def validate_part(path: Path): logging_utils.init() sum_path = path + ".sum" _run_valsort(["-o", sum_path, path]) logging.info(f"Validated output {path}") with open(sum_path, "rb") as fin: return os.path.getsize(path), fin.read()
def validate_part(path: Path): logging_utils.init() proc = subprocess.run([constants.VALSORT_PATH, path], capture_output=True) if proc.returncode != 0: logging.critical("\n" + proc.stderr.decode("ascii")) raise RuntimeError(f"Validation failed: {path}") logging.info(f"Validated output {path}")
def mapper(args: Args, mapper_id: PartId, boundaries: List[int], path: Path) -> List[np.ndarray]: logging_utils.init() part = _load_partition(args, path) sort_fn = (_dummy_sort_and_partition if args.skip_sorting else sortlib.sort_and_partition) blocks = sort_fn(part, boundaries) return [part[offset:offset + size] for offset, size in blocks]
def generate_part(args: Args, part_id: PartId, size: RecordCount, offset: RecordCount) -> PartInfo: logging_utils.init() pinfo = _part_info(args, part_id) subprocess.run( [constants.GENSORT_PATH, f"-b{offset}", f"{size}", pinfo.path], check=True) logging.info(f"Generated input {pinfo}") return pinfo
def init(): if args.ray_address is None: ray.init() else: ray.init(address=args.ray_address) logging_utils.init() logging.info(args) logging.info(ray.available_resources()) os.makedirs(constants.WORK_DIR, exist_ok=True)
def generate_part(part_id: PartId, size: RecordCount, offset: RecordCount) -> PartitionInfo: logging_utils.init() pinfo = _make_partition_info(part_id) if not args.skip_input: subprocess.run( [constants.GENSORT_PATH, f"-b{offset}", f"{size}", pinfo.path], check=True) logging.info(f"Generated input {pinfo}") return pinfo
def __init__( self, gauges: List[str], histograms: List[Tuple[str, List[int]]], ): self.counts = {m: 0 for m in gauges} self.gauges = {m: Gauge(m) for m in gauges} self.reset_gauges() self.histograms = {m: Histogram(m, boundaries=b) for m, b in histograms} logging_utils.init()
def init(args: Args): if not args.ray_address: ray.init(resources={"worker": os.cpu_count()}) else: ray.init(address=args.ray_address) logging_utils.init() logging.info(args) os.makedirs(constants.WORK_DIR, exist_ok=True) resources = ray.cluster_resources() logging.info(resources) args.num_workers = resources["worker"] progress_tracker = tracing_utils.create_progress_tracker(args) return progress_tracker
def mapper(boundaries: List[int], mapper_id: PartId, path: Path) -> List[ray.ObjectRef]: logging_utils.init() task_id = f"M-{mapper_id} Mapper" logging.info(f"{task_id} starting") if args.skip_input: block_size = int(np.ceil(args.part_size / args.num_parts)) return [ ray.put(np.frombuffer(np.random.bytes(block_size), dtype=np.uint8)) for _ in range(args.num_parts) ] part = _load_partition(path) sort_fn = _dummy_sort_and_partition if args.skip_sorting else sortlib.sort_and_partition blocks = sort_fn(part, boundaries) logging.info(f"{task_id} saving to object store") return [ray.put(part[offset:offset + size]) for offset, size in blocks]
def reducer(reducer_id: PartId, *blocks) -> PartitionInfo: logging_utils.init() task_id = f"R-{reducer_id} Reducer" logging.info(f"{task_id} starting") blocks = [np.copy(ray.get(block)) for block in blocks] merge_fn = _dummy_merge if args.skip_sorting else sortlib.merge_partitions merger = merge_fn(blocks, args.reducer_batch_num_records) pinfo = _make_partition_info(reducer_id, "output") if args.skip_output: total = 0 for datachunk in merger: total += len(datachunk) else: with open(pinfo.path, "wb") as fout: for datachunk in merger: fout.write(datachunk) logging.info(f"{task_id} done") return pinfo
def init(): ray.init(address="auto") logging_utils.init() logging.info(args) logging.info(ray.available_resources()) os.makedirs(constants.WORK_DIR, exist_ok=True)