def write_datasource(self, datasource: Datasource[T], **write_args) -> None: """Write the dataset to a custom datasource. Examples: >>> ds.write_datasource(CustomDatasourceImpl(...)) Time complexity: O(dataset size / parallelism) Args: datasource: The datasource to write to. write_args: Additional write args to pass to the datasource. """ write_tasks = datasource.prepare_write(self._blocks, **write_args) progress = ProgressBar("Write Progress", len(write_tasks)) @ray.remote def remote_write(task: WriteTask) -> Any: return task() write_task_outputs = [remote_write.remote(w) for w in write_tasks] try: progress.block_until_complete(write_task_outputs) datasource.on_write_complete(write_tasks, ray.get(write_task_outputs)) except Exception as e: datasource.on_write_failed(write_tasks, e) raise finally: progress.close()
def sample_boundaries(blocks: BlockList[T], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ n_samples = int(num_reducers * 10 / len(blocks)) @ray.remote def sample_block(block: Block[T]) -> np.ndarray: return BlockAccessor.for_block(block).sample(n_samples, key) sample_results = [sample_block.remote(block) for block in blocks] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) sample_items = np.concatenate(samples) sample_items.sort() ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.arange(0, 1, 1 / num_reducers) ] return ret[1:]
def simple_shuffle(input_blocks: BlockList[T], output_num_blocks: int) -> BlockList[T]: input_num_blocks = len(input_blocks) @ray.remote(num_returns=output_num_blocks) def shuffle_map(block: Block[T]) -> List[Block[T]]: block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) # Needed to handle num_returns=1 edge case in Ray API. if len(slices) == 1: return slices[0] else: return slices @ray.remote(num_returns=2) def shuffle_reduce( *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata): builder = DelegatingArrowBlockBuilder() assert len(mapper_outputs) == input_num_blocks for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None) return new_block, new_metadata map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [shuffle_map.remote(block) for block in input_blocks] if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.remote( *[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))
def apply(self, fn: Any, remote_args: dict, blocks: List[Block[T]]) -> List[ObjectRef[Block]]: map_bar = ProgressBar("Map Progress", total=len(blocks)) class Worker: def ready(self): return "ok" def process_block(self, block: Block[T]) -> Block[U]: return fn(block) if "num_cpus" not in remote_args: remote_args["num_cpus"] = 1 Worker = ray.remote(**remote_args)(Worker) workers = [Worker.remote()] tasks = {w.ready.remote(): w for w in workers} ready_workers = set() blocks_in = blocks.copy() blocks_out = [] while len(blocks_out) < len(blocks): ready, _ = ray.wait(list(tasks), timeout=0.01, num_returns=1, fetch_local=False) if not ready: if len(ready_workers) / len(workers) > 0.8: w = Worker.remote() workers.append(w) tasks[w.ready.remote()] = w map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers))) continue [obj_id] = ready worker = tasks[obj_id] del tasks[obj_id] # Process task result. if worker in ready_workers: blocks_out.append(obj_id) map_bar.update(1) else: ready_workers.add(worker) # Schedule a new task. if blocks_in: tasks[worker.process_block.remote(blocks_in.pop())] = worker map_bar.close() return blocks_out
def sort_impl(blocks: BlockList[T], key: SortKeyT, descending: bool = False) -> BlockList[T]: if len(blocks) == 0: return BlockList([], []) if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() @ray.remote(num_returns=num_reducers) def sort_block(block, boundaries): return BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending) @ray.remote(num_returns=2) def merge_sorted_blocks(*blocks: List[Block[T]]) -> Block[T]: if len(blocks) == 1: blocks = blocks[0] # Python weirdness return BlockAccessor.for_block(blocks[0]).merge_sorted_blocks( list(blocks), key, descending) map_results = np.empty((num_mappers, num_reducers), dtype=object) for i, block in enumerate(blocks): map_results[i, :] = sort_block.remote(block, boundaries) map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete([ret[0] for ret in map_results]) map_bar.close() reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(*map_results[:, j].tolist()) reduce_results.append(ret) merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) return BlockList(blocks, metadata)
def simple_shuffle(input_blocks: List[ObjectRef[Block[T]]], output_num_blocks: int) -> List[ObjectRef[Block[T]]]: input_num_blocks = len(input_blocks) @ray.remote(num_returns=output_num_blocks) def shuffle_map(block: Block[T]) -> List[Block[T]]: slice_sz = max(1, block.num_rows() // output_num_blocks) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz)) return slices @ray.remote def shuffle_reduce(*mapper_outputs: List[Block[T]]) -> Block[T]: builder = DelegatingArrowBlockBuilder() assert len(mapper_outputs) == input_num_blocks for block in mapper_outputs: builder.add_block(block) return builder.build() map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) reduce_bar = ProgressBar("Shuffle Reduce", position=1, total=output_num_blocks) shuffle_map_out = [shuffle_map.remote(block) for block in input_blocks] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) shuffle_reduce_out = [ shuffle_reduce.remote( *[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] reduce_bar.block_until_complete(shuffle_reduce_out) map_bar.close() reduce_bar.close() return shuffle_reduce_out
def apply(self, fn: Any, remote_args: dict, blocks: Iterable[Block]) -> Iterable[ObjectRef[Block]]: map_bar = ProgressBar("Map Progress", total=len(blocks)) class BlockWorker: def ready(self): return "ok" @ray.method(num_returns=2) def process_block(self, block: Block, meta: BlockMetadata) -> (Block, BlockMetadata): new_block = fn(block) accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=meta.input_files) return new_block, new_metadata if not remote_args: remote_args["num_cpus"] = 1 BlockWorker = ray.remote(**remote_args)(BlockWorker) self.workers = [BlockWorker.remote()] metadata_mapping = {} tasks = {w.ready.remote(): w for w in self.workers} ready_workers = set() blocks_in = [(b, m) for (b, m) in zip(blocks, blocks.get_metadata())] blocks_out = [] while len(blocks_out) < len(blocks): ready, _ = ray.wait(list(tasks), timeout=0.01, num_returns=1, fetch_local=False) if not ready: if len(ready_workers) / len(self.workers) > 0.8: w = BlockWorker.remote() self.workers.append(w) tasks[w.ready.remote()] = w map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(self.workers) - len(ready_workers))) continue [obj_id] = ready worker = tasks[obj_id] del tasks[obj_id] # Process task result. if worker in ready_workers: blocks_out.append(obj_id) map_bar.update(1) else: ready_workers.add(worker) # Schedule a new task. if blocks_in: block_ref, meta_ref = worker.process_block.remote( *blocks_in.pop()) metadata_mapping[block_ref] = meta_ref tasks[block_ref] = worker new_metadata = ray.get([metadata_mapping[b] for b in blocks_out]) map_bar.close() return BlockList(blocks_out, new_metadata)