def apply(self, fn: Any, remote_args: dict, blocks: BlockList[Any]) -> BlockList[Any]: map_bar = ProgressBar("Map Progress", total=len(blocks)) kwargs = remote_args.copy() kwargs["num_returns"] = 2 @ray.remote(**kwargs) def wrapped_fn(block: Block, meta: BlockMetadata): new_block = fn(block) accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=meta.input_files) return new_block, new_meta refs = [ wrapped_fn.remote(b, m) for b, m in zip(blocks, blocks.get_metadata()) ] new_blocks, new_metadata = zip(*refs) map_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) return BlockList(list(new_blocks), list(new_metadata))
def write_datasource(self, datasource: Datasource[T], **write_args) -> None: """Write the dataset to a custom datasource. Examples: >>> ds.write_datasource(CustomDatasourceImpl(...)) Time complexity: O(dataset size / parallelism) Args: datasource: The datasource to write to. write_args: Additional write args to pass to the datasource. """ write_tasks = datasource.prepare_write(self._blocks, **write_args) progress = ProgressBar("Write Progress", len(write_tasks)) @ray.remote def remote_write(task: WriteTask) -> Any: return task() write_task_outputs = [remote_write.remote(w) for w in write_tasks] try: progress.block_until_complete(write_task_outputs) datasource.on_write_complete(write_tasks, ray.get(write_task_outputs)) except Exception as e: datasource.on_write_failed(write_tasks, e) raise finally: progress.close()
def sample_boundaries(blocks: BlockList[T], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ n_samples = int(num_reducers * 10 / len(blocks)) @ray.remote def sample_block(block: Block[T]) -> np.ndarray: return BlockAccessor.for_block(block).sample(n_samples, key) sample_results = [sample_block.remote(block) for block in blocks] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) sample_items = np.concatenate(samples) sample_items.sort() ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.arange(0, 1, 1 / num_reducers) ] return ret[1:]
def simple_shuffle(input_blocks: BlockList[T], output_num_blocks: int) -> BlockList[T]: input_num_blocks = len(input_blocks) @ray.remote(num_returns=output_num_blocks) def shuffle_map(block: Block[T]) -> List[Block[T]]: block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append( block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) # Needed to handle num_returns=1 edge case in Ray API. if len(slices) == 1: return slices[0] else: return slices @ray.remote(num_returns=2) def shuffle_reduce( *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata): builder = DelegatingArrowBlockBuilder() assert len(mapper_outputs) == input_num_blocks for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None) return new_block, new_metadata map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) shuffle_map_out = [shuffle_map.remote(block) for block in input_blocks] if output_num_blocks == 1: # Handle the num_returns=1 edge case which doesn't return a list. shuffle_map_out = [[x] for x in shuffle_map_out] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) map_bar.close() reduce_bar = ProgressBar("Shuffle Reduce", position=0, total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.remote( *[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] new_blocks, new_metadata = zip(*shuffle_reduce_out) reduce_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) reduce_bar.close() return BlockList(list(new_blocks), list(new_metadata))
def apply(self, fn: Any, remote_args: dict, blocks: List[Block[T]]) -> List[ObjectRef[Block]]: map_bar = ProgressBar("Map Progress", total=len(blocks)) if remote_args: fn = ray.remote(**remote_args)(fn) else: fn = ray.remote(fn) blocks = [fn.remote(b) for b in blocks] map_bar.block_until_complete(blocks) return blocks
def sort_impl(blocks: BlockList[T], key: SortKeyT, descending: bool = False) -> BlockList[T]: if len(blocks) == 0: return BlockList([], []) if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks) num_reducers = num_mappers boundaries = sample_boundaries(blocks, key, num_reducers) if descending: boundaries.reverse() @ray.remote(num_returns=num_reducers) def sort_block(block, boundaries): return BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending) @ray.remote(num_returns=2) def merge_sorted_blocks(*blocks: List[Block[T]]) -> Block[T]: if len(blocks) == 1: blocks = blocks[0] # Python weirdness return BlockAccessor.for_block(blocks[0]).merge_sorted_blocks( list(blocks), key, descending) map_results = np.empty((num_mappers, num_reducers), dtype=object) for i, block in enumerate(blocks): map_results[i, :] = sort_block.remote(block, boundaries) map_bar = ProgressBar("Sort Map", len(map_results)) map_bar.block_until_complete([ret[0] for ret in map_results]) map_bar.close() reduce_results = [] for j in range(num_reducers): ret = merge_sorted_blocks.remote(*map_results[:, j].tolist()) reduce_results.append(ret) merge_bar = ProgressBar("Sort Merge", len(reduce_results)) merge_bar.block_until_complete([ret[0] for ret in reduce_results]) merge_bar.close() blocks = [b for b, _ in reduce_results] metadata = ray.get([m for _, m in reduce_results]) return BlockList(blocks, metadata)
def apply(self, fn: Any, remote_args: dict, blocks: BlockList[Any]) -> BlockList[Any]: map_bar = ProgressBar("Map Progress", total=len(blocks)) kwargs = remote_args.copy() kwargs["num_returns"] = 2 # Lazy init to avoid circular import. TODO(ekl) move these into a # separate remote functions file. global _remote_fn if _remote_fn is None: _remote_fn = ray.remote(map_block) refs = [ _remote_fn.options(**kwargs).remote(b, m, fn) for b, m in zip(blocks, blocks.get_metadata()) ] new_blocks, new_metadata = zip(*refs) map_bar.block_until_complete(list(new_blocks)) new_metadata = ray.get(list(new_metadata)) return BlockList(list(new_blocks), list(new_metadata))
def simple_shuffle(input_blocks: List[ObjectRef[Block[T]]], output_num_blocks: int) -> List[ObjectRef[Block[T]]]: input_num_blocks = len(input_blocks) @ray.remote(num_returns=output_num_blocks) def shuffle_map(block: Block[T]) -> List[Block[T]]: slice_sz = max(1, block.num_rows() // output_num_blocks) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz)) return slices @ray.remote def shuffle_reduce(*mapper_outputs: List[Block[T]]) -> Block[T]: builder = DelegatingArrowBlockBuilder() assert len(mapper_outputs) == input_num_blocks for block in mapper_outputs: builder.add_block(block) return builder.build() map_bar = ProgressBar("Shuffle Map", position=0, total=input_num_blocks) reduce_bar = ProgressBar("Shuffle Reduce", position=1, total=output_num_blocks) shuffle_map_out = [shuffle_map.remote(block) for block in input_blocks] map_bar.block_until_complete([x[0] for x in shuffle_map_out]) shuffle_reduce_out = [ shuffle_reduce.remote( *[shuffle_map_out[i][j] for i in range(input_num_blocks)]) for j in range(output_num_blocks) ] reduce_bar.block_until_complete(shuffle_reduce_out) map_bar.close() reduce_bar.close() return shuffle_reduce_out