def sort_impl( blocks: BlockList, clear_input_blocks: bool, key: SortKeyT, descending: bool = False ) -> Tuple[BlockList, dict]: stage_info = {} blocks_list = blocks.get_blocks() if len(blocks_list) == 0: return BlockList([], []), stage_info if isinstance(key, str): key = [(key, "descending" if descending else "ascending")] if isinstance(key, list): descending = key[0][1] == "descending" num_mappers = len(blocks_list) # Use same number of output partitions. num_reducers = num_mappers # TODO(swang): sample_boundaries could be fused with a previous stage. boundaries = sample_boundaries(blocks_list, key, num_reducers) if descending: boundaries.reverse() context = DatasetContext.get_current() if context.use_push_based_shuffle: sort_op_cls = PushBasedSortOp else: sort_op_cls = SimpleSortOp sort_op = sort_op_cls( map_args=[boundaries, key, descending], reduce_args=[key, descending] ) return sort_op.execute( blocks, num_reducers, clear_input_blocks, )
def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]] ) -> Dataset[ArrowRow]: """Create a dataset from a list of Ray object references to Pandas dataframes. Args: dfs: A Ray object references to pandas dataframe, or a list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] elif isinstance(dfs, list): for df in dfs: if not isinstance(df, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(df)}" ) else: raise ValueError( "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}" ) context = DatasetContext.get_current() if context.enable_pandas_block: get_metadata = cached_remote_fn(_get_metadata) metadata = ray.get([get_metadata.remote(df) for df in dfs]) return Dataset( ExecutionPlan( BlockList(dfs, metadata), DatasetStats(stages={"from_pandas_refs": metadata}, parent=None), ), 0, False, ) df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = map(list, zip(*res)) metadata = ray.get(metadata) return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_pandas_refs": metadata}, parent=None), ), 0, False, )
def _test_equal_split_balanced(block_sizes, num_splits): blocks = [] metadata = [] total_rows = 0 for block_size in block_sizes: block = list(range(total_rows, total_rows + block_size)) blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(None, None)) total_rows += block_size block_list = BlockList(blocks, metadata) ds = Dataset( ExecutionPlan(block_list, DatasetStats.TODO()), 0, False, ) splits = ds.split(num_splits, equal=True) split_counts = [split.count() for split in splits] assert len(split_counts) == num_splits expected_block_size = total_rows // num_splits # Check that all splits are the expected size. assert all([count == expected_block_size for count in split_counts]) expected_total_rows = sum(split_counts) # Check that the expected number of rows were dropped. assert total_rows - expected_total_rows == total_rows % num_splits # Check that all rows are unique (content check). split_rows = [row for split in splits for row in split.take(total_rows)] assert len(set(split_rows)) == len(split_rows)
def from_arrow_refs( tables: Union[ObjectRef[Union["pyarrow.Table", bytes]], List[ObjectRef[Union["pyarrow.Table", bytes]]], ] ) -> Dataset[ArrowRow]: """Create a dataset from a set of Arrow tables. Args: tables: A Ray object reference to Arrow table, or list of Ray object references to Arrow tables, or its streaming format in bytes. Returns: Dataset holding Arrow records from the tables. """ if isinstance(tables, ray.ObjectRef): tables = [tables] get_metadata = cached_remote_fn(_get_metadata) metadata = ray.get([get_metadata.remote(t) for t in tables]) return Dataset( ExecutionPlan( BlockList(tables, metadata), DatasetStats(stages={"from_arrow_refs": metadata}, parent=None), ), 0, False, )
def _rewrite_read_stage( in_blocks: LazyBlockList, ) -> Tuple[BlockList, DatasetStats, Stage]: """Rewrite the read stage to a OneToOne stage over read tasks as input. For example, suppose the plan was [Read -> MapBatches(Fn)]. These stages cannot be fused, since read stages are handled specially. After rewriting to [GetReadTasks -> MapBatches(DoRead) -> MapBatches(Fn)], now we can fuse the latter two MapBatches stages into a single OneToOne stage: [GetReadTasks -> MapBatches(DoRead -> Fn)]. Args: blocks: Lazy block list representing read stage. Returns: Non-lazy block list containing read tasks for not-yet-read block partitions, new stats for the block list, and the new one-to-one read stage. """ # Generate the "GetReadTasks" stage blocks. remote_args = in_blocks._remote_args blocks, metadata = [], [] for read_task in in_blocks._tasks: blocks.append(ray.put(read_task._read_fn)) metadata.append(read_task.get_metadata()) block_list = BlockList(blocks, metadata) def block_fn(read_fn: Callable[[], Iterator[Block]]) -> Iterator[Block]: for block in read_fn(): yield block stage = OneToOneStage("read", block_fn, "tasks", remote_args) stats = DatasetStats(stages={}, parent=None) return block_list, stats, stage
def _optimize_stages(self): """Optimize this pipeline, fusing stages together as possible.""" context = DatasetContext.get_current() if not context.optimize_fuse_stages: self._optimized_stages = self._stages return # This dummy dataset will be used to get a set of optimized stages. dummy_ds = Dataset( ExecutionPlan(BlockList([], []), DatasetStats(stages={}, parent=None)), 0, True, ) # Apply all pipeline operations to the dummy dataset. for stage in self._stages: dummy_ds = stage(dummy_ds) # Get the optimized stages. _, _, stages = dummy_ds._plan._optimize() # Apply these optimized stages to the datasets underlying the pipeline. # These optimized stages will be executed by the PipelineExecutor. optimized_stages = [] for stage in stages: optimized_stages.append(lambda ds, stage=stage: Dataset( ds._plan.with_stage(stage), ds._epoch, True)) self._optimized_stages = optimized_stages
def build(self, final_blocks: BlockList) -> "DatasetStats": stats = DatasetStats( stages={self.stage_name: final_blocks.get_metadata()}, parent=self.parent, ) stats.time_total_s = time.perf_counter() - self.start_time return stats
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> import ray >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP >>> ds # doctest: +SKIP Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>) >>> ds.take(2) # doctest: +SKIP [1, 2] Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ detected_parallelism, _ = _autodetect_parallelism( parallelism, ray.util.get_current_placement_group(), DatasetContext.get_current(), ) block_size = max( 1, len(items) // detected_parallelism, ) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): stats = BlockExecStats.builder() builder = DelegatingBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build())) i += block_size return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_items": metadata}, parent=None), ), 0, False, )
def _rewrite_read_stage( in_blocks: LazyBlockList, stages: List[Stage]) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Rewrite the read stage to a OneToOne stage over read tasks as input. For example, suppose the plan was [Read -> MapBatches(Fn)]. These stages cannot be fused, since read stages are handled specially. After rewriting to [GetReadTasks -> MapBatches(DoRead) -> MapBatches(Fn)], now we can fuse the latter two MapBatches stages into a single OneToOne stage: [GetReadTasks -> MapBatches(DoRead -> Fn)]. Args: blocks: Lazy block list representing read stage. stages: List of current stages. Returns: Non-lazy block list containing read tasks for not-yet-read block partitions, new stats for the block list, and the new list of stages. """ from ray.data._internal.stage_impl import RandomizeBlocksStage # Generate the "GetReadTasks" stage blocks. remote_args = in_blocks._remote_args blocks, metadata = [], [] for read_task in in_blocks._tasks: blocks.append(ray.put(read_task._read_fn)) metadata.append(read_task.get_metadata()) block_list = BlockList(blocks, metadata) def block_fn(read_fn: Callable[[], Iterator[Block]]) -> Iterator[Block]: for block in read_fn(): yield block name = "read" # Fuse downstream randomize stage with the read stage if possible. This is needed # when .window() is called right after read->randomize, since it forces execution. has_randomize = stages and isinstance(stages[0], RandomizeBlocksStage) if has_randomize: if stages and isinstance(stages[0], RandomizeBlocksStage): block_list, _ = stages[0].do_randomize(block_list) stages = stages[1:] name += "->randomize_block_order" stage = OneToOneStage(name, block_fn, "tasks", remote_args) stats = DatasetStats(stages={}, parent=None) stages.insert(0, stage) return block_list, stats, stages
def from_numpy_refs( ndarrays: Union[ObjectRef[np.ndarray], List[ObjectRef[np.ndarray]]], ) -> Dataset[ArrowRow]: """Create a dataset from a list of NumPy ndarray futures. Args: ndarrays: A Ray object reference to a NumPy ndarray or a list of Ray object references to NumPy ndarrays. Returns: Dataset holding the given ndarrays. """ if isinstance(ndarrays, ray.ObjectRef): ndarrays = [ndarrays] elif isinstance(ndarrays, list): for ndarray in ndarrays: if not isinstance(ndarray, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(ndarray)}" ) else: raise ValueError( f"Expected Ray object ref or list of Ray object refs, got {type(ndarray)}" ) ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2) res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays] blocks, metadata = map(list, zip(*res)) metadata = ray.get(metadata) return Dataset( ExecutionPlan( BlockList(blocks, metadata), DatasetStats(stages={"from_numpy_refs": metadata}, parent=None), ), 0, False, )
def do_zip_all(block_list, clear_input_blocks: bool, *_): blocks1 = block_list.get_blocks() blocks2 = other.get_internal_block_refs() if clear_input_blocks: block_list.clear() if len(blocks1) != len(blocks2): # TODO(ekl) consider supporting if num_rows are equal. raise ValueError( "Cannot zip dataset of different num blocks: {} vs {}".format( len(blocks1), len(blocks2) ) ) def do_zip(block1: Block, block2: Block) -> (Block, BlockMetadata): stats = BlockExecStats.builder() b1 = BlockAccessor.for_block(block1) result = b1.zip(block2) br = BlockAccessor.for_block(result) return result, br.get_metadata(input_files=[], exec_stats=stats.build()) do_zip_fn = cached_remote_fn(do_zip, num_returns=2) blocks = [] metadata = [] for b1, b2 in zip(blocks1, blocks2): res, meta = do_zip_fn.remote(b1, b2) blocks.append(res) metadata.append(meta) # Early release memory. del blocks1, blocks2 # TODO(ekl) it might be nice to have a progress bar here. metadata = ray.get(metadata) blocks = BlockList(blocks, metadata) return blocks, {}
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, merge_factor: int = 2, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: logger.info("Using experimental push-based shuffle.") # TODO(swang): For jobs whose reduce work is heavier than the map work, # we should support fractional merge factors. # TODO(swang): For large jobs, we should try to choose the merge factor # automatically, e.g., by running one test round of map and merge tasks # and comparing their run times. # TODO(swang): Add option to automatically reduce write amplification # during map-merge stage, by limiting how many partitions can be # processed concurrently. input_blocks_list = input_blocks.get_blocks() # Preemptively clear the blocks list since we will incrementally delete # the last remaining references as we submit the dependent map tasks # during the map-merge stage. if clear_input_blocks: input_blocks.clear() if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} # The placement strategy for reduce tasks is overwritten to colocate # them with their inputs from the merge stage, so remove any # pre-specified scheduling strategy here. reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args.pop("scheduling_strategy", None) map_fn = self._map_partition merge_fn = self._merge def map_partition(*args, **kwargs): return map_fn(self.map, *args, **kwargs) def merge(*args, **kwargs): return merge_fn(self.reduce, *args, **kwargs) shuffle_map = cached_remote_fn(map_partition) shuffle_merge = cached_remote_fn(merge) def submit_map_task(arg): mapper_idx, block = arg # NOTE(swang): Results are shuffled between map and merge tasks, so # there is no advantage to colocating specific map and merge tasks. # Therefore, we do not specify a node affinity policy for map tasks # in case the caller or Ray has a better scheduling strategy, e.g., # based on data locality. map_result = shuffle_map.options( **map_ray_remote_args, num_returns=1 + schedule.num_merge_tasks_per_round, ).remote( mapper_idx, block, output_num_blocks, schedule, *self._map_args, ) metadata_ref = map_result.pop(0) return metadata_ref, map_result def submit_merge_task(arg): merge_idx, map_results = arg num_merge_returns = schedule.get_num_reducers_per_merge_idx(merge_idx) merge_result = shuffle_merge.options( num_returns=1 + num_merge_returns, **schedule.get_merge_task_options(merge_idx), ).remote( *map_results, reduce_args=self._reduce_args, ) metadata_ref = merge_result.pop(0) return metadata_ref, merge_result # Compute all constants used for task scheduling. num_cpus_per_node_map = _get_num_cpus_per_node_map() schedule = self._compute_shuffle_schedule( num_cpus_per_node_map, len(input_blocks_list), merge_factor, output_num_blocks, ) # ObjectRef results from the last round of tasks. Used to add # backpressure during pipelining of map and merge tasks. last_map_metadata_results = [] last_merge_metadata_results = [] # Final outputs from the map-merge stage. # This is a map from merge task index to a nested list of merge results # (ObjectRefs). Each merge task index corresponds to a partition of P # final reduce tasks. all_merge_results = [[] for _ in range(schedule.num_merge_tasks_per_round)] shuffle_map_metadata = [] shuffle_merge_metadata = [] map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list)) # Execute the map-merge stage. This submits tasks in rounds of M map # tasks and N merge tasks each. Task execution between map and merge is # pipelined, so that while executing merge for one round of inputs, we # also execute the map tasks for the following round. input_blocks_list = list(enumerate(input_blocks_list)) while input_blocks_list: # Execute one round of the map stage. # Pop from the inputs so that we can clear the memory ASAP. round_input_blocks = [] try: for _ in range(schedule.num_map_tasks_per_round): round_input_blocks.append(input_blocks_list.pop(0)) except IndexError: pass ( prev_map_metadata, last_map_metadata_results, map_results, ) = _execute_pipelined_stage( submit_map_task, last_map_metadata_results, round_input_blocks, progress_bar=map_bar, ) shuffle_map_metadata += prev_map_metadata # Shuffle the map results for the merge tasks. merge_args = [ (merge_idx, [map_result.pop(0) for map_result in map_results]) for merge_idx in range(schedule.num_merge_tasks_per_round) ] assert all([not map_result for map_result in map_results]) # Execute one round of the merge stage. ( prev_merge_metadata, last_merge_metadata_results, merge_results, ) = _execute_pipelined_stage( submit_merge_task, last_merge_metadata_results, merge_args, ) shuffle_merge_metadata += prev_merge_metadata for merge_idx, merge_result in enumerate(merge_results): all_merge_results[merge_idx].append(merge_result) del merge_results # Wait for last map and merge tasks to finish. prev_map_metadata, _, _ = _execute_pipelined_stage( None, last_map_metadata_results, [], progress_bar=map_bar ) shuffle_map_metadata += prev_map_metadata map_bar.close() prev_merge_metadata, _, _ = _execute_pipelined_stage( None, last_merge_metadata_results, [] ) shuffle_merge_metadata += prev_merge_metadata # Execute and wait for the reduce stage. new_metadata, new_blocks = self._execute_reduce_stage( output_num_blocks, schedule, reduce_ray_remote_args, all_merge_results ) stats = { "map": shuffle_map_metadata, "merge": shuffle_merge_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats
def compute_to_blocklist(self) -> BlockList: """Launch all tasks and return a concrete BlockList.""" blocks, metadata = self._get_blocks_with_metadata() return BlockList(blocks, metadata)
def _apply( self, fn: Any, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, name: Optional[str] = None, ) -> BlockList: context = DatasetContext.get_current() # Handle empty datasets. if block_list.initial_num_blocks() == 0: return block_list blocks = block_list.get_blocks_with_metadata() if name is None: name = "map" name = name.title() map_bar = ProgressBar(name, total=len(blocks)) if context.block_splitting_enabled: map_block = cached_remote_fn(_map_block_split).options( **remote_args) refs = [map_block.remote(b, fn, m.input_files) for b, m in blocks] else: map_block = cached_remote_fn(_map_block_nosplit).options( **dict(remote_args, num_returns=2)) all_refs = [ map_block.remote(b, fn, m.input_files) for b, m in blocks ] data_refs = [r[0] for r in all_refs] refs = [r[1] for r in all_refs] # Release input block references. if clear_input_blocks: del blocks block_list.clear() # Common wait for non-data refs. try: results = map_bar.fetch_until_complete(refs) except (ray.exceptions.RayTaskError, KeyboardInterrupt) as e: # One or more mapper tasks failed, or we received a SIGINT signal # while waiting; either way, we cancel all map tasks. for ref in refs: ray.cancel(ref) # Wait until all tasks have failed or been cancelled. for ref in refs: try: ray.get(ref) except (ray.exceptions.RayTaskError, ray.exceptions.TaskCancelledError): pass # Reraise the original task failure exception. raise e from None new_blocks, new_metadata = [], [] if context.block_splitting_enabled: for result in results: for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block, metadata in zip(data_refs, results): new_blocks.append(block) new_metadata.append(metadata) return BlockList(list(new_blocks), list(new_metadata))
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, merge_factor: int = 2, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: logger.info("Using experimental push-based shuffle.") # TODO(swang): For jobs whose reduce work is heavier than the map work, # we should support fractional merge factors. # TODO(swang): For large jobs, we should try to choose the merge factor # automatically, e.g., by running one test round of map and merge tasks # and comparing their run times. # TODO(swang): Add option to automatically reduce write amplification # during map-merge stage, by limiting how many partitions can be # processed concurrently. input_blocks_list = input_blocks.get_blocks() # Preemptively clear the blocks list since we will incrementally delete # the last remaining references as we submit the dependent map tasks # during the map-merge stage. if clear_input_blocks: input_blocks.clear() if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} # The placement strategy for reduce tasks is overwritten to colocate # them with their inputs from the merge stage, so remove any # pre-specified scheduling strategy here. reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args.pop("scheduling_strategy", None) # Compute all constants used for task scheduling. num_cpus_per_node_map = _get_num_cpus_per_node_map() stage = self._compute_shuffle_schedule( num_cpus_per_node_map, len(input_blocks_list), merge_factor, output_num_blocks, ) map_fn = self._map_partition merge_fn = self._merge def map_partition(*args, **kwargs): return map_fn(self.map, *args, **kwargs) def merge(*args, **kwargs): return merge_fn(self.reduce, *args, **kwargs) shuffle_map = cached_remote_fn(map_partition) shuffle_map = shuffle_map.options( **map_ray_remote_args, num_returns=1 + stage.num_merge_tasks_per_round, ) map_stage_iter = _MapStageIterator( input_blocks_list, shuffle_map, [output_num_blocks, stage.merge_schedule, *self._map_args], ) map_bar = ProgressBar("Shuffle Map", position=0, total=len(input_blocks_list)) map_stage_executor = _PipelinedStageExecutor( map_stage_iter, stage.num_map_tasks_per_round, progress_bar=map_bar) shuffle_merge = cached_remote_fn(merge) merge_stage_iter = _MergeStageIterator(map_stage_iter, shuffle_merge, stage, self._reduce_args) merge_stage_executor = _PipelinedStageExecutor( merge_stage_iter, stage.num_merge_tasks_per_round, max_concurrent_rounds=2) # Execute the map-merge stage. This submits tasks in rounds of M map # tasks and N merge tasks each. Task execution between map and merge is # pipelined, so that while executing merge for one round of inputs, we # also execute the map tasks for the following round. map_done = False merge_done = False map_stage_metadata = [] merge_stage_metadata = [] while not (map_done and merge_done): try: map_stage_metadata += next(map_stage_executor) except StopIteration: map_done = True break try: merge_stage_metadata += next(merge_stage_executor) except StopIteration: merge_done = True break map_bar.close() all_merge_results = merge_stage_iter.pop_merge_results() # Execute and wait for the reduce stage. reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) shuffle_reduce = cached_remote_fn(self.reduce) reduce_stage_iter = _ReduceStageIterator( stage, shuffle_reduce, all_merge_results, reduce_ray_remote_args, self._reduce_args, ) max_reduce_tasks_in_flight = output_num_blocks ctx = DatasetContext.get_current() if ctx.pipeline_push_based_shuffle_reduce_tasks: # If pipelining is enabled, we should still try to utilize all # cores. max_reduce_tasks_in_flight = min( max_reduce_tasks_in_flight, sum(num_cpus_per_node_map.values())) reduce_stage_executor = _PipelinedStageExecutor( reduce_stage_iter, max_reduce_tasks_in_flight, max_concurrent_rounds=2, progress_bar=reduce_bar, ) reduce_stage_metadata = [] while True: try: reduce_stage_metadata += next(reduce_stage_executor) except StopIteration: break new_blocks = reduce_stage_iter.pop_reduce_results() sorted_blocks = [(block[0], block[1], reduce_stage_metadata[i]) for i, block in enumerate(new_blocks)] sorted_blocks.sort(key=lambda x: x[0]) _, new_blocks, reduce_stage_metadata = zip(*sorted_blocks) del sorted_blocks assert ( len(new_blocks) == output_num_blocks ), f"Expected {output_num_blocks} outputs, produced {len(new_blocks)}" reduce_bar.close() stats = { "map": map_stage_metadata, "merge": merge_stage_metadata, "reduce": reduce_stage_metadata, } return BlockList(list(new_blocks), list(reduce_stage_metadata)), stats
def _apply( self, fn: Any, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, name: Optional[str] = None, ) -> BlockList: """Note: this is not part of the Dataset public API.""" context = DatasetContext.get_current() blocks_in = block_list.get_blocks_with_metadata() # Early release block references. if clear_input_blocks: block_list.clear() orig_num_blocks = len(blocks_in) results = [] if name is None: name = "map" name = name.title() map_bar = ProgressBar(name, total=orig_num_blocks) class BlockWorker: def ready(self): return "ok" def map_block_split(self, block: Block, input_files: List[str]) -> BlockPartition: return _map_block_split(block, fn, input_files) @ray.method(num_returns=2) def map_block_nosplit( self, block: Block, input_files: List[str]) -> Tuple[Block, BlockMetadata]: return _map_block_nosplit(block, fn, input_files) if not remote_args: remote_args["num_cpus"] = 1 remote_args["scheduling_strategy"] = context.scheduling_strategy BlockWorker = ray.remote(**remote_args)(BlockWorker) workers = [BlockWorker.remote() for _ in range(self.min_size)] tasks = {w.ready.remote(): w for w in workers} tasks_in_flight = collections.defaultdict(int) metadata_mapping = {} block_indices = {} ready_workers = set() while len(results) < orig_num_blocks: ready, _ = ray.wait(list(tasks.keys()), timeout=0.01, num_returns=1, fetch_local=False) if not ready: if (len(workers) < self.max_size and len(ready_workers) / len(workers) > 0.8): w = BlockWorker.remote() workers.append(w) tasks[w.ready.remote()] = w map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers))) continue [obj_id] = ready worker = tasks.pop(obj_id) # Process task result. if worker in ready_workers: results.append(obj_id) tasks_in_flight[worker] -= 1 map_bar.update(1) else: ready_workers.add(worker) map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers))) # Schedule a new task. while (blocks_in and tasks_in_flight[worker] < self.max_tasks_in_flight_per_actor): block, meta = blocks_in.pop() if context.block_splitting_enabled: ref = worker.map_block_split.remote( block, meta.input_files) else: ref, meta_ref = worker.map_block_nosplit.remote( block, meta.input_files) metadata_mapping[ref] = meta_ref tasks[ref] = worker block_indices[ref] = len(blocks_in) tasks_in_flight[worker] += 1 map_bar.close() new_blocks, new_metadata = [], [] # Put blocks in input order. results.sort(key=block_indices.get) if context.block_splitting_enabled: for result in ray.get(results): for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block in results: new_blocks.append(block) new_metadata.append(metadata_mapping[block]) new_metadata = ray.get(new_metadata) return BlockList(new_blocks, new_metadata)
def _apply( self, block_fn: BlockTransform, remote_args: dict, block_list: BlockList, clear_input_blocks: bool, name: Optional[str] = None, fn: Optional[UDF] = None, fn_args: Optional[Iterable[Any]] = None, fn_kwargs: Optional[Dict[str, Any]] = None, fn_constructor_args: Optional[Iterable[Any]] = None, fn_constructor_kwargs: Optional[Dict[str, Any]] = None, ) -> BlockList: """Note: this is not part of the Dataset public API.""" if fn_args is None: fn_args = tuple() if fn_kwargs is None: fn_kwargs = {} if fn_constructor_args is None: fn_constructor_args = tuple() if fn_constructor_kwargs is None: fn_constructor_kwargs = {} context = DatasetContext.get_current() blocks_in = block_list.get_blocks_with_metadata() # Early release block references. if clear_input_blocks: block_list.clear() orig_num_blocks = len(blocks_in) results = [] if name is None: name = "map" name = name.title() map_bar = ProgressBar(name, total=orig_num_blocks) class BlockWorker: def __init__( self, *fn_constructor_args: Any, **fn_constructor_kwargs: Any, ): if not isinstance(fn, CallableClass): if fn_constructor_args or fn_constructor_kwargs: raise ValueError( "fn_constructor_{kw}args only valid for CallableClass " f"UDFs, but got: {fn}" ) self.fn = fn else: self.fn = fn(*fn_constructor_args, **fn_constructor_kwargs) def ready(self): return "ok" def map_block_split( self, block: Block, input_files: List[str], *fn_args, **fn_kwargs, ) -> BlockPartition: return _map_block_split( block, block_fn, input_files, self.fn, *fn_args, **fn_kwargs ) @ray.method(num_returns=2) def map_block_nosplit( self, block: Block, input_files: List[str], *fn_args, **fn_kwargs, ) -> Tuple[Block, BlockMetadata]: return _map_block_nosplit( block, block_fn, input_files, self.fn, *fn_args, **fn_kwargs ) if "num_cpus" not in remote_args: remote_args["num_cpus"] = 1 if "scheduling_strategy" not in remote_args: ctx = DatasetContext.get_current() if ctx.scheduling_strategy == DEFAULT_SCHEDULING_STRATEGY: remote_args["scheduling_strategy"] = "SPREAD" else: remote_args["scheduling_strategy"] = ctx.scheduling_strategy BlockWorker = ray.remote(**remote_args)(BlockWorker) workers = [ BlockWorker.remote(*fn_constructor_args, **fn_constructor_kwargs) for _ in range(self.min_size) ] tasks = {w.ready.remote(): w for w in workers} tasks_in_flight = collections.defaultdict(int) metadata_mapping = {} block_indices = {} ready_workers = set() try: while len(results) < orig_num_blocks: ready, _ = ray.wait( list(tasks.keys()), timeout=0.01, num_returns=1, fetch_local=False ) if not ready: if ( len(workers) < self.max_size and len(ready_workers) / len(workers) > self.ready_to_total_workers_ratio ): w = BlockWorker.remote( *fn_constructor_args, **fn_constructor_kwargs ) workers.append(w) tasks[w.ready.remote()] = w map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers) ) ) continue [obj_id] = ready worker = tasks.pop(obj_id) # Process task result. if worker in ready_workers: results.append(obj_id) tasks_in_flight[worker] -= 1 map_bar.update(1) else: ready_workers.add(worker) map_bar.set_description( "Map Progress ({} actors {} pending)".format( len(ready_workers), len(workers) - len(ready_workers) ) ) # Schedule a new task. while ( blocks_in and tasks_in_flight[worker] < self.max_tasks_in_flight_per_actor ): block, meta = blocks_in.pop() if context.block_splitting_enabled: ref = worker.map_block_split.remote( block, meta.input_files, *fn_args, **fn_kwargs, ) else: ref, meta_ref = worker.map_block_nosplit.remote( block, meta.input_files, *fn_args, **fn_kwargs, ) metadata_mapping[ref] = meta_ref tasks[ref] = worker block_indices[ref] = len(blocks_in) tasks_in_flight[worker] += 1 map_bar.close() self.num_workers += len(workers) new_blocks, new_metadata = [], [] # Put blocks in input order. results.sort(key=block_indices.get) if context.block_splitting_enabled: for result in ray.get(results): for block, metadata in result: new_blocks.append(block) new_metadata.append(metadata) else: for block in results: new_blocks.append(block) new_metadata.append(metadata_mapping[block]) new_metadata = ray.get(new_metadata) return BlockList(new_blocks, new_metadata) except Exception as e: try: for worker in workers: ray.kill(worker) except Exception as err: logger.exception(f"Error killing workers: {err}") finally: raise e
def execute( self, input_blocks: BlockList, output_num_blocks: int, clear_input_blocks: bool, *, map_ray_remote_args: Optional[Dict[str, Any]] = None, reduce_ray_remote_args: Optional[Dict[str, Any]] = None, ) -> Tuple[BlockList, Dict[str, List[BlockMetadata]]]: input_blocks_list = input_blocks.get_blocks() input_num_blocks = len(input_blocks_list) if map_ray_remote_args is None: map_ray_remote_args = {} if reduce_ray_remote_args is None: reduce_ray_remote_args = {} if "scheduling_strategy" not in reduce_ray_remote_args: reduce_ray_remote_args = reduce_ray_remote_args.copy() reduce_ray_remote_args["scheduling_strategy"] = "SPREAD" shuffle_map = cached_remote_fn(self.map) shuffle_reduce = cached_remote_fn(self.reduce) map_bar = ProgressBar("Shuffle Map", total=input_num_blocks) shuffle_map_out = [ shuffle_map.options( **map_ray_remote_args, num_returns=1 + output_num_blocks, ).remote(i, block, output_num_blocks, *self._map_args) for i, block in enumerate(input_blocks_list) ] # The first item returned is the BlockMetadata. shuffle_map_metadata = [] for i, refs in enumerate(shuffle_map_out): shuffle_map_metadata.append(refs[0]) shuffle_map_out[i] = refs[1:] # Eagerly delete the input block references in order to eagerly release # the blocks' memory. del input_blocks_list if clear_input_blocks: input_blocks.clear() shuffle_map_metadata = map_bar.fetch_until_complete( shuffle_map_metadata) map_bar.close() reduce_bar = ProgressBar("Shuffle Reduce", total=output_num_blocks) shuffle_reduce_out = [ shuffle_reduce.options( **reduce_ray_remote_args, num_returns=2, ).remote( *self._reduce_args, *[shuffle_map_out[i][j] for i in range(input_num_blocks)], ) for j in range(output_num_blocks) ] # Eagerly delete the map block references in order to eagerly release # the blocks' memory. del shuffle_map_out new_blocks, new_metadata = zip(*shuffle_reduce_out) new_metadata = reduce_bar.fetch_until_complete(list(new_metadata)) reduce_bar.close() stats = { "map": shuffle_map_metadata, "reduce": new_metadata, } return BlockList(list(new_blocks), list(new_metadata)), stats
def fast_repartition(blocks, num_blocks): from ray.data.dataset import Dataset wrapped_ds = Dataset( ExecutionPlan(blocks, DatasetStats(stages={}, parent=None)), 0, lazy=False ) # Compute the (n-1) indices needed for an equal split of the data. count = wrapped_ds.count() dataset_format = wrapped_ds._dataset_format() indices = [] cur_idx = 0 for _ in range(num_blocks - 1): cur_idx += count / num_blocks indices.append(int(cur_idx)) assert len(indices) < num_blocks, (indices, num_blocks) if indices: splits = wrapped_ds.split_at_indices(indices) else: splits = [wrapped_ds] # TODO(ekl) include stats for the split tasks. We may also want to # consider combining the split and coalesce tasks as an optimization. # Coalesce each split into a single block. reduce_task = cached_remote_fn(_ShufflePartitionOp.reduce).options(num_returns=2) reduce_bar = ProgressBar("Repartition", position=0, total=len(splits)) reduce_out = [ reduce_task.remote(False, None, *s.get_internal_block_refs()) for s in splits if s.num_blocks() > 0 ] # Early-release memory. del splits, blocks, wrapped_ds new_blocks, new_metadata = zip(*reduce_out) new_blocks, new_metadata = list(new_blocks), list(new_metadata) new_metadata = reduce_bar.fetch_until_complete(new_metadata) reduce_bar.close() # Handle empty blocks. if len(new_blocks) < num_blocks: from ray.data._internal.arrow_block import ArrowBlockBuilder from ray.data._internal.pandas_block import PandasBlockBuilder from ray.data._internal.simple_block import SimpleBlockBuilder num_empties = num_blocks - len(new_blocks) if dataset_format == "arrow": builder = ArrowBlockBuilder() elif dataset_format == "pandas": builder = PandasBlockBuilder() else: builder = SimpleBlockBuilder() empty_block = builder.build() empty_meta = BlockAccessor.for_block(empty_block).get_metadata( input_files=None, exec_stats=None ) # No stats for empty block. empty_blocks, empty_metadata = zip( *[(ray.put(empty_block), empty_meta) for _ in range(num_empties)] ) new_blocks += empty_blocks new_metadata += empty_metadata return BlockList(new_blocks, new_metadata), {}