def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]] ) -> Dataset[ArrowRow]: """Create a dataset from a list of Ray object references to Pandas dataframes. Args: dfs: A Ray object references to pandas dataframe, or a list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] elif isinstance(dfs, list): for df in dfs: if not isinstance(df, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(df)}" ) else: raise ValueError( "Expected Ray object ref or list of Ray object refs, " f"got {type(df)}" ) context = DatasetContext.get_current() if context.enable_pandas_block: get_metadata = cached_remote_fn(_get_metadata) metadata = [get_metadata.remote(df) for df in dfs] return Dataset( ExecutionPlan(BlockList(dfs, ray.get(metadata)), DatasetStats.TODO()), 0, False, ) df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = zip(*res) return Dataset( ExecutionPlan( BlockList(blocks, ray.get(list(metadata))), DatasetStats(stages={"from_pandas_refs": metadata}, parent=None), ), 0, False, )
def _test_equal_split_balanced(block_sizes, num_splits): blocks = [] metadata = [] total_rows = 0 for block_size in block_sizes: block = list(range(total_rows, total_rows + block_size)) blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(None, None)) total_rows += block_size block_list = BlockList(blocks, metadata) ds = Dataset( ExecutionPlan(block_list, DatasetStats.TODO()), 0, False, ) splits = ds.split(num_splits, equal=True) split_counts = [split.count() for split in splits] assert len(split_counts) == num_splits expected_block_size = total_rows // num_splits # Check that all splits are the expected size. assert all([count == expected_block_size for count in split_counts]) expected_total_rows = sum(split_counts) # Check that the expected number of rows were dropped. assert total_rows - expected_total_rows == total_rows % num_splits # Check that all rows are unique (content check). split_rows = [row for split in splits for row in split.take(total_rows)] assert len(set(split_rows)) == len(split_rows)
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ray.data.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Parallelism may be limited by the number of items. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = DelegatingBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=BlockExecStats.TODO)) i += block_size return Dataset(BlockList(blocks, metadata), 0, DatasetStats.TODO())
def from_arrow_refs( tables: Union[ ObjectRef[Union["pyarrow.Table", bytes]], List[ObjectRef[Union["pyarrow.Table", bytes]]], ] ) -> Dataset[ArrowRow]: """Create a dataset from a set of Arrow tables. Args: tables: A Ray object reference to Arrow table, or list of Ray object references to Arrow tables, or its streaming format in bytes. Returns: Dataset holding Arrow records from the tables. """ if isinstance(tables, ray.ObjectRef): tables = [tables] get_metadata = cached_remote_fn(_get_metadata) metadata = [get_metadata.remote(t) for t in tables] return Dataset( ExecutionPlan( BlockList(tables, ray.get(metadata)), DatasetStats(stages={"from_arrow_refs": metadata}, parent=None), ), 0, False, )
def _rewrite_read_stage( in_blocks: LazyBlockList, ) -> Tuple[BlockList, DatasetStats, Stage]: """Rewrite the read stage to a OneToOne stage over read tasks as input. For example, suppose the plan was [Read -> MapBatches(Fn)]. These stages cannot be fused, since read stages are handled specially. After rewriting to [GetReadTasks -> MapBatches(DoRead) -> MapBatches(Fn)], now we can fuse the latter two MapBatches stages into a single OneToOne stage: [GetReadTasks -> MapBatches(DoRead -> Fn)]. Args: blocks: Lazy block list representing read stage. Returns: Non-lazy block list containing read tasks for not-yet-read block partitions, new stats for the block list, and the new one-to-one read stage. """ # Generate the "GetReadTasks" stage blocks. remote_args = in_blocks._remote_args blocks, metadata = [], [] for read_task in in_blocks._tasks: blocks.append(ray.put(read_task._read_fn)) metadata.append(read_task.get_metadata()) block_list = BlockList(blocks, metadata) def block_fn(read_fn: Callable[[], Iterator[Block]]) -> Iterator[Block]: for block in read_fn(): yield block stage = OneToOneStage("read", block_fn, "tasks", remote_args) stats = DatasetStats(stages={}, parent=None) return block_list, stats, stage
def _rewrite_read_stages(self) -> None: """Rewrites read stages into one-to-one stages.""" if self._stages and self._has_read_stage(): block_list, stage = self._rewrite_read_stage() self._in_blocks = block_list self._in_stats = DatasetStats(stages={}, parent=None) self._stages.insert(0, stage)
def _optimize_stages(self): """Optimize this pipeline, fusing stages together as possible.""" context = DatasetContext.get_current() if not context.optimize_fuse_stages: self._optimized_stages = self._stages return # This dummy dataset will be used to get a set of optimized stages. dummy_ds = Dataset( ExecutionPlan(BlockList([], []), DatasetStats(stages={}, parent=None)), 0, True, ) # Apply all pipeline operations to the dummy dataset. for stage in self._stages: dummy_ds = stage(dummy_ds) # Get the optimized stages. _, _, stages = dummy_ds._plan._optimize() # Apply these optimized stages to the datasets underlying the pipeline. # These optimized stages will be executed by the PipelineExecutor. optimized_stages = [] for stage in stages: optimized_stages.append(lambda ds, stage=stage: Dataset( ds._plan.with_stage(stage), ds._epoch, True)) self._optimized_stages = optimized_stages
def __init__(self, in_blocks: BlockList, stats: DatasetStats, dataset_uuid=None): """Create a plan with no transformation stages. Args: in_blocks: Base list of blocks. stats: Stats for the base blocks. dataset_uuid: Dataset's UUID. """ self._in_blocks = in_blocks self._in_stats = stats # A computed snapshot of some prefix of stages. self._snapshot_blocks = None self._snapshot_stats = None # Chains of stages. self._stages_before_snapshot = [] self._stages_after_snapshot = [] # Cache of optimized stages. self._last_optimized_stages = None self._dataset_uuid = dataset_uuid or uuid.uuid4().hex if not stats.dataset_uuid: stats.dataset_uuid = self._dataset_uuid
def stats(self) -> DatasetStats: """Create DatasetStats for this LazyBlockList.""" return DatasetStats( stages={"read": self.get_metadata(fetch_if_missing=False)}, parent=None, needs_stats_actor=True, stats_uuid=self._stats_uuid, )
def _rewrite_read_stages( blocks: BlockList, stats: DatasetStats, stages: List[Stage], dataset_uuid: str, ) -> Tuple[BlockList, DatasetStats, List[Stage]]: """Rewrites read stages into one-to-one stages, if needed.""" if _is_lazy(blocks) and stages: blocks, stats, stage = _rewrite_read_stage(blocks) stats.dataset_uuid = dataset_uuid stages.insert(0, stage) return blocks, stats, stages
def from_numpy(ndarrays: List[ObjectRef[np.ndarray]]) -> Dataset[ArrowRow]: """Create a dataset from a set of NumPy ndarrays. Args: ndarrays: A list of Ray object references to NumPy ndarrays. Returns: Dataset holding the given ndarrays. """ ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2) res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))), 0, DatasetStats.TODO())
def __init__(self, in_blocks: BlockList, stats: DatasetStats, dataset_uuid=None): """Create a plan with no transformation stages. Args: in_blocks: Base list of blocks. stats: Stats for the base blocks. """ self._in_blocks = in_blocks self._out_blocks = None self._in_stats = stats self._out_stats = None self._stages = [] self._dataset_uuid = dataset_uuid or uuid.uuid4().hex if not stats.dataset_uuid: stats.dataset_uuid = self._dataset_uuid
def _optimize_stages(self): """Optimize this pipeline, fusing stages together as possible.""" context = DatasetContext.get_current() if not context.optimize_fuse_stages: self._optimized_stages = self._stages return dummy_ds = Dataset( ExecutionPlan(BlockList([], []), DatasetStats(stages={}, parent=None)), 0, True, ) for stage in self._stages: dummy_ds = stage(dummy_ds) dummy_ds._plan._optimize() optimized_stages = [] for stage in dummy_ds._plan._stages: optimized_stages.append(lambda ds, stage=stage: Dataset( ds._plan.with_stage(stage), ds._epoch, True)) self._optimized_stages = optimized_stages
def from_numpy_refs( ndarrays: Union[ObjectRef[np.ndarray], List[ObjectRef[np.ndarray]]], ) -> Dataset[ArrowRow]: """Create a dataset from a list of NumPy ndarray futures. Args: ndarrays: A Ray object reference to a NumPy ndarray or a list of Ray object references to NumPy ndarrays. Returns: Dataset holding the given ndarrays. """ if isinstance(ndarrays, ray.ObjectRef): ndarrays = [ndarrays] elif isinstance(ndarrays, list): for ndarray in ndarrays: if not isinstance(ndarray, ray.ObjectRef): raise ValueError( "Expected list of Ray object refs, " f"got list containing {type(ndarray)}" ) else: raise ValueError( f"Expected Ray object ref or list of Ray object refs, got {type(ndarray)}" ) ndarray_to_block = cached_remote_fn(_ndarray_to_block, num_returns=2) res = [ndarray_to_block.remote(ndarray) for ndarray in ndarrays] blocks, metadata = zip(*res) return Dataset( ExecutionPlan( BlockList(blocks, ray.get(list(metadata))), DatasetStats(stages={"from_numpy": metadata}, parent=None), ), 0, False, )
def from_pandas_refs( dfs: Union[ObjectRef["pandas.DataFrame"], List[ObjectRef["pandas.DataFrame"]]] ) -> Dataset[ArrowRow]: """Create a dataset from a list of Ray object references to Pandas dataframes. Args: dfs: A Ray object references to pandas dataframe, or a list of Ray object references to pandas dataframes. Returns: Dataset holding Arrow records read from the dataframes. """ if isinstance(dfs, ray.ObjectRef): dfs = [dfs] df_to_block = cached_remote_fn(_df_to_block, num_returns=2) res = [df_to_block.remote(df) for df in dfs] blocks, metadata = zip(*res) return Dataset(BlockList(blocks, ray.get(list(metadata))), 0, DatasetStats.TODO())
def fast_repartition(blocks, num_blocks): from ray.data.dataset import Dataset wrapped_ds = Dataset(ExecutionPlan(blocks, DatasetStats(stages={}, parent=None)), 0, lazy=False) # Compute the (n-1) indices needed for an equal split of the data. count = wrapped_ds.count() dataset_format = wrapped_ds._dataset_format() indices = [] cur_idx = 0 for _ in range(num_blocks - 1): cur_idx += count / num_blocks indices.append(int(cur_idx)) assert len(indices) < num_blocks, (indices, num_blocks) if indices: splits = wrapped_ds.split_at_indices(indices) else: splits = [wrapped_ds] # TODO(ekl) include stats for the split tasks. We may also want to # consider combining the split and coalesce tasks as an optimization. # Coalesce each split into a single block. reduce_task = cached_remote_fn( _ShufflePartitionOp.reduce).options(num_returns=2) reduce_bar = ProgressBar("Repartition", position=0, total=len(splits)) reduce_out = [ reduce_task.remote(False, None, *s.get_internal_block_refs()) for s in splits if s.num_blocks() > 0 ] # Early-release memory. del splits, blocks, wrapped_ds new_blocks, new_metadata = zip(*reduce_out) new_blocks, new_metadata = list(new_blocks), list(new_metadata) new_metadata = reduce_bar.fetch_until_complete(new_metadata) reduce_bar.close() # Handle empty blocks. if len(new_blocks) < num_blocks: from ray.data.impl.arrow_block import ArrowBlockBuilder from ray.data.impl.pandas_block import PandasBlockBuilder from ray.data.impl.simple_block import SimpleBlockBuilder num_empties = num_blocks - len(new_blocks) if dataset_format == "arrow": builder = ArrowBlockBuilder() elif dataset_format == "pandas": builder = PandasBlockBuilder() else: builder = SimpleBlockBuilder() empty_block = builder.build() empty_meta = BlockAccessor.for_block(empty_block).get_metadata( input_files=None, exec_stats=None) # No stats for empty block. empty_blocks, empty_metadata = zip(*[(ray.put(empty_block), empty_meta) for _ in range(num_empties)]) new_blocks += empty_blocks new_metadata += empty_metadata return BlockList(new_blocks, new_metadata), {}
def read_datasource( datasource: Datasource[T], *, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, _spread_resource_prefix: Optional[str] = None, **read_args, ) -> Dataset[T]: """Read a dataset from a custom data source. Args: datasource: The datasource to read data from. parallelism: The requested parallelism of the read. Parallelism may be limited by the available partitioning of the datasource. read_args: Additional kwargs to pass to the datasource impl. ray_remote_args: kwargs passed to ray.remote in the read tasks. Returns: Dataset holding the data read from the datasource. """ # TODO(ekl) remove this feature flag. if "RAY_DATASET_FORCE_LOCAL_METADATA" in os.environ: read_tasks = datasource.prepare_read(parallelism, **read_args) else: # Prepare read in a remote task so that in Ray client mode, we aren't # attempting metadata resolution from the client machine. ctx = DatasetContext.get_current() prepare_read = cached_remote_fn(_prepare_read, retry_exceptions=False, num_cpus=0) read_tasks = ray.get( prepare_read.remote(datasource, ctx, parallelism, _wrap_s3_filesystem_workaround(read_args))) context = DatasetContext.get_current() stats_actor = get_or_create_stats_actor() stats_uuid = uuid.uuid4() stats_actor.record_start.remote(stats_uuid) def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the read task. block = task() if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=stats.build()) stats_actor.record_task.remote(stats_uuid, i, metadata) return block if ray_remote_args is None: ray_remote_args = {} # Increase the read parallelism by default to maximize IO throughput. This # is particularly important when reading from e.g., remote storage. if "num_cpus" not in ray_remote_args: # Note that the too many workers warning triggers at 4x subscription, # so we go at 0.5 to avoid the warning message. ray_remote_args["num_cpus"] = 0.5 remote_read = cached_remote_fn(remote_read) if _spread_resource_prefix is not None: # Use given spread resource prefix for round-robin resource-based # scheduling. nodes = ray.nodes() resource_iter = _get_spread_resources_iter(nodes, _spread_resource_prefix, ray_remote_args) else: # If no spread resource prefix given, yield an empty dictionary. resource_iter = itertools.repeat({}) calls: List[Callable[[], ObjectRef[MaybeBlockPartition]]] = [] metadata: List[BlockPartitionMetadata] = [] for i, task in enumerate(read_tasks): calls.append( lambda i=i, task=task, resources=next(resource_iter): remote_read. options(**ray_remote_args, resources=resources).remote(i, task)) metadata.append(task.get_metadata()) block_list = LazyBlockList(calls, metadata) # Get the schema from the first block synchronously. if metadata and metadata[0].schema is None: block_list.ensure_schema_for_first_block() return Dataset( block_list, 0, DatasetStats( stages={"read": metadata}, parent=None, stats_actor=stats_actor, stats_uuid=stats_uuid, ), )
def read_datasource( datasource: Datasource[T], *, parallelism: int = 200, ray_remote_args: Dict[str, Any] = None, _spread_resource_prefix: Optional[str] = None, **read_args, ) -> Dataset[T]: """Read a dataset from a custom data source. Args: datasource: The datasource to read data from. parallelism: The requested parallelism of the read. Parallelism may be limited by the available partitioning of the datasource. read_args: Additional kwargs to pass to the datasource impl. ray_remote_args: kwargs passed to ray.remote in the read tasks. Returns: Dataset holding the data read from the datasource. """ # TODO(ekl) remove this feature flag. force_local = "RAY_DATASET_FORCE_LOCAL_METADATA" in os.environ pa_ds = _lazy_import_pyarrow_dataset() if pa_ds: partitioning = read_args.get("dataset_kwargs", {}).get("partitioning", None) if isinstance(partitioning, pa_ds.Partitioning): logger.info( "Forcing local metadata resolution since the provided partitioning " f"{partitioning} is not serializable." ) force_local = True if force_local: read_tasks = datasource.prepare_read(parallelism, **read_args) else: # Prepare read in a remote task so that in Ray client mode, we aren't # attempting metadata resolution from the client machine. ctx = DatasetContext.get_current() prepare_read = cached_remote_fn( _prepare_read, retry_exceptions=False, num_cpus=0 ) read_tasks = ray.get( prepare_read.remote( datasource, ctx, parallelism, _wrap_arrow_serialization_workaround(read_args), ) ) context = DatasetContext.get_current() stats_actor = get_or_create_stats_actor() stats_uuid = uuid.uuid4() stats_actor.record_start.remote(stats_uuid) def remote_read(i: int, task: ReadTask, stats_actor) -> MaybeBlockPartition: DatasetContext._set_current(context) stats = BlockExecStats.builder() # Execute the read task. block = task() if context.block_splitting_enabled: metadata = task.get_metadata() metadata.exec_stats = stats.build() else: metadata = BlockAccessor.for_block(block).get_metadata( input_files=task.get_metadata().input_files, exec_stats=stats.build() ) stats_actor.record_task.remote(stats_uuid, i, metadata) return block if ray_remote_args is None: ray_remote_args = {} if "scheduling_strategy" not in ray_remote_args: ray_remote_args["scheduling_strategy"] = "SPREAD" remote_read = cached_remote_fn(remote_read) if _spread_resource_prefix is not None: if context.optimize_fuse_stages: logger.warning( "_spread_resource_prefix has no effect when optimize_fuse_stages " "is enabled. Tasks are spread by default." ) # Use given spread resource prefix for round-robin resource-based # scheduling. nodes = ray.nodes() resource_iter = _get_spread_resources_iter( nodes, _spread_resource_prefix, ray_remote_args ) else: # If no spread resource prefix given, yield an empty dictionary. resource_iter = itertools.repeat({}) calls: List[Callable[[], ObjectRef[MaybeBlockPartition]]] = [] metadata: List[BlockPartitionMetadata] = [] for i, task in enumerate(read_tasks): calls.append( lambda i=i, task=task, resources=next(resource_iter): remote_read.options( **ray_remote_args, resources=resources ).remote(i, task, stats_actor) ) metadata.append(task.get_metadata()) block_list = LazyBlockList(calls, metadata) # TODO(ekl) consider refactoring LazyBlockList to take read_tasks explicitly. block_list._read_tasks = read_tasks block_list._read_remote_args = ray_remote_args # Get the schema from the first block synchronously. if metadata and metadata[0].schema is None: block_list.ensure_schema_for_first_block() stats = DatasetStats( stages={"read": metadata}, parent=None, stats_actor=stats_actor, stats_uuid=stats_uuid, ) return Dataset( ExecutionPlan(block_list, stats), 0, False, )