def _shuffle_map( block: Block, idx: int, output_num_blocks: int, random_shuffle: bool, random_seed: Optional[int], ) -> List[Union[BlockMetadata, Block]]: """Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks].""" stats = BlockExecStats.builder() block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this matters when # some blocks are larger than others). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) metadata = block.get_metadata(input_files=None, exec_stats=stats.build()) return [metadata] + slices
def _split_single_block( block_id: int, block: Block, meta: BlockMetadata, block_row: int, split_indices: List[int], ) -> Tuple[int, List[Tuple[ObjectRef[Block], BlockMetadata]]]: """Split the provided block at the given indices.""" split_result = [] block_accessor = BlockAccessor.for_block(block) prev_index = 0 # append one more entry at the last so we don't # need handle empty edge case. split_indices.append(block_row) for index in split_indices: logger.debug(f"slicing block {prev_index}:{index}") stats = BlockExecStats.builder() split_block = block_accessor.slice(prev_index, index, copy=True) accessor = BlockAccessor.for_block(split_block) split_meta = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=meta.schema, input_files=meta.input_files, exec_stats=stats.build(), ) split_result.append((ray.put(split_block), split_meta)) prev_index = index return (block_id, split_result)
def _shuffle_map(block: Block, idx: int, output_num_blocks: int, random_shuffle: bool, random_seed: Optional[int]) -> List[Block]: block = BlockAccessor.for_block(block) # Randomize the distribution of records to blocks. if random_shuffle: seed_i = random_seed + idx if random_seed is not None else None block = block.random_shuffle(seed_i) block = BlockAccessor.for_block(block) slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks)) slices = [] for i in range(output_num_blocks): slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True)) # Randomize the distribution order of the blocks (this matters when # some blocks are larger than others). if random_shuffle: random = np.random.RandomState(seed_i) random.shuffle(slices) num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices) assert num_rows == block.num_rows(), (num_rows, block.num_rows()) # Needed to handle num_returns=1 edge case in Ray API. if len(slices) == 1: return slices[0] else: return slices
def _sort_block(block, boundaries, key, descending): stats = BlockExecStats.builder() out = BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending) meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return out + [meta]
def aggregate_combined_blocks( blocks: List["pandas.DataFrame"], key: KeyFn, aggs: Tuple[AggregateFn] ) -> Tuple["pandas.DataFrame", BlockMetadata]: # TODO (kfstorm): A workaround to pass tests. Not efficient. block, metadata = ArrowBlockAccessor.aggregate_combined_blocks( [BlockAccessor.for_block(block).to_arrow() for block in blocks], key, aggs ) return BlockAccessor.for_block(block).to_pandas(), metadata
def sort_and_partition(self, boundaries: List[T], key: "SortKeyT", descending: bool) -> List["pandas.DataFrame"]: # TODO (kfstorm): A workaround to pass tests. Not efficient. delegated_result = BlockAccessor.for_block( self.to_arrow()).sort_and_partition(boundaries, key, descending) return [ BlockAccessor.for_block(_).to_pandas() for _ in delegated_result ]
def merge_sorted_blocks( blocks: List["pandas.DataFrame"], key: "SortKeyT", _descending: bool) -> Tuple["pandas.DataFrame", BlockMetadata]: # TODO (kfstorm): A workaround to pass tests. Not efficient. block, metadata = ArrowBlockAccessor.merge_sorted_blocks( [BlockAccessor.for_block(block).to_arrow() for block in blocks], key, _descending, ) return BlockAccessor.for_block(block).to_pandas(), metadata
def _partition_and_combine_block(block: Block[T], boundaries: List[KeyType], key: GroupKeyT, aggs: Tuple[AggregateFn]) -> List[Block]: """Partition the block and combine rows with the same key.""" if key is None: partitions = [block] else: partitions = BlockAccessor.for_block(block).sort_and_partition( boundaries, [(key, "ascending")] if isinstance(key, str) else key, descending=False) return [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions]
def _format_batch(batch: Block, batch_format: str) -> BatchType: if batch_format == "native": batch = BlockAccessor.for_block(batch).to_native() elif batch_format == "pandas": batch = BlockAccessor.for_block(batch).to_pandas() elif batch_format == "pyarrow": batch = BlockAccessor.for_block(batch).to_arrow() elif batch_format == "numpy": batch = BlockAccessor.for_block(batch).to_numpy() else: raise ValueError(f"The given batch format: {batch_format} " f"is invalid. Supported batch type: {BatchType}") return batch
def map( idx: int, block: Block, output_num_blocks: int, boundaries: List[T], key: SortKeyT, descending: bool, ) -> List[Union[BlockMetadata, Block]]: stats = BlockExecStats.builder() out = BlockAccessor.for_block(block).sort_and_partition( boundaries, key, descending) meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return [meta] + out
def _partition_and_combine_block( block: Block[T], boundaries: List[KeyType], key: KeyFn, aggs: Tuple[AggregateFn]) -> List[Union[Block, BlockMetadata]]: """Partition the block and combine rows with the same key.""" stats = BlockExecStats.builder() if key is None: partitions = [block] else: partitions = BlockAccessor.for_block(block).sort_and_partition( boundaries, [(key, "ascending")] if isinstance(key, str) else key, descending=False) parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions] meta = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build()) return parts + [meta]
def __call__(self) -> MaybeBlockPartition: context = DatasetContext.get_current() result = self._read_fn() if not hasattr(result, "__iter__"): DeprecationWarning( "Read function must return Iterable[Block], got {}. " "Probably you need to return `[block]` instead of " "`block`.".format(result)) if context.block_splitting_enabled: partition: BlockPartition = [] for block in result: metadata = BlockAccessor.for_block(block).get_metadata( input_files=self._metadata.input_files, exec_stats=None) # No exec stats for the block splits. assert context.block_owner partition.append( (ray.put(block, _owner=context.block_owner), metadata)) if len(partition) == 0: raise ValueError("Read task must return non-empty list.") return partition else: builder = DelegatingBlockBuilder() for block in result: builder.add_block(block) return builder.build()
def has_batch(self) -> bool: """Whether this Batcher has any full batches.""" return self._buffer and ( self._batch_size is None or sum(BlockAccessor.for_block(b).num_rows() for b in self._buffer) >= self._batch_size )
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ray.data.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = DelegatingArrowBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(input_files=None)) i += block_size return Dataset(BlockList(blocks, metadata))
def reduce( key: KeyFn, aggs: Tuple[AggregateFn], *mapper_outputs: List[Block] ) -> (Block, BlockMetadata): """Aggregate sorted and partially combined blocks.""" return BlockAccessor.for_block(mapper_outputs[0]).aggregate_combined_blocks( list(mapper_outputs), key, aggs )
def _map_block_split( block: Block, block_fn: BlockTransform, input_files: List[str], fn: Optional[UDF], *fn_args, **fn_kwargs, ) -> BlockPartition: output = [] stats = BlockExecStats.builder() if fn is not None: fn_args = (fn,) + fn_args for new_block in block_fn(block, *fn_args, **fn_kwargs): accessor = BlockAccessor.for_block(new_block) new_meta = BlockMetadata( num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=input_files, exec_stats=stats.build(), ) owner = DatasetContext.get_current().block_owner output.append((ray.put(new_block, _owner=owner), new_meta)) stats = BlockExecStats.builder() return output
def _aggregate_combined_blocks( num_reducers: int, key: KeyFn, aggs: Tuple[AggregateFn], *blocks: Tuple[Block, ...] ) -> Tuple[Block[U], BlockMetadata]: """Aggregate sorted and partially combined blocks.""" return BlockAccessor.for_block(blocks[0]).aggregate_combined_blocks( list(blocks), key, aggs )
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() block = BlockAccessor.batch_to_block(ndarray) metadata = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) return block, metadata
def _test_equal_split_balanced(block_sizes, num_splits): blocks = [] metadata = [] total_rows = 0 for block_size in block_sizes: block = list(range(total_rows, total_rows + block_size)) blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(None, None)) total_rows += block_size block_list = BlockList(blocks, metadata) ds = Dataset( ExecutionPlan(block_list, DatasetStats.TODO()), 0, False, ) splits = ds.split(num_splits, equal=True) split_counts = [split.count() for split in splits] assert len(split_counts) == num_splits expected_block_size = total_rows // num_splits # Check that all splits are the expected size. assert all([count == expected_block_size for count in split_counts]) expected_total_rows = sum(split_counts) # Check that the expected number of rows were dropped. assert total_rows - expected_total_rows == total_rows % num_splits # Check that all rows are unique (content check). split_rows = [row for split in splits for row in split.take(total_rows)] assert len(set(split_rows)) == len(split_rows)
def build(self) -> Block: if self._builder is None: if self._empty_block is not None: self._builder = BlockAccessor.for_block(self._empty_block).builder() else: self._builder = ArrowBlockBuilder() return self._builder.build()
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() import pyarrow as pa from ray.data.extensions import TensorArray table = pa.Table.from_pydict({"value": TensorArray(ndarray)}) return (table, BlockAccessor.for_block(table).get_metadata( input_files=None, exec_stats=stats.build()))
def write_block(write_path: str, block: Block): logger.debug(f"Writing {write_path} file.") fs = filesystem if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() with fs.open_output_stream(write_path) as f: _write_block_to_file(f, BlockAccessor.for_block(block))
def test_sort_arrow_with_empty_blocks(ray_start_regular, use_push_based_shuffle): ctx = ray.data.context.DatasetContext.get_current() try: original = ctx.use_push_based_shuffle ctx.use_push_based_shuffle = use_push_based_shuffle assert ( BlockAccessor.for_block(pa.Table.from_pydict({})).sample(10, "A").num_rows == 0 ) partitions = BlockAccessor.for_block( pa.Table.from_pydict({}) ).sort_and_partition([1, 5, 10], "A", descending=False) assert len(partitions) == 4 for partition in partitions: assert partition.num_rows == 0 assert ( BlockAccessor.for_block(pa.Table.from_pydict({})) .merge_sorted_blocks([pa.Table.from_pydict({})], "A", False)[0] .num_rows == 0 ) ds = ray.data.from_items( [{"A": (x % 3), "B": x} for x in range(3)], parallelism=3 ) ds = ds.filter(lambda r: r["A"] == 0) assert [row.as_pydict() for row in ds.sort("A").iter_rows()] == [ {"A": 0, "B": 0} ] # Test empty dataset. ds = ray.data.range_table(10).filter(lambda r: r["value"] > 10) assert ( len( ray.data._internal.sort.sample_boundaries( ds._plan.execute().get_blocks(), "value", 3 ) ) == 2 ) assert ds.sort("value").count() == 0 finally: ctx.use_push_based_shuffle = original
def _aggregate_combined_blocks( num_reducers: int, key: GroupKeyT, aggs: Tuple[AggregateFn], *blocks: Tuple[Block, ...]) -> Tuple[Block[U], BlockMetadata]: """Aggregate sorted and partially combined blocks.""" if num_reducers == 1: blocks = [b[0] for b in blocks] # Ray weirdness return BlockAccessor.for_block(blocks[0]).aggregate_combined_blocks( list(blocks), key, aggs)
def _map_block_nosplit(block: Block, fn: Any, input_files: List[str]) -> Tuple[Block, BlockMetadata]: builder = DelegatingArrowBlockBuilder() for new_block in fn(block): builder.add_block(new_block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) return new_block, accessor.get_metadata(input_files=input_files)
def __init__(self): super().__init__( init=lambda k: 0, accumulate_block=(lambda a, block: a + BlockAccessor.for_block( block).num_rows()), merge=lambda a1, a2: a1 + a2, name="count()", )
def next_batch(self) -> Block: """Get the next shuffled batch from the shuffle buffer. Returns: A batch represented as a Block. """ assert self.has_batch() or (self._done_adding and self.has_any()) # Add rows in the builder to the shuffle buffer. if self._builder.num_rows() > 0: if self._shuffle_buffer is not None: if self._batch_head > 0: # Compact the materialized shuffle buffer. # TODO(Clark): If alternating between adding blocks and fetching # shuffled batches, this aggressive compaction could be inefficient. self._shuffle_buffer = BlockAccessor.for_block( self._shuffle_buffer).take( self._shuffle_indices[self._batch_head:]) # Add the unyielded rows from the existing shuffle buffer. self._builder.add_block(self._shuffle_buffer) # Build the new shuffle buffer. self._shuffle_buffer = self._builder.build() # Reset the builder. self._builder = DelegatingBlockBuilder() # Invalidate the shuffle indices. self._shuffle_indices = None self._batch_head = 0 assert self._shuffle_buffer is not None buffer_size = BlockAccessor.for_block(self._shuffle_buffer).num_rows() # Truncate the batch to the buffer size, if necessary. batch_size = min(self._batch_size, buffer_size) if self._shuffle_indices is None: # Need to generate new shuffle indices. self._shuffle_indices = list(range(buffer_size)) random.shuffle(self._shuffle_indices) # Get the shuffle indices for this batch. batch_indices = self._shuffle_indices[self. _batch_head:self._batch_head + batch_size] self._batch_head += batch_size # Yield the shuffled batch. return BlockAccessor.for_block( self._shuffle_buffer).take(batch_indices)
def process_block(self, block: Block, meta: BlockMetadata) -> (Block, BlockMetadata): new_block = fn(block) accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=meta.input_files) return new_block, new_metadata
def add(self, block: Block): """Add a block to the block buffer. Args: block: Block to add to the block buffer. """ assert self.can_add(block) self._buffer.append(block) self._buffer_size += BlockAccessor.for_block(block).num_rows()
def reduce( key: SortKeyT, descending: bool, *mapper_outputs: List[Block], partial_reduce: bool = False, ) -> (Block, BlockMetadata): return BlockAccessor.for_block(mapper_outputs[0]).merge_sorted_blocks( mapper_outputs, key, descending )