def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ray.data.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = DelegatingArrowBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(input_files=None)) i += block_size return Dataset(BlockList(blocks, metadata))
def __call__(self) -> MaybeBlockPartition: context = DatasetContext.get_current() result = self._read_fn() if not hasattr(result, "__iter__"): DeprecationWarning( "Read function must return Iterable[Block], got {}. " "Probably you need to return `[block]` instead of " "`block`.".format(result)) if context.block_splitting_enabled: partition: BlockPartition = [] for block in result: metadata = BlockAccessor.for_block(block).get_metadata( input_files=self._metadata.input_files) assert context.block_owner partition.append( (ray.put(block, _owner=context.block_owner), metadata)) if len(partition) == 0: raise ValueError("Read task must return non-empty list.") return partition else: builder = DelegatingArrowBlockBuilder() for block in result: builder.add_block(block) return builder.build()
def _map_block_nosplit(block: Block, fn: Any, input_files: List[str]) -> Tuple[Block, BlockMetadata]: builder = DelegatingArrowBlockBuilder() for new_block in fn(block): builder.add_block(new_block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) return new_block, accessor.get_metadata(input_files=input_files)
def __init__(self, block_udf: Optional[Callable[[Block], Block]], target_max_block_size: int): from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder self._target_max_block_size = target_max_block_size self._block_udf = block_udf self._buffer = DelegatingArrowBlockBuilder() self._returned_at_least_one_block = False self._finalized = False
def next(self) -> Block: """Returns the next complete output block.""" assert self.has_next() block = self._buffer.build() if self._block_udf and block.num_rows > 0: block = self._block_udf(block) self._buffer = DelegatingArrowBlockBuilder() self._returned_at_least_one_block = True return block
def _shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata): builder = DelegatingArrowBlockBuilder() for block in mapper_outputs: builder.add_block(block) new_block = builder.build() accessor = BlockAccessor.for_block(new_block) new_metadata = BlockMetadata(num_rows=accessor.num_rows(), size_bytes=accessor.size_bytes(), schema=accessor.schema(), input_files=None) return new_block, new_metadata
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT, num_reducers: int) -> List[T]: """ Return (num_reducers - 1) items in ascending order from the blocks that partition the domain into ranges with approximately equally many elements. """ # TODO(Clark): Support multiple boundary sampling keys. if isinstance(key, list) and len(key) > 1: raise ValueError("Multiple boundary sampling keys not supported.") n_samples = int(num_reducers * 10 / len(blocks)) sample_block = cached_remote_fn(_sample_block) sample_results = [ sample_block.remote(block, n_samples, key) for block in blocks ] sample_bar = ProgressBar("Sort Sample", len(sample_results)) sample_bar.block_until_complete(sample_results) sample_bar.close() samples = ray.get(sample_results) samples = [s for s in samples if len(s) > 0] # The dataset is empty if len(samples) == 0: return [None] * (num_reducers - 1) builder = DelegatingArrowBlockBuilder() for sample in samples: builder.add_block(sample) samples = builder.build() column = key[0][0] if isinstance(key, list) else None sample_items = BlockAccessor.for_block(samples).to_numpy(column) sample_items = np.sort(sample_items) ret = [ np.quantile(sample_items, q, interpolation="nearest") for q in np.linspace(0, 1, num_reducers) ] return ret[1:]
def read_files(read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]): logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() builder = DelegatingArrowBlockBuilder() for read_path in read_paths: with fs.open_input_stream(read_path) as f: data = read_file(f, read_path, **reader_args) if isinstance(data, pa.Table) or isinstance( data, np.ndarray): builder.add_block(data) else: builder.add(data) return builder.build()
def next_batch(self) -> Block: """Get the next batch from the block buffer. Returns: A batch represented as a Block. """ # If no batch size, short-circuit. if self._batch_size is None: assert len(self._buffer) == 1 block = self._buffer[0] self._buffer = [] return block output = DelegatingArrowBlockBuilder() leftover = [] needed = self._batch_size for block in self._buffer: accessor = BlockAccessor.for_block(block) if needed <= 0: # We already have a full batch, so add this block to # the leftovers. leftover.append(block) elif accessor.num_rows() <= needed: # We need this entire block to fill out a batch. output.add_block(block) needed -= accessor.num_rows() else: # We only need part of the block to fill out a batch. output.add_block(accessor.slice(0, needed, copy=False)) # Add the rest of the block to the leftovers. leftover.append( accessor.slice(needed, accessor.num_rows(), copy=False)) needed = 0 # Move the leftovers into the block buffer so they're the first # blocks consumed on the next batch extraction. self._buffer = leftover return output.build()
class BlockOutputBuffer(object): """Generates output blocks of a given size given a stream of inputs. This class is used to turn a stream of items / blocks of arbitrary size into a stream of blocks of ``target_max_block_size``. The caller should check ``has_next()`` after each ``add()`` call, and call ``next()`` to get the next block when ``has_next()`` returns True. When all items have been added, the caller must call ``finalize()`` and then check ``has_next()`` one last time. Examples: >>> # Yield a stream of output blocks. >>> output = BlockOutputBuffer(udf, 500 * 1024 * 1024) >>> for item in generator(): ... output.add(item) ... if output.has_next(): ... yield output.next() ... output.finalize() ... if output.has_next() ... yield output.next() """ def __init__(self, block_udf: Optional[Callable[[Block], Block]], target_max_block_size: int): from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder self._target_max_block_size = target_max_block_size self._block_udf = block_udf self._buffer = DelegatingArrowBlockBuilder() self._returned_at_least_one_block = False self._finalized = False def add(self, item: Any) -> None: """Add a single item to this output buffer.""" assert not self._finalized self._buffer.add(item) def add_block(self, block: Block) -> None: """Add a data block to this output buffer.""" assert not self._finalized self._buffer.add_block(block) def finalize(self) -> None: """Must be called once all items have been added.""" assert not self._finalized self._finalized = True def has_next(self) -> bool: """Returns true when a complete output block is produced.""" if self._finalized: return not self._returned_at_least_one_block \ or self._buffer.num_rows() > 0 else: return self._buffer.get_estimated_memory_usage() > \ self._target_max_block_size def next(self) -> Block: """Returns the next complete output block.""" assert self.has_next() block = self._buffer.build() if self._block_udf and block.num_rows > 0: block = self._block_udf(block) self._buffer = DelegatingArrowBlockBuilder() self._returned_at_least_one_block = True return block