def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]: """Create a dataset from a list of local Python objects. Examples: >>> ray.data.from_items([1, 2, 3, 4, 5]) Args: items: List of local Python objects. parallelism: The amount of parallelism to use for the dataset. Returns: Dataset holding the items. """ block_size = max(1, len(items) // parallelism) blocks: List[ObjectRef[Block]] = [] metadata: List[BlockMetadata] = [] i = 0 while i < len(items): builder = DelegatingArrowBlockBuilder() for item in items[i:i + block_size]: builder.add(item) block = builder.build() blocks.append(ray.put(block)) metadata.append( BlockAccessor.for_block(block).get_metadata(input_files=None)) i += block_size return Dataset(BlockList(blocks, metadata))
def read_files(read_paths: List[str], fs: Union["pyarrow.fs.FileSystem", _S3FileSystemWrapper]): logger.debug(f"Reading {len(read_paths)} files.") if isinstance(fs, _S3FileSystemWrapper): fs = fs.unwrap() builder = DelegatingArrowBlockBuilder() for read_path in read_paths: with fs.open_input_stream(read_path) as f: data = read_file(f, read_path, **reader_args) if isinstance(data, pa.Table) or isinstance( data, np.ndarray): builder.add_block(data) else: builder.add(data) return builder.build()
class BlockOutputBuffer(object): """Generates output blocks of a given size given a stream of inputs. This class is used to turn a stream of items / blocks of arbitrary size into a stream of blocks of ``target_max_block_size``. The caller should check ``has_next()`` after each ``add()`` call, and call ``next()`` to get the next block when ``has_next()`` returns True. When all items have been added, the caller must call ``finalize()`` and then check ``has_next()`` one last time. Examples: >>> # Yield a stream of output blocks. >>> output = BlockOutputBuffer(udf, 500 * 1024 * 1024) >>> for item in generator(): ... output.add(item) ... if output.has_next(): ... yield output.next() ... output.finalize() ... if output.has_next() ... yield output.next() """ def __init__(self, block_udf: Optional[Callable[[Block], Block]], target_max_block_size: int): from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder self._target_max_block_size = target_max_block_size self._block_udf = block_udf self._buffer = DelegatingArrowBlockBuilder() self._returned_at_least_one_block = False self._finalized = False def add(self, item: Any) -> None: """Add a single item to this output buffer.""" assert not self._finalized self._buffer.add(item) def add_block(self, block: Block) -> None: """Add a data block to this output buffer.""" assert not self._finalized self._buffer.add_block(block) def finalize(self) -> None: """Must be called once all items have been added.""" assert not self._finalized self._finalized = True def has_next(self) -> bool: """Returns true when a complete output block is produced.""" if self._finalized: return not self._returned_at_least_one_block \ or self._buffer.num_rows() > 0 else: return self._buffer.get_estimated_memory_usage() > \ self._target_max_block_size def next(self) -> Block: """Returns the next complete output block.""" assert self.has_next() block = self._buffer.build() if self._block_udf and block.num_rows > 0: block = self._block_udf(block) self._buffer = DelegatingArrowBlockBuilder() self._returned_at_least_one_block = True return block