Ejemplo n.º 1
0
def from_items(items: List[Any], parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ray.data.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        builder = DelegatingArrowBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(input_files=None))
        i += block_size

    return Dataset(BlockList(blocks, metadata))
Ejemplo n.º 2
0
 def transform(block: Block[T]) -> Block[U]:
     block = BlockAccessor.for_block(block)
     builder = DelegatingArrowBlockBuilder()
     for row in block.iter_rows():
         for r2 in fn(row):
             builder.add(r2)
     return builder.build()
Ejemplo n.º 3
0
 def shuffle_reduce(
         *mapper_outputs: List[Block[T]]) -> (Block[T], BlockMetadata):
     builder = DelegatingArrowBlockBuilder()
     assert len(mapper_outputs) == input_num_blocks
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     new_metadata = BlockMetadata(num_rows=new_block.num_rows(),
                                  size_bytes=new_block.size_bytes(),
                                  schema=new_block.schema(),
                                  input_files=None)
     return new_block, new_metadata
Ejemplo n.º 4
0
        def transform(block: Block[T]) -> Block[U]:
            block = BlockAccessor.for_block(block)
            total_rows = block.num_rows()
            max_batch_size = batch_size
            if max_batch_size is None:
                max_batch_size = total_rows

            builder = DelegatingArrowBlockBuilder()

            for start in range(0, total_rows, max_batch_size):
                # Build a block for each batch.
                end = min(total_rows, start + max_batch_size)
                view = block.slice(start, end, copy=False)
                if batch_format == "pandas":
                    view = BlockAccessor.for_block(view).to_pandas()
                elif batch_format == "pyarrow":
                    view = BlockAccessor.for_block(view).to_arrow_table()
                else:
                    raise ValueError(
                        f"The given batch format: {batch_format} "
                        f"is invalid. Supported batch type: {BatchType}")

                applied = fn(view)
                if isinstance(applied, list):
                    applied = applied
                elif isinstance(applied, pa.Table):
                    applied = applied
                elif isinstance(applied, pd.core.frame.DataFrame):
                    applied = pa.Table.from_pandas(applied)
                else:
                    raise ValueError("The map batch UDF returns a type "
                                     f"{type(applied)}, which is not allowed. "
                                     "The return type must be either list, "
                                     "pandas.DataFrame, or pyarrow.Table")
                builder.add_block(applied)

            return builder.build()
Ejemplo n.º 5
0
 def read_files(read_paths: List[str],
                fs: Union["pyarrow.fs.FileSystem",
                          _S3FileSystemWrapper]):
     logger.debug(f"Reading {len(read_paths)} files.")
     if isinstance(fs, _S3FileSystemWrapper):
         fs = fs.unwrap()
     builder = DelegatingArrowBlockBuilder()
     for read_path in read_paths:
         with fs.open_input_stream(read_path) as f:
             data = read_file(f, read_path, **reader_args)
             if isinstance(data, pa.Table):
                 builder.add_block(data)
             else:
                 builder.add(data)
     return builder.build()
Ejemplo n.º 6
0
    def next_batch(self) -> Block:
        """Get the next batch from the block buffer.

        Returns:
            A batch represented as a Block.
        """
        # If no batch size, short-circuit.
        if self._batch_size is None:
            assert len(self._buffer) == 1
            block = self._buffer[0]
            self._buffer = []
            return block
        output = DelegatingArrowBlockBuilder()
        leftover = []
        needed = self._batch_size
        for block in self._buffer:
            accessor = BlockAccessor.for_block(block)
            if needed <= 0:
                # We already have a full batch, so add this block to
                # the leftovers.
                leftover.append(block)
            elif accessor.num_rows() <= needed:
                # We need this entire block to fill out a batch.
                output.add_block(block)
                needed -= accessor.num_rows()
            else:
                # We only need part of the block to fill out a batch.
                output.add_block(accessor.slice(0, needed, copy=False))
                # Add the rest of the block to the leftovers.
                leftover.append(
                    accessor.slice(needed, accessor.num_rows(), copy=False))
                needed = 0

        # Move the leftovers into the block buffer so they're the first
        # blocks consumed on the next batch extraction.
        self._buffer = leftover
        return output.build()
Ejemplo n.º 7
0
 def transform(block: Block[T]) -> Block[U]:
     builder = DelegatingArrowBlockBuilder()
     for row in block.iter_rows():
         builder.add(fn(row))
     return builder.build()
Ejemplo n.º 8
0
 def shuffle_reduce(*mapper_outputs: List[Block[T]]) -> Block[T]:
     builder = DelegatingArrowBlockBuilder()
     assert len(mapper_outputs) == input_num_blocks
     for block in mapper_outputs:
         builder.add_block(block)
     return builder.build()