コード例 #1
0
ファイル: read_api.py プロジェクト: hngenc/ray
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ray.data.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        builder = DelegatingArrowBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(input_files=None))
        i += block_size

    return Dataset(BlockList(blocks, metadata))
コード例 #2
0
ファイル: datasource.py プロジェクト: stefanbschneider/ray
    def __call__(self) -> MaybeBlockPartition:
        context = DatasetContext.get_current()
        result = self._read_fn()
        if not hasattr(result, "__iter__"):
            DeprecationWarning(
                "Read function must return Iterable[Block], got {}. "
                "Probably you need to return `[block]` instead of "
                "`block`.".format(result))

        if context.block_splitting_enabled:
            partition: BlockPartition = []
            for block in result:
                metadata = BlockAccessor.for_block(block).get_metadata(
                    input_files=self._metadata.input_files)
                assert context.block_owner
                partition.append(
                    (ray.put(block, _owner=context.block_owner), metadata))
            if len(partition) == 0:
                raise ValueError("Read task must return non-empty list.")
            return partition
        else:
            builder = DelegatingArrowBlockBuilder()
            for block in result:
                builder.add_block(block)
            return builder.build()
コード例 #3
0
def _map_block_nosplit(block: Block, fn: Any,
                       input_files: List[str]) -> Tuple[Block, BlockMetadata]:
    builder = DelegatingArrowBlockBuilder()
    for new_block in fn(block):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files)
コード例 #4
0
ファイル: shuffle.py プロジェクト: rlan/ray
def _shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata):
    builder = DelegatingArrowBlockBuilder()
    for block in mapper_outputs:
        builder.add_block(block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                 size_bytes=accessor.size_bytes(),
                                 schema=accessor.schema(),
                                 input_files=None)
    return new_block, new_metadata
コード例 #5
0
 def read_files(read_paths: List[str],
                fs: Union["pyarrow.fs.FileSystem",
                          _S3FileSystemWrapper]):
     logger.debug(f"Reading {len(read_paths)} files.")
     if isinstance(fs, _S3FileSystemWrapper):
         fs = fs.unwrap()
     builder = DelegatingArrowBlockBuilder()
     for read_path in read_paths:
         with fs.open_input_stream(read_path) as f:
             data = read_file(f, read_path, **reader_args)
             if isinstance(data, pa.Table) or isinstance(
                     data, np.ndarray):
                 builder.add_block(data)
             else:
                 builder.add(data)
     return builder.build()
コード例 #6
0
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    # TODO(Clark): Support multiple boundary sampling keys.
    if isinstance(key, list) and len(key) > 1:
        raise ValueError("Multiple boundary sampling keys not supported.")

    n_samples = int(num_reducers * 10 / len(blocks))

    sample_block = cached_remote_fn(_sample_block)

    sample_results = [
        sample_block.remote(block, n_samples, key) for block in blocks
    ]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    sample_bar.block_until_complete(sample_results)
    sample_bar.close()

    samples = ray.get(sample_results)
    samples = [s for s in samples if len(s) > 0]
    # The dataset is empty
    if len(samples) == 0:
        return [None] * (num_reducers - 1)
    builder = DelegatingArrowBlockBuilder()
    for sample in samples:
        builder.add_block(sample)
    samples = builder.build()
    column = key[0][0] if isinstance(key, list) else None
    sample_items = BlockAccessor.for_block(samples).to_numpy(column)
    sample_items = np.sort(sample_items)
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.linspace(0, 1, num_reducers)
    ]
    return ret[1:]
コード例 #7
0
    def next_batch(self) -> Block:
        """Get the next batch from the block buffer.

        Returns:
            A batch represented as a Block.
        """
        # If no batch size, short-circuit.
        if self._batch_size is None:
            assert len(self._buffer) == 1
            block = self._buffer[0]
            self._buffer = []
            return block
        output = DelegatingArrowBlockBuilder()
        leftover = []
        needed = self._batch_size
        for block in self._buffer:
            accessor = BlockAccessor.for_block(block)
            if needed <= 0:
                # We already have a full batch, so add this block to
                # the leftovers.
                leftover.append(block)
            elif accessor.num_rows() <= needed:
                # We need this entire block to fill out a batch.
                output.add_block(block)
                needed -= accessor.num_rows()
            else:
                # We only need part of the block to fill out a batch.
                output.add_block(accessor.slice(0, needed, copy=False))
                # Add the rest of the block to the leftovers.
                leftover.append(
                    accessor.slice(needed, accessor.num_rows(), copy=False))
                needed = 0

        # Move the leftovers into the block buffer so they're the first
        # blocks consumed on the next batch extraction.
        self._buffer = leftover
        return output.build()
コード例 #8
0
ファイル: output_buffer.py プロジェクト: stefanbschneider/ray
class BlockOutputBuffer(object):
    """Generates output blocks of a given size given a stream of inputs.

    This class is used to turn a stream of items / blocks of arbitrary size
    into a stream of blocks of ``target_max_block_size``. The caller should
    check ``has_next()`` after each ``add()`` call, and call ``next()`` to get
    the next block when ``has_next()`` returns True.

    When all items have been added, the caller must call ``finalize()`` and
    then check ``has_next()`` one last time.

    Examples:
        >>> # Yield a stream of output blocks.
        >>> output = BlockOutputBuffer(udf, 500 * 1024 * 1024)
        >>> for item in generator():
        ...     output.add(item)
        ...      if output.has_next():
        ...         yield output.next()
        ... output.finalize()
        ... if output.has_next()
        ...    yield output.next()
    """

    def __init__(self, block_udf: Optional[Callable[[Block], Block]],
                 target_max_block_size: int):
        from ray.data.impl.arrow_block import DelegatingArrowBlockBuilder
        self._target_max_block_size = target_max_block_size
        self._block_udf = block_udf
        self._buffer = DelegatingArrowBlockBuilder()
        self._returned_at_least_one_block = False
        self._finalized = False

    def add(self, item: Any) -> None:
        """Add a single item to this output buffer."""
        assert not self._finalized
        self._buffer.add(item)

    def add_block(self, block: Block) -> None:
        """Add a data block to this output buffer."""
        assert not self._finalized
        self._buffer.add_block(block)

    def finalize(self) -> None:
        """Must be called once all items have been added."""
        assert not self._finalized
        self._finalized = True

    def has_next(self) -> bool:
        """Returns true when a complete output block is produced."""
        if self._finalized:
            return not self._returned_at_least_one_block \
                or self._buffer.num_rows() > 0
        else:
            return self._buffer.get_estimated_memory_usage() > \
                self._target_max_block_size

    def next(self) -> Block:
        """Returns the next complete output block."""
        assert self.has_next()
        block = self._buffer.build()
        if self._block_udf and block.num_rows > 0:
            block = self._block_udf(block)
        self._buffer = DelegatingArrowBlockBuilder()
        self._returned_at_least_one_block = True
        return block