Exemple #1
0
    def __call__(self) -> MaybeBlockPartition:
        context = DatasetContext.get_current()
        result = self._read_fn()
        if not hasattr(result, "__iter__"):
            DeprecationWarning(
                "Read function must return Iterable[Block], got {}. "
                "Probably you need to return `[block]` instead of "
                "`block`.".format(result))

        if context.block_splitting_enabled:
            partition: BlockPartition = []
            for block in result:
                metadata = BlockAccessor.for_block(block).get_metadata(
                    input_files=self._metadata.input_files,
                    exec_stats=None)  # No exec stats for the block splits.
                assert context.block_owner
                partition.append(
                    (ray.put(block, _owner=context.block_owner), metadata))
            if len(partition) == 0:
                raise ValueError("Read task must return non-empty list.")
            return partition
        else:
            builder = DelegatingBlockBuilder()
            for block in result:
                builder.add_block(block)
            return builder.build()
Exemple #2
0
    def __init__(
        self,
        batch_size: Optional[int],
        shuffle_buffer_min_size: int,
        shuffle_buffer_capacity: Optional[int] = None,
        shuffle_seed: Optional[int] = None,
    ):
        """Constructs a random-shuffling block batcher.

        Args:
            batch_size: Record batch size.
            shuffle_buffer_min_size: Minimum number of rows that must be in the local
                in-memory shuffle buffer in order to yield a batch. This must be greater
                than or equal to ``batch_size``. When there are no more rows to be added
                to the buffer, the number of rows in the buffer *will* decrease below
                this value while yielding the remaining batches, and the final batch may
                have less than ``batch_size`` rows. Increasing this will improve the
                randomness of the shuffle but may increase the latency to the first
                batch.
            shuffle_buffer_capacity: Soft maximum number of rows allowed in the local
                in-memory shuffle buffer. This must be greater than or equal to
                ``batch_size`` plus ``shuffle_buffer_min_size``. Note that this is a
                soft max: if the buffer is currently smaller than this max, we will add
                a new data block to the buffer, but this new data block may push the
                buffer over this max; we don't take the size of the new data block into
                account when doing this capacity check. Default is ``max(2 *
                shuffle_buffer_min_size, shuffle_buffer_min_size + batch_size)``.
            shuffle_seed: The seed to use for the local random shuffle.
        """
        if batch_size is None:
            raise ValueError(
                "Must specify a batch_size if using a local shuffle.")
        self._batch_size = batch_size
        if shuffle_buffer_min_size < batch_size:
            raise ValueError(
                "Shuffle buffer min size must be at least as large as the batch size, "
                f"but got: shuffle_buffer_min_size={shuffle_buffer_min_size}, "
                f"batch_size={batch_size}")
        if shuffle_buffer_capacity is None:
            shuffle_buffer_capacity = max(
                2 * shuffle_buffer_min_size,
                shuffle_buffer_min_size + batch_size,
            )
        if shuffle_buffer_capacity < shuffle_buffer_min_size + batch_size:
            raise ValueError(
                "Shuffle buffer capacity must be at least as large as the shuffle "
                "buffer min size plus the batch size, but got: "
                f"shuffle_buffer_capacity={shuffle_buffer_capacity}, "
                f"shuffle_buffer_min_size={shuffle_buffer_min_size}, "
                f"batch_size={batch_size}")
        self._buffer_capacity = shuffle_buffer_capacity
        self._buffer_min_size = shuffle_buffer_min_size
        self._builder = DelegatingBlockBuilder()
        self._shuffle_buffer: Block = None
        self._shuffle_indices: List[int] = None
        self._batch_head = 0
        self._done_adding = False

        if shuffle_seed is not None:
            random.seed(shuffle_seed)
Exemple #3
0
 def __init__(self, block_udf: Optional[Callable[[Block], Block]],
              target_max_block_size: int):
     self._target_max_block_size = target_max_block_size
     self._block_udf = block_udf
     self._buffer = DelegatingBlockBuilder()
     self._returned_at_least_one_block = False
     self._finalized = False
Exemple #4
0
def _map_block_nosplit(block: Block, fn: Any,
                       input_files: List[str]) -> Tuple[Block, BlockMetadata]:
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    for new_block in fn(block):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files,
                                            exec_stats=stats.build())
Exemple #5
0
 def next(self) -> Block:
     """Returns the next complete output block."""
     assert self.has_next()
     block = self._buffer.build()
     accessor = BlockAccessor.for_block(block)
     if self._block_udf and accessor.num_rows() > 0:
         block = self._block_udf(block)
     self._buffer = DelegatingBlockBuilder()
     self._returned_at_least_one_block = True
     return block
Exemple #6
0
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> import ray
        >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP
        >>> ds # doctest: +SKIP
        Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>)
        >>> ds.take(2) # doctest: +SKIP
        [1, 2]

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """

    detected_parallelism, _ = _autodetect_parallelism(
        parallelism,
        ray.util.get_current_placement_group(),
        DatasetContext.get_current(),
    )
    block_size = max(
        1,
        len(items) // detected_parallelism,
    )

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        stats = BlockExecStats.builder()
        builder = DelegatingBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=stats.build()))
        i += block_size

    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_items": metadata}, parent=None),
        ),
        0,
        False,
    )
Exemple #7
0
    def next_batch(self) -> Block:
        """Get the next shuffled batch from the shuffle buffer.

        Returns:
            A batch represented as a Block.
        """
        assert self.has_batch() or (self._done_adding and self.has_any())
        # Add rows in the builder to the shuffle buffer.
        if self._builder.num_rows() > 0:
            if self._shuffle_buffer is not None:
                if self._batch_head > 0:
                    # Compact the materialized shuffle buffer.
                    # TODO(Clark): If alternating between adding blocks and fetching
                    # shuffled batches, this aggressive compaction could be inefficient.
                    self._shuffle_buffer = BlockAccessor.for_block(
                        self._shuffle_buffer).take(
                            self._shuffle_indices[self._batch_head:])
                # Add the unyielded rows from the existing shuffle buffer.
                self._builder.add_block(self._shuffle_buffer)
            # Build the new shuffle buffer.
            self._shuffle_buffer = self._builder.build()
            # Reset the builder.
            self._builder = DelegatingBlockBuilder()
            # Invalidate the shuffle indices.
            self._shuffle_indices = None
            self._batch_head = 0

        assert self._shuffle_buffer is not None
        buffer_size = BlockAccessor.for_block(self._shuffle_buffer).num_rows()
        # Truncate the batch to the buffer size, if necessary.
        batch_size = min(self._batch_size, buffer_size)

        if self._shuffle_indices is None:
            # Need to generate new shuffle indices.
            self._shuffle_indices = list(range(buffer_size))
            random.shuffle(self._shuffle_indices)

        # Get the shuffle indices for this batch.
        batch_indices = self._shuffle_indices[self.
                                              _batch_head:self._batch_head +
                                              batch_size]
        self._batch_head += batch_size
        # Yield the shuffled batch.
        return BlockAccessor.for_block(
            self._shuffle_buffer).take(batch_indices)
Exemple #8
0
def _map_block_nosplit(
    block: Block,
    block_fn: BlockTransform,
    input_files: List[str],
    fn: Optional[UDF],
    *fn_args,
    **fn_kwargs,
) -> Tuple[Block, BlockMetadata]:
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    if fn is not None:
        fn_args = (fn, ) + fn_args
    for new_block in block_fn(block, *fn_args, **fn_kwargs):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files,
                                            exec_stats=stats.build())
 def reduce(random_shuffle: bool, random_seed: Optional[int],
            *mapper_outputs: List[Block]) -> (Block, BlockMetadata):
     stats = BlockExecStats.builder()
     builder = DelegatingBlockBuilder()
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     accessor = BlockAccessor.for_block(new_block)
     if random_shuffle:
         new_block = accessor.random_shuffle(
             random_seed if random_seed is not None else None)
         accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(
         num_rows=accessor.num_rows(),
         size_bytes=accessor.size_bytes(),
         schema=accessor.schema(),
         input_files=None,
         exec_stats=stats.build(),
     )
     return new_block, new_metadata
Exemple #10
0
def sample_boundaries(blocks: List[ObjectRef[Block]], key: SortKeyT,
                      num_reducers: int) -> List[T]:
    """
    Return (num_reducers - 1) items in ascending order from the blocks that
    partition the domain into ranges with approximately equally many elements.
    """
    # TODO(Clark): Support multiple boundary sampling keys.
    if isinstance(key, list) and len(key) > 1:
        raise ValueError("Multiple boundary sampling keys not supported.")

    n_samples = int(num_reducers * 10 / len(blocks))

    sample_block = cached_remote_fn(_sample_block)

    sample_results = [
        sample_block.remote(block, n_samples, key) for block in blocks
    ]
    sample_bar = ProgressBar("Sort Sample", len(sample_results))
    samples = sample_bar.fetch_until_complete(sample_results)
    sample_bar.close()
    del sample_results
    samples = [s for s in samples if len(s) > 0]
    # The dataset is empty
    if len(samples) == 0:
        return [None] * (num_reducers - 1)
    builder = DelegatingBlockBuilder()
    for sample in samples:
        builder.add_block(sample)
    samples = builder.build()
    column = key[0][0] if isinstance(key, list) else None
    sample_items = BlockAccessor.for_block(samples).to_numpy(column)
    sample_items = np.sort(sample_items)
    ret = [
        np.quantile(sample_items, q, interpolation="nearest")
        for q in np.linspace(0, 1, num_reducers)
    ]
    return ret[1:]
Exemple #11
0
    def next_batch(self) -> Block:
        """Get the next batch from the block buffer.

        Returns:
            A batch represented as a Block.
        """
        assert self.has_batch() or (self._done_adding and self.has_any())
        # If no batch size, short-circuit.
        if self._batch_size is None:
            assert len(self._buffer) == 1
            block = self._buffer[0]
            self._buffer = []
            self._buffer_size = 0
            return block
        output = DelegatingBlockBuilder()
        leftover = []
        needed = self._batch_size
        for block in self._buffer:
            accessor = BlockAccessor.for_block(block)
            if needed <= 0:
                # We already have a full batch, so add this block to
                # the leftovers.
                leftover.append(block)
            elif accessor.num_rows() <= needed:
                # We need this entire block to fill out a batch.
                # We need to call `accessor.slice()` to ensure
                # the subsequent block's type are the same.
                output.add_block(
                    accessor.slice(0, accessor.num_rows(), copy=False))
                needed -= accessor.num_rows()
            else:
                # We only need part of the block to fill out a batch.
                output.add_block(accessor.slice(0, needed, copy=False))
                # Add the rest of the block to the leftovers.
                leftover.append(
                    accessor.slice(needed, accessor.num_rows(), copy=False))
                needed = 0

        # Move the leftovers into the block buffer so they're the first
        # blocks consumed on the next batch extraction.
        self._buffer = leftover
        self._buffer_size -= self._batch_size
        return output.build()
Exemple #12
0
class ShufflingBatcher(BatcherInterface):
    """Chunks blocks into shuffled batches, using a local in-memory shuffle buffer."""

    # Implementation Note:
    #
    # This shuffling batcher lazily builds a shuffle buffer from added blocks, and once
    # a batch is requested via .next_batch(), it concatenates the blocks into a concrete
    # shuffle buffer, generates random shuffle indices, and starts returning shuffled
    # batches.
    #
    # Adding of more blocks can be intermixed with retrieving batches, but it should be
    # noted that we can end up performing two expensive operations on each retrieval:
    #  1. Build added blocks into a concrete shuffle buffer.
    #  2. Generate random shuffle indices.
    # Note that (1) and (2) only happen when new blocks are added, upon the next
    # retrieval. I.e., if no new blocks have been added since the last batch retrieval,
    # and there are still batches in the existing concrete shuffle buffer to be yielded,
    # then each batch retrieval will only involve slicing the batch out of the concrete
    # shuffle buffer.
    #
    # Similarly, adding blocks is very cheap. Each added block will be appended to a
    # list, with concatenation of the underlying data delayed until the next batch
    # retrieval.
    #
    # Since (1) runs of block additions are cheap, and (2) runs of batch retrievals are
    # cheap, callers of ShufflingBatcher are encouraged to add as many blocks as
    # possible (up to the shuffle buffer capacity), followed by retrieving as many
    # batches as possible (down to the shuffle buffer minimum size), in such contiguous
    # runs.

    def __init__(
        self,
        batch_size: Optional[int],
        shuffle_buffer_min_size: int,
        shuffle_buffer_capacity: Optional[int] = None,
        shuffle_seed: Optional[int] = None,
    ):
        """Constructs a random-shuffling block batcher.

        Args:
            batch_size: Record batch size.
            shuffle_buffer_min_size: Minimum number of rows that must be in the local
                in-memory shuffle buffer in order to yield a batch. This must be greater
                than or equal to ``batch_size``. When there are no more rows to be added
                to the buffer, the number of rows in the buffer *will* decrease below
                this value while yielding the remaining batches, and the final batch may
                have less than ``batch_size`` rows. Increasing this will improve the
                randomness of the shuffle but may increase the latency to the first
                batch.
            shuffle_buffer_capacity: Soft maximum number of rows allowed in the local
                in-memory shuffle buffer. This must be greater than or equal to
                ``batch_size`` plus ``shuffle_buffer_min_size``. Note that this is a
                soft max: if the buffer is currently smaller than this max, we will add
                a new data block to the buffer, but this new data block may push the
                buffer over this max; we don't take the size of the new data block into
                account when doing this capacity check. Default is ``max(2 *
                shuffle_buffer_min_size, shuffle_buffer_min_size + batch_size)``.
            shuffle_seed: The seed to use for the local random shuffle.
        """
        if batch_size is None:
            raise ValueError(
                "Must specify a batch_size if using a local shuffle.")
        self._batch_size = batch_size
        if shuffle_buffer_min_size < batch_size:
            raise ValueError(
                "Shuffle buffer min size must be at least as large as the batch size, "
                f"but got: shuffle_buffer_min_size={shuffle_buffer_min_size}, "
                f"batch_size={batch_size}")
        if shuffle_buffer_capacity is None:
            shuffle_buffer_capacity = max(
                2 * shuffle_buffer_min_size,
                shuffle_buffer_min_size + batch_size,
            )
        if shuffle_buffer_capacity < shuffle_buffer_min_size + batch_size:
            raise ValueError(
                "Shuffle buffer capacity must be at least as large as the shuffle "
                "buffer min size plus the batch size, but got: "
                f"shuffle_buffer_capacity={shuffle_buffer_capacity}, "
                f"shuffle_buffer_min_size={shuffle_buffer_min_size}, "
                f"batch_size={batch_size}")
        self._buffer_capacity = shuffle_buffer_capacity
        self._buffer_min_size = shuffle_buffer_min_size
        self._builder = DelegatingBlockBuilder()
        self._shuffle_buffer: Block = None
        self._shuffle_indices: List[int] = None
        self._batch_head = 0
        self._done_adding = False

        if shuffle_seed is not None:
            random.seed(shuffle_seed)

    def add(self, block: Block):
        """Add a block to the shuffle buffer.

        Args:
            block: Block to add to the shuffle buffer.
        """
        assert self.can_add(block)
        self._builder.add_block(block)

    def can_add(self, block: Block) -> bool:
        """Whether the block can be added to the shuffle buffer.

        This does not take the to-be-added block size into account when checking the
        buffer size vs. buffer capacity, since we need to support large outlier blocks
        and have to guard against min buffer size liveness issues.
        """
        return self._buffer_size(
        ) <= self._buffer_capacity and not self._done_adding

    def done_adding(self) -> bool:
        """Indicate to the batcher that no more blocks will be added to the batcher.

        No more blocks should be added to the batcher after calling this.
        """
        self._done_adding = True

    def has_any(self) -> bool:
        """Whether this batcher has any data."""
        return self._buffer_size() > 0

    def has_batch(self) -> bool:
        """Whether this batcher has any batches."""
        buffer_size = self._buffer_size()
        # If still adding blocks, ensure that removing a batch wouldn't cause the
        # shuffle buffer to dip beneath its configured minimum size.
        return buffer_size - self._batch_size >= self._buffer_min_size or (
            self._done_adding and buffer_size >= self._batch_size)

    def _buffer_size(self) -> int:
        """Return shuffle buffer size."""
        buffer_size = self._builder.num_rows()
        if self._shuffle_buffer is not None:
            # Include the size of the concrete (materialized) shuffle buffer, adjusting
            # for the batch head position, which also serves as a counter of the number
            # of already-yielded rows from the current concrete shuffle buffer.
            buffer_size += (
                BlockAccessor.for_block(self._shuffle_buffer).num_rows() -
                self._batch_head)
        return buffer_size

    def next_batch(self) -> Block:
        """Get the next shuffled batch from the shuffle buffer.

        Returns:
            A batch represented as a Block.
        """
        assert self.has_batch() or (self._done_adding and self.has_any())
        # Add rows in the builder to the shuffle buffer.
        if self._builder.num_rows() > 0:
            if self._shuffle_buffer is not None:
                if self._batch_head > 0:
                    # Compact the materialized shuffle buffer.
                    # TODO(Clark): If alternating between adding blocks and fetching
                    # shuffled batches, this aggressive compaction could be inefficient.
                    self._shuffle_buffer = BlockAccessor.for_block(
                        self._shuffle_buffer).take(
                            self._shuffle_indices[self._batch_head:])
                # Add the unyielded rows from the existing shuffle buffer.
                self._builder.add_block(self._shuffle_buffer)
            # Build the new shuffle buffer.
            self._shuffle_buffer = self._builder.build()
            # Reset the builder.
            self._builder = DelegatingBlockBuilder()
            # Invalidate the shuffle indices.
            self._shuffle_indices = None
            self._batch_head = 0

        assert self._shuffle_buffer is not None
        buffer_size = BlockAccessor.for_block(self._shuffle_buffer).num_rows()
        # Truncate the batch to the buffer size, if necessary.
        batch_size = min(self._batch_size, buffer_size)

        if self._shuffle_indices is None:
            # Need to generate new shuffle indices.
            self._shuffle_indices = list(range(buffer_size))
            random.shuffle(self._shuffle_indices)

        # Get the shuffle indices for this batch.
        batch_indices = self._shuffle_indices[self.
                                              _batch_head:self._batch_head +
                                              batch_size]
        self._batch_head += batch_size
        # Yield the shuffled batch.
        return BlockAccessor.for_block(
            self._shuffle_buffer).take(batch_indices)
Exemple #13
0
class BlockOutputBuffer(object):
    """Generates output blocks of a given size given a stream of inputs.

    This class is used to turn a stream of items / blocks of arbitrary size
    into a stream of blocks of ``target_max_block_size``. The caller should
    check ``has_next()`` after each ``add()`` call, and call ``next()`` to get
    the next block when ``has_next()`` returns True.

    When all items have been added, the caller must call ``finalize()`` and
    then check ``has_next()`` one last time.

    Examples:
        >>> from ray.data._internal.output_buffer import BlockOutputBuffer
        >>> udf = ... # doctest: +SKIP
        >>> generator = ... # doctest: +SKIP
        >>> # Yield a stream of output blocks.
        >>> output = BlockOutputBuffer(udf, 500 * 1024 * 1024) # doctest: +SKIP
        >>> for item in generator(): # doctest: +SKIP
        ...     output.add(item) # doctest: +SKIP
        ...     if output.has_next(): # doctest: +SKIP
        ...         yield output.next() # doctest: +SKIP
        >>> output.finalize() # doctest: +SKIP
        >>> if output.has_next() # doctest: +SKIP
        ...     yield output.next() # doctest: +SKIP
    """
    def __init__(self, block_udf: Optional[Callable[[Block], Block]],
                 target_max_block_size: int):
        self._target_max_block_size = target_max_block_size
        self._block_udf = block_udf
        self._buffer = DelegatingBlockBuilder()
        self._returned_at_least_one_block = False
        self._finalized = False

    def add(self, item: Any) -> None:
        """Add a single item to this output buffer."""
        assert not self._finalized
        self._buffer.add(item)

    def add_batch(self, batch: DataBatch) -> None:
        """Add a data batch to this output buffer."""
        assert not self._finalized
        self._buffer.add_batch(batch)

    def add_block(self, block: Block) -> None:
        """Add a data block to this output buffer."""
        assert not self._finalized
        self._buffer.add_block(block)

    def finalize(self) -> None:
        """Must be called once all items have been added."""
        assert not self._finalized
        self._finalized = True

    def has_next(self) -> bool:
        """Returns true when a complete output block is produced."""
        if self._finalized:
            return not self._returned_at_least_one_block or self._buffer.num_rows(
            ) > 0
        else:
            return (self._buffer.get_estimated_memory_usage() >
                    self._target_max_block_size)

    def next(self) -> Block:
        """Returns the next complete output block."""
        assert self.has_next()
        block = self._buffer.build()
        accessor = BlockAccessor.for_block(block)
        if self._block_udf and accessor.num_rows() > 0:
            block = self._block_udf(block)
        self._buffer = DelegatingBlockBuilder()
        self._returned_at_least_one_block = True
        return block