Ejemplo n.º 1
0
def _shuffle_map(
    block: Block,
    idx: int,
    output_num_blocks: int,
    random_shuffle: bool,
    random_seed: Optional[int],
) -> List[Union[BlockMetadata, Block]]:
    """Returns list of [BlockMetadata, O1, O2, O3, ...output_num_blocks]."""
    stats = BlockExecStats.builder()
    block = BlockAccessor.for_block(block)

    # Randomize the distribution of records to blocks.
    if random_shuffle:
        seed_i = random_seed + idx if random_seed is not None else None
        block = block.random_shuffle(seed_i)
        block = BlockAccessor.for_block(block)

    slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
    slices = []
    for i in range(output_num_blocks):
        slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))

    # Randomize the distribution order of the blocks (this matters when
    # some blocks are larger than others).
    if random_shuffle:
        random = np.random.RandomState(seed_i)
        random.shuffle(slices)

    num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
    assert num_rows == block.num_rows(), (num_rows, block.num_rows())
    metadata = block.get_metadata(input_files=None, exec_stats=stats.build())
    return [metadata] + slices
Ejemplo n.º 2
0
Archivo: split.py Proyecto: parasj/ray
def _split_single_block(
    block_id: int,
    block: Block,
    meta: BlockMetadata,
    block_row: int,
    split_indices: List[int],
) -> Tuple[int, List[Tuple[ObjectRef[Block], BlockMetadata]]]:
    """Split the provided block at the given indices."""
    split_result = []
    block_accessor = BlockAccessor.for_block(block)
    prev_index = 0
    # append one more entry at the last so we don't
    # need handle empty edge case.
    split_indices.append(block_row)
    for index in split_indices:
        logger.debug(f"slicing block {prev_index}:{index}")
        stats = BlockExecStats.builder()
        split_block = block_accessor.slice(prev_index, index, copy=True)
        accessor = BlockAccessor.for_block(split_block)
        split_meta = BlockMetadata(
            num_rows=accessor.num_rows(),
            size_bytes=accessor.size_bytes(),
            schema=meta.schema,
            input_files=meta.input_files,
            exec_stats=stats.build(),
        )
        split_result.append((ray.put(split_block), split_meta))
        prev_index = index
    return (block_id, split_result)
Ejemplo n.º 3
0
Archivo: shuffle.py Proyecto: rlan/ray
def _shuffle_map(block: Block, idx: int, output_num_blocks: int,
                 random_shuffle: bool,
                 random_seed: Optional[int]) -> List[Block]:
    block = BlockAccessor.for_block(block)

    # Randomize the distribution of records to blocks.
    if random_shuffle:
        seed_i = random_seed + idx if random_seed is not None else None
        block = block.random_shuffle(seed_i)
        block = BlockAccessor.for_block(block)

    slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
    slices = []
    for i in range(output_num_blocks):
        slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))

    # Randomize the distribution order of the blocks (this matters when
    # some blocks are larger than others).
    if random_shuffle:
        random = np.random.RandomState(seed_i)
        random.shuffle(slices)

    num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
    assert num_rows == block.num_rows(), (num_rows, block.num_rows())
    # Needed to handle num_returns=1 edge case in Ray API.
    if len(slices) == 1:
        return slices[0]
    else:
        return slices
Ejemplo n.º 4
0
def _sort_block(block, boundaries, key, descending):
    stats = BlockExecStats.builder()
    out = BlockAccessor.for_block(block).sort_and_partition(
        boundaries, key, descending)
    meta = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build())
    return out + [meta]
Ejemplo n.º 5
0
 def aggregate_combined_blocks(
     blocks: List["pandas.DataFrame"], key: KeyFn, aggs: Tuple[AggregateFn]
 ) -> Tuple["pandas.DataFrame", BlockMetadata]:
     # TODO (kfstorm): A workaround to pass tests. Not efficient.
     block, metadata = ArrowBlockAccessor.aggregate_combined_blocks(
         [BlockAccessor.for_block(block).to_arrow() for block in blocks], key, aggs
     )
     return BlockAccessor.for_block(block).to_pandas(), metadata
Ejemplo n.º 6
0
 def sort_and_partition(self, boundaries: List[T], key: "SortKeyT",
                        descending: bool) -> List["pandas.DataFrame"]:
     # TODO (kfstorm): A workaround to pass tests. Not efficient.
     delegated_result = BlockAccessor.for_block(
         self.to_arrow()).sort_and_partition(boundaries, key, descending)
     return [
         BlockAccessor.for_block(_).to_pandas() for _ in delegated_result
     ]
Ejemplo n.º 7
0
 def merge_sorted_blocks(
         blocks: List["pandas.DataFrame"], key: "SortKeyT",
         _descending: bool) -> Tuple["pandas.DataFrame", BlockMetadata]:
     # TODO (kfstorm): A workaround to pass tests. Not efficient.
     block, metadata = ArrowBlockAccessor.merge_sorted_blocks(
         [BlockAccessor.for_block(block).to_arrow() for block in blocks],
         key,
         _descending,
     )
     return BlockAccessor.for_block(block).to_pandas(), metadata
Ejemplo n.º 8
0
def _partition_and_combine_block(block: Block[T], boundaries: List[KeyType],
                                 key: GroupKeyT,
                                 aggs: Tuple[AggregateFn]) -> List[Block]:
    """Partition the block and combine rows with the same key."""
    if key is None:
        partitions = [block]
    else:
        partitions = BlockAccessor.for_block(block).sort_and_partition(
            boundaries, [(key, "ascending")] if isinstance(key, str) else key,
            descending=False)
    return [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions]
Ejemplo n.º 9
0
def _format_batch(batch: Block, batch_format: str) -> BatchType:
    if batch_format == "native":
        batch = BlockAccessor.for_block(batch).to_native()
    elif batch_format == "pandas":
        batch = BlockAccessor.for_block(batch).to_pandas()
    elif batch_format == "pyarrow":
        batch = BlockAccessor.for_block(batch).to_arrow()
    elif batch_format == "numpy":
        batch = BlockAccessor.for_block(batch).to_numpy()
    else:
        raise ValueError(f"The given batch format: {batch_format} "
                         f"is invalid. Supported batch type: {BatchType}")
    return batch
Ejemplo n.º 10
0
Archivo: sort.py Proyecto: tchordia/ray
 def map(
     idx: int,
     block: Block,
     output_num_blocks: int,
     boundaries: List[T],
     key: SortKeyT,
     descending: bool,
 ) -> List[Union[BlockMetadata, Block]]:
     stats = BlockExecStats.builder()
     out = BlockAccessor.for_block(block).sort_and_partition(
         boundaries, key, descending)
     meta = BlockAccessor.for_block(block).get_metadata(
         input_files=None, exec_stats=stats.build())
     return [meta] + out
Ejemplo n.º 11
0
def _partition_and_combine_block(
        block: Block[T], boundaries: List[KeyType], key: KeyFn,
        aggs: Tuple[AggregateFn]) -> List[Union[Block, BlockMetadata]]:
    """Partition the block and combine rows with the same key."""
    stats = BlockExecStats.builder()
    if key is None:
        partitions = [block]
    else:
        partitions = BlockAccessor.for_block(block).sort_and_partition(
            boundaries, [(key, "ascending")] if isinstance(key, str) else key,
            descending=False)
    parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions]
    meta = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build())
    return parts + [meta]
Ejemplo n.º 12
0
    def __call__(self) -> MaybeBlockPartition:
        context = DatasetContext.get_current()
        result = self._read_fn()
        if not hasattr(result, "__iter__"):
            DeprecationWarning(
                "Read function must return Iterable[Block], got {}. "
                "Probably you need to return `[block]` instead of "
                "`block`.".format(result))

        if context.block_splitting_enabled:
            partition: BlockPartition = []
            for block in result:
                metadata = BlockAccessor.for_block(block).get_metadata(
                    input_files=self._metadata.input_files,
                    exec_stats=None)  # No exec stats for the block splits.
                assert context.block_owner
                partition.append(
                    (ray.put(block, _owner=context.block_owner), metadata))
            if len(partition) == 0:
                raise ValueError("Read task must return non-empty list.")
            return partition
        else:
            builder = DelegatingBlockBuilder()
            for block in result:
                builder.add_block(block)
            return builder.build()
Ejemplo n.º 13
0
 def has_batch(self) -> bool:
     """Whether this Batcher has any full batches."""
     return self._buffer and (
         self._batch_size is None
         or sum(BlockAccessor.for_block(b).num_rows() for b in self._buffer)
         >= self._batch_size
     )
Ejemplo n.º 14
0
def from_items(items: List[Any], *, parallelism: int = 200) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> ray.data.from_items([1, 2, 3, 4, 5])

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.

    Returns:
        Dataset holding the items.
    """
    block_size = max(1, len(items) // parallelism)

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        builder = DelegatingArrowBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(input_files=None))
        i += block_size

    return Dataset(BlockList(blocks, metadata))
Ejemplo n.º 15
0
 def reduce(
     key: KeyFn, aggs: Tuple[AggregateFn], *mapper_outputs: List[Block]
 ) -> (Block, BlockMetadata):
     """Aggregate sorted and partially combined blocks."""
     return BlockAccessor.for_block(mapper_outputs[0]).aggregate_combined_blocks(
         list(mapper_outputs), key, aggs
     )
Ejemplo n.º 16
0
def _map_block_split(
    block: Block,
    block_fn: BlockTransform,
    input_files: List[str],
    fn: Optional[UDF],
    *fn_args,
    **fn_kwargs,
) -> BlockPartition:
    output = []
    stats = BlockExecStats.builder()
    if fn is not None:
        fn_args = (fn,) + fn_args
    for new_block in block_fn(block, *fn_args, **fn_kwargs):
        accessor = BlockAccessor.for_block(new_block)
        new_meta = BlockMetadata(
            num_rows=accessor.num_rows(),
            size_bytes=accessor.size_bytes(),
            schema=accessor.schema(),
            input_files=input_files,
            exec_stats=stats.build(),
        )
        owner = DatasetContext.get_current().block_owner
        output.append((ray.put(new_block, _owner=owner), new_meta))
        stats = BlockExecStats.builder()
    return output
Ejemplo n.º 17
0
def _aggregate_combined_blocks(
    num_reducers: int, key: KeyFn, aggs: Tuple[AggregateFn], *blocks: Tuple[Block, ...]
) -> Tuple[Block[U], BlockMetadata]:
    """Aggregate sorted and partially combined blocks."""
    return BlockAccessor.for_block(blocks[0]).aggregate_combined_blocks(
        list(blocks), key, aggs
    )
Ejemplo n.º 18
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    block = BlockAccessor.batch_to_block(ndarray)
    metadata = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build()
    )
    return block, metadata
Ejemplo n.º 19
0
def _test_equal_split_balanced(block_sizes, num_splits):
    blocks = []
    metadata = []
    total_rows = 0
    for block_size in block_sizes:
        block = list(range(total_rows, total_rows + block_size))
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(None, None))
        total_rows += block_size
    block_list = BlockList(blocks, metadata)
    ds = Dataset(
        ExecutionPlan(block_list, DatasetStats.TODO()),
        0,
        False,
    )

    splits = ds.split(num_splits, equal=True)
    split_counts = [split.count() for split in splits]
    assert len(split_counts) == num_splits
    expected_block_size = total_rows // num_splits
    # Check that all splits are the expected size.
    assert all([count == expected_block_size for count in split_counts])
    expected_total_rows = sum(split_counts)
    # Check that the expected number of rows were dropped.
    assert total_rows - expected_total_rows == total_rows % num_splits
    # Check that all rows are unique (content check).
    split_rows = [row for split in splits for row in split.take(total_rows)]
    assert len(set(split_rows)) == len(split_rows)
Ejemplo n.º 20
0
 def build(self) -> Block:
     if self._builder is None:
         if self._empty_block is not None:
             self._builder = BlockAccessor.for_block(self._empty_block).builder()
         else:
             self._builder = ArrowBlockBuilder()
     return self._builder.build()
Ejemplo n.º 21
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    import pyarrow as pa
    from ray.data.extensions import TensorArray
    table = pa.Table.from_pydict({"value": TensorArray(ndarray)})
    return (table, BlockAccessor.for_block(table).get_metadata(
        input_files=None, exec_stats=stats.build()))
Ejemplo n.º 22
0
 def write_block(write_path: str, block: Block):
     logger.debug(f"Writing {write_path} file.")
     fs = filesystem
     if isinstance(fs, _S3FileSystemWrapper):
         fs = fs.unwrap()
     with fs.open_output_stream(write_path) as f:
         _write_block_to_file(f, BlockAccessor.for_block(block))
Ejemplo n.º 23
0
def test_sort_arrow_with_empty_blocks(ray_start_regular, use_push_based_shuffle):
    ctx = ray.data.context.DatasetContext.get_current()

    try:
        original = ctx.use_push_based_shuffle
        ctx.use_push_based_shuffle = use_push_based_shuffle

        assert (
            BlockAccessor.for_block(pa.Table.from_pydict({})).sample(10, "A").num_rows
            == 0
        )

        partitions = BlockAccessor.for_block(
            pa.Table.from_pydict({})
        ).sort_and_partition([1, 5, 10], "A", descending=False)
        assert len(partitions) == 4
        for partition in partitions:
            assert partition.num_rows == 0

        assert (
            BlockAccessor.for_block(pa.Table.from_pydict({}))
            .merge_sorted_blocks([pa.Table.from_pydict({})], "A", False)[0]
            .num_rows
            == 0
        )

        ds = ray.data.from_items(
            [{"A": (x % 3), "B": x} for x in range(3)], parallelism=3
        )
        ds = ds.filter(lambda r: r["A"] == 0)
        assert [row.as_pydict() for row in ds.sort("A").iter_rows()] == [
            {"A": 0, "B": 0}
        ]

        # Test empty dataset.
        ds = ray.data.range_table(10).filter(lambda r: r["value"] > 10)
        assert (
            len(
                ray.data._internal.sort.sample_boundaries(
                    ds._plan.execute().get_blocks(), "value", 3
                )
            )
            == 2
        )
        assert ds.sort("value").count() == 0
    finally:
        ctx.use_push_based_shuffle = original
Ejemplo n.º 24
0
def _aggregate_combined_blocks(
        num_reducers: int, key: GroupKeyT, aggs: Tuple[AggregateFn],
        *blocks: Tuple[Block, ...]) -> Tuple[Block[U], BlockMetadata]:
    """Aggregate sorted and partially combined blocks."""
    if num_reducers == 1:
        blocks = [b[0] for b in blocks]  # Ray weirdness
    return BlockAccessor.for_block(blocks[0]).aggregate_combined_blocks(
        list(blocks), key, aggs)
Ejemplo n.º 25
0
def _map_block_nosplit(block: Block, fn: Any,
                       input_files: List[str]) -> Tuple[Block, BlockMetadata]:
    builder = DelegatingArrowBlockBuilder()
    for new_block in fn(block):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(input_files=input_files)
Ejemplo n.º 26
0
 def __init__(self):
     super().__init__(
         init=lambda k: 0,
         accumulate_block=(lambda a, block: a + BlockAccessor.for_block(
             block).num_rows()),
         merge=lambda a1, a2: a1 + a2,
         name="count()",
     )
Ejemplo n.º 27
0
    def next_batch(self) -> Block:
        """Get the next shuffled batch from the shuffle buffer.

        Returns:
            A batch represented as a Block.
        """
        assert self.has_batch() or (self._done_adding and self.has_any())
        # Add rows in the builder to the shuffle buffer.
        if self._builder.num_rows() > 0:
            if self._shuffle_buffer is not None:
                if self._batch_head > 0:
                    # Compact the materialized shuffle buffer.
                    # TODO(Clark): If alternating between adding blocks and fetching
                    # shuffled batches, this aggressive compaction could be inefficient.
                    self._shuffle_buffer = BlockAccessor.for_block(
                        self._shuffle_buffer).take(
                            self._shuffle_indices[self._batch_head:])
                # Add the unyielded rows from the existing shuffle buffer.
                self._builder.add_block(self._shuffle_buffer)
            # Build the new shuffle buffer.
            self._shuffle_buffer = self._builder.build()
            # Reset the builder.
            self._builder = DelegatingBlockBuilder()
            # Invalidate the shuffle indices.
            self._shuffle_indices = None
            self._batch_head = 0

        assert self._shuffle_buffer is not None
        buffer_size = BlockAccessor.for_block(self._shuffle_buffer).num_rows()
        # Truncate the batch to the buffer size, if necessary.
        batch_size = min(self._batch_size, buffer_size)

        if self._shuffle_indices is None:
            # Need to generate new shuffle indices.
            self._shuffle_indices = list(range(buffer_size))
            random.shuffle(self._shuffle_indices)

        # Get the shuffle indices for this batch.
        batch_indices = self._shuffle_indices[self.
                                              _batch_head:self._batch_head +
                                              batch_size]
        self._batch_head += batch_size
        # Yield the shuffled batch.
        return BlockAccessor.for_block(
            self._shuffle_buffer).take(batch_indices)
Ejemplo n.º 28
0
 def process_block(self, block: Block,
                   meta: BlockMetadata) -> (Block, BlockMetadata):
     new_block = fn(block)
     accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                  size_bytes=accessor.size_bytes(),
                                  schema=accessor.schema(),
                                  input_files=meta.input_files)
     return new_block, new_metadata
Ejemplo n.º 29
0
    def add(self, block: Block):
        """Add a block to the block buffer.

        Args:
            block: Block to add to the block buffer.
        """
        assert self.can_add(block)
        self._buffer.append(block)
        self._buffer_size += BlockAccessor.for_block(block).num_rows()
Ejemplo n.º 30
0
 def reduce(
     key: SortKeyT,
     descending: bool,
     *mapper_outputs: List[Block],
     partial_reduce: bool = False,
 ) -> (Block, BlockMetadata):
     return BlockAccessor.for_block(mapper_outputs[0]).merge_sorted_blocks(
         mapper_outputs, key, descending
     )