Exemple #1
0
    def map(
        idx: int,
        block: Block,
        output_num_blocks: int,
        block_udf: Optional[Callable[[Block], Iterable[Block]]],
        random_shuffle: bool,
        random_seed: Optional[int],
    ) -> List[Union[BlockMetadata, Block]]:
        stats = BlockExecStats.builder()
        if block_udf:
            # TODO(ekl) note that this effectively disables block splitting.
            blocks = list(block_udf(block))
            if len(blocks) > 1:
                builder = BlockAccessor.for_block(blocks[0]).builder()
                for b in blocks:
                    builder.add_block(b)
                block = builder.build()
            else:
                block = blocks[0]
        block = BlockAccessor.for_block(block)

        # Randomize the distribution of records to blocks.
        if random_shuffle:
            seed_i = random_seed + idx if random_seed is not None else None
            block = block.random_shuffle(seed_i)
            block = BlockAccessor.for_block(block)

        slice_sz = max(1, math.ceil(block.num_rows() / output_num_blocks))
        slices = []
        for i in range(output_num_blocks):
            slices.append(block.slice(i * slice_sz, (i + 1) * slice_sz, copy=True))

        # Randomize the distribution order of the blocks (this prevents empty
        # outputs when input blocks are very small).
        if random_shuffle:
            random = np.random.RandomState(seed_i)
            random.shuffle(slices)

        num_rows = sum(BlockAccessor.for_block(s).num_rows() for s in slices)
        assert num_rows == block.num_rows(), (num_rows, block.num_rows())
        metadata = block.get_metadata(input_files=None, exec_stats=stats.build())
        return [metadata] + slices
Exemple #2
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    import pyarrow as pa
    from ray.data.extensions import TensorArray

    table = pa.Table.from_pydict({"value": TensorArray(ndarray)})
    return (
        table,
        BlockAccessor.for_block(table).get_metadata(input_files=None,
                                                    exec_stats=stats.build()),
    )
Exemple #3
0
 def reduce(random_shuffle: bool, random_seed: Optional[int],
            *mapper_outputs: List[Block]) -> (Block, BlockMetadata):
     stats = BlockExecStats.builder()
     builder = DelegatingBlockBuilder()
     for block in mapper_outputs:
         builder.add_block(block)
     new_block = builder.build()
     accessor = BlockAccessor.for_block(new_block)
     if random_shuffle:
         new_block = accessor.random_shuffle(
             random_seed if random_seed is not None else None)
         accessor = BlockAccessor.for_block(new_block)
     new_metadata = BlockMetadata(
         num_rows=accessor.num_rows(),
         size_bytes=accessor.size_bytes(),
         schema=accessor.schema(),
         input_files=None,
         exec_stats=stats.build(),
     )
     return new_block, new_metadata
Exemple #4
0
def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
    stats = BlockExecStats.builder()
    import pyarrow as pa

    block = pa.table(df)
    return (
        block,
        BlockAccessor.for_block(block).get_metadata(
            input_files=None, exec_stats=stats.build()
        ),
    )
Exemple #5
0
def _shuffle_reduce(*mapper_outputs: List[Block]) -> (Block, BlockMetadata):
    builder = DelegatingArrowBlockBuilder()
    for block in mapper_outputs:
        builder.add_block(block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    new_metadata = BlockMetadata(num_rows=accessor.num_rows(),
                                 size_bytes=accessor.size_bytes(),
                                 schema=accessor.schema(),
                                 input_files=None)
    return new_block, new_metadata
    def _write_block(
        self,
        f: "pyarrow.NativeFile",
        block: BlockAccessor,
        writer_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
        **writer_args,
    ):
        import pyarrow.parquet as pq

        writer_args = _resolve_kwargs(writer_args_fn, **writer_args)
        pq.write_table(block.to_arrow(), f, **writer_args)
Exemple #7
0
 def _buffer_size(self) -> int:
     """Return shuffle buffer size."""
     buffer_size = self._builder.num_rows()
     if self._shuffle_buffer is not None:
         # Include the size of the concrete (materialized) shuffle buffer, adjusting
         # for the batch head position, which also serves as a counter of the number
         # of already-yielded rows from the current concrete shuffle buffer.
         buffer_size += (
             BlockAccessor.for_block(self._shuffle_buffer).num_rows() -
             self._batch_head)
     return buffer_size
Exemple #8
0
        def write_block(write_path: str, block: Block):
            logger.debug(f"Writing {write_path} file.")
            fs = filesystem
            if isinstance(fs, _S3FileSystemWrapper):
                fs = fs.unwrap()
            if _block_udf is not None:
                block = _block_udf(block)

            with fs.open_output_stream(write_path, **open_stream_args) as f:
                _write_block_to_file(f, BlockAccessor.for_block(block),
                                     **write_args)
Exemple #9
0
 def vectorized_mean(block: Block[T]) -> AggType:
     block_acc = BlockAccessor.for_block(block)
     count = block_acc.count(on)
     if count == 0 or count is None:
         # Empty or all null.
         return None
     sum_ = block_acc.sum(on, ignore_nulls)
     if sum_ is None:
         # ignore_nulls=False and at least one null.
         return None
     return [sum_, count]
Exemple #10
0
 def add_block(self, block: Any) -> None:
     if not isinstance(block, self._block_type):
         raise TypeError(
             f"Got a block of type {type(block)}, expected {self._block_type}."
             "If you are mapping a function, ensure it returns an "
             "object with the expected type. Block:\n"
             f"{block}")
     accessor = BlockAccessor.for_block(block)
     self._tables.append(block)
     self._tables_size_bytes += accessor.size_bytes()
     self._num_rows += accessor.num_rows()
Exemple #11
0
def from_items(items: List[Any], *, parallelism: int = -1) -> Dataset[Any]:
    """Create a dataset from a list of local Python objects.

    Examples:
        >>> import ray
        >>> ds = ray.data.from_items([1, 2, 3, 4, 5]) # doctest: +SKIP
        >>> ds # doctest: +SKIP
        Dataset(num_blocks=5, num_rows=5, schema=<class 'int'>)
        >>> ds.take(2) # doctest: +SKIP
        [1, 2]

    Args:
        items: List of local Python objects.
        parallelism: The amount of parallelism to use for the dataset.
            Parallelism may be limited by the number of items.

    Returns:
        Dataset holding the items.
    """

    detected_parallelism, _ = _autodetect_parallelism(
        parallelism,
        ray.util.get_current_placement_group(),
        DatasetContext.get_current(),
    )
    block_size = max(
        1,
        len(items) // detected_parallelism,
    )

    blocks: List[ObjectRef[Block]] = []
    metadata: List[BlockMetadata] = []
    i = 0
    while i < len(items):
        stats = BlockExecStats.builder()
        builder = DelegatingBlockBuilder()
        for item in items[i:i + block_size]:
            builder.add(item)
        block = builder.build()
        blocks.append(ray.put(block))
        metadata.append(
            BlockAccessor.for_block(block).get_metadata(
                input_files=None, exec_stats=stats.build()))
        i += block_size

    return Dataset(
        ExecutionPlan(
            BlockList(blocks, metadata),
            DatasetStats(stages={"from_items": metadata}, parent=None),
        ),
        0,
        False,
    )
Exemple #12
0
def _format_batch(batch: Block, batch_format: str) -> BatchType:
    import pyarrow as pa

    if batch_format == "native":
        # Always promote Arrow blocks to pandas for consistency, since
        # we lazily convert pandas->Arrow internally for efficiency.
        if isinstance(batch, pa.Table) or isinstance(batch, bytes):
            batch = BlockAccessor.for_block(batch)
            batch = batch.to_pandas()
        return batch
    elif batch_format == "pandas":
        batch = BlockAccessor.for_block(batch)
        return batch.to_pandas()
    elif batch_format == "pyarrow":
        batch = BlockAccessor.for_block(batch)
        return batch.to_arrow()
    else:
        raise ValueError(
            f"The given batch format: {batch_format} "
            f"is invalid. Supported batch type: {BatchType}"
        )
Exemple #13
0
def _map_block_split(block: Block, fn: Any,
                     input_files: List[str]) -> BlockPartition:
    output = []
    for new_block in fn(block):
        accessor = BlockAccessor.for_block(new_block)
        new_meta = BlockMetadata(num_rows=accessor.num_rows(),
                                 size_bytes=accessor.size_bytes(),
                                 schema=accessor.schema(),
                                 input_files=input_files)
        owner = DatasetContext.get_current().block_owner
        output.append((ray.put(new_block, _owner=owner), new_meta))
    return output
Exemple #14
0
    def _write_block(
        self,
        f: "pyarrow.NativeFile",
        block: BlockAccessor,
        writer_args_fn: Callable[[], Dict[str, Any]] = lambda: {},
        **writer_args,
    ):
        from pyarrow import csv

        writer_args = _resolve_kwargs(writer_args_fn, **writer_args)
        write_options = writer_args.pop("write_options", None)
        csv.write_csv(block.to_arrow(), f, write_options, **writer_args)
Exemple #15
0
 def _get(self, block_index, key):
     if block_index is None:
         return None
     block = self.blocks[block_index]
     column = block[self.key_field]
     if self.dataset_format == "arrow":
         column = _ArrowListWrapper(column)
     i = _binary_search_find(column, key)
     if i is None:
         return None
     acc = BlockAccessor.for_block(block)
     return acc._create_table_row(acc.slice(i, i + 1, copy=True))
Exemple #16
0
def _map_block_nosplit(
    block: Block, fn: Any, input_files: List[str]
) -> Tuple[Block, BlockMetadata]:
    stats = BlockExecStats.builder()
    builder = DelegatingBlockBuilder()
    for new_block in fn(block):
        builder.add_block(new_block)
    new_block = builder.build()
    accessor = BlockAccessor.for_block(new_block)
    return new_block, accessor.get_metadata(
        input_files=input_files, exec_stats=stats.build()
    )
Exemple #17
0
def test_sort_arrow_with_empty_blocks(ray_start_regular,
                                      use_push_based_shuffle):
    ctx = ray.data.context.DatasetContext.get_current()

    try:
        original = ctx.use_push_based_shuffle
        ctx.use_push_based_shuffle = use_push_based_shuffle

        assert (BlockAccessor.for_block(pa.Table.from_pydict({})).sample(
            10, "A").num_rows == 0)

        partitions = BlockAccessor.for_block(pa.Table.from_pydict(
            {})).sort_and_partition([1, 5, 10], "A", descending=False)
        assert len(partitions) == 4
        for partition in partitions:
            assert partition.num_rows == 0

        assert (BlockAccessor.for_block(pa.Table.from_pydict(
            {})).merge_sorted_blocks([pa.Table.from_pydict({})], "A",
                                     False)[0].num_rows == 0)

        ds = ray.data.from_items([{
            "A": (x % 3),
            "B": x
        } for x in range(3)],
                                 parallelism=3)
        ds = ds.filter(lambda r: r["A"] == 0)
        assert [row.as_pydict() for row in ds.sort("A").iter_rows()] == [{
            "A": 0,
            "B": 0
        }]

        # Test empty dataset.
        ds = ray.data.range_table(10).filter(lambda r: r["value"] > 10)
        assert (len(
            ray.data.impl.sort.sample_boundaries(
                ds._plan.execute().get_blocks(), "value", 3)) == 2)
        assert ds.sort("value").count() == 0
    finally:
        ctx.use_push_based_shuffle = original
Exemple #18
0
 def zip(self, other: "Block[T]") -> "Block[T]":
     acc = BlockAccessor.for_block(other)
     if not isinstance(acc, type(self)):
         raise ValueError(
             "Cannot zip {} with block of type {}".format(type(self), type(other))
         )
     if acc.num_rows() != self.num_rows():
         raise ValueError(
             "Cannot zip self (length {}) with block of length {}".format(
                 self.num_rows(), acc.num_rows()
             )
         )
     return self._zip(acc)
Exemple #19
0
 def vectorized_std(block: Block[T]) -> AggType:
     block_acc = BlockAccessor.for_block(block)
     count = block_acc.count(on)
     if count == 0 or count is None:
         # Empty or all null.
         return None
     sum_ = block_acc.sum(on, ignore_nulls)
     if sum_ is None:
         # ignore_nulls=False and at least one null.
         return None
     mean = sum_ / count
     M2 = block_acc.sum_of_squared_diffs_from_mean(on, ignore_nulls, mean)
     return [M2, mean, count]
Exemple #20
0
        def group_fn(batch):
            block_accessor = BlockAccessor.for_block(batch)
            boundaries = get_boundaries(block_accessor)
            builder = block_accessor.builder()
            start = 0
            for end in boundaries:
                group = block_accessor.slice(start, end, False)
                applied = fn(group)
                builder.add_block(applied)
                start = end

            rs = builder.build()
            return rs
Exemple #21
0
 def _get_write_path_for_block(self,
                               base_path,
                               *,
                               filesystem=None,
                               dataset_uuid=None,
                               block=None,
                               block_index=None,
                               file_format=None):
     num_rows = BlockAccessor.for_block(ray.get(block)).num_rows()
     suffix = f"{block_index:06}_{num_rows:02}_{dataset_uuid}" \
              f".test.{file_format}"
     print(f"Writing to: {base_path}/{suffix}")
     return f"{base_path}/{suffix}"
Exemple #22
0
 def map(
     idx: int,
     block: Block,
     output_num_blocks: int,
     boundaries: List[KeyType],
     key: KeyFn,
     aggs: Tuple[AggregateFn],
 ) -> List[Union[BlockMetadata, Block]]:
     """Partition the block and combine rows with the same key."""
     stats = BlockExecStats.builder()
     if key is None:
         partitions = [block]
     else:
         partitions = BlockAccessor.for_block(block).sort_and_partition(
             boundaries,
             [(key, "ascending")] if isinstance(key, str) else key,
             descending=False,
         )
     parts = [BlockAccessor.for_block(p).combine(key, aggs) for p in partitions]
     meta = BlockAccessor.for_block(block).get_metadata(
         input_files=None, exec_stats=stats.build()
     )
     return [meta] + parts
Exemple #23
0
 def _zip(self, acc: BlockAccessor) -> "Block[T]":
     r = self.to_arrow()
     s = acc.to_arrow()
     for col_name in s.column_names:
         col = s.column(col_name)
         # Ensure the column names are unique after zip.
         if col_name in r.column_names:
             i = 1
             new_name = col_name
             while new_name in r.column_names:
                 new_name = "{}_{}".format(col_name, i)
                 i += 1
             col_name = new_name
         r = r.append_column(col_name, col)
     return r
Exemple #24
0
 def _zip(self, acc: BlockAccessor) -> "pandas.DataFrame":
     r = self.to_pandas().copy(deep=False)
     s = acc.to_pandas()
     for col_name in s.columns:
         col = s[col_name]
         # Ensure the column names are unique after zip.
         if col_name in r.column_names:
             i = 1
             new_name = col_name
             while new_name in r.column_names:
                 new_name = "{}_{}".format(col_name, i)
                 i += 1
             col_name = new_name
         r[col_name] = col
     return r
Exemple #25
0
 def process_block(
     self, block: Block, input_files: List[str]
 ) -> Iterable[Tuple[Block, BlockMetadata]]:
     output = []
     for new_block in fn(block):
         accessor = BlockAccessor.for_block(new_block)
         new_metadata = BlockMetadata(
             num_rows=accessor.num_rows(),
             size_bytes=accessor.size_bytes(),
             schema=accessor.schema(),
             input_files=input_files)
         owner = DatasetContext.get_current().block_owner
         output.append((ray.put(new_block,
                                _owner=owner), new_metadata))
     return output
Exemple #26
0
 def _get_write_path_for_block(
     self,
     base_path,
     *,
     filesystem=None,
     dataset_uuid=None,
     block=None,
     block_index=None,
     file_format=None,
 ):
     num_rows = BlockAccessor.for_block(ray.get(block)).num_rows()
     suffix = (
         f"{block_index:06}_{num_rows:02}_{dataset_uuid}" f".test.{file_format}"
     )
     return posixpath.join(base_path, suffix)
Exemple #27
0
    def __init__(self, on: Optional[KeyFn] = None, ignore_nulls: bool = True):
        self._set_key_fn(on)

        null_merge = _null_wrap_merge(ignore_nulls, max)

        super().__init__(
            init=_null_wrap_init(lambda k: float("-inf")),
            merge=null_merge,
            accumulate_block=_null_wrap_accumulate_block(
                ignore_nulls,
                lambda block: BlockAccessor.for_block(block).max(on, ignore_nulls),
                null_merge,
            ),
            finalize=_null_wrap_finalize(lambda a: a),
            name=(f"max({str(on)})"),
        )
Exemple #28
0
 def multiget(self, block_indices, keys):
     start = time.perf_counter()
     if self.dataset_format == "arrow" and len(set(block_indices)) == 1:
         # Fast path: use np.searchsorted for vectorized search on a single block.
         # This is ~3x faster than the naive case.
         block = self.blocks[block_indices[0]]
         col = block[self.key_field]
         indices = np.searchsorted(col, keys)
         acc = BlockAccessor.for_block(block)
         result = [acc._get_row(i, copy=True) for i in indices]
         # assert result == [self._get(i, k) for i, k in zip(block_indices, keys)]
     else:
         result = [self._get(i, k) for i, k in zip(block_indices, keys)]
     self.total_time += time.perf_counter() - start
     self.num_accesses += 1
     return result
Exemple #29
0
    def remote_read(i: int, task: ReadTask) -> MaybeBlockPartition:
        DatasetContext._set_current(context)
        stats = BlockExecStats.builder()

        # Execute the read task.
        block = task()

        if context.block_splitting_enabled:
            metadata = task.get_metadata()
            metadata.exec_stats = stats.build()
        else:
            metadata = BlockAccessor.for_block(block).get_metadata(
                input_files=task.get_metadata().input_files,
                exec_stats=stats.build())
        stats_actor.record_task.remote(stats_uuid, i, metadata)
        return block
Exemple #30
0
        def make_block(start: int, count: int) -> Block:
            if block_format == "arrow":
                import pyarrow as pa

                return pa.Table.from_arrays([np.arange(start, start + count)],
                                            names=["value"])
            elif block_format == "tensor":
                import pyarrow as pa

                tensor = np.ones(tensor_shape,
                                 dtype=np.int64) * np.expand_dims(
                                     np.arange(start, start + count),
                                     tuple(range(1, 1 + len(tensor_shape))),
                                 )
                return BlockAccessor.batch_to_block(tensor)
            else:
                return list(builtins.range(start, start + count))