Ejemplo n.º 1
0
 def add(self, item: Any) -> None:
     if self._builder is None:
         if isinstance(item, dict) or isinstance(item, ArrowRow):
             try:
                 check = ArrowBlockBuilder()
                 check.add(item)
                 check.build()
                 self._builder = ArrowBlockBuilder()
             except (TypeError, pyarrow.lib.ArrowInvalid):
                 self._builder = SimpleBlockBuilder()
         else:
             self._builder = SimpleBlockBuilder()
     self._builder.add(item)
Ejemplo n.º 2
0
    def add(self, item: Any) -> None:

        if self._builder is None:
            # TODO (kfstorm): Maybe we can use Pandas block format for dict.
            if isinstance(item, dict) or isinstance(item, ArrowRow):
                import pyarrow

                try:
                    check = ArrowBlockBuilder()
                    check.add(item)
                    check.build()
                    self._builder = ArrowBlockBuilder()
                except (TypeError, pyarrow.lib.ArrowInvalid):
                    self._builder = SimpleBlockBuilder()
            elif isinstance(item, PandasRow):
                self._builder = PandasBlockBuilder()
            else:
                self._builder = SimpleBlockBuilder()
        self._builder.add(item)
Ejemplo n.º 3
0
def test_py_size_diff_values(ray_start_regular_shared):
    b = SimpleBlockBuilder()
    assert b.get_estimated_memory_usage() == 0
    for _ in range(10):
        b.add(LARGE_VALUE)
    assert_close(b.get_estimated_memory_usage(), 100120)
    for _ in range(100):
        b.add(SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 121120)
    for _ in range(100):
        b.add(LARGE_VALUE)
    assert_close(b.get_estimated_memory_usage(), 1166875)
    for _ in range(100):
        b.add(LARGE_VALUE)
    assert_close(b.get_estimated_memory_usage(), 2182927)
    b.add_block([SMALL_VALUE] * 1000)
    assert_close(b.get_estimated_memory_usage(), 2240613)
    assert len(b.build()) == 1310
Ejemplo n.º 4
0
def test_py_size(ray_start_regular_shared):
    b = SimpleBlockBuilder()
    assert b.get_estimated_memory_usage() == 0
    b.add(SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 111)
    b.add(SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 222)
    for _ in range(8):
        b.add(SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 1110)
    for _ in range(90):
        b.add(SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 11100)
    b.add_block([SMALL_VALUE] * 900)
    assert_close(b.get_estimated_memory_usage(), 111000)
    assert len(b.build()) == 1000
Ejemplo n.º 5
0
def fast_repartition(blocks, num_blocks):
    from ray.data.dataset import Dataset

    wrapped_ds = Dataset(ExecutionPlan(blocks,
                                       DatasetStats(stages={}, parent=None)),
                         0,
                         lazy=False)
    # Compute the (n-1) indices needed for an equal split of the data.
    count = wrapped_ds.count()
    dataset_format = wrapped_ds._dataset_format()
    indices = []
    cur_idx = 0
    for _ in range(num_blocks - 1):
        cur_idx += count / num_blocks
        indices.append(int(cur_idx))
    assert len(indices) < num_blocks, (indices, num_blocks)
    if indices:
        splits = wrapped_ds.split_at_indices(indices)
    else:
        splits = [wrapped_ds]
    # TODO(ekl) include stats for the split tasks. We may also want to
    # consider combining the split and coalesce tasks as an optimization.

    # Coalesce each split into a single block.
    reduce_task = cached_remote_fn(
        _ShufflePartitionOp.reduce).options(num_returns=2)
    reduce_bar = ProgressBar("Repartition", position=0, total=len(splits))
    reduce_out = [
        reduce_task.remote(False, None, *s.get_internal_block_refs())
        for s in splits if s.num_blocks() > 0
    ]

    # Early-release memory.
    del splits, blocks, wrapped_ds

    new_blocks, new_metadata = zip(*reduce_out)
    new_blocks, new_metadata = list(new_blocks), list(new_metadata)
    new_metadata = reduce_bar.fetch_until_complete(new_metadata)
    reduce_bar.close()

    # Handle empty blocks.
    if len(new_blocks) < num_blocks:
        from ray.data.impl.arrow_block import ArrowBlockBuilder
        from ray.data.impl.pandas_block import PandasBlockBuilder
        from ray.data.impl.simple_block import SimpleBlockBuilder

        num_empties = num_blocks - len(new_blocks)
        if dataset_format == "arrow":
            builder = ArrowBlockBuilder()
        elif dataset_format == "pandas":
            builder = PandasBlockBuilder()
        else:
            builder = SimpleBlockBuilder()
        empty_block = builder.build()
        empty_meta = BlockAccessor.for_block(empty_block).get_metadata(
            input_files=None, exec_stats=None)  # No stats for empty block.
        empty_blocks, empty_metadata = zip(*[(ray.put(empty_block), empty_meta)
                                             for _ in range(num_empties)])
        new_blocks += empty_blocks
        new_metadata += empty_metadata

    return BlockList(new_blocks, new_metadata), {}