def build(self) -> Block: if self._builder is None: if self._empty_block is not None: self._builder = BlockAccessor.for_block(self._empty_block).builder() else: self._builder = ArrowBlockBuilder() return self._builder.build()
def add(self, item: Any) -> None: if self._builder is None: if isinstance(item, dict) or isinstance(item, ArrowRow): import pyarrow try: check = ArrowBlockBuilder() check.add(item) check.build() self._builder = ArrowBlockBuilder() except (TypeError, pyarrow.lib.ArrowInvalid): self._builder = SimpleBlockBuilder() else: self._builder = SimpleBlockBuilder() self._builder.add(item)
class DelegatingBlockBuilder(BlockBuilder[T]): def __init__(self): self._builder = None self._empty_block = None def add(self, item: Any) -> None: if self._builder is None: # TODO (kfstorm): Maybe we can use Pandas block format for dict. if isinstance(item, dict) or isinstance(item, ArrowRow): import pyarrow try: check = ArrowBlockBuilder() check.add(item) check.build() self._builder = ArrowBlockBuilder() except (TypeError, pyarrow.lib.ArrowInvalid): self._builder = SimpleBlockBuilder() elif isinstance(item, PandasRow): self._builder = PandasBlockBuilder() else: self._builder = SimpleBlockBuilder() self._builder.add(item) def add_block(self, block: Block) -> None: accessor = BlockAccessor.for_block(block) if accessor.num_rows() == 0: # Don't infer types of empty lists. Store the block and use it if no # other data is added. https://github.com/ray-project/ray/issues/20290 self._empty_block = block return if self._builder is None: self._builder = accessor.builder() self._builder.add_block(block) def build(self) -> Block: if self._builder is None: if self._empty_block is not None: self._builder = BlockAccessor.for_block( self._empty_block).builder() else: self._builder = ArrowBlockBuilder() return self._builder.build() def num_rows(self) -> int: return self._builder.num_rows() if self._builder is not None else 0 def get_estimated_memory_usage(self) -> int: if self._builder is None: return 0 return self._builder.get_estimated_memory_usage()
class DelegatingBlockBuilder(BlockBuilder[T]): def __init__(self): self._builder = None def add(self, item: Any) -> None: if self._builder is None: # TODO (kfstorm): Maybe we can use Pandas block format for dict. if isinstance(item, dict) or isinstance(item, ArrowRow): import pyarrow try: check = ArrowBlockBuilder() check.add(item) check.build() self._builder = ArrowBlockBuilder() except (TypeError, pyarrow.lib.ArrowInvalid): self._builder = SimpleBlockBuilder() elif isinstance(item, PandasRow): self._builder = PandasBlockBuilder() else: self._builder = SimpleBlockBuilder() self._builder.add(item) def add_block(self, block: Block) -> None: if self._builder is None: self._builder = BlockAccessor.for_block(block).builder() self._builder.add_block(block) def build(self) -> Block: if self._builder is None: self._builder = ArrowBlockBuilder() return self._builder.build() def num_rows(self) -> int: return self._builder.num_rows() if self._builder is not None else 0 def get_estimated_memory_usage(self) -> int: if self._builder is None: return 0 return self._builder.get_estimated_memory_usage()
def test_arrow_size_diff_values(ray_start_regular_shared): b = ArrowBlockBuilder() assert b.get_estimated_memory_usage() == 0 b.add(ARROW_LARGE_VALUE) assert b._num_compactions == 0 assert_close(b.get_estimated_memory_usage(), 10019) b.add(ARROW_LARGE_VALUE) assert b._num_compactions == 0 assert_close(b.get_estimated_memory_usage(), 20038) for _ in range(10): b.add(ARROW_SMALL_VALUE) assert_close(b.get_estimated_memory_usage(), 25178) for _ in range(100): b.add(ARROW_SMALL_VALUE) assert b._num_compactions == 0 assert_close(b.get_estimated_memory_usage(), 35394) for _ in range(13000): b.add(ARROW_LARGE_VALUE) assert_close(b.get_estimated_memory_usage(), 130131680) assert b._num_compactions == 2 for _ in range(4000): b.add(ARROW_LARGE_VALUE) assert_close(b.get_estimated_memory_usage(), 170129189) assert b._num_compactions == 3 assert b.build().num_rows == 17112
def test_arrow_size(ray_start_regular_shared): b = ArrowBlockBuilder() assert b.get_estimated_memory_usage() == 0 b.add(ARROW_SMALL_VALUE) assert_close(b.get_estimated_memory_usage(), 118) b.add(ARROW_SMALL_VALUE) assert_close(b.get_estimated_memory_usage(), 236) for _ in range(8): b.add(ARROW_SMALL_VALUE) assert_close(b.get_estimated_memory_usage(), 1180) for _ in range(90): b.add(ARROW_SMALL_VALUE) assert_close(b.get_estimated_memory_usage(), 11800) for _ in range(900): b.add(ARROW_SMALL_VALUE) assert_close(b.get_estimated_memory_usage(), 118000) assert b.build().num_rows == 1000
def test_arrow_size_add_block(ray_start_regular_shared): b = ArrowBlockBuilder() for _ in range(2000): b.add(ARROW_LARGE_VALUE) block = b.build() b2 = ArrowBlockBuilder() for _ in range(5): b2.add_block(block) assert b2._num_compactions == 0 assert_close(b2.get_estimated_memory_usage(), 100040020) assert b2.build().num_rows == 10000
def fast_repartition(blocks, num_blocks): from ray.data.dataset import Dataset wrapped_ds = Dataset(ExecutionPlan(blocks, DatasetStats(stages={}, parent=None)), 0, lazy=False) # Compute the (n-1) indices needed for an equal split of the data. count = wrapped_ds.count() dataset_format = wrapped_ds._dataset_format() indices = [] cur_idx = 0 for _ in range(num_blocks - 1): cur_idx += count / num_blocks indices.append(int(cur_idx)) assert len(indices) < num_blocks, (indices, num_blocks) if indices: splits = wrapped_ds.split_at_indices(indices) else: splits = [wrapped_ds] # TODO(ekl) include stats for the split tasks. We may also want to # consider combining the split and coalesce tasks as an optimization. # Coalesce each split into a single block. reduce_task = cached_remote_fn( _ShufflePartitionOp.reduce).options(num_returns=2) reduce_bar = ProgressBar("Repartition", position=0, total=len(splits)) reduce_out = [ reduce_task.remote(False, None, *s.get_internal_block_refs()) for s in splits if s.num_blocks() > 0 ] # Early-release memory. del splits, blocks, wrapped_ds new_blocks, new_metadata = zip(*reduce_out) new_blocks, new_metadata = list(new_blocks), list(new_metadata) new_metadata = reduce_bar.fetch_until_complete(new_metadata) reduce_bar.close() # Handle empty blocks. if len(new_blocks) < num_blocks: from ray.data.impl.arrow_block import ArrowBlockBuilder from ray.data.impl.pandas_block import PandasBlockBuilder from ray.data.impl.simple_block import SimpleBlockBuilder num_empties = num_blocks - len(new_blocks) if dataset_format == "arrow": builder = ArrowBlockBuilder() elif dataset_format == "pandas": builder = PandasBlockBuilder() else: builder = SimpleBlockBuilder() empty_block = builder.build() empty_meta = BlockAccessor.for_block(empty_block).get_metadata( input_files=None, exec_stats=None) # No stats for empty block. empty_blocks, empty_metadata = zip(*[(ray.put(empty_block), empty_meta) for _ in range(num_empties)]) new_blocks += empty_blocks new_metadata += empty_metadata return BlockList(new_blocks, new_metadata), {}
def add(self, item: Any) -> None: if self._builder is None: # TODO (kfstorm): Maybe we can use Pandas block format for dict. if isinstance(item, dict) or isinstance(item, ArrowRow): import pyarrow try: check = ArrowBlockBuilder() check.add(item) check.build() self._builder = ArrowBlockBuilder() except (TypeError, pyarrow.lib.ArrowInvalid): self._builder = SimpleBlockBuilder() elif isinstance(item, np.ndarray): self._builder = ArrowBlockBuilder() elif isinstance(item, PandasRow): self._builder = PandasBlockBuilder() else: self._builder = SimpleBlockBuilder() self._builder.add(item)
def build(self) -> Block: if self._builder is None: self._builder = ArrowBlockBuilder() return self._builder.build()