Exemple #1
0
def test_arrow_size_add_block(ray_start_regular_shared):
    b = ArrowBlockBuilder()
    for _ in range(2000):
        b.add(ARROW_LARGE_VALUE)
    block = b.build()
    b2 = ArrowBlockBuilder()
    for _ in range(5):
        b2.add_block(block)
    assert b2._num_compactions == 0
    assert_close(b2.get_estimated_memory_usage(), 100040020)
    assert b2.build().num_rows == 10000
class DelegatingBlockBuilder(BlockBuilder[T]):
    def __init__(self):
        self._builder = None
        self._empty_block = None

    def add(self, item: Any) -> None:

        if self._builder is None:
            # TODO (kfstorm): Maybe we can use Pandas block format for dict.
            if isinstance(item, dict) or isinstance(item, ArrowRow):
                import pyarrow

                try:
                    check = ArrowBlockBuilder()
                    check.add(item)
                    check.build()
                    self._builder = ArrowBlockBuilder()
                except (TypeError, pyarrow.lib.ArrowInvalid):
                    self._builder = SimpleBlockBuilder()
            elif isinstance(item, PandasRow):
                self._builder = PandasBlockBuilder()
            else:
                self._builder = SimpleBlockBuilder()
        self._builder.add(item)

    def add_block(self, block: Block) -> None:
        accessor = BlockAccessor.for_block(block)
        if accessor.num_rows() == 0:
            # Don't infer types of empty lists. Store the block and use it if no
            # other data is added. https://github.com/ray-project/ray/issues/20290
            self._empty_block = block
            return
        if self._builder is None:
            self._builder = accessor.builder()
        self._builder.add_block(block)

    def build(self) -> Block:
        if self._builder is None:
            if self._empty_block is not None:
                self._builder = BlockAccessor.for_block(
                    self._empty_block).builder()
            else:
                self._builder = ArrowBlockBuilder()
        return self._builder.build()

    def num_rows(self) -> int:
        return self._builder.num_rows() if self._builder is not None else 0

    def get_estimated_memory_usage(self) -> int:
        if self._builder is None:
            return 0
        return self._builder.get_estimated_memory_usage()
Exemple #3
0
    def add(self, item: Any) -> None:

        if self._builder is None:
            if isinstance(item, dict) or isinstance(item, ArrowRow):
                import pyarrow
                try:
                    check = ArrowBlockBuilder()
                    check.add(item)
                    check.build()
                    self._builder = ArrowBlockBuilder()
                except (TypeError, pyarrow.lib.ArrowInvalid):
                    self._builder = SimpleBlockBuilder()
            else:
                self._builder = SimpleBlockBuilder()
        self._builder.add(item)
Exemple #4
0
def test_arrow_size(ray_start_regular_shared):
    b = ArrowBlockBuilder()
    assert b.get_estimated_memory_usage() == 0
    b.add(ARROW_SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 118)
    b.add(ARROW_SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 236)
    for _ in range(8):
        b.add(ARROW_SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 1180)
    for _ in range(90):
        b.add(ARROW_SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 11800)
    for _ in range(900):
        b.add(ARROW_SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 118000)
    assert b.build().num_rows == 1000
    def add(self, item: Any) -> None:

        if self._builder is None:
            # TODO (kfstorm): Maybe we can use Pandas block format for dict.
            if isinstance(item, dict) or isinstance(item, ArrowRow):
                import pyarrow

                try:
                    check = ArrowBlockBuilder()
                    check.add(item)
                    check.build()
                    self._builder = ArrowBlockBuilder()
                except (TypeError, pyarrow.lib.ArrowInvalid):
                    self._builder = SimpleBlockBuilder()
            elif isinstance(item, PandasRow):
                self._builder = PandasBlockBuilder()
            else:
                self._builder = SimpleBlockBuilder()
        self._builder.add(item)
class DelegatingBlockBuilder(BlockBuilder[T]):
    def __init__(self):
        self._builder = None

    def add(self, item: Any) -> None:

        if self._builder is None:
            # TODO (kfstorm): Maybe we can use Pandas block format for dict.
            if isinstance(item, dict) or isinstance(item, ArrowRow):
                import pyarrow

                try:
                    check = ArrowBlockBuilder()
                    check.add(item)
                    check.build()
                    self._builder = ArrowBlockBuilder()
                except (TypeError, pyarrow.lib.ArrowInvalid):
                    self._builder = SimpleBlockBuilder()
            elif isinstance(item, PandasRow):
                self._builder = PandasBlockBuilder()
            else:
                self._builder = SimpleBlockBuilder()
        self._builder.add(item)

    def add_block(self, block: Block) -> None:
        if self._builder is None:
            self._builder = BlockAccessor.for_block(block).builder()
        self._builder.add_block(block)

    def build(self) -> Block:
        if self._builder is None:
            self._builder = ArrowBlockBuilder()
        return self._builder.build()

    def num_rows(self) -> int:
        return self._builder.num_rows() if self._builder is not None else 0

    def get_estimated_memory_usage(self) -> int:
        if self._builder is None:
            return 0
        return self._builder.get_estimated_memory_usage()
Exemple #7
0
def test_arrow_size_diff_values(ray_start_regular_shared):
    b = ArrowBlockBuilder()
    assert b.get_estimated_memory_usage() == 0
    b.add(ARROW_LARGE_VALUE)
    assert b._num_compactions == 0
    assert_close(b.get_estimated_memory_usage(), 10019)
    b.add(ARROW_LARGE_VALUE)
    assert b._num_compactions == 0
    assert_close(b.get_estimated_memory_usage(), 20038)
    for _ in range(10):
        b.add(ARROW_SMALL_VALUE)
    assert_close(b.get_estimated_memory_usage(), 25178)
    for _ in range(100):
        b.add(ARROW_SMALL_VALUE)
    assert b._num_compactions == 0
    assert_close(b.get_estimated_memory_usage(), 35394)
    for _ in range(13000):
        b.add(ARROW_LARGE_VALUE)
    assert_close(b.get_estimated_memory_usage(), 130131680)
    assert b._num_compactions == 2
    for _ in range(4000):
        b.add(ARROW_LARGE_VALUE)
    assert_close(b.get_estimated_memory_usage(), 170129189)
    assert b._num_compactions == 3
    assert b.build().num_rows == 17112