class SimpleBlockBuilder(BlockBuilder[T]): def __init__(self): self._items = [] self._size_estimator = SizeEstimator() def add(self, item: T) -> None: self._items.append(item) self._size_estimator.add(item) def add_block(self, block: List[T]) -> None: if not isinstance(block, list): raise TypeError( f"Got a block of type {type(block)}, expected list. " "If you are mapping a function, ensure it returns an " "object with the expected type. Block:\n" f"{block}") self._items.extend(block) for item in block: self._size_estimator.add(item) def num_rows(self) -> int: return len(self._items) def build(self) -> Block: return list(self._items) def get_estimated_memory_usage(self) -> int: return self._size_estimator.size_bytes()
class SimpleBlockBuilder(BlockBuilder[T]): def __init__(self): self._items = [] self._size_estimator = SizeEstimator() def add(self, item: T) -> None: self._items.append(item) self._size_estimator.add(item) def add_block(self, block: List[T]) -> None: assert isinstance(block, list), block self._items.extend(block) for item in block: self._size_estimator.add(item) def build(self) -> Block: return list(self._items) def get_estimated_memory_usage(self) -> int: return self._size_estimator.size_bytes()
class TableBlockBuilder(BlockBuilder[T]): def __init__(self, block_type): # The set of uncompacted Python values buffered. self._columns = collections.defaultdict(list) # The set of compacted tables we have built so far. self._tables: List[Any] = [] self._tables_size_bytes = 0 # Size estimator for un-compacted table values. self._uncompacted_size = SizeEstimator() self._num_rows = 0 self._num_compactions = 0 self._block_type = block_type def add(self, item: Union[dict, TableRow]) -> None: if isinstance(item, TableRow): item = item.as_pydict() if not isinstance(item, dict): raise ValueError( "Returned elements of an TableBlock must be of type `dict`, " "got {} (type {}).".format(item, type(item)) ) for key, value in item.items(): self._columns[key].append(value) self._num_rows += 1 self._compact_if_needed() self._uncompacted_size.add(item) def add_block(self, block: Any) -> None: assert isinstance(block, self._block_type), block accessor = BlockAccessor.for_block(block) self._tables.append(block) self._tables_size_bytes += accessor.size_bytes() self._num_rows += accessor.num_rows() def _table_from_pydict(self, columns: Dict[str, List[Any]]) -> Block: raise NotImplementedError def _concat_tables(self, tables: List[Block]) -> Block: raise NotImplementedError @staticmethod def _empty_table() -> Any: raise NotImplementedError def build(self) -> Block: if self._columns: tables = [self._table_from_pydict(self._columns)] else: tables = [] tables.extend(self._tables) if len(tables) > 1: return self._concat_tables(tables) elif len(tables) > 0: return tables[0] else: return self._empty_table() def num_rows(self) -> int: return self._num_rows def get_estimated_memory_usage(self) -> int: if self._num_rows == 0: return 0 return self._tables_size_bytes + self._uncompacted_size.size_bytes() def _compact_if_needed(self) -> None: assert self._columns if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES: return block = self._table_from_pydict(self._columns) self.add_block(block) self._uncompacted_size = SizeEstimator() self._columns.clear() self._num_compactions += 1
class ArrowBlockBuilder(BlockBuilder[T]): def __init__(self): if pyarrow is None: raise ImportError("Run `pip install pyarrow` for Arrow support") # The set of uncompacted Python values buffered. self._columns = collections.defaultdict(list) # The set of compacted tables we have built so far. self._tables: List["pyarrow.Table"] = [] self._tables_nbytes = 0 # Size estimator for un-compacted table values. self._uncompacted_size = SizeEstimator() self._num_rows = 0 self._num_compactions = 0 def add(self, item: Union[dict, ArrowRow]) -> None: if isinstance(item, ArrowRow): item = item.as_pydict() if not isinstance(item, dict): raise ValueError( "Returned elements of an ArrowBlock must be of type `dict`, " "got {} (type {}).".format(item, type(item))) for key, value in item.items(): self._columns[key].append(value) self._num_rows += 1 self._compact_if_needed() self._uncompacted_size.add(item) def add_block(self, block: "pyarrow.Table") -> None: assert isinstance(block, pyarrow.Table), block self._tables.append(block) self._tables_nbytes += block.nbytes self._num_rows += block.num_rows def build(self) -> Block: if self._columns: tables = [pyarrow.Table.from_pydict(self._columns)] else: tables = [] tables.extend(self._tables) if len(tables) > 1: return pyarrow.concat_tables(tables, promote=True) elif len(tables) > 0: return tables[0] else: return pyarrow.Table.from_pydict({}) def num_rows(self) -> int: return self._num_rows def get_estimated_memory_usage(self) -> int: if self._num_rows == 0: return 0 return self._tables_nbytes + self._uncompacted_size.size_bytes() def _compact_if_needed(self) -> None: assert self._columns if self._uncompacted_size.size_bytes() < MAX_UNCOMPACTED_SIZE_BYTES: return block = pyarrow.Table.from_pydict(self._columns) self._tables.append(block) self._tables_nbytes += block.nbytes self._uncompacted_size = SizeEstimator() self._columns.clear() self._num_compactions += 1