def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]: stats = BlockExecStats.builder() block = BlockAccessor.batch_to_block(ndarray) metadata = BlockAccessor.for_block(block).get_metadata( input_files=None, exec_stats=stats.build() ) return block, metadata
def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args): # TODO(ekl) Ideally numpy can read directly from the file, but it # seems like it requires the file to be seekable. buf = BytesIO() data = f.readall() buf.write(data) buf.seek(0) return BlockAccessor.batch_to_block(np.load(buf, allow_pickle=True))
def add_batch(self, batch: DataBatch): """Add a user-facing data batch to the builder. This data batch will be converted to an internal block and then added to the underlying builder. """ block = BlockAccessor.batch_to_block(batch) return self.add_block(block)
def make_block(start: int, count: int) -> Block: if block_format == "arrow": import pyarrow as pa return pa.Table.from_arrays([np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": import pyarrow as pa tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))), ) return BlockAccessor.batch_to_block(tensor) else: return list(builtins.range(start, start + count))
def prepare_read( self, parallelism: int, n: int, block_format: str = "list", tensor_shape: Tuple = (1, ), ) -> List[ReadTask]: read_tasks: List[ReadTask] = [] block_size = max(1, n // parallelism) # Example of a read task. In a real datasource, this would pull data # from an external system instead of generating dummy data. def make_block(start: int, count: int) -> Block: if block_format == "arrow": import pyarrow as pa return pa.Table.from_arrays([np.arange(start, start + count)], names=["value"]) elif block_format == "tensor": import pyarrow as pa tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(start, start + count), tuple(range(1, 1 + len(tensor_shape))), ) return BlockAccessor.batch_to_block(tensor) else: return list(builtins.range(start, start + count)) i = 0 while i < n: count = min(block_size, n - i) if block_format == "arrow": _check_pyarrow_version() import pyarrow as pa schema = pa.Table.from_pydict({"value": [0]}).schema elif block_format == "tensor": _check_pyarrow_version() import pyarrow as pa tensor = np.ones(tensor_shape, dtype=np.int64) * np.expand_dims( np.arange(0, 10), tuple(range(1, 1 + len(tensor_shape)))) schema = BlockAccessor.batch_to_block(tensor).schema elif block_format == "list": schema = int else: raise ValueError("Unsupported block type", block_format) if block_format == "tensor": element_size = np.product(tensor_shape) else: element_size = 1 meta = BlockMetadata( num_rows=count, size_bytes=8 * count * element_size, schema=schema, input_files=None, exec_stats=None, ) read_tasks.append( ReadTask(lambda i=i, count=count: [make_block(i, count)], meta)) i += block_size return read_tasks