Ejemplo n.º 1
0
def _ndarray_to_block(ndarray: np.ndarray) -> Block[np.ndarray]:
    stats = BlockExecStats.builder()
    block = BlockAccessor.batch_to_block(ndarray)
    metadata = BlockAccessor.for_block(block).get_metadata(
        input_files=None, exec_stats=stats.build()
    )
    return block, metadata
Ejemplo n.º 2
0
 def _read_file(self, f: "pyarrow.NativeFile", path: str, **reader_args):
     # TODO(ekl) Ideally numpy can read directly from the file, but it
     # seems like it requires the file to be seekable.
     buf = BytesIO()
     data = f.readall()
     buf.write(data)
     buf.seek(0)
     return BlockAccessor.batch_to_block(np.load(buf, allow_pickle=True))
Ejemplo n.º 3
0
    def add_batch(self, batch: DataBatch):
        """Add a user-facing data batch to the builder.

        This data batch will be converted to an internal block and then added to the
        underlying builder.
        """
        block = BlockAccessor.batch_to_block(batch)
        return self.add_block(block)
Ejemplo n.º 4
0
        def make_block(start: int, count: int) -> Block:
            if block_format == "arrow":
                import pyarrow as pa

                return pa.Table.from_arrays([np.arange(start, start + count)],
                                            names=["value"])
            elif block_format == "tensor":
                import pyarrow as pa

                tensor = np.ones(tensor_shape,
                                 dtype=np.int64) * np.expand_dims(
                                     np.arange(start, start + count),
                                     tuple(range(1, 1 + len(tensor_shape))),
                                 )
                return BlockAccessor.batch_to_block(tensor)
            else:
                return list(builtins.range(start, start + count))
Ejemplo n.º 5
0
    def prepare_read(
            self,
            parallelism: int,
            n: int,
            block_format: str = "list",
            tensor_shape: Tuple = (1, ),
    ) -> List[ReadTask]:
        read_tasks: List[ReadTask] = []
        block_size = max(1, n // parallelism)

        # Example of a read task. In a real datasource, this would pull data
        # from an external system instead of generating dummy data.
        def make_block(start: int, count: int) -> Block:
            if block_format == "arrow":
                import pyarrow as pa

                return pa.Table.from_arrays([np.arange(start, start + count)],
                                            names=["value"])
            elif block_format == "tensor":
                import pyarrow as pa

                tensor = np.ones(tensor_shape,
                                 dtype=np.int64) * np.expand_dims(
                                     np.arange(start, start + count),
                                     tuple(range(1, 1 + len(tensor_shape))),
                                 )
                return BlockAccessor.batch_to_block(tensor)
            else:
                return list(builtins.range(start, start + count))

        i = 0
        while i < n:
            count = min(block_size, n - i)
            if block_format == "arrow":
                _check_pyarrow_version()
                import pyarrow as pa

                schema = pa.Table.from_pydict({"value": [0]}).schema
            elif block_format == "tensor":
                _check_pyarrow_version()
                import pyarrow as pa

                tensor = np.ones(tensor_shape,
                                 dtype=np.int64) * np.expand_dims(
                                     np.arange(0, 10),
                                     tuple(range(1, 1 + len(tensor_shape))))
                schema = BlockAccessor.batch_to_block(tensor).schema
            elif block_format == "list":
                schema = int
            else:
                raise ValueError("Unsupported block type", block_format)
            if block_format == "tensor":
                element_size = np.product(tensor_shape)
            else:
                element_size = 1
            meta = BlockMetadata(
                num_rows=count,
                size_bytes=8 * count * element_size,
                schema=schema,
                input_files=None,
                exec_stats=None,
            )
            read_tasks.append(
                ReadTask(lambda i=i, count=count: [make_block(i, count)],
                         meta))
            i += block_size

        return read_tasks