Esempio n. 1
0
    def from_record_batches(
        cls,
        record_batches,
        output_types,
        output_shapes=None,
        columns=None,
        batch_size=None,
        batch_mode="keep_remainder",
    ):
        """Create an ArrowDataset directly from Arrow record batches.
        This constructor requires pyarrow to be installed.

        Args:
            record_batches: An Arrow record batch or sequence of record batches
            output_types: Tensor dtypes of the output tensors
            output_shapes: TensorShapes of the output tensors or None to
                            infer partial
            batch_size: Batch size of output tensors, setting a batch size here
                        will create batched tensors from Arrow memory and can be more
                        efficient than using tf.data.Dataset.batch().
                        NOTE: batch_size does not need to be set if batch_mode='auto'
            batch_mode: Mode of batching, supported strings:
                        "keep_remainder" (default, keeps partial batch data),
                        "drop_remainder" (discard partial batch data),
                        "auto" (size to number of records in Arrow record batch)
            columns: A list of column indices to be used in the Dataset
        """
        import pyarrow as pa  # pylint: disable=import-outside-toplevel

        if isinstance(record_batches, pa.RecordBatch):
            record_batches = [record_batches]
        if columns is None:
            columns = tuple(range(record_batches[0].num_columns))
        assert record_batches
        if tf.executing_eagerly():
            sink = pa.BufferOutputStream()
            writer = pa.RecordBatchFileWriter(sink, record_batches[0].schema)
            for batch in record_batches:
                writer.write_batch(batch)
            writer.close()
            serialized_batches = None
            arrow_buffer = sink.getvalue()
        else:
            buf = io.BytesIO()
            writer = pa.RecordBatchFileWriter(buf, record_batches[0].schema)
            for batch in record_batches:
                writer.write_batch(batch)
            writer.close()
            serialized_batches = tf.convert_to_tensor(
                buf.getvalue(), dtype=dtypes.string, name="serialized_batches")
            arrow_buffer = None
        return cls(
            serialized_batches,
            columns,
            output_types,
            output_shapes,
            batch_size=batch_size,
            batch_mode=batch_mode,
            arrow_buffer=arrow_buffer,
        )
Esempio n. 2
0
 def to_arrow(self, path: Pathlike) -> None:
     """
     Store the manifest in Apache Arrow streaming binary format.
     For very large manifests it can be ~5x larger that a corresponding compressed JSONL,
     but it allows to read the manifest with a relatively small memory footprint (~300M).
     """
     import pyarrow as pa
     # If the underlying storage for manifests is already lazy, we can
     # access the arrow tables directly without the need to convert items.
     if self.is_lazy:
         # TODO: I don't want to add a special method for retrieving those in each manifest type;
         #       after this work is done, I will make a refactoring PR that renames these members
         #       to sth like ".data" so that it's uniform across manifests.
         from lhotse import RecordingSet, SupervisionSet, CutSet
         if isinstance(self, RecordingSet):
             table = self.recordings.table
         elif isinstance(self, SupervisionSet):
             table = self.segments.table
         elif isinstance(self, CutSet):
             table = self.cuts.table
         else:
             raise NotImplementedError(
                 f"Unsupported type of manifest for arrow serialization: {type(self)}"
             )
         with open(path, "wb") as f, pa.RecordBatchFileWriter(
                 f, schema=table.schema) as writer:
             for batch in table.to_batches():
                 writer.write_batch(batch)
     else:
         # We will take the first 1000 items from the manifest to infer the schema.
         # TODO: might want to sample items randomly in case their manifests vary...
         schema = pa.schema(
             pa.array(list(self.subset(first=1000).to_dicts())).type)
         # Open the file for writing and initialize the pyarrow batch writer.
         # Note that the batch size we determine here will be used to load whole chunks into
         # memory during deserialization.
         with open(path, "wb") as f, pa.RecordBatchFileWriter(
                 f, schema=schema) as writer:
             # We are (lazily) grouping the items in manifest into chunks,
             # each of ``batch_size`` items.
             batch_size = 10 * 1024
             chunks = grouper(n=batch_size, iterable=self.to_dicts())
             for chunk in chunks:
                 # We convert the items in each chunk into Arrow's columnar representation.
                 # To do this, we first iterate by available "columns" (i.e. dict keys),
                 # and for each of them create an Arrow array with the corresponding values.
                 # These arrays are then used to create an arrow Table.
                 arrays = [
                     pa.array([item.get(key) for item in chunk],
                              type=schema.field(key_idx).type)
                     for key_idx, key in enumerate(schema.names)
                 ]
                 table = pa.Table.from_arrays(arrays, schema=schema)
                 # The loop below will iterate only once, since we ensured there's exactly one batch.
                 for idx, batch in enumerate(
                         table.to_batches(max_chunksize=batch_size)):
                     writer.write_batch(batch)
Esempio n. 3
0
def _serialize_table(obj):
    if __supports_pyarrow:
        try:
            f = BytesIO()
            data = pa.Table.from_pandas(obj, preserve_index=False)
            if __use_legacy_export:
                batch_writer = pa.RecordBatchFileWriter(f,
                                                        data.schema,
                                                        use_legacy_format=True)
            else:
                batch_writer = pa.RecordBatchFileWriter(f, data.schema)
            with batch_writer as writer:
                writer.write_table(data)
            return {
                "arrow": True,
                # Tornado will encode this
                "data": f.getvalue()
            }
        except Exception as e:
            print("Failed to serialize to Arrow")
            print(e)
            # fall through to the JSON format

    def _serialize_row(row_obj, column_types, row_name=None):
        row_data = []
        for i, irow in enumerate(row_obj):
            if (column_types[i] == "Any"):
                datatype = guess_type(irow)
                row_data.append(serialize(irow, datatype))
            else:
                row_data.append(serialize(irow, column_types[i])["value"])
        return {
            "children": [],
            "name":
            row_name,
            "data": [
                serialize(irow, column_types[i])
                for i, irow in enumerate(row_obj)
            ]
        }

    serialized_table = {"rows": [], "cols": [], "types": []}
    for col in obj:
        serialized_table["cols"].append(col)
        # Use the numpy types to infer the MavenType
        serialized_table["types"].append(
            guess_type(obj.dtypes[col].type, check_instanceof=False))

    for i, row in obj.iterrows():
        serialized_table["rows"].append(
            _serialize_row(row, serialized_table["types"], i))
    return serialized_table
Esempio n. 4
0
  def __init__(self,
               record_batches,
               columns,
               output_types,
               output_shapes=None):
    """Create an ArrowDataset directly from Arrow record batches.
    This constructor requires pyarrow to be installed.

    Args:
      record_batches: An Arrow record batch or sequence of record batches
      columns: A list of column indices to be used in the Dataset
      output_types: Tensor dtypes of the output tensors
      output_shapes: TensorShapes of the output tensors or None to
                     infer partial
    """
    self._columns = columns
    self._output_types = output_types
    self._output_shapes = output_shapes or \
        nest.map_structure(
            lambda _: tensorflow.TensorShape(None), self._output_types)
    import pyarrow as pa
    if isinstance(record_batches, pa.RecordBatch):
      record_batches = [record_batches]
    assert record_batches
    buf = io.BytesIO()
    writer = pa.RecordBatchFileWriter(buf, record_batches[0].schema)
    for batch in record_batches:
      writer.write_batch(batch)
    writer.close()
    self._serialized_batches = tensorflow.convert_to_tensor(
        buf.getvalue(),
        dtype=dtypes.string,
        name="serialized_batches")
    super(ArrowDataset, self).__init__(columns, output_types, output_shapes)
Esempio n. 5
0
  def test_feed_batches(self):
    """
    Test that an ArrowDataset can initialize an iterator to feed a placeholder
    """
    truth_data = TruthData(
        [list(range(10)), [x * 1.1 for x in range(10)]],
        (dtypes.int32, dtypes.float64),
        (tf.TensorShape([]), tf.TensorShape([])))
    batch = self.make_record_batch(truth_data)

    buf = io.BytesIO()
    writer = pa.RecordBatchFileWriter(buf, batch.schema)
    writer.write_batch(batch)
    writer.close()

    buf_placeholder = tf.compat.v1.placeholder(
        tf.dtypes.string, tf.TensorShape([]))

    dataset = arrow_io.ArrowDataset(
        buf_placeholder,
        list(range(len(truth_data.output_types))),
        truth_data.output_types,
        truth_data.output_shapes)
    it = dataset.make_initializable_iterator()
    next_element = it.get_next()

    with self.test_session() as sess:
      sess.run(it.initializer, feed_dict={buf_placeholder: buf.getvalue()})
      for row in range(len(truth_data.data)):
        value = sess.run(next_element)
        self.assertEqual(value[0], truth_data.data[0][row])
        self.assertAlmostEqual(value[1], truth_data.data[1][row], 4)
Esempio n. 6
0
def dataframe_to_arrow_table(dataframe: pd.DataFrame, columns: List[Column],
                             path: Path) -> atypes.ArrowTable:
    """
    Write `dataframe` to an Arrow file and return an ArrowTable backed by it.

    The result will consume little RAM, because its data is stored in an
    mmapped file.
    """
    arrow_columns = []
    if columns:
        arrays = []
        for pandas_column in columns:
            arrays.append(series_to_arrow_array(dataframe[pandas_column.name]))
            arrow_columns.append(pandas_column.to_arrow())

        arrow_table = pyarrow.Table.from_arrays(
            arrays, names=[c.name for c in columns])
        with pyarrow.RecordBatchFileWriter(str(path),
                                           arrow_table.schema) as writer:
            writer.write_table(arrow_table)
    else:
        path = None
        arrow_table = None

    return atypes.ArrowTable(
        path, arrow_table, atypes.TableMetadata(len(dataframe), arrow_columns))
Esempio n. 7
0
def h5ad_to_arrow(h5ad_file, arrow_file):
    adata = read_h5ad(h5ad_file)
    umap = adata.obsm['X_umap'].transpose()
    leiden = adata.obs['leiden'].to_numpy().astype('uint8')
    index = adata.obs.index

    # Only use the cell type predictions if they are available in the AnnData file.
    adata_is_annotated = has_cell_type_annotations(adata)
    if adata_is_annotated:
        predicted_cell_type = adata.obs[PREDICTED_ASCT_CELLTYPE].astype(str)
    else:
        predicted_cell_type = None

    df = DataFrame(data={
        'umap_x':
        umap[0],
        'umap_y':
        umap[1],
        'leiden':
        leiden,
        **({
            PREDICTED_ASCT_CELLTYPE: predicted_cell_type,
        } if adata_is_annotated else {})
    },
                   index=index)
    table = pa.Table.from_pandas(df)

    writer = pa.RecordBatchFileWriter(arrow_file, table.schema)
    writer.write(table)
    writer.close()
Esempio n. 8
0
def test_arrow_write_table(tmpdir):
    path = str(tmpdir.join('test.arrow'))
    with pa.OSFile(path, 'wb') as sink:
        with pa.RecordBatchFileWriter(sink, table.schema) as writer:
            writer.write_table(table)

    df = vaex.open(path)
Esempio n. 9
0
    def test_arrow_io_dataset_map_from_file(self):
        """test_arrow_io_dataset_map_from_file"""
        column = "a"
        dtype = dtypes.int64
        column_dtype = self.get_arrow_type(dtype, False)
        arr = pa.array(list(range(100)), column_dtype)
        table = pa.Table.from_arrays([arr], [column])
        spec = {column: dtype}

        with tempfile.NamedTemporaryFile(delete=False) as f:
            with pa.RecordBatchFileWriter(f.name, table.schema) as writer:
                for batch in table.to_batches():
                    writer.write_batch(batch)

        def from_file(_):
            reader = pa.RecordBatchFileReader(f.name)
            t = reader.read_all()
            tio = IOTensor.from_arrow(t, spec=spec)
            return tio(column).to_tensor()

        num_iters = 2
        ds = tf.data.Dataset.range(num_iters).map(from_file)
        expected = table[column].to_pylist()

        iter_count = 0
        for result in ds:
            npt.assert_array_equal(result, expected)
            iter_count += 1

        self.assertEqual(iter_count, num_iters)
        os.unlink(f.name)
Esempio n. 10
0
def arrow_table_context(
    *columns,
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[Tuple[pathlib.Path, pyarrow.Table]]:
    """Yield a Path and a pa.Table with its contents.

    Two calling conventions:

        with arrow_table_context(make_column("A", [1]), make_column("B", [2])) as (path, table):
            pass

        table = make_table(make_column("A", [1]), make_column("B", [1]))
        with arrow_table_context(x) as (path, _):
            pass
    """
    if len(columns) == 1 and isinstance(columns[0], pyarrow.Table):
        table = columns[0]
    else:
        table = make_table(*columns)

    with tempfile_context(dir=dir) as path:
        writer = pyarrow.RecordBatchFileWriter(path, table.schema)
        writer.write_table(table)
        writer.close()
        yield path, table
Esempio n. 11
0
def test_envvar_set_legacy_ipc_format():
    schema = pa.schema([pa.field('foo', pa.int32())])

    writer = pa.RecordBatchStreamWriter(pa.BufferOutputStream(), schema)
    assert not writer._use_legacy_format
    writer = pa.RecordBatchFileWriter(pa.BufferOutputStream(), schema)
    assert not writer._use_legacy_format

    import os
    os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1'
    writer = pa.RecordBatchStreamWriter(pa.BufferOutputStream(), schema)
    assert writer._use_legacy_format
    writer = pa.RecordBatchFileWriter(pa.BufferOutputStream(), schema)
    assert writer._use_legacy_format

    del os.environ['ARROW_PRE_0_15_IPC_FORMAT']
Esempio n. 12
0
    def test_tf_function(self):
        """Test that an ArrowDataset can be used in tf.function call"""
        if not tf.version.VERSION.startswith("2."):
            self.skipTest("Test requires TF2.0 for tf.function")

        truth_data = TruthData(
            [list(range(10)), [x * 1.1 for x in range(10)]],
            (dtypes.int32, dtypes.float64),
            (tf.TensorShape([]), tf.TensorShape([])),
        )

        @tf.function
        def create_arrow_dataset(serialized_batch):
            """Create an arrow dataset from input tensor"""
            dataset = arrow_io.ArrowDataset(
                serialized_batch,
                list(range(len(truth_data.output_types))),
                truth_data.output_types,
                truth_data.output_shapes,
            )
            return dataset

        batch = self.make_record_batch(truth_data)
        buf = io.BytesIO()
        writer = pa.RecordBatchFileWriter(buf, batch.schema)
        writer.write_batch(batch)
        writer.close()

        for row, results in enumerate(create_arrow_dataset(buf.getvalue())):
            value = [result.numpy() for result in results]
            self.assertEqual(value[0], truth_data.data[0][row])
            self.assertAlmostEqual(value[1], truth_data.data[1][row], 4)
Esempio n. 13
0
def arrow_file(table: pyarrow.Table) -> ContextManager[pathlib.Path]:
    with empty_file(suffix=".arrow") as path:
        with path.open("wb") as f:
            writer = pyarrow.RecordBatchFileWriter(f, table.schema)
            writer.write(table)
            writer.close()
        yield path
Esempio n. 14
0
    def enqueue_tensor(self, uri, data):
        if isinstance(data, np.ndarray):
            # tensor
            data = [data]
        if not isinstance(data, list):
            raise Exception(
                "Your input is invalid, only List of ndarray and ndarray are allowed."
            )

        sink = pa.BufferOutputStream()
        writer = None
        for d in data:
            shape = np.array(d.shape, dtype="float32")
            d = d.astype("float32").flatten()
            len_arr = np.array([len(shape), len(d)], dtype="float32")
            data_arr = np.concatenate([len_arr, shape, d])
            arrow_arr = pa.array(data_arr)
            batch = pa.RecordBatch.from_arrays([arrow_arr], ["0"])
            if writer is None:
                # initialize
                writer = pa.RecordBatchFileWriter(sink, batch.schema)
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        b = buf.to_pybytes()
        tensor_encoded = self.base64_encode_image(b)
        d = {"uri": uri, "tensor": tensor_encoded}
        self.__enqueue_data("tensor_stream", d)
Esempio n. 15
0
 def dumps(self, batch):
     import pyarrow as pa
     import io
     sink = io.BytesIO()
     writer = pa.RecordBatchFileWriter(sink, batch.schema)
     writer.write_batch(batch)
     writer.close()
     return sink.getvalue()
def compress(array: pa.Array) -> bytes:
    rb = pa.RecordBatch.from_arrays([array], ["array"])
    buf = io.BytesIO()
    writer = pa.RecordBatchFileWriter(buf, rb.schema)
    writer.write_batch(rb)
    writer.close()
    buf.seek(0)
    return brotli.compress(buf.read())
Esempio n. 17
0
def save_results_arrow(filename, pdf2):
    #save results
    import pyarrow as pa
    table = pa.Table.from_pandas(pdf2)
    schema = pa.Schema.from_pandas(pdf2)
    with open(filename, 'bw') as f:
        writer = pa.RecordBatchFileWriter(f, table.schema)
        writer.write(table)
        writer.close()
Esempio n. 18
0
def dataframe_to_arrowfile(df):
    ''' Pandas DataFrame to Arrow file format
    '''
    batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
    sink = io.BytesIO()
    writer = pa.RecordBatchFileWriter(sink, batch.schema)
    writer.write_batch(batch)
    writer.close()
    return sink.getvalue()
Esempio n. 19
0
    def test_register_object(self):
        data = [
            pa.array([1, 2, 3, 4]),
            pa.array(["foo", "bar", "baz", None]),
            pa.array([True, None, False, True]),
        ]
        batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"])
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)

        for i in range(10):
            writer.write_batch(batch)

        writer.close()
        buf = sink.getvalue()
        mymsg = DummyMessage()
        mymsg.name = "dummy"
        mymsg.description = "really dumb"

        mymenu = CronusObject()
        mymenu.name = "menu"
        menuinfo = MenuObjectInfo()
        menuinfo.created.GetCurrentTime()
        bufmenu = pa.py_buffer(mymenu.SerializeToString())

        myconfig = Configuration()
        myconfig.uuid = str(uuid.uuid4())
        myconfig.name = f"{myconfig.uuid}.config.dat"

        configinfo = ConfigObjectInfo()
        configinfo.created.GetCurrentTime()
        bufconfig = pa.py_buffer(myconfig.SerializeToString())

        with tempfile.TemporaryDirectory() as dirpath:
            _path = dirpath + "/test"
            store = BaseObjectStore(
                str(_path), "test")  # wrapper to the CronusStore message
            fileinfo = FileObjectInfo()
            fileinfo.type = 5
            fileinfo.aux.description = "Some dummy data"

            menu_uuid = store.register_content(mymenu, menuinfo).uuid
            config_uuid = store.register_content(myconfig, configinfo).uuid
            dataset = store.register_dataset(menu_uuid, config_uuid)
            store.new_partition(dataset.uuid, "key")
            path = dirpath + "/test/dummy.arrow"
            with pa.OSFile(str(path), "wb") as f:
                f.write(sink.getvalue())
            id_ = store.register_content(path,
                                         fileinfo,
                                         dataset_id=dataset.uuid,
                                         partition_key="key").uuid
            print(store[id_].address)
            buf = pa.py_buffer(store.get(id_))
            reader = pa.ipc.open_file(buf)
            self.assertEqual(reader.num_record_batches, 10)
Esempio n. 20
0
def csv_to_arrow(csv_file: Path, arrow_file: Path):
    df = pd.read_csv(csv_file, index_col=0)
    df.index.name = 'index'
    df.columns = ['umap_x', 'umap_y', 'leiden']

    table = pa.Table.from_pandas(df)

    writer = pa.RecordBatchFileWriter(arrow_file, table.schema)
    writer.write(table)
    writer.close()
Esempio n. 21
0
    def parquet_to_arrow(self):
        batches = self.dataset.to_batches(batch_size=self.batch_size)
        gc.collect()

        sink = pa.OSFile(self.arrow_file, 'wb')
        writer = pa.RecordBatchFileWriter(sink, self.dataset.schema)
        for _, batch in enumerate(batches):
            writer.write_batch(batch)
        writer.close()
        self.filetype = "arrow"
Esempio n. 22
0
def pandas_to_arrow(frame):
    """
    Convert from a pandas dataframe to apache arrow serialized buffer
    """
    batch = pa.RecordBatch.from_pandas(frame, preserve_index=False)
    sink = pa.BufferOutputStream()
    writer = pa.RecordBatchFileWriter(sink, batch.schema)
    writer.write_batch(batch)
    writer.close()
    arrow_buffer = sink.getvalue()
    return arrow_buffer.to_pybytes()
Esempio n. 23
0
    def func():
        df = table.to_pandas()

        batch = pa.RecordBatch.from_pandas(df)

        sink = io.BytesIO()
        writer = pa.RecordBatchFileWriter(sink, batch.schema)
        writer.write_batch(batch)
        writer.close()

        buf_reader = pa.BufferReader(sink.getvalue())
        reader = pa.open_file(buf_reader)
        reader.read_all()
Esempio n. 24
0
def make_arrow(root, dataset_root):
    with open(f"{root}/karpathy/dataset_flickr30k.json", "r") as fp:
        captions = json.load(fp)

    captions = captions["images"]

    iid2captions = defaultdict(list)
    iid2split = dict()

    for cap in tqdm(captions):
        filename = cap["filename"]
        iid2split[filename] = cap["split"]
        for c in cap["sentences"]:
            iid2captions[filename].append(c["raw"])

    paths = list(glob(f"{root}/flickr30k-images/*.jpg"))
    random.shuffle(paths)
    caption_paths = [
        path for path in paths if path.split("/")[-1] in iid2captions
    ]

    if len(paths) == len(caption_paths):
        print("all images have caption annotations")
    else:
        print("not all images have caption annotations")
    print(
        len(paths),
        len(caption_paths),
        len(iid2captions),
    )

    bs = [
        path2rest(path, iid2captions, iid2split)
        for path in tqdm(caption_paths)
    ]

    for split in ["train", "val", "test"]:
        batches = [b for b in bs if b[-1] == split]

        dataframe = pd.DataFrame(
            batches,
            columns=["image", "caption", "image_id", "split"],
        )

        table = pa.Table.from_pandas(dataframe)

        os.makedirs(dataset_root, exist_ok=True)
        with pa.OSFile(f"{dataset_root}/f30k_caption_karpathy_{split}.arrow",
                       "wb") as sink:
            with pa.RecordBatchFileWriter(sink, table.schema) as writer:
                writer.write_table(table)
Esempio n. 25
0
    def write_to_arrow(cls, data_wrapper, path_to_outfile):
        data_schema = data_wrapper.table_schema().with_metadata(
            data_wrapper.schema_skyhook_metadata().to_byte_coercible())

        cls.logger.info('>>> writing data in single arrow file')
        with open(path_to_outfile, 'wb') as arrow_handle:
            batch_writer = pyarrow.RecordBatchFileWriter(
                arrow_handle, data_schema)

            for record_batch in data_wrapper.as_arrow_table(
                    data_schema).to_batches():
                batch_writer.write_batch(record_batch)

        cls.logger.info('<<< data written')
Esempio n. 26
0
def test_write_empty_ipc_file():
    # ARROW-3894: IPC file was not being properly initialized when no record
    # batches are being written
    schema = pa.schema([('field', pa.int64())])

    sink = pa.BufferOutputStream()
    writer = pa.RecordBatchFileWriter(sink, schema)
    writer.close()

    buf = sink.getvalue()
    reader = pa.RecordBatchFileReader(pa.BufferReader(buf))
    table = reader.read_all()
    assert len(table) == 0
    assert table.schema.equals(schema)
Esempio n. 27
0
def test_deprecated_pyarrow_ns_apis():
    table = pa.table([pa.array([1, 2, 3, 4])], names=['a'])
    sink = pa.BufferOutputStream()
    with pa.RecordBatchStreamWriter(sink, table.schema) as writer:
        writer.write(table)

    with pytest.warns(FutureWarning,
                      match="please use pyarrow.ipc.open_stream"):
        pa.open_stream(sink.getvalue())

    sink = pa.BufferOutputStream()
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write(table)
    with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_file"):
        pa.open_file(sink.getvalue())
Esempio n. 28
0
 def write_columns(self, name, type, columns):
     path = self.datadir + '/' + self.outputs[name]
     self.write_type(path, type)
     schema = pa.schema(
         [pa.field(name, a.type) for (name, a) in columns.items()])
     t = pa.Table.from_arrays(list(columns.values()), schema=schema)
     with pa.output_stream(path + '/data.arrow') as sink:
         writer = pa.RecordBatchFileWriter(sink, t.schema)
         batches = t.to_batches(max_chunksize=len(t))
         if batches:
             assert len(batches) == 1
             writer.write_batch(batches[0])
         writer.close()
     with open(path + '/_SUCCESS', 'w'):
         pass
Esempio n. 29
0
    def write_recordbatchfile(self):
        sink = pa.BufferOutputStream()
        writer = pa.RecordBatchFileWriter(sink, self.pa_schema)

        batches_size = 0
        while (batches_size // 1024**2) < self.maxfilesize:
            batch = self.write_batch_arrow()
            batches_size += pa.get_record_batch_size(batch)
            writer.write_batch(batch)
            if self.checkcount():
                break

        writer.close()
        buf = sink.getvalue()
        return buf
Esempio n. 30
0
def arrow_file(
    table: Union[Dict[str, List[Any]], pyarrow.Table],
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[pathlib.Path]:
    """
    Yield a path with `table` written to an Arrow file.
    """
    if isinstance(table, dict):
        table = pyarrow.Table.from_pydict(table)

    with tempfile_context(dir=dir) as path:
        writer = pyarrow.RecordBatchFileWriter(str(path), table.schema)
        writer.write_table(table)
        writer.close()
        yield path