def from_record_batches( cls, record_batches, output_types, output_shapes=None, columns=None, batch_size=None, batch_mode="keep_remainder", ): """Create an ArrowDataset directly from Arrow record batches. This constructor requires pyarrow to be installed. Args: record_batches: An Arrow record batch or sequence of record batches output_types: Tensor dtypes of the output tensors output_shapes: TensorShapes of the output tensors or None to infer partial batch_size: Batch size of output tensors, setting a batch size here will create batched tensors from Arrow memory and can be more efficient than using tf.data.Dataset.batch(). NOTE: batch_size does not need to be set if batch_mode='auto' batch_mode: Mode of batching, supported strings: "keep_remainder" (default, keeps partial batch data), "drop_remainder" (discard partial batch data), "auto" (size to number of records in Arrow record batch) columns: A list of column indices to be used in the Dataset """ import pyarrow as pa # pylint: disable=import-outside-toplevel if isinstance(record_batches, pa.RecordBatch): record_batches = [record_batches] if columns is None: columns = tuple(range(record_batches[0].num_columns)) assert record_batches if tf.executing_eagerly(): sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, record_batches[0].schema) for batch in record_batches: writer.write_batch(batch) writer.close() serialized_batches = None arrow_buffer = sink.getvalue() else: buf = io.BytesIO() writer = pa.RecordBatchFileWriter(buf, record_batches[0].schema) for batch in record_batches: writer.write_batch(batch) writer.close() serialized_batches = tf.convert_to_tensor( buf.getvalue(), dtype=dtypes.string, name="serialized_batches") arrow_buffer = None return cls( serialized_batches, columns, output_types, output_shapes, batch_size=batch_size, batch_mode=batch_mode, arrow_buffer=arrow_buffer, )
def to_arrow(self, path: Pathlike) -> None: """ Store the manifest in Apache Arrow streaming binary format. For very large manifests it can be ~5x larger that a corresponding compressed JSONL, but it allows to read the manifest with a relatively small memory footprint (~300M). """ import pyarrow as pa # If the underlying storage for manifests is already lazy, we can # access the arrow tables directly without the need to convert items. if self.is_lazy: # TODO: I don't want to add a special method for retrieving those in each manifest type; # after this work is done, I will make a refactoring PR that renames these members # to sth like ".data" so that it's uniform across manifests. from lhotse import RecordingSet, SupervisionSet, CutSet if isinstance(self, RecordingSet): table = self.recordings.table elif isinstance(self, SupervisionSet): table = self.segments.table elif isinstance(self, CutSet): table = self.cuts.table else: raise NotImplementedError( f"Unsupported type of manifest for arrow serialization: {type(self)}" ) with open(path, "wb") as f, pa.RecordBatchFileWriter( f, schema=table.schema) as writer: for batch in table.to_batches(): writer.write_batch(batch) else: # We will take the first 1000 items from the manifest to infer the schema. # TODO: might want to sample items randomly in case their manifests vary... schema = pa.schema( pa.array(list(self.subset(first=1000).to_dicts())).type) # Open the file for writing and initialize the pyarrow batch writer. # Note that the batch size we determine here will be used to load whole chunks into # memory during deserialization. with open(path, "wb") as f, pa.RecordBatchFileWriter( f, schema=schema) as writer: # We are (lazily) grouping the items in manifest into chunks, # each of ``batch_size`` items. batch_size = 10 * 1024 chunks = grouper(n=batch_size, iterable=self.to_dicts()) for chunk in chunks: # We convert the items in each chunk into Arrow's columnar representation. # To do this, we first iterate by available "columns" (i.e. dict keys), # and for each of them create an Arrow array with the corresponding values. # These arrays are then used to create an arrow Table. arrays = [ pa.array([item.get(key) for item in chunk], type=schema.field(key_idx).type) for key_idx, key in enumerate(schema.names) ] table = pa.Table.from_arrays(arrays, schema=schema) # The loop below will iterate only once, since we ensured there's exactly one batch. for idx, batch in enumerate( table.to_batches(max_chunksize=batch_size)): writer.write_batch(batch)
def _serialize_table(obj): if __supports_pyarrow: try: f = BytesIO() data = pa.Table.from_pandas(obj, preserve_index=False) if __use_legacy_export: batch_writer = pa.RecordBatchFileWriter(f, data.schema, use_legacy_format=True) else: batch_writer = pa.RecordBatchFileWriter(f, data.schema) with batch_writer as writer: writer.write_table(data) return { "arrow": True, # Tornado will encode this "data": f.getvalue() } except Exception as e: print("Failed to serialize to Arrow") print(e) # fall through to the JSON format def _serialize_row(row_obj, column_types, row_name=None): row_data = [] for i, irow in enumerate(row_obj): if (column_types[i] == "Any"): datatype = guess_type(irow) row_data.append(serialize(irow, datatype)) else: row_data.append(serialize(irow, column_types[i])["value"]) return { "children": [], "name": row_name, "data": [ serialize(irow, column_types[i]) for i, irow in enumerate(row_obj) ] } serialized_table = {"rows": [], "cols": [], "types": []} for col in obj: serialized_table["cols"].append(col) # Use the numpy types to infer the MavenType serialized_table["types"].append( guess_type(obj.dtypes[col].type, check_instanceof=False)) for i, row in obj.iterrows(): serialized_table["rows"].append( _serialize_row(row, serialized_table["types"], i)) return serialized_table
def __init__(self, record_batches, columns, output_types, output_shapes=None): """Create an ArrowDataset directly from Arrow record batches. This constructor requires pyarrow to be installed. Args: record_batches: An Arrow record batch or sequence of record batches columns: A list of column indices to be used in the Dataset output_types: Tensor dtypes of the output tensors output_shapes: TensorShapes of the output tensors or None to infer partial """ self._columns = columns self._output_types = output_types self._output_shapes = output_shapes or \ nest.map_structure( lambda _: tensorflow.TensorShape(None), self._output_types) import pyarrow as pa if isinstance(record_batches, pa.RecordBatch): record_batches = [record_batches] assert record_batches buf = io.BytesIO() writer = pa.RecordBatchFileWriter(buf, record_batches[0].schema) for batch in record_batches: writer.write_batch(batch) writer.close() self._serialized_batches = tensorflow.convert_to_tensor( buf.getvalue(), dtype=dtypes.string, name="serialized_batches") super(ArrowDataset, self).__init__(columns, output_types, output_shapes)
def test_feed_batches(self): """ Test that an ArrowDataset can initialize an iterator to feed a placeholder """ truth_data = TruthData( [list(range(10)), [x * 1.1 for x in range(10)]], (dtypes.int32, dtypes.float64), (tf.TensorShape([]), tf.TensorShape([]))) batch = self.make_record_batch(truth_data) buf = io.BytesIO() writer = pa.RecordBatchFileWriter(buf, batch.schema) writer.write_batch(batch) writer.close() buf_placeholder = tf.compat.v1.placeholder( tf.dtypes.string, tf.TensorShape([])) dataset = arrow_io.ArrowDataset( buf_placeholder, list(range(len(truth_data.output_types))), truth_data.output_types, truth_data.output_shapes) it = dataset.make_initializable_iterator() next_element = it.get_next() with self.test_session() as sess: sess.run(it.initializer, feed_dict={buf_placeholder: buf.getvalue()}) for row in range(len(truth_data.data)): value = sess.run(next_element) self.assertEqual(value[0], truth_data.data[0][row]) self.assertAlmostEqual(value[1], truth_data.data[1][row], 4)
def dataframe_to_arrow_table(dataframe: pd.DataFrame, columns: List[Column], path: Path) -> atypes.ArrowTable: """ Write `dataframe` to an Arrow file and return an ArrowTable backed by it. The result will consume little RAM, because its data is stored in an mmapped file. """ arrow_columns = [] if columns: arrays = [] for pandas_column in columns: arrays.append(series_to_arrow_array(dataframe[pandas_column.name])) arrow_columns.append(pandas_column.to_arrow()) arrow_table = pyarrow.Table.from_arrays( arrays, names=[c.name for c in columns]) with pyarrow.RecordBatchFileWriter(str(path), arrow_table.schema) as writer: writer.write_table(arrow_table) else: path = None arrow_table = None return atypes.ArrowTable( path, arrow_table, atypes.TableMetadata(len(dataframe), arrow_columns))
def h5ad_to_arrow(h5ad_file, arrow_file): adata = read_h5ad(h5ad_file) umap = adata.obsm['X_umap'].transpose() leiden = adata.obs['leiden'].to_numpy().astype('uint8') index = adata.obs.index # Only use the cell type predictions if they are available in the AnnData file. adata_is_annotated = has_cell_type_annotations(adata) if adata_is_annotated: predicted_cell_type = adata.obs[PREDICTED_ASCT_CELLTYPE].astype(str) else: predicted_cell_type = None df = DataFrame(data={ 'umap_x': umap[0], 'umap_y': umap[1], 'leiden': leiden, **({ PREDICTED_ASCT_CELLTYPE: predicted_cell_type, } if adata_is_annotated else {}) }, index=index) table = pa.Table.from_pandas(df) writer = pa.RecordBatchFileWriter(arrow_file, table.schema) writer.write(table) writer.close()
def test_arrow_write_table(tmpdir): path = str(tmpdir.join('test.arrow')) with pa.OSFile(path, 'wb') as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table) df = vaex.open(path)
def test_arrow_io_dataset_map_from_file(self): """test_arrow_io_dataset_map_from_file""" column = "a" dtype = dtypes.int64 column_dtype = self.get_arrow_type(dtype, False) arr = pa.array(list(range(100)), column_dtype) table = pa.Table.from_arrays([arr], [column]) spec = {column: dtype} with tempfile.NamedTemporaryFile(delete=False) as f: with pa.RecordBatchFileWriter(f.name, table.schema) as writer: for batch in table.to_batches(): writer.write_batch(batch) def from_file(_): reader = pa.RecordBatchFileReader(f.name) t = reader.read_all() tio = IOTensor.from_arrow(t, spec=spec) return tio(column).to_tensor() num_iters = 2 ds = tf.data.Dataset.range(num_iters).map(from_file) expected = table[column].to_pylist() iter_count = 0 for result in ds: npt.assert_array_equal(result, expected) iter_count += 1 self.assertEqual(iter_count, num_iters) os.unlink(f.name)
def arrow_table_context( *columns, dir: Optional[pathlib.Path] = None, ) -> ContextManager[Tuple[pathlib.Path, pyarrow.Table]]: """Yield a Path and a pa.Table with its contents. Two calling conventions: with arrow_table_context(make_column("A", [1]), make_column("B", [2])) as (path, table): pass table = make_table(make_column("A", [1]), make_column("B", [1])) with arrow_table_context(x) as (path, _): pass """ if len(columns) == 1 and isinstance(columns[0], pyarrow.Table): table = columns[0] else: table = make_table(*columns) with tempfile_context(dir=dir) as path: writer = pyarrow.RecordBatchFileWriter(path, table.schema) writer.write_table(table) writer.close() yield path, table
def test_envvar_set_legacy_ipc_format(): schema = pa.schema([pa.field('foo', pa.int32())]) writer = pa.RecordBatchStreamWriter(pa.BufferOutputStream(), schema) assert not writer._use_legacy_format writer = pa.RecordBatchFileWriter(pa.BufferOutputStream(), schema) assert not writer._use_legacy_format import os os.environ['ARROW_PRE_0_15_IPC_FORMAT'] = '1' writer = pa.RecordBatchStreamWriter(pa.BufferOutputStream(), schema) assert writer._use_legacy_format writer = pa.RecordBatchFileWriter(pa.BufferOutputStream(), schema) assert writer._use_legacy_format del os.environ['ARROW_PRE_0_15_IPC_FORMAT']
def test_tf_function(self): """Test that an ArrowDataset can be used in tf.function call""" if not tf.version.VERSION.startswith("2."): self.skipTest("Test requires TF2.0 for tf.function") truth_data = TruthData( [list(range(10)), [x * 1.1 for x in range(10)]], (dtypes.int32, dtypes.float64), (tf.TensorShape([]), tf.TensorShape([])), ) @tf.function def create_arrow_dataset(serialized_batch): """Create an arrow dataset from input tensor""" dataset = arrow_io.ArrowDataset( serialized_batch, list(range(len(truth_data.output_types))), truth_data.output_types, truth_data.output_shapes, ) return dataset batch = self.make_record_batch(truth_data) buf = io.BytesIO() writer = pa.RecordBatchFileWriter(buf, batch.schema) writer.write_batch(batch) writer.close() for row, results in enumerate(create_arrow_dataset(buf.getvalue())): value = [result.numpy() for result in results] self.assertEqual(value[0], truth_data.data[0][row]) self.assertAlmostEqual(value[1], truth_data.data[1][row], 4)
def arrow_file(table: pyarrow.Table) -> ContextManager[pathlib.Path]: with empty_file(suffix=".arrow") as path: with path.open("wb") as f: writer = pyarrow.RecordBatchFileWriter(f, table.schema) writer.write(table) writer.close() yield path
def enqueue_tensor(self, uri, data): if isinstance(data, np.ndarray): # tensor data = [data] if not isinstance(data, list): raise Exception( "Your input is invalid, only List of ndarray and ndarray are allowed." ) sink = pa.BufferOutputStream() writer = None for d in data: shape = np.array(d.shape, dtype="float32") d = d.astype("float32").flatten() len_arr = np.array([len(shape), len(d)], dtype="float32") data_arr = np.concatenate([len_arr, shape, d]) arrow_arr = pa.array(data_arr) batch = pa.RecordBatch.from_arrays([arrow_arr], ["0"]) if writer is None: # initialize writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() buf = sink.getvalue() b = buf.to_pybytes() tensor_encoded = self.base64_encode_image(b) d = {"uri": uri, "tensor": tensor_encoded} self.__enqueue_data("tensor_stream", d)
def dumps(self, batch): import pyarrow as pa import io sink = io.BytesIO() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() return sink.getvalue()
def compress(array: pa.Array) -> bytes: rb = pa.RecordBatch.from_arrays([array], ["array"]) buf = io.BytesIO() writer = pa.RecordBatchFileWriter(buf, rb.schema) writer.write_batch(rb) writer.close() buf.seek(0) return brotli.compress(buf.read())
def save_results_arrow(filename, pdf2): #save results import pyarrow as pa table = pa.Table.from_pandas(pdf2) schema = pa.Schema.from_pandas(pdf2) with open(filename, 'bw') as f: writer = pa.RecordBatchFileWriter(f, table.schema) writer.write(table) writer.close()
def dataframe_to_arrowfile(df): ''' Pandas DataFrame to Arrow file format ''' batch = pa.RecordBatch.from_pandas(df, preserve_index=False) sink = io.BytesIO() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() return sink.getvalue()
def test_register_object(self): data = [ pa.array([1, 2, 3, 4]), pa.array(["foo", "bar", "baz", None]), pa.array([True, None, False, True]), ] batch = pa.RecordBatch.from_arrays(data, ["f0", "f1", "f2"]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) for i in range(10): writer.write_batch(batch) writer.close() buf = sink.getvalue() mymsg = DummyMessage() mymsg.name = "dummy" mymsg.description = "really dumb" mymenu = CronusObject() mymenu.name = "menu" menuinfo = MenuObjectInfo() menuinfo.created.GetCurrentTime() bufmenu = pa.py_buffer(mymenu.SerializeToString()) myconfig = Configuration() myconfig.uuid = str(uuid.uuid4()) myconfig.name = f"{myconfig.uuid}.config.dat" configinfo = ConfigObjectInfo() configinfo.created.GetCurrentTime() bufconfig = pa.py_buffer(myconfig.SerializeToString()) with tempfile.TemporaryDirectory() as dirpath: _path = dirpath + "/test" store = BaseObjectStore( str(_path), "test") # wrapper to the CronusStore message fileinfo = FileObjectInfo() fileinfo.type = 5 fileinfo.aux.description = "Some dummy data" menu_uuid = store.register_content(mymenu, menuinfo).uuid config_uuid = store.register_content(myconfig, configinfo).uuid dataset = store.register_dataset(menu_uuid, config_uuid) store.new_partition(dataset.uuid, "key") path = dirpath + "/test/dummy.arrow" with pa.OSFile(str(path), "wb") as f: f.write(sink.getvalue()) id_ = store.register_content(path, fileinfo, dataset_id=dataset.uuid, partition_key="key").uuid print(store[id_].address) buf = pa.py_buffer(store.get(id_)) reader = pa.ipc.open_file(buf) self.assertEqual(reader.num_record_batches, 10)
def csv_to_arrow(csv_file: Path, arrow_file: Path): df = pd.read_csv(csv_file, index_col=0) df.index.name = 'index' df.columns = ['umap_x', 'umap_y', 'leiden'] table = pa.Table.from_pandas(df) writer = pa.RecordBatchFileWriter(arrow_file, table.schema) writer.write(table) writer.close()
def parquet_to_arrow(self): batches = self.dataset.to_batches(batch_size=self.batch_size) gc.collect() sink = pa.OSFile(self.arrow_file, 'wb') writer = pa.RecordBatchFileWriter(sink, self.dataset.schema) for _, batch in enumerate(batches): writer.write_batch(batch) writer.close() self.filetype = "arrow"
def pandas_to_arrow(frame): """ Convert from a pandas dataframe to apache arrow serialized buffer """ batch = pa.RecordBatch.from_pandas(frame, preserve_index=False) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() arrow_buffer = sink.getvalue() return arrow_buffer.to_pybytes()
def func(): df = table.to_pandas() batch = pa.RecordBatch.from_pandas(df) sink = io.BytesIO() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() buf_reader = pa.BufferReader(sink.getvalue()) reader = pa.open_file(buf_reader) reader.read_all()
def make_arrow(root, dataset_root): with open(f"{root}/karpathy/dataset_flickr30k.json", "r") as fp: captions = json.load(fp) captions = captions["images"] iid2captions = defaultdict(list) iid2split = dict() for cap in tqdm(captions): filename = cap["filename"] iid2split[filename] = cap["split"] for c in cap["sentences"]: iid2captions[filename].append(c["raw"]) paths = list(glob(f"{root}/flickr30k-images/*.jpg")) random.shuffle(paths) caption_paths = [ path for path in paths if path.split("/")[-1] in iid2captions ] if len(paths) == len(caption_paths): print("all images have caption annotations") else: print("not all images have caption annotations") print( len(paths), len(caption_paths), len(iid2captions), ) bs = [ path2rest(path, iid2captions, iid2split) for path in tqdm(caption_paths) ] for split in ["train", "val", "test"]: batches = [b for b in bs if b[-1] == split] dataframe = pd.DataFrame( batches, columns=["image", "caption", "image_id", "split"], ) table = pa.Table.from_pandas(dataframe) os.makedirs(dataset_root, exist_ok=True) with pa.OSFile(f"{dataset_root}/f30k_caption_karpathy_{split}.arrow", "wb") as sink: with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write_table(table)
def write_to_arrow(cls, data_wrapper, path_to_outfile): data_schema = data_wrapper.table_schema().with_metadata( data_wrapper.schema_skyhook_metadata().to_byte_coercible()) cls.logger.info('>>> writing data in single arrow file') with open(path_to_outfile, 'wb') as arrow_handle: batch_writer = pyarrow.RecordBatchFileWriter( arrow_handle, data_schema) for record_batch in data_wrapper.as_arrow_table( data_schema).to_batches(): batch_writer.write_batch(record_batch) cls.logger.info('<<< data written')
def test_write_empty_ipc_file(): # ARROW-3894: IPC file was not being properly initialized when no record # batches are being written schema = pa.schema([('field', pa.int64())]) sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, schema) writer.close() buf = sink.getvalue() reader = pa.RecordBatchFileReader(pa.BufferReader(buf)) table = reader.read_all() assert len(table) == 0 assert table.schema.equals(schema)
def test_deprecated_pyarrow_ns_apis(): table = pa.table([pa.array([1, 2, 3, 4])], names=['a']) sink = pa.BufferOutputStream() with pa.RecordBatchStreamWriter(sink, table.schema) as writer: writer.write(table) with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_stream"): pa.open_stream(sink.getvalue()) sink = pa.BufferOutputStream() with pa.RecordBatchFileWriter(sink, table.schema) as writer: writer.write(table) with pytest.warns(FutureWarning, match="please use pyarrow.ipc.open_file"): pa.open_file(sink.getvalue())
def write_columns(self, name, type, columns): path = self.datadir + '/' + self.outputs[name] self.write_type(path, type) schema = pa.schema( [pa.field(name, a.type) for (name, a) in columns.items()]) t = pa.Table.from_arrays(list(columns.values()), schema=schema) with pa.output_stream(path + '/data.arrow') as sink: writer = pa.RecordBatchFileWriter(sink, t.schema) batches = t.to_batches(max_chunksize=len(t)) if batches: assert len(batches) == 1 writer.write_batch(batches[0]) writer.close() with open(path + '/_SUCCESS', 'w'): pass
def write_recordbatchfile(self): sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, self.pa_schema) batches_size = 0 while (batches_size // 1024**2) < self.maxfilesize: batch = self.write_batch_arrow() batches_size += pa.get_record_batch_size(batch) writer.write_batch(batch) if self.checkcount(): break writer.close() buf = sink.getvalue() return buf
def arrow_file( table: Union[Dict[str, List[Any]], pyarrow.Table], dir: Optional[pathlib.Path] = None, ) -> ContextManager[pathlib.Path]: """ Yield a path with `table` written to an Arrow file. """ if isinstance(table, dict): table = pyarrow.Table.from_pydict(table) with tempfile_context(dir=dir) as path: writer = pyarrow.RecordBatchFileWriter(str(path), table.schema) writer.write_table(table) writer.close() yield path