def deserialize_pandas(buf, nthreads=None): """Deserialize a buffer protocol compatible object into a pandas DataFrame. Parameters ---------- buf : buffer An object compatible with the buffer protocol nthreads : int, defualt None The number of threads to use to convert the buffer to a DataFrame, default all CPUs Returns ------- df : pandas.DataFrame """ buffer_reader = pa.BufferReader(buf) reader = pa.RecordBatchStreamReader(buffer_reader) table = reader.read_all() return table.to_pandas(nthreads=nthreads)
def test_stream_write_dispatch(stream_fixture): # ARROW-1616 df = pd.DataFrame({ 'one': np.random.randn(5), 'two': pd.Categorical(['foo', np.nan, 'bar', 'foo', 'foo'], categories=['foo', 'bar'], ordered=True) }) table = pa.Table.from_pandas(df, preserve_index=False) batch = pa.RecordBatch.from_pandas(df, preserve_index=False) writer = stream_fixture._get_writer(stream_fixture.sink, table.schema) writer.write(table) writer.write(batch) writer.close() table = (pa.ipc.open_stream(pa.BufferReader(stream_fixture.get_source())) .read_all()) assert_frame_equal(table.to_pandas(), pd.concat([df, df], ignore_index=True))
def recv_time_and_tensor(self, copy=False, flags=0, track=False): """Receives a jpg buffer and a text msg. Receives a apache arrow tensor and a time stamp Arguments: copy: flags: (optional) zmq flags. track: (optional) zmq track flag. Returns: t: (datetime) tensor: (pyarrow.tensor), tensor. """ frames = self.recv_multipart(copy=copy) t = struct.unpack(self.time_byte_format, frames[0].buffer)[0] t = datetime.datetime.fromtimestamp(t, pytz.utc) tensor = pa.ipc.read_tensor(pa.BufferReader(frames[1].buffer)) return t, tensor
def test_parquet_period(tmpdir, registered_period_type): # Parquet support for primitive extension types period_type, period_class = registered_period_type storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'period_extension_type.parquet' pq.write_table(table, filename) # Stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) # Since the type could be reconstructed, the extension type metadata is # absent. assert schema.field("ext").metadata == {} # When reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.schema.field("ext").type == period_type assert result.schema.field("ext").metadata == {b'PARQUET:field_id': b'1'} # Get the exact array class defined by the registered type. result_array = result.column("ext").chunk(0) assert type(result_array) is period_class # When the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.schema.field("ext").type == pa.int64() # The extension metadata is present for roundtripping. assert result.schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'test.period', b'PARQUET:field_id': b'1', }
def read_from_plasma(name): # get the current plasma_state with open("plasma_state.pkl", "rb") as f: plasma_state = pickle.load(f) # get the object ID for the dataframe object_id = plasma_state[name] # get the client and read from it client = plasma.connect("/tmp/plasma") # Fetch the Plasma object [data] = client.get_buffers([object_id]) # Get PlasmaBuffer from ObjectID buffer = pa.BufferReader(data) # Convert object back into an Arrow RecordBatch reader = pa.RecordBatchStreamReader(buffer) record_batch = reader.read_next_batch() # Convert back into Pandas df = record_batch.to_pandas() # close out and finish client.disconnect() return df
def get_df(self, df_name): """ Given some name for a dataframe, read that thing off of Plasma. """ # Grab the plasma ObjectID object_id = self.object_names[df_name] # Fetch the Plasma object [data] = self.client.get_buffers([object_id]) buffer = pa.BufferReader(data) # Convert object back into an Arrow RecordBatch reader = pa.RecordBatchStreamReader(buffer) record_batch = reader.read_next_batch() # Convert back into Pandas return (record_batch.to_pandas())
def test_chunked_binary_error_message(): # ARROW-3058: As Feather does not yet support chunked columns, we at least # make sure it's clear to the user what is going on # 2^31 + 1 bytes values = [b'x'] + [b'x' * (1 << 20)] * 2 * (1 << 10) df = pd.DataFrame({'byte_col': values}) # Works fine with version 2 buf = io.BytesIO() write_feather(df, buf, version=2) result = read_feather(pa.BufferReader(buf.getvalue())) assert_frame_equal(result, df) with pytest.raises(ValueError, match="'byte_col' exceeds 2GB maximum " "capacity of a Feather binary column. This restriction " "may be lifted in the future"): write_feather(df, io.BytesIO(), version=1)
def test_write_with_features(): output = pa.BufferOutputStream() features = Features({"labels": ClassLabel(names=["neg", "pos"])}) with ArrowWriter(stream=output, features=features) as writer: writer.write({"labels": 0}) writer.write({"labels": 1}) num_examples, num_bytes = writer.finalize() assert num_examples == 2 assert num_bytes > 0 assert writer._schema == features.arrow_schema assert writer._schema.metadata == features.arrow_schema.metadata stream = pa.BufferReader(output.getvalue()) f = pa.ipc.open_stream(stream) pa_table: pa.Table = f.read_all() schema = pa_table.schema assert pa_table.num_rows == 2 assert schema == features.arrow_schema assert schema.metadata == features.arrow_schema.metadata assert features == Features.from_arrow_schema(schema)
def test_compressed_recordbatch_stream(compression): # ARROW-4836: roundtrip a RecordBatch through a compressed stream table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a']) raw = pa.BufferOutputStream() try: stream = pa.CompressedOutputStream(raw, compression) except NotImplementedError as e: if compression == "bz2": pytest.skip(str(e)) else: raise writer = pa.RecordBatchStreamWriter(stream, table.schema) writer.write_table(table, max_chunksize=3) writer.close() stream.close() # Flush data buf = raw.getvalue() stream = pa.CompressedInputStream(pa.BufferReader(buf), compression) got_table = pa.RecordBatchStreamReader(stream).read_all() assert got_table == table
def _parquet_bytes_to_dict(column: str, index_buffer: bytes): reader = pa.BufferReader(index_buffer) # This can be done much more efficient but would take a lot more # time to implement so this will be only done on request. table = pq.read_table(reader) column_type = table.schema.field(column).type # `datetime.datetime` objects have a precision of up to microseconds only, so arrow # parses the type to `pa.timestamp("us")`. Since the # values are normalized to `numpy.datetime64[ns]` anyways, we do not care about this # and load the column type as `pa.timestamp("ns")` if column_type == pa.timestamp("us"): column_type = pa.timestamp("ns") df = table.to_pandas() index_dct = dict( zip(df[column].values, (list(x) for x in df[_PARTITION_COLUMN_NAME].values))) return index_dct, column_type
def test_bytes_reader(): # Like a BytesIO, but zero-copy underneath for C++ consumers data = b'some sample data' f = pa.BufferReader(data) assert f.tell() == 0 assert f.size() == len(data) assert f.read(4) == b'some' assert f.tell() == 4 f.seek(0) assert f.tell() == 0 f.seek(5) assert f.tell() == 5 assert f.read(50) == b'sample data' f.close()
def leak2(): data = [pa.array(np.concatenate([np.random.randn(100000)] * 10))] table = pa.Table.from_arrays(data, ['foo']) while True: print('calling to_pandas') print('memory_usage: {0}'.format(memory_profiler.memory_usage())) df = table.to_pandas() batch = pa.RecordBatch.from_pandas(df) sink = io.BytesIO() writer = pa.RecordBatchFileWriter(sink, batch.schema) writer.write_batch(batch) writer.close() buf_reader = pa.BufferReader(sink.getvalue()) reader = pa.open_file(buf_reader) reader.read_all() gc.collect()
def deserialize_pandas(buf, nthreads=None, use_threads=False): """Deserialize a buffer protocol compatible object into a pandas DataFrame. Parameters ---------- buf : buffer An object compatible with the buffer protocol use_threads: boolean, default False Whether to parallelize the conversion using multiple threads Returns ------- df : pandas.DataFrame """ use_threads = _deprecate_nthreads(use_threads, nthreads) buffer_reader = pa.BufferReader(buf) reader = pa.RecordBatchStreamReader(buffer_reader) table = reader.read_all() return table.to_pandas(use_threads=use_threads)
def stream_testing(): # connect to plasma client = plasma.connect("/tmp/store", "", 0) # csv -> table -> record batch table = arrow_csv.read_csv('../data/million.csv') record_batch = table.to_batches()[0] # create an object id object_id = plasma.ObjectID(np.random.bytes(20)) # record batch -> stream writer mock_sink = pa.MockOutputStream() stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema) stream_writer.write_batch(record_batch) stream_writer.close() # create buffer in plasma client data_size = mock_sink.size() buf = client.create(object_id, data_size) # stream writer -> write to plasma buffer stream = pa.FixedSizeBufferWriter(buf) stream_writer = pa.RecordBatchStreamWriter(stream, record_batch.schema) stream_writer.write_batch(record_batch) stream_writer.close() client.seal(object_id) # ----------------Reading Data back from plasma---------------------------- # Get PlasmaBuffer from ObjectID [data] = client.get_buffers([object_id]) buffer = pa.BufferReader(data) # Plasmabuffer -> record batch reader = pa.RecordBatchStreamReader(buffer) record_batch = reader.read_next_batch() # record batch -> python dictionary py_dict = record_batch.to_pydict()
def test_open_file_from_buffer(file_fixture): # ARROW-2859; APIs accept the buffer protocol file_fixture.write_batches() source = file_fixture.get_source() reader1 = pa.ipc.open_file(source) reader2 = pa.ipc.open_file(pa.BufferReader(source)) reader3 = pa.RecordBatchFileReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3) st1 = reader1.stats assert st1.num_messages == 6 assert st1.num_record_batches == 5 assert reader2.stats == st1 assert reader3.stats == st1
def test_direct_read_dictionary(use_legacy_dataset): # ARROW-3325 repeats = 10 nunique = 5 data = [ [util.rands(10) for i in range(nunique)] * repeats, ] table = pa.table(data, names=['f0']) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'], use_legacy_dataset=use_legacy_dataset) # Compute dictionary-encoded subfield expected = pa.table([table[0].dictionary_encode()], names=['f0']) assert result.equals(expected)
def test_stream_options_roundtrip(stream_fixture, options): stream_fixture.use_legacy_ipc_format = None stream_fixture.options = options batches = stream_fixture.write_batches() file_contents = pa.BufferReader(stream_fixture.get_source()) message = pa.ipc.read_message(stream_fixture.get_source()) assert message.metadata_version == options.metadata_version reader = pa.ipc.open_stream(file_contents) assert reader.schema.equals(batches[0].schema) total = 0 for i, next_batch in enumerate(reader): assert next_batch.equals(batches[i]) total += 1 assert total == len(batches) with pytest.raises(StopIteration): reader.read_next_batch()
def _parse_apache_arrow(self, data): if not pa: raise Exception("Apache Arrow is not supported") def is_metadata(schema): if not schema.metadata: return False return schema.metadata.get(b"GROONGA:data_type") == b"metadata" source = pa.BufferReader(data) while source.tell() < source.size(): with pa.RecordBatchStreamReader(source) as reader: schema = reader.schema table = reader.read_all() if is_metadata(schema): self.status = table["return_code"][0].as_py() start_time_ns = table["start_time"][0].value start_time_s = start_time_ns / 1_000_000_000 self.start_time = start_time_s self.elapsed = table["elapsed_time"][0].as_py() else: self._parse_apache_arrow_body(table)
def trim_buffers_ipc(ar): ''' >>> ar = pa.array([1, 2, 3, 4], pa.int8()) >>> ar.nbytes 4 >>> ar.slice(2, 2) #doctest: +ELLIPSIS <pyarrow.lib.Int8Array object at 0x...> [ 3, 4 ] >>> ar.slice(2, 2).nbytes 4 >>> trim_buffers_ipc(ar.slice(2, 2)).nbytes # expected 1 2 >>> trim_buffers_ipc(ar.slice(2, 2))#doctest: +ELLIPSIS <pyarrow.lib.Int8Array object at 0x...> [ 3, 4 ] ''' if len(ar) == 0: return ar schema = pa.schema({'x': ar.type}) with pa.BufferOutputStream() as sink: with pa.ipc.new_stream(sink, schema) as writer: writer.write_table(pa.table({'x': ar})) with pa.BufferReader(sink.getvalue()) as source: with pa.ipc.open_stream(source) as reader: table = reader.read_all() assert table.num_columns == 1 assert table.num_rows == len(ar) trimmed_ar = table.column(0) if isinstance(trimmed_ar, pa.ChunkedArray): assert len(trimmed_ar.chunks) == 1 trimmed_ar = trimmed_ar.chunks[0] return trimmed_ar
def test_serialization_deprecated(): with pytest.warns(FutureWarning): ser = pa.serialize(1) with pytest.warns(FutureWarning): pa.deserialize(ser.to_buffer()) f = pa.BufferOutputStream() with pytest.warns(FutureWarning): pa.serialize_to(12, f) buf = f.getvalue() f = pa.BufferReader(buf) with pytest.warns(FutureWarning): pa.read_serialized(f).deserialize() with pytest.warns(FutureWarning): pa.default_serialization_context() context = pa.lib.SerializationContext() with pytest.warns(FutureWarning): pa.register_default_serialization_handlers(context)
def test_stream_write_table_batches(self): # ARROW-504 df = pd.DataFrame({ 'one': np.random.randn(20), }) b1 = pa.RecordBatch.from_pandas(df[:10], preserve_index=False) b2 = pa.RecordBatch.from_pandas(df, preserve_index=False) table = pa.Table.from_batches([b1, b2, b1]) writer = self._get_writer(self.sink, table.schema) writer.write_table(table, chunksize=15) writer.close() batches = list(pa.open_stream(pa.BufferReader(self._get_source()))) assert list(map(len, batches)) == [10, 15, 5, 10] result_table = pa.Table.from_batches(batches) assert_frame_equal(result_table.to_pandas(), pd.concat([df[:10], df, df[:10]], ignore_index=True))
def test_parquet_writer_with_caller_provided_filesystem(use_legacy_dataset): out = pa.BufferOutputStream() class CustomFS(FileSystem): def __init__(self): self.path = None self.mode = None def open(self, path, mode='rb'): self.path = path self.mode = mode return out fs = CustomFS() fname = 'expected_fname.parquet' df = _test_dataframe(100) table = pa.Table.from_pandas(df, preserve_index=False) with pq.ParquetWriter(fname, table.schema, filesystem=fs, version='2.0') \ as writer: writer.write_table(table) assert fs.path == fname assert fs.mode == 'wb' assert out.closed buf = out.getvalue() table_read = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) df_read = table_read.to_pandas() tm.assert_frame_equal(df_read, df) # Should raise ValueError when filesystem is passed with file-like object with pytest.raises(ValueError) as err_info: pq.ParquetWriter(pa.BufferOutputStream(), table.schema, filesystem=fs) expected_msg = ("filesystem passed but where is file-like, so" " there is nothing to open with filesystem.") assert str(err_info) == expected_msg
def test_open_stream_from_buffer(stream_fixture): # ARROW-2859 stream_fixture.write_batches() source = stream_fixture.get_source() reader1 = pa.ipc.open_stream(source) reader2 = pa.ipc.open_stream(pa.BufferReader(source)) reader3 = pa.RecordBatchStreamReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3) st1 = reader1.stats assert st1.num_messages == 6 assert st1.num_record_batches == 5 assert reader2.stats == st1 assert reader3.stats == st1 assert tuple(st1) == tuple(stream_fixture.write_stats)
def test_to_arrow_str_dict(self): data = { "a": ["abcdefg", "abcdefg", "h"], "b": ["aaa", "bbb", "bbb"], "c": ["hello", "world", "world"] } tbl = Table(data) assert tbl.schema() == {"a": str, "b": str, "c": str} arr = tbl.view().to_arrow() # assert that we are actually generating dict arrays buf = pa.BufferReader(arr) reader = pa.ipc.open_stream(buf) arrow_table = reader.read_all() arrow_schema = arrow_table.schema for name in ("a", "b", "c"): arrow_type = arrow_schema.field(name).type assert pa.types.is_dictionary(arrow_type) # assert that data is symmetrical tbl2 = Table(arr) assert tbl2.view().to_dict() == data
def test_parquet_writer_context_obj(tempdir, use_legacy_dataset): df = _test_dataframe(100) df['unique_id'] = 0 arrow_table = pa.Table.from_pandas(df, preserve_index=False) out = pa.BufferOutputStream() with pq.ParquetWriter(out, arrow_table.schema, version='2.0') as writer: frames = [] for i in range(10): df['unique_id'] = i arrow_table = pa.Table.from_pandas(df, preserve_index=False) writer.write_table(arrow_table) frames.append(df.copy()) buf = out.getvalue() result = _read_table(pa.BufferReader(buf), use_legacy_dataset=use_legacy_dataset) expected = pd.concat(frames, ignore_index=True) tm.assert_frame_equal(result.to_pandas(), expected)
def test_field_id_metadata(): # ARROW-7080 table = pa.table([ pa.array([1], type='int32'), pa.array([[]], type=pa.list_(pa.int32())), pa.array([b'boo'], type='binary') ], ['f0', 'f1', 'f2']) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() pf = pq.ParquetFile(pa.BufferReader(contents)) schema = pf.schema_arrow # Expected Parquet schema for reference # # required group field_id=0 schema { # optional int32 field_id=1 f0; # optional group field_id=2 f1 (List) { # repeated group field_id=3 list { # optional int32 field_id=4 item; # } # } # optional binary field_id=5 f2; # } field_name = b'PARQUET:field_id' assert schema[0].metadata[field_name] == b'1' list_field = schema[1] assert list_field.metadata[field_name] == b'2' list_item_field = list_field.type.value_field assert list_item_field.metadata[field_name] == b'4' assert schema[2].metadata[field_name] == b'5'
def test_parquet(tmpdir, registered_period_type): # parquet support for extension types period_type, period_class = registered_period_type storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'extension_type.parquet' pq.write_table(table, filename) # stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.ipc.read_schema(pa.BufferReader(decoded_schema)) assert schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period'} # when reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.column("ext").type == period_type # get the exact array class defined by the registered type. result_array = result.column("ext").chunk(0) assert type(result_array) == period_class # when the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.column("ext").type == pa.int64()
def _get_example_messages(self): _, batches = self.write_batches() file_contents = self._get_source() buf_reader = pa.BufferReader(file_contents) reader = pa.MessageReader.open_stream(buf_reader) return batches, list(reader)
def _simple_table_write_read(table, use_legacy_dataset): bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() return pq.read_table(pa.BufferReader(contents), use_legacy_dataset=use_legacy_dataset)
def restore_dataframe( store, key, filter_query=None, columns=None, predicate_pushdown_to_io=True, categories=None, predicates=None, date_as_object=False, ): check_predicates(predicates) # If we want to do columnar access we can benefit from partial reads # otherwise full read en block is the better option. if (not predicate_pushdown_to_io) or (columns is None and predicates is None): with pa.BufferReader(store.get(key)) as reader: table = pq.read_pandas(reader, columns=columns) else: if HAVE_BOTO and isinstance(store, BotoStore): # Parquet and seeks on S3 currently leak connections thus # we omit column projection to the store. reader = pa.BufferReader(store.get(key)) else: reader = store.open(key) # Buffer at least 4 MB in requests. This is chosen because the default block size of the Azure # storage client is 4MB. reader = BlockBuffer(reader, 4 * 1024 * 1024) try: parquet_file = ParquetFile(reader) if predicates and parquet_file.metadata.num_rows > 0: # We need to calculate different predicates for predicate # pushdown and the later DataFrame filtering. This is required # e.g. in the case where we have an `in` predicate as this has # different normalized values. columns_to_io = _columns_for_pushdown(columns, predicates) predicates_for_pushdown = _normalize_predicates( parquet_file, predicates, True) predicates = _normalize_predicates(parquet_file, predicates, False) tables = _read_row_groups_into_tables( parquet_file, columns_to_io, predicates_for_pushdown) if len(tables) == 0: if ARROW_LARGER_EQ_0130: table = parquet_file.schema.to_arrow_schema( ).empty_table() else: table = _empty_table_from_schema(parquet_file) else: table = pa.concat_tables(tables) else: # ARROW-5139 Column projection with empty columns returns a table w/out index if ARROW_LARGER_EQ_0130 and columns == []: # Create an arrow table with expected index length. df = (parquet_file.schema.to_arrow_schema(). empty_table().to_pandas( date_as_object=date_as_object)) index = pd.Int64Index( pd.RangeIndex(start=0, stop=parquet_file.metadata.num_rows)) df = pd.DataFrame(df, index=index) # convert back to table to keep downstream code untouched by this patch table = pa.Table.from_pandas(df) else: table = pq.read_pandas(reader, columns=columns) finally: reader.close() table = _fix_pyarrow_07992_table(table) table = _fix_pyarrow_0130_table(table) if columns is not None: missing_columns = set(columns) - set(table.schema.names) if missing_columns: raise ValueError( "Columns cannot be found in stored dataframe: {missing}". format(missing=", ".join(sorted(missing_columns)))) df = table.to_pandas(categories=categories, date_as_object=date_as_object) df.columns = df.columns.map(ensure_unicode_string_type) if predicates: df = filter_df_from_predicates(df, predicates, strict_date_types=date_as_object) else: df = filter_df(df, filter_query) if columns is not None: return df.loc[:, columns] else: return df