def write_mutable_tensor(self, session_id, name, payload_type, body): import pyarrow from ..serialize import dataserializer from ..tensor.core import Indexes session_uid = SessionActor.gen_uid(session_id) session_ref = self.get_actor_ref(session_uid) index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item() index_json = json.loads(body[8:8 + index_json_size].decode('ascii')) index = Indexes.from_json(index_json).indexes if payload_type is None: value = dataserializer.loads(body[8 + index_json_size:]) elif payload_type == 'tensor': tensor_chunk_offset = 8 + index_json_size with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader: value = pyarrow.read_tensor(reader).to_numpy() elif payload_type == 'record_batch': schema_size = np.frombuffer(body[8 + index_json_size:8 + index_json_size + 8], dtype=np.int64).item() schema_offset = 8 + index_json_size + 8 with pyarrow.BufferReader(body[schema_offset:schema_offset + schema_size]) as reader: schema = pyarrow.read_schema(reader) record_batch_offset = schema_offset + schema_size with pyarrow.BufferReader(body[record_batch_offset:]) as reader: record_batch = pyarrow.read_record_batch(reader, schema) value = record_batch.to_pandas().to_records(index=False) else: raise ValueError(f'Not supported payload type: {payload_type}') return session_ref.write_mutable_tensor(name, index, value)
def test_parquet(tmpdir, registered_period_type): # parquet support for extension types period_type = PeriodType('D') storage = pa.array([1, 2, 3, 4], pa.int64()) arr = pa.ExtensionArray.from_storage(period_type, storage) table = pa.table([arr], names=["ext"]) import pyarrow.parquet as pq filename = tmpdir / 'extension_type.parquet' pq.write_table(table, filename) # stored in parquet as storage type but with extension metadata saved # in the serialized arrow schema meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata import base64 decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) schema = pa.read_schema(pa.BufferReader(decoded_schema)) assert schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period' } # when reading in, properly create extension type if it is registered result = pq.read_table(filename) assert result.column("ext").type == period_type # when the type is not registered, read in as storage type pa.unregister_extension_type(period_type.extension_name) result = pq.read_table(filename) assert result.column("ext").type == pa.int64()
def _parse_arrow_schema(self): if self._schema: return self._schema = pyarrow.read_schema( pyarrow.py_buffer(self._read_session.arrow_schema.serialized_schema) ) self._column_names = [field.name for field in self._schema]
def _read_schema(self): # read schema bytes (hsize, ) = SCHEMA_HEAD_SERDES.from_file(self._fh) buf = self._fh.read(hsize) # Odd -- pyarrow.read_schema wants a readable buffer, and a bytes # object is insufficient. So we wrap it back up to pull out the schema wrap = io.BufferedReader(io.BytesIO(buf)) self._schema = pa.read_schema(wrap)
def test_schema_batch_serialize_methods(): nrows = 5 df = pd.DataFrame({ 'one': np.random.randn(nrows), 'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']}) batch = pa.RecordBatch.from_pandas(df) s_schema = batch.schema.serialize() s_batch = batch.serialize() recons_schema = pa.read_schema(s_schema) recons_batch = pa.read_record_batch(s_batch, recons_schema) assert recons_batch.equals(batch)
def test_schema_serialization_with_metadata(): field_metadata = {b'foo': b'bar', b'kind': b'field'} schema_metadata = {b'foo': b'bar', b'kind': b'schema'} f0 = pa.field('a', pa.int8()) f1 = pa.field('b', pa.string(), metadata=field_metadata) schema = pa.schema([f0, f1], metadata=schema_metadata) s_schema = schema.serialize() recons_schema = pa.read_schema(s_schema) assert recons_schema.equals(schema) assert recons_schema.metadata == schema_metadata assert recons_schema[0].metadata is None assert recons_schema[1].metadata == field_metadata
def _parse_schema(self, encoded_schema): return pa.read_schema(pa.BufferReader( base64.b64decode(encoded_schema)))
def _parse_tdf_gpu(tdf): """ Parse the results of a select ipc_gpu into a GpuDataFrame Parameters ---------- tdf : TDataFrame Returns ------- gdf : GpuDataFrame """ import pyarrow as pa from cudf.comm.gpuarrow import GpuArrowReader from cudf.core.dataframe import DataFrame from cudf._lib.arrow._cuda import Context, IpcMemHandle from numba import cuda ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle)) ctx = Context() ipc_buf = ctx.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size) buffer = pa.BufferReader(schema_buffer) schema = pa.read_schema(buffer) # Dictionary Memo functionality used to # deserialize on the C++ side is not # exposed on the pyarrow side, so we need to # handle this on our own. dict_memo = {} try: dict_batch_reader = pa.RecordBatchStreamReader(buffer) updated_fields = [] for f in schema: if pa.types.is_dictionary(f.type): msg = dict_batch_reader.read_next_batch() dict_memo[f.name] = msg.column(0) updated_fields.append(pa.field(f.name, f.type.index_type)) else: updated_fields.append(pa.field(f.name, f.type)) schema = pa.schema(updated_fields) except pa.ArrowInvalid: # This message does not have any dictionary encoded # columns pass dtype = np.dtype(np.byte) darr = cuda.devicearray.DeviceNDArray( shape=ipc_buf.size, strides=dtype.itemsize, dtype=dtype, gpu_data=ipc_buf.to_numba(), ) reader = GpuArrowReader(schema, darr) df = DataFrame() df.set_tdf = MethodType(set_tdf, df) df.get_tdf = MethodType(get_tdf, df) for k, v in reader.to_dict().items(): if k in dict_memo: df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k]) else: df[k] = v df.set_tdf(tdf) # free shared memory from Python # https://github.com/omnisci/pymapd/issues/46 # https://github.com/omnisci/pymapd/issues/31 free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p)) # noqa return df
def parse_encoded_schema(b64_schema): return pa.read_schema(pa.BufferReader(base64.b64decode(b64_schema)))