def write_mutable_tensor(self, session_id, name, payload_type, body): import pyarrow from ..serialize import dataserializer from ..tensor.core import Indexes session_uid = SessionActor.gen_uid(session_id) session_ref = self.get_actor_ref(session_uid) index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item() index_json = json.loads(body[8:8 + index_json_size].decode('ascii')) index = Indexes.from_json(index_json).indexes if payload_type is None: value = dataserializer.loads(body[8 + index_json_size:]) elif payload_type == 'tensor': tensor_chunk_offset = 8 + index_json_size with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader: value = pyarrow.read_tensor(reader).to_numpy() elif payload_type == 'record_batch': schema_size = np.frombuffer(body[8 + index_json_size:8 + index_json_size + 8], dtype=np.int64).item() schema_offset = 8 + index_json_size + 8 with pyarrow.BufferReader(body[schema_offset:schema_offset + schema_size]) as reader: schema = pyarrow.read_schema(reader) record_batch_offset = schema_offset + schema_size with pyarrow.BufferReader(body[record_batch_offset:]) as reader: record_batch = pyarrow.read_record_batch(reader, schema) value = record_batch.to_pandas().to_records(index=False) else: raise ValueError(f'Not supported payload type: {payload_type}') return session_ref.write_mutable_tensor(name, index, value)
def __iter__(self): input_stream = pa.input_stream(self.path) while (True): try: result = pa.read_tensor(input_stream) yield result.to_numpy() except: break input_stream.close()
def test_read_tensor(tmpdir): # Create and write tensor tensor data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) data_size = pa.get_tensor_size(tensor) path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-read-tensor') write_mmap = pa.create_memory_map(path, data_size) pa.write_tensor(tensor, write_mmap) # Try to read tensor read_mmap = pa.memory_map(path, mode='r') array = pa.read_tensor(read_mmap).to_numpy() np.testing.assert_equal(data, array)
def test_read_tensor(tmpdir): # Create and write tensor tensor data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) data_size = pa.get_tensor_size(tensor) path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-read-tensor') write_mmap = pa.create_memory_map(path, data_size) pa.write_tensor(tensor, write_mmap) # Try to read tensor read_mmap = pa.memory_map(path, mode='r') array = pa.read_tensor(read_mmap).to_numpy() np.testing.assert_equal(data, array)
def test_tensor_ipc_roundtrip(tmpdir): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-roundtrip') mmap = pa.create_memory_map(path, 1024) pa.write_tensor(tensor, mmap) mmap.seek(0) result = pa.read_tensor(mmap) assert result.equals(tensor)
def test_tensor_ipc_roundtrip(tmpdir): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-roundtrip') mmap = pa.create_memory_map(path, 1024) pa.write_tensor(tensor, mmap) mmap.seek(0) result = pa.read_tensor(mmap) assert result.equals(tensor)
def test_tensor_ipc_read_from_compressed(tempdir): # ARROW-5910 data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) path = tempdir / 'tensor-compressed-file' out_stream = pa.output_stream(path, compression='gzip') pa.write_tensor(tensor, out_stream) out_stream.close() result = pa.read_tensor(pa.input_stream(path, compression='gzip')) assert result.equals(tensor)
def test_store_arrow_objects(self): data = np.random.randn(10, 4) # Write an arrow object. object_id = random_object_id() tensor = pa.Tensor.from_numpy(data) data_size = pa.get_tensor_size(tensor) buf = self.plasma_client.create(object_id, data_size) stream = pa.FixedSizeBufferWriter(buf) pa.write_tensor(tensor, stream) self.plasma_client.seal(object_id) # Read the arrow object. [tensor] = self.plasma_client.get_buffers([object_id]) reader = pa.BufferReader(tensor) array = pa.read_tensor(reader).to_numpy() # Assert that they are equal. np.testing.assert_equal(data, array)
def test_store_arrow_objects(self): data = np.random.randn(10, 4) # Write an arrow object. object_id = random_object_id() tensor = pa.Tensor.from_numpy(data) data_size = pa.get_tensor_size(tensor) buf = self.plasma_client.create(object_id, data_size) stream = pa.FixedSizeBufferWriter(buf) pa.write_tensor(tensor, stream) self.plasma_client.seal(object_id) # Read the arrow object. [tensor] = self.plasma_client.get_buffers([object_id]) reader = pa.BufferReader(tensor) array = pa.read_tensor(reader).to_numpy() # Assert that they are equal. np.testing.assert_equal(data, array)
def test_tensor_ipc_roundtrip(): data = np.random.randn(10, 4) tensor = pa.Tensor.from_numpy(data) path = 'pyarrow-tensor-ipc-roundtrip' try: mmap = pa.create_memory_map(path, 1024) pa.write_tensor(tensor, mmap) mmap.seek(0) result = pa.read_tensor(mmap) assert result.equals(tensor) finally: _try_delete(path)
def test_tensor_ipc_strided(tmpdir): data1 = np.random.randn(10, 4) tensor1 = pa.Tensor.from_numpy(data1[::2]) data2 = np.random.randn(10, 6, 4) tensor2 = pa.Tensor.from_numpy(data2[::, ::2, ::]) path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-strided') mmap = pa.create_memory_map(path, 2048) for tensor in [tensor1, tensor2]: mmap.seek(0) pa.write_tensor(tensor, mmap) mmap.seek(0) result = pa.read_tensor(mmap) assert result.equals(tensor)
def test_tensor_ipc_strided(tmpdir): data1 = np.random.randn(10, 4) tensor1 = pa.Tensor.from_numpy(data1[::2]) data2 = np.random.randn(10, 6, 4) tensor2 = pa.Tensor.from_numpy(data2[::, ::2, ::]) path = os.path.join(str(tmpdir), 'pyarrow-tensor-ipc-strided') mmap = pa.create_memory_map(path, 2048) for tensor in [tensor1, tensor2]: mmap.seek(0) pa.write_tensor(tensor, mmap) mmap.seek(0) result = pa.read_tensor(mmap) assert result.equals(tensor)
def get_np(oid): buff = client.get_buffers([oid])[0] reader = pa.BufferReader(buff) t = pa.read_tensor(reader) return t.to_numpy()