Ejemplo n.º 1
0
    def write_mutable_tensor(self, session_id, name, payload_type, body):
        import pyarrow

        from ..serialize import dataserializer
        from ..tensor.core import Indexes
        session_uid = SessionActor.gen_uid(session_id)
        session_ref = self.get_actor_ref(session_uid)

        index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item()
        index_json = json.loads(body[8:8 + index_json_size].decode('ascii'))
        index = Indexes.from_json(index_json).indexes
        if payload_type is None:
            value = dataserializer.loads(body[8 + index_json_size:])
        elif payload_type == 'tensor':
            tensor_chunk_offset = 8 + index_json_size
            with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader:
                value = pyarrow.read_tensor(reader).to_numpy()
        elif payload_type == 'record_batch':
            schema_size = np.frombuffer(body[8 + index_json_size:8 +
                                             index_json_size + 8],
                                        dtype=np.int64).item()
            schema_offset = 8 + index_json_size + 8
            with pyarrow.BufferReader(body[schema_offset:schema_offset +
                                           schema_size]) as reader:
                schema = pyarrow.read_schema(reader)
            record_batch_offset = schema_offset + schema_size
            with pyarrow.BufferReader(body[record_batch_offset:]) as reader:
                record_batch = pyarrow.read_record_batch(reader, schema)
                value = record_batch.to_pandas().to_records(index=False)
        else:
            raise ValueError(f'Not supported payload type: {payload_type}')
        return session_ref.write_mutable_tensor(name, index, value)
Ejemplo n.º 2
0
def test_parquet(tmpdir, registered_period_type):
    # parquet support for extension types
    period_type = PeriodType('D')
    storage = pa.array([1, 2, 3, 4], pa.int64())
    arr = pa.ExtensionArray.from_storage(period_type, storage)
    table = pa.table([arr], names=["ext"])

    import pyarrow.parquet as pq

    filename = tmpdir / 'extension_type.parquet'
    pq.write_table(table, filename)

    # stored in parquet as storage type but with extension metadata saved
    # in the serialized arrow schema
    meta = pq.read_metadata(filename)
    assert meta.schema.column(0).physical_type == "INT64"
    assert b"ARROW:schema" in meta.metadata

    import base64
    decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"])
    schema = pa.read_schema(pa.BufferReader(decoded_schema))
    assert schema.field("ext").metadata == {
        b'ARROW:extension:metadata': b'freq=D',
        b'ARROW:extension:name': b'pandas.period'
    }

    # when reading in, properly create extension type if it is registered
    result = pq.read_table(filename)
    assert result.column("ext").type == period_type

    # when the type is not registered, read in as storage type
    pa.unregister_extension_type(period_type.extension_name)
    result = pq.read_table(filename)
    assert result.column("ext").type == pa.int64()
Ejemplo n.º 3
0
    def _parse_arrow_schema(self):
        if self._schema:
            return

        self._schema = pyarrow.read_schema(
            pyarrow.py_buffer(self._read_session.arrow_schema.serialized_schema)
        )
        self._column_names = [field.name for field in self._schema]
Ejemplo n.º 4
0
    def _read_schema(self):
        # read schema bytes
        (hsize, ) = SCHEMA_HEAD_SERDES.from_file(self._fh)
        buf = self._fh.read(hsize)

        # Odd -- pyarrow.read_schema wants a readable buffer, and a bytes
        # object is insufficient. So we wrap it back up to pull out the schema
        wrap = io.BufferedReader(io.BytesIO(buf))
        self._schema = pa.read_schema(wrap)
Ejemplo n.º 5
0
def test_schema_batch_serialize_methods():
    nrows = 5
    df = pd.DataFrame({
        'one': np.random.randn(nrows),
        'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']})
    batch = pa.RecordBatch.from_pandas(df)

    s_schema = batch.schema.serialize()
    s_batch = batch.serialize()

    recons_schema = pa.read_schema(s_schema)
    recons_batch = pa.read_record_batch(s_batch, recons_schema)
    assert recons_batch.equals(batch)
Ejemplo n.º 6
0
def test_schema_serialization_with_metadata():
    field_metadata = {b'foo': b'bar', b'kind': b'field'}
    schema_metadata = {b'foo': b'bar', b'kind': b'schema'}

    f0 = pa.field('a', pa.int8())
    f1 = pa.field('b', pa.string(), metadata=field_metadata)

    schema = pa.schema([f0, f1], metadata=schema_metadata)

    s_schema = schema.serialize()
    recons_schema = pa.read_schema(s_schema)

    assert recons_schema.equals(schema)
    assert recons_schema.metadata == schema_metadata
    assert recons_schema[0].metadata is None
    assert recons_schema[1].metadata == field_metadata
Ejemplo n.º 7
0
def test_schema_serialization_with_metadata():
    field_metadata = {b'foo': b'bar', b'kind': b'field'}
    schema_metadata = {b'foo': b'bar', b'kind': b'schema'}

    f0 = pa.field('a', pa.int8())
    f1 = pa.field('b', pa.string(), metadata=field_metadata)

    schema = pa.schema([f0, f1], metadata=schema_metadata)

    s_schema = schema.serialize()
    recons_schema = pa.read_schema(s_schema)

    assert recons_schema.equals(schema)
    assert recons_schema.metadata == schema_metadata
    assert recons_schema[0].metadata is None
    assert recons_schema[1].metadata == field_metadata
Ejemplo n.º 8
0
 def _parse_schema(self, encoded_schema):
     return pa.read_schema(pa.BufferReader(
         base64.b64decode(encoded_schema)))
Ejemplo n.º 9
0
def _parse_tdf_gpu(tdf):
    """
    Parse the results of a select ipc_gpu into a GpuDataFrame

    Parameters
    ----------
    tdf : TDataFrame

    Returns
    -------
    gdf : GpuDataFrame
    """

    import pyarrow as pa
    from cudf.comm.gpuarrow import GpuArrowReader
    from cudf.core.dataframe import DataFrame
    from cudf._lib.arrow._cuda import Context, IpcMemHandle
    from numba import cuda

    ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle))
    ctx = Context()
    ipc_buf = ctx.open_ipc_buffer(ipc_handle)
    ipc_buf.context.synchronize()

    schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size)

    buffer = pa.BufferReader(schema_buffer)
    schema = pa.read_schema(buffer)

    # Dictionary Memo functionality used to
    # deserialize on the C++ side is not
    # exposed on the pyarrow side, so we need to
    # handle this on our own.
    dict_memo = {}

    try:
        dict_batch_reader = pa.RecordBatchStreamReader(buffer)
        updated_fields = []

        for f in schema:
            if pa.types.is_dictionary(f.type):
                msg = dict_batch_reader.read_next_batch()
                dict_memo[f.name] = msg.column(0)
                updated_fields.append(pa.field(f.name, f.type.index_type))
            else:
                updated_fields.append(pa.field(f.name, f.type))

        schema = pa.schema(updated_fields)
    except pa.ArrowInvalid:
        # This message does not have any dictionary encoded
        # columns
        pass

    dtype = np.dtype(np.byte)
    darr = cuda.devicearray.DeviceNDArray(
        shape=ipc_buf.size,
        strides=dtype.itemsize,
        dtype=dtype,
        gpu_data=ipc_buf.to_numba(),
    )

    reader = GpuArrowReader(schema, darr)
    df = DataFrame()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)

    for k, v in reader.to_dict().items():
        if k in dict_memo:
            df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k])
        else:
            df[k] = v

    df.set_tdf(tdf)

    # free shared memory from Python
    # https://github.com/omnisci/pymapd/issues/46
    # https://github.com/omnisci/pymapd/issues/31
    free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p))  # noqa

    return df
Ejemplo n.º 10
0
 def parse_encoded_schema(b64_schema):
     return pa.read_schema(pa.BufferReader(base64.b64decode(b64_schema)))