Beispiel #1
0
def test_read_record_batch_on_stream_error_message():
    # ARROW-5374
    batch = pa.record_batch([pa.array([b"foo"], type=pa.utf8())],
                            names=['strs'])
    stream = pa.BufferOutputStream()
    with pa.RecordBatchStreamWriter(stream, batch.schema) as writer:
        writer.write_batch(batch)
    buf = stream.getvalue()
    with pytest.raises(IOError, match="type record batch but got schema"):
        pa.read_record_batch(buf, batch.schema)
Beispiel #2
0
    def _parse_arrow_message(self, message):
        self._parse_arrow_schema()

        return pyarrow.read_record_batch(
            pyarrow.py_buffer(message.arrow_record_batch.serialized_record_batch),
            self._schema,
        )
Beispiel #3
0
 def decode_pyarrow_records(b64_schema, b64_records):
     """
     Decodes an encoded record set provided a similarly encoded schema.
     Returns just the records as the schema will be included with that
     """
     pa_schema = AthenaSDKUtils.parse_encoded_schema(b64_schema)
     return pa.read_record_batch(base64.b64decode(b64_records), pa_schema)
Beispiel #4
0
    def write_mutable_tensor(self, session_id, name, payload_type, body):
        import pyarrow

        from ..serialize import dataserializer
        from ..tensor.core import Indexes
        session_uid = SessionActor.gen_uid(session_id)
        session_ref = self.get_actor_ref(session_uid)

        index_json_size = np.frombuffer(body[0:8], dtype=np.int64).item()
        index_json = json.loads(body[8:8 + index_json_size].decode('ascii'))
        index = Indexes.from_json(index_json).indexes
        if payload_type is None:
            value = dataserializer.loads(body[8 + index_json_size:])
        elif payload_type == 'tensor':
            tensor_chunk_offset = 8 + index_json_size
            with pyarrow.BufferReader(body[tensor_chunk_offset:]) as reader:
                value = pyarrow.read_tensor(reader).to_numpy()
        elif payload_type == 'record_batch':
            schema_size = np.frombuffer(body[8 + index_json_size:8 +
                                             index_json_size + 8],
                                        dtype=np.int64).item()
            schema_offset = 8 + index_json_size + 8
            with pyarrow.BufferReader(body[schema_offset:schema_offset +
                                           schema_size]) as reader:
                schema = pyarrow.read_schema(reader)
            record_batch_offset = schema_offset + schema_size
            with pyarrow.BufferReader(body[record_batch_offset:]) as reader:
                record_batch = pyarrow.read_record_batch(reader, schema)
                value = record_batch.to_pandas().to_records(index=False)
        else:
            raise ValueError(f'Not supported payload type: {payload_type}')
        return session_ref.write_mutable_tensor(name, index, value)
Beispiel #5
0
def test_schema_batch_serialize_methods():
    nrows = 5
    df = pd.DataFrame({
        'one': np.random.randn(nrows),
        'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']})
    batch = pa.RecordBatch.from_pandas(df)

    s_schema = batch.schema.serialize()
    s_batch = batch.serialize()

    recons_schema = pa.read_schema(s_schema)
    recons_batch = pa.read_record_batch(s_batch, recons_schema)
    assert recons_batch.equals(batch)
Beispiel #6
0
def test_batch_serialize():
    batch = make_recordbatch(10)
    hbuf = batch.serialize()
    cbuf = cuda.serialize_record_batch(batch, global_context)
    # test that read_record_batch works properly:
    cuda.read_record_batch(batch.schema, cbuf)
    buf = cbuf.copy_to_host()
    assert hbuf.equals(buf)
    batch2 = pa.read_record_batch(buf, batch.schema)
    assert hbuf.equals(batch2.serialize())
    assert batch.num_columns == batch2.num_columns
    assert batch.num_rows == batch2.num_rows
    assert batch.column(0).equals(batch2.column(0))
    assert batch.equals(batch2)
Beispiel #7
0
def test_batch_serialize():
    batch = make_recordbatch(10)
    hbuf = batch.serialize()
    cbuf = cuda.serialize_record_batch(batch, global_context)
    # test that read_record_batch works properly:
    cuda.read_record_batch(cbuf, batch.schema)
    buf = cbuf.copy_to_host()
    assert hbuf.equals(buf)
    batch2 = pa.read_record_batch(buf, batch.schema)
    assert hbuf.equals(batch2.serialize())
    assert batch.num_columns == batch2.num_columns
    assert batch.num_rows == batch2.num_rows
    assert batch.column(0).equals(batch2.column(0))
    assert batch.equals(batch2)
Beispiel #8
0
def _load_data(buf, schema):
    """
    Load a `pandas.DataFrame` from a buffer written to shared memory

    Parameters
    ----------
    buf : pyarrow.Buffer
    shcema : pyarrow.Schema

    Returns
    -------
    df : pandas.DataFrame
    """
    import pyarrow as pa

    message = pa.read_message(buf)
    rb = pa.read_record_batch(message, schema)
    return rb.to_pandas()
Beispiel #9
0
def _load_data(buf, schema, tdf=None):
    """
    Load a `pandas.DataFrame` from a buffer written to shared memory

    Parameters
    ----------
    buf : pyarrow.Buffer
    shcema : pyarrow.Schema
    tdf(optional) : TDataFrame

    Returns
    -------
    df : pandas.DataFrame
    """
    message = pa.read_message(buf)
    rb = pa.read_record_batch(message, schema)
    df = rb.to_pandas()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)
    df.set_tdf(tdf)
    return df
Beispiel #10
0
    def test_read_record_batch(self):
        batches, messages = self._get_example_messages()

        for batch, message in zip(batches, messages[1:]):
            read_batch = pa.read_record_batch(message, batch.schema)
            assert read_batch.equals(batch)
Beispiel #11
0
    def test_read_record_batch(self):
        batches, messages = self._get_example_messages()

        for batch, message in zip(batches, messages[1:]):
            read_batch = pa.read_record_batch(message, batch.schema)
            assert read_batch.equals(batch)
Beispiel #12
0
def test_message_read_record_batch(example_messages):
    batches, messages = example_messages

    for batch, message in zip(batches, messages[1:]):
        read_batch = pa.read_record_batch(message, batch.schema)
        assert read_batch.equals(batch)
Beispiel #13
0
def test_message_read_record_batch(example_messages):
    batches, messages = example_messages

    for batch, message in zip(batches, messages[1:]):
        read_batch = pa.read_record_batch(message, batch.schema)
        assert read_batch.equals(batch)
Beispiel #14
0
def get_dfs_arrow(object_ids):
    buffers = client.get_buffers(object_ids)
    return [
        pa.read_record_batch(pa.BufferReader(buf), test_schema()).to_pandas()
        for buf in buffers
    ]