def test_open_stream_from_buffer(stream_fixture):
    # ARROW-2859
    stream_fixture.write_batches()
    source = stream_fixture.get_source()

    reader1 = pa.ipc.open_stream(source)
    reader2 = pa.ipc.open_stream(pa.BufferReader(source))
    reader3 = pa.RecordBatchStreamReader(source)

    result1 = reader1.read_all()
    result2 = reader2.read_all()
    result3 = reader3.read_all()

    assert result1.equals(result2)
    assert result1.equals(result3)

    st1 = reader1.stats
    assert st1.num_messages == 6
    assert st1.num_record_batches == 5
    assert reader2.stats == st1
    assert reader3.stats == st1

    assert tuple(st1) == tuple(stream_fixture.write_stats)
Ejemplo n.º 2
0
def read_arrow(source):
    # print('reading arrow file as arrow table from disk')
    reader = pa.RecordBatchStreamReader(source)
    pa_df = reader.read_all()
    return pa_df
Ejemplo n.º 3
0
import pyarrow as pa
import pandas as pd
import base64

streamB64String = "/////3gGAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAABcAAAD0BQAAnAUAAFwFAAAYBQAA2AQAAJgEAABYBAAAGAQAANgDAACQAwAAUAMAAAADAAC8AgAAeAIAADACAADoAQAAnAEAAFQBAAAUAQAA0AAAAJAAAABIAAAABAAAAHr6//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAAMP3//wIAAAAHAAAADQAAAHNzX25ldF9wcm9maXQAAAC6+v//FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAAHD9//8CAAAABwAAABMAAABzc19uZXRfcGFpZF9pbmNfdGF4AP76//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAAtP3//wIAAAAHAAAACwAAAHNzX25ldF9wYWlkADr7//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAA8P3//wIAAAAHAAAADQAAAHNzX2NvdXBvbl9hbXQAAAB6+///FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAADD+//8CAAAABwAAAAoAAABzc19leHRfdGF4AAC2+///FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAAGz+//8CAAAABwAAABEAAABzc19leHRfbGlzdF9wcmljZQAAAPr7//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAAsP7//wIAAAAHAAAAFQAAAHNzX2V4dF93aG9sZXNhbGVfY29zdAAAAEL8//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAA+P7//wIAAAAHAAAAEgAAAHNzX2V4dF9zYWxlc19wcmljZQAAhvz//xQAAAAUAAAAFAAAAAAABwEYAAAAAAAAAAAAAAA8////AgAAAAcAAAATAAAAc3NfZXh0X2Rpc2NvdW50X2FtdADK/P//FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAAID///8CAAAABwAAAA4AAABzc19zYWxlc19wcmljZQAACv3//xQAAAAUAAAAFAAAAAAABwEYAAAAAAAAAAAAAADA////AgAAAAcAAAANAAAAc3NfbGlzdF9wcmljZQAAAEr9//8UAAAAFAAAABwAAAAAAAcBIAAAAAAAAAAAAAAACAAMAAgABAAIAAAAAgAAAAcAAAARAAAAc3Nfd2hvbGVzYWxlX2Nvc3QAAACW/f//FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAIT9//8AAAABIAAAAAsAAABzc19xdWFudGl0eQDS/f//FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAMD9//8AAAABQAAAABAAAABzc190aWNrZXRfbnVtYmVyAAAAABb+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAABP7//wAAAAFAAAAACwAAAHNzX3Byb21vX3NrAFL+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAQP7//wAAAAFAAAAACwAAAHNzX3N0b3JlX3NrAI7+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAfP7//wAAAAFAAAAACgAAAHNzX2FkZHJfc2sAAMr+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAuP7//wAAAAFAAAAACwAAAHNzX2hkZW1vX3NrAAb///8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAA9P7//wAAAAFAAAAACwAAAHNzX2NkZW1vX3NrAEL///8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAMP///wAAAAFAAAAADgAAAHNzX2N1c3RvbWVyX3NrAACC////FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAHD///8AAAABQAAAAAoAAABzc19pdGVtX3NrAAC+////FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAKz///8AAAABQAAAAA8AAABzc19zb2xkX3RpbWVfc2sAAAASABgAFAATABIADAAAAAgABAASAAAAFAAAABQAAAAcAAAAAAACASAAAAAAAAAAAAAAAAgADAAIAAcACAAAAAAAAAFAAAAADwAAAHNzX3NvbGRfZGF0ZV9zawD/////qAQAABQAAAAAAAAADAAWAA4AFQAQAAQADAAAAIALAAAAAAAAAAAEABAAAAAAAwoAGAAMAAgABAAKAAAAFAAAAPgCAAAKAAAAAAAAAAAAAAAuAAAAAAAAAAAAAAACAAAAAAAAAAgAAAAAAAAAUAAAAAAAAABYAAAAAAAAAAIAAAAAAAAAYAAAAAAAAABQAAAAAAAAALAAAAAAAAAAAgAAAAAAAAC4AAAAAAAAAFAAAAAAAAAACAEAAAAAAAACAAAAAAAAABABAAAAAAAAUAAAAAAAAABgAQAAAAAAAAIAAAAAAAAAaAEAAAAAAABQAAAAAAAAALgBAAAAAAAAAgAAAAAAAADAAQAAAAAAAFAAAAAAAAAAEAIAAAAAAAACAAAAAAAAABgCAAAAAAAAUAAAAAAAAABoAgAAAAAAAAIAAAAAAAAAcAIAAAAAAABQAAAAAAAAAMACAAAAAAAAAgAAAAAAAADIAgAAAAAAAFAAAAAAAAAAGAMAAAAAAAACAAAAAAAAACADAAAAAAAAUAAAAAAAAABwAwAAAAAAAAIAAAAAAAAAeAMAAAAAAAAoAAAAAAAAAKADAAAAAAAAAgAAAAAAAACoAwAAAAAAAKAAAAAAAAAASAQAAAAAAAACAAAAAAAAAFAEAAAAAAAAoAAAAAAAAADwBAAAAAAAAAIAAAAAAAAA+AQAAAAAAACgAAAAAAAAAJgFAAAAAAAAAgAAAAAAAACgBQAAAAAAAKAAAAAAAAAAQAYAAAAAAAACAAAAAAAAAEgGAAAAAAAAoAAAAAAAAADoBgAAAAAAAAIAAAAAAAAA8AYAAAAAAACgAAAAAAAAAJAHAAAAAAAAAgAAAAAAAACYBwAAAAAAAKAAAAAAAAAAOAgAAAAAAAACAAAAAAAAAEAIAAAAAAAAoAAAAAAAAADgCAAAAAAAAAIAAAAAAAAA6AgAAAAAAACgAAAAAAAAAIgJAAAAAAAAAgAAAAAAAACQCQAAAAAAAKAAAAAAAAAAMAoAAAAAAAACAAAAAAAAADgKAAAAAAAAoAAAAAAAAADYCgAAAAAAAAIAAAAAAAAA4AoAAAAAAACgAAAAAAAAAAAAAAAXAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAD/AwAAAAAAAGVpJQAAAAAAZWklAAAAAABlaSUAAAAAAGVpJQAAAAAAZWklAAAAAABlaSUAAAAAAGVpJQAAAAAAZWklAAAAAABlaSUAAAAAAGVpJQAAAAAA/wMAAAAAAADX/wAAAAAAANf/AAAAAAAA1/8AAAAAAADX/wAAAAAAANf/AAAAAAAA1/8AAAAAAADX/wAAAAAAANf/AAAAAAAA1/8AAAAAAADX/wAAAAAAAP8DAAAAAAAAQwQAAAAAAADGBQAAAAAAAKkHAAAAAAAA1gEAAAAAAACUAQAAAAAAABUHAAAAAAAAnwUAAAAAAADLAAAAAAAAAC0HAAAAAAAAzAUAAAAAAAD/AwAAAAAAAAYAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAYAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAYAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAYAAAAAAAAA/wMAAAAAAAABBwkAAAAAAAEHCQAAAAAAAQcJAAAAAAABBwkAAAAAAAEHCQAAAAAAAQcJAAAAAAABBwkAAAAAAAEHCQAAAAAAAQcJAAAAAAABBwkAAAAAAP8DAAAAAAAAZA0AAAAAAABkDQAAAAAAAGQNAAAAAAAAZA0AAAAAAABkDQAAAAAAAGQNAAAAAAAAZA0AAAAAAABkDQAAAAAAAGQNAAAAAAAAZA0AAAAAAAD/AwAAAAAAAEcDAAAAAAAARwMAAAAAAABHAwAAAAAAAEcDAAAAAAAARwMAAAAAAABHAwAAAAAAAEcDAAAAAAAARwMAAAAAAABHAwAAAAAAAEcDAAAAAAAA/wMAAAAAAAACAAAAAAAAAAIAAAAAAAAAAgAAAAAAAAACAAAAAAAAAAIAAAAAAAAAAgAAAAAAAAACAAAAAAAAAAIAAAAAAAAAAgAAAAAAAAACAAAAAAAAAP8DAAAAAAAAAgAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAACAAAAAAAAAAIAAAAAAAAAAQAAAAAAAAADAAAAAAAAAAEAAAAAAAAAAwAAAAAAAAD/AwAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAA/wMAAAAAAABPAAAAJQAAAGMAAAAOAAAAZAAAAFsAAAAFAAAASAAAAA4AAAA6AAAA/wMAAAAAAAB1BAAAAAAAAAAAAAAAAAAA2xgAAAAAAAAAAAAAAAAAAHQfAAAAAAAAAAAAAAAAAABpFgAAAAAAAAAAAAAAAAAAzAkAAAAAAAAAAAAAAAAAAIQkAAAAAAAAAAAAAAAAAAAsBAAAAAAAAAAAAAAAAAAAGCEAAAAAAAAAAAAAAAAAAIIEAAAAAAAAAAAAAAAAAADJAQAAAAAAAAAAAAAAAAAA/wMAAAAAAABPBwAAAAAAAAAAAAAAAAAAhScAAAAAAAAAAAAAAAAAAMg1AAAAAAAAAAAAAAAAAADOHQAAAAAAAAAAAAAAAAAAZg4AAAAAAAAAAAAAAAAAAFsqAAAAAAAAAAAAAAAAAAA3BgAAAAAAAAAAAAAAAAAArysAAAAAAAAAAAAAAAAAAJkEAAAAAAAAAAAAAAAAAAAWAgAAAAAAAAAAAAAAAAAA/wMAAAAAAAAYAQAAAAAAAAAAAAAAAAAAMxAAAAAAAAAAAAAAAAAAAM4gAAAAAAAAAAAAAAAAAABiAgAAAAAAAAAAAAAAAAAASQAAAAAAAAAAAAAAAAAAAGwkAAAAAAAAAAAAAAAAAACcAgAAAAAAAAAAAAAAAAAABhgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABgAQAAAAAAAAAAAAAAAAAA/wMAAAAAAADiJgAAAAAAAAAAAAAAAAAA+xEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/wMAAAAAAABoVgAAAAAAAAAAAAAAAAAAX1cCAAAAAAAAAAAAAAAAAKqvDAAAAAAAAAAAAAAAAABcIQAAAAAAAAAAAAAAAAAAhBwAAAAAAAAAAAAAAAAAAGTyDAAAAAAAAAAAAAAAAAAMDQAAAAAAAAAAAAAAAAAAsMEGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADATwAAAAAAAAAAAAAAAAAA/wMAAAAAAAAbYAEAAAAAAAAAAAAAAAAAp5cDAAAAAAAAAAAAAAAAANwpDAAAAAAAAAAAAAAAAAC+OQEAAAAAAAAAAAAAAAAAsNMDAAAAAAAAAAAAAAAAAOz6DAAAAAAAAAAAAAAAAADcFAAAAAAAAAAAAAAAAAAAwE4JAAAAAAAAAAAAAAAAABw/AAAAAAAAAAAAAAAAAACKZwAAAAAAAAAAAAAAAAAA/wMAAAAAAABhQQIAAAAAAAAAAAAAAAAAObYFAAAAAAAAAAAAAAAAAFjMFAAAAAAAAAAAAAAAAABEoQEAAAAAAAAAAAAAAAAA2J8FAAAAAAAAAAAAAAAAAFkODwAAAAAAAAAAAAAAAAATHwAAAAAAAAAAAAAAAAAAOEkMAAAAAAAAAAAAAAAAAF5AAAAAAAAAAAAAAAAAAAD8eAAAAAAAAAAAAAAAAAAA/wMAAAAAAABgAgAAAAAAAAAAAAAAAAAAQRcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAkQIAAAAAAAAAAAAAAAAAAG5jAAAAAAAAAAAAAAAAAADpAAAAAAAAAAAAAAAAAAAAMEUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/wMAAAAAAADiJgAAAAAAAAAAAAAAAAAA+xEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/wMAAAAAAACGLwAAAAAAAAAAAAAAAAAAZEUCAAAAAAAAAAAAAAAAAKqvDAAAAAAAAAAAAAAAAABcIQAAAAAAAAAAAAAAAAAAhBwAAAAAAAAAAAAAAAAAAGTyDAAAAAAAAAAAAAAAAAAMDQAAAAAAAAAAAAAAAAAAsMEGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADATwAAAAAAAAAAAAAAAAAA/wMAAAAAAADmMQAAAAAAAAAAAAAAAAAApVwCAAAAAAAAAAAAAAAAAKqvDAAAAAAAAAAAAAAAAABcIQAAAAAAAAAAAAAAAAAAFR8AAAAAAAAAAAAAAAAAANJVDQAAAAAAAAAAAAAAAAD1DQAAAAAAAAAAAAAAAAAA4AYHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADATwAAAAAAAAAAAAAAAAAA/wMAAAAAAABrz/7/////////////////va3+/////////////////86FAAAAAAAAAAAAAAAAAACe5/7/////////////////1Ej8/////////////////3j3//////////////////8w+P//////////////////8HL9/////////////////+TA//////////////////826P///////////////////////wAAAAA="
streamString = base64.b64decode(streamB64String)
streamBytes = bytes(streamString)

reader = pa.RecordBatchStreamReader(streamBytes)
df = reader.read_pandas()
print (df)
Ejemplo n.º 4
0
def _parse_tdf_gpu(tdf):
    """
    Parse the results of a select ipc_gpu into a GpuDataFrame

    Parameters
    ----------
    tdf : TDataFrame

    Returns
    -------
    gdf : GpuDataFrame
    """

    import pyarrow as pa
    from cudf.comm.gpuarrow import GpuArrowReader
    from cudf.core.dataframe import DataFrame
    from cudf._lib.arrow._cuda import Context, IpcMemHandle
    from numba import cuda

    ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle))
    ctx = Context()
    ipc_buf = ctx.open_ipc_buffer(ipc_handle)
    ipc_buf.context.synchronize()

    schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size)

    buffer = pa.BufferReader(schema_buffer)
    schema = pa.read_schema(buffer)

    # Dictionary Memo functionality used to
    # deserialize on the C++ side is not
    # exposed on the pyarrow side, so we need to
    # handle this on our own.
    dict_memo = {}

    try:
        dict_batch_reader = pa.RecordBatchStreamReader(buffer)
        updated_fields = []

        for f in schema:
            if pa.types.is_dictionary(f.type):
                msg = dict_batch_reader.read_next_batch()
                dict_memo[f.name] = msg.column(0)
                updated_fields.append(pa.field(f.name, f.type.index_type))
            else:
                updated_fields.append(pa.field(f.name, f.type))

        schema = pa.schema(updated_fields)
    except pa.ArrowInvalid:
        # This message does not have any dictionary encoded
        # columns
        pass

    dtype = np.dtype(np.byte)
    darr = cuda.devicearray.DeviceNDArray(
        shape=ipc_buf.size,
        strides=dtype.itemsize,
        dtype=dtype,
        gpu_data=ipc_buf.to_numba(),
    )

    reader = GpuArrowReader(schema, darr)
    df = DataFrame()
    df.set_tdf = MethodType(set_tdf, df)
    df.get_tdf = MethodType(get_tdf, df)

    for k, v in reader.to_dict().items():
        if k in dict_memo:
            df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k])
        else:
            df[k] = v

    df.set_tdf(tdf)

    # free shared memory from Python
    # https://github.com/omnisci/pymapd/issues/46
    # https://github.com/omnisci/pymapd/issues/31
    free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p))  # noqa

    return df
Ejemplo n.º 5
0
def read_data(file_name):
    reader = pa.RecordBatchStreamReader(file_name)
    table = reader.read_all()
    print(str(table.to_pydict()))
Ejemplo n.º 6
0
def deserialize_batch(header, frames):
    blob = frames[0]
    reader = pyarrow.RecordBatchStreamReader(pyarrow.BufferReader(blob))
    return reader.read_next_batch()
Ejemplo n.º 7
0
def deserialize_data_frame(path):
    global read_data_frame, read_types, read_serializers, _pandas_native_types_, path_to_mmap
    path_to_mmap = path
    with pyarrow.OSFile(path, 'rb') as f:
        stream_reader = pyarrow.RecordBatchStreamReader(f)
        arrowtable = stream_reader.read_all()
        # metadata
        pandas_metadata = json.loads(
            arrowtable.schema.metadata[b'pandas'].decode('utf-8'))
        names = []
        for col in pandas_metadata['columns']:
            names.append(col['name'])
            read_types.append(col['metadata']['type_id'])
            ser_id = col['metadata']['serializer_id']
            if ser_id != '':
                read_serializers[col['name']] = ser_id

        # data
        read_data_frame = pandas.DataFrame()
        for arrowcolumn in arrowtable.itercolumns():
            typeidx = names.index(arrowcolumn.name)
            coltype = read_types[typeidx]
            if coltype in _pandas_native_types_:
                dfcol = arrowcolumn.to_pandas()
            else:
                if coltype == _types_.INTEGER_LIST or coltype == _types_.INTEGER_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.INTEGER_SET, 4,
                                             'i'))
                elif coltype == _types_.LONG_LIST or coltype == _types_.LONG_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.LONG_SET, 8,
                                             'q'))
                elif coltype == _types_.DOUBLE_LIST or coltype == _types_.DOUBLE_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.DOUBLE_SET, 8,
                                             'd'))
                elif coltype == _types_.FLOAT_LIST or coltype == _types_.FLOAT_SET:
                    dfcol = pandas.Series(
                        collection_generator(arrowcolumn,
                                             coltype == _types_.FLOAT_SET, 4,
                                             'f'))
                elif coltype == _types_.BOOLEAN_LIST or coltype == _types_.BOOLEAN_SET:
                    dfcol = pandas.Series(
                        boolean_collection_generator(
                            arrowcolumn, coltype == _types_.BOOLEAN_SET))
                elif coltype == _types_.STRING_LIST or coltype == _types_.STRING_SET:
                    dfcol = pandas.Series(
                        string_collection_generator(
                            arrowcolumn, coltype == _types_.STRING_SET))
                elif coltype == _types_.BYTES_LIST or coltype == _types_.BYTES_SET:
                    dfcol = pandas.Series(
                        bytes_collection_generator(
                            arrowcolumn, coltype == _types_.BYTES_SET))
                else:
                    raise KeyError('Type with id ' + str(coltype) +
                                   ' cannot be deserialized!')
            # Note: we only have one index column (the KNIME RowKeys)
            if arrowcolumn.name in pandas_metadata['index_columns']:
                indexcol = dfcol
            else:
                read_data_frame[arrowcolumn.name] = dfcol

        if not 'indexcol' in locals():
            raise NameError(
                'Variable indexcol has not been set properly, exiting!')

        if len(read_data_frame.columns) > 0:
            read_data_frame.set_index(keys=indexcol, inplace=True)
        else:
            read_data_frame = pandas.DataFrame(index=indexcol)
Ejemplo n.º 8
0
def _deserialize_pyarrow_table(buf):
    with pa.RecordBatchStreamReader(buf) as reader:
        return reader.read_all()
Ejemplo n.º 9
0
def read_arrow_as_pandas(source):
    print('reading arrow file as pandas dataframe from disk')
    reader = pa.RecordBatchStreamReader(source)
    pa_df = reader.read_all()
    return pa_df.to_pandas()
Ejemplo n.º 10
0
 def time_read_to_dataframe(self, *args):
     reader = pa.RecordBatchStreamReader(self.source)
     table = reader.read_all()
     df = table.to_pandas()
Ejemplo n.º 11
0
def hello():
    channel = grpc.insecure_channel('untrusted:50051')
    stub = codeRunner_pb2_grpc.codeRunnerStub(channel)

    rand = random.choice([True, False])

    from pyarrow import csv
    fn = "IRAhandle_tweets_1.csv" if rand else "mimic.csv"
    table = csv.read_csv(fn)
    start = time.clock()

    print("data loaded")

    batches = table.to_batches()
    print(1)
    client = plasma.connect("/tmp/plasma")

    print(2)

    code = """
import time
while True:
    print(7)
    time.sleep(0.5)
""" if False else """
import os
import pyarrow
import sys

authors = dataTable.column("author")
newData = []
for i in range(len(authors)):
    newData.append(1 if i == 0 or authors[i] != authors[i-1] else newData[-1]+1)
newColumn = dataTable.column(3).from_array("authorTweetCount", [newData])
newTable = dataTable.append_column(newColumn)
    """ if rand else """
import os
import pyarrow
import sys

ages = dataTable.column("age")
maxV = max(ages.to_pylist())
newData = []
for i in ages:
    newData.append(1 if i == maxV else 0)
newColumn = dataTable.column(3).from_array("oldest", [newData])
newTable = dataTable.append_column(newColumn)
    """

    tables = []

    for i in range(len(batches)):
        id_ = randString()

        strId = makeID(id_)

        mock_sink = pyarrow.MockOutputStream()  #find data size
        stream_writer = pyarrow.RecordBatchStreamWriter(
            mock_sink, batches[0].schema)
        stream_writer.write_batch(batches[i])
        stream_writer.close()
        data_size = mock_sink.size()
        buf = client.create(strId, data_size)

        stream = pyarrow.FixedSizeBufferWriter(buf)
        stream_writer = pyarrow.RecordBatchStreamWriter(
            stream, batches[0].schema)
        stream_writer.write_batch(batches[i])
        stream_writer.close()

        client.seal(strId)
        print("sent batch " + str(i + 1))

        codeToSend = codeRunner_pb2.code(toRun=code, id_=id_)

        newId = stub.runCode(codeToSend, timeout=1)
        newId = newId.id_

        [data] = client.get_buffers([makeID(newId)])
        outputBuf = pyarrow.py_buffer(data.to_pybytes())
        buffer_ = pyarrow.BufferReader(outputBuf)
        reader = pyarrow.RecordBatchStreamReader(buffer_)
        if i == 0:
            datatable = reader.read_all()
        else:
            datatable = pyarrow.concat_tables([
                datatable,
                datatable.from_batches(reader.read_all().to_batches())
            ])

    html = str(datatable.column("authorTweetCount" if rand else "oldest").data)
    print("data received after " + str(time.clock() - start))

    return html
Ejemplo n.º 12
0
        def handle_batch() -> None:
            nonlocal dataframe
            nonlocal batch
            nonlocal error

            try:
                if dataframe is None:
                    batch = pyarrow.RecordBatchStreamReader(read_stream)
                    dataframe = batch.read_pandas()

                    if encoding is not None:

                        def decode(value: typing.Any) -> typing.Any:
                            if type(value) is bytes:
                                assert encoding is not None

                                return value.decode(encoding)

                            if type(value) is bytearray:
                                assert encoding is not None

                                return value.decode(encoding)

                            if type(value) is tuple:
                                return tuple(decode(child) for child in value)

                            if type(value) is list:
                                return [decode(child) for child in value]

                            if type(value) is numpy.ndarray:
                                return numpy.array(
                                    [decode(child) for child in value])

                            if type(value) is set:
                                return {decode(child) for child in value}

                            if type(value) is frozenset:
                                return frozenset(
                                    decode(child) for child in value)

                            if type(value) is dict:
                                return {
                                    key: decode(child)
                                    for key, child in value.items()
                                }

                            return value

                        dataframe = pandas.DataFrame({
                            column: (dataframe[column].apply(decode)
                                     if dataframe[column].dtype == 'O' else
                                     dataframe[column])
                            for column in dataframe
                        })
                else:
                    if encoding is not None:

                        def encode(value: typing.Any) -> typing.Any:
                            if type(value) is str:
                                assert encoding is not None

                                return value.encode(encoding)

                            if type(value) is tuple:
                                return tuple(encode(child) for child in value)

                            if type(value) is list:
                                return [encode(child) for child in value]

                            if type(value) is numpy.ndarray:
                                return numpy.array(
                                    [encode(child) for child in value])

                            if type(value) is set:
                                return {encode(child) for child in value}

                            if type(value) is frozenset:
                                return frozenset(
                                    encode(child) for child in value)

                            if type(value) is dict:
                                return {
                                    key: encode(child)
                                    for key, child in value.items()
                                }

                            return value

                        dataframe = pandas.DataFrame({
                            column: (dataframe[column].apply(encode)
                                     if dataframe[column].dtype == 'O' else
                                     dataframe[column])
                            for column in dataframe
                        })

                    table = pyarrow.Table.from_arrays([
                        pyarrow.array(dataframe[column].values)
                        for column in dataframe
                    ], dataframe.columns)
                    batch = pyarrow.RecordBatchStreamWriter(
                        write_stream, table.schema)
                    batch.write_table(table)
                    dataframe = None
                    batch.close()
                    write_stream.close()
            except pyarrow.ArrowInvalid:
                pass
            except BaseException as raw_error:  # pylint: disable=broad-except
                error = raw_error
Ejemplo n.º 13
0
    def _request(self, is_redirected=False, **kwargs):
        """
        Request something to the server.
        """
        connection = Connection(self.server_address)

        try:
            # Authentication
            self.authenticate(connection,
                              user=kwargs.pop('user', None),
                              password=kwargs.pop('password', None))

            # Sending request
            if is_redirected:
                connection.send(
                    {key: kwargs[key]
                     for key in ('request_type', 'path')})
            else:
                connection.send(kwargs)

            data = connection.recv()

            # Redirecting request (if necessary)
            if type(data) is dict and 'redirection_address' in data:

                for key in data:

                    if key != 'redirection_address':
                        kwargs[key] = data[key]

                connection.kill()
                connection.connect(tuple(data['redirection_address']))
                self.authenticate(connection)
                connection.send(kwargs)
                data = connection.recv()

            if type(data) is dict and 'msg' in data and data['msg']:
                log.info(data['msg'])

            # Processing request
            if kwargs['request_type'] == 'sync_databases':

                while data['msg'] != 'Done!':
                    data = connection.recv()
                    log.info(data['msg'])

            elif kwargs['request_type'] == 'new_batch':
                send_tables(connection, kwargs['files'], data)
                data = connection.recv()
            elif kwargs['request_type'] == 'query':
                reader = pa.RecordBatchStreamReader(
                    pa.BufferReader(connection.recv().getbuffer()))
                log.info('Done!')
                data['batch'] = reader.read_next_batch()
            elif kwargs['request_type'] == 'add_attachment':
                log.info(
                    f"Transferring '{os.path.basename(kwargs['file'])}' ({humansize(os.path.getsize(kwargs['file']))})..."
                )
                connection.send_file(kwargs['file'])
                data = connection.recv()
            elif kwargs['request_type'] == 'download_attachment':
                connection.recv_file(
                    os.path.join(kwargs['output_path'], kwargs['name']))
                data = connection.recv()

            return data
        finally:
            connection.kill()
Ejemplo n.º 14
0
import pyarrow.plasma as plasma
import binascii
import pyarrow as pa
import sys

client = plasma.connect("/tmp/plasma", "", 0)

[buffers] = client.get_buffers([plasma.ObjectID(b"A" * 20)])

data = pa.BufferReader(buffers)

#print(data.read())
batch = pa.RecordBatchStreamReader(data)
Ejemplo n.º 15
0
 def deserialize(self, header: Dict, buffers: List, context: Dict):
     reader = pa.RecordBatchStreamReader(pa.BufferReader(buffers[0]))
     if header['type'] == 'Table':
         return reader.read_all()
     else:
         return reader.read_next_batch()
Ejemplo n.º 16
0
def get_dfs(object_ids):
    """Retrieve dataframes from the object store given their object IDs."""
    buffers = client.get_buffers(object_ids)
    return [pa.RecordBatchStreamReader(buf).read_next_batch().to_pandas()
            for buf in buffers]
Ejemplo n.º 17
0
Archivo: db.py Proyecto: ksens/SciDB-Py
    def iquery(self,
               query,
               fetch=False,
               use_arrow=None,
               atts_only=False,
               as_dataframe=True,
               dataframe_promo=True,
               schema=None,
               upload_data=None,
               upload_schema=None):
        """Execute query in SciDB

        :param string query: SciDB AFL query to execute

        :param bool fetch: If ``True``, download SciDB array (default
          ``False``)

        :param bool use_arrow: If ``True``, download SciDB array using
          Apache Arrow library. Requires ``accelerated_io_tools`` and
          ``aio`` enabled in ``Shim``. If ``True``, a Pandas DataFrame
          is returned (``as_dataframe`` has no effect) and null-able
          types are promoted as per Pandas `promotion scheme
          <http://pandas.pydata.org/pandas-docs/stable/gotchas.html
          #na-type-promotions>`_ (``dataframe_promo`` has no
          effect). If ``None`` the ``use_arrow`` value set at
          connection time is used (default ``None``)

        :param bool atts_only: If ``True``, download only SciDB array
          attributes without dimensions (default ``False``)

        :param bool as_dataframe: If ``True``, return a Pandas
          DataFrame. If ``False``, return a NumPy array (default
          ``True``)

        :param bool dataframe_promo: If ``True``, null-able types are
          promoted as per Pandas `promotion scheme
          <http://pandas.pydata.org/pandas-docs/stable/gotchas.html
          #na-type-promotions>`_ If ``False``, object records are used
          for null-able types (default ``True``)

        :param schema: Schema of the SciDB array to use when
          downloading the array. Schema is not verified. If schema is
          a Schema instance, it is copied. Otherwise, a
          :py:class:``Schema`` object is built using
          :py:func:``Schema.fromstring`` (default ``None``)

        >>> DB().iquery('build(<x:int64>[i=0:1; j=0:1], i + j)', fetch=True)
           i  j    x
        0  0  0  0.0
        1  0  1  1.0
        2  1  0  1.0
        3  1  1  2.0

        >>> DB().iquery("input({sch}, '{fn}', 0, '{fmt}')",
        ...             fetch=True,
        ...             upload_data=numpy.arange(3, 6))
           i  x
        0  0  3
        1  1  4
        2  2  5

        """
        # Set use_arrow using local/global
        if use_arrow is None:
            use_arrow = self.use_arrow

        # Special case: -- - set_namespace - --
        if query.startswith('set_namespace(') and query[-1] == ')':
            param = query[len('set_namespace('):-1]
            # Unquote if quoted. Will be quoted when set in prefix.
            if param[0] == "'" and param[-1] == "'":
                param = param[1:-1]
            self.namespace = param
            return

        if upload_data is not None:
            if isinstance(upload_data, numpy.ndarray):
                if upload_schema is None:
                    try:
                        upload_schema = Schema.fromdtype(upload_data.dtype)
                    except Exception as e:
                        warnings.warn(
                            'Mapping NumPy dtype to SciDB schema failed. ' +
                            'Try providing an explicit upload_schema')
                        raise e

                # Convert upload data to bytes
                if upload_schema.is_fixsize():
                    upload_data = upload_data.tobytes()
                else:
                    upload_data = upload_schema.tobytes(upload_data)

            # Check if placeholders are present
            place_holders = set(
                field_name
                for _1, field_name, _3, _4 in self._formatter.parse(query))
            if 'fn' not in place_holders:
                warnings.warn(
                    'upload_data provided, but {fn} placeholder is missing',
                    stacklevel=2)
            if 'fmt' in place_holders and upload_schema is None:
                warnings.warn('upload_data and {fmt} placeholder provided, ' +
                              'but upload_schema is None',
                              stacklevel=2)

            # Check if upload data is bytes or file-like object
            if not (isinstance(upload_data, bytes) or isinstance(
                    upload_data, bytearray) or hasattr(upload_data, 'read')):
                warnings.warn('upload_data is not bytes or file-like object',
                              stacklevel=2)

            fn = self._shim(Shim.upload, data=upload_data).text
            query = query.format(
                sch=upload_schema,
                fn=fn,
                fmt=upload_schema.atts_fmt_scidb if upload_schema else None)

        if fetch:
            # Use provided schema or get schema from SciDB
            if schema:
                # Deep-copy schema since we might be mutating it
                if isinstance(schema, Schema):
                    if not atts_only and not use_arrow:
                        schema = copy.deepcopy(schema)
                else:
                    schema = Schema.fromstring(schema)
            else:
                # Execute 'show(...)' and Download text
                self._shim(Shim.execute_query,
                           query=DB._show_query.format(
                               query.replace("'", "\\'")),
                           save='tsv')
                schema = Schema.fromstring(
                    self._shim(Shim.read_lines, n=0).text)

            # Attributes and dimensions can collide. Run make_unique to
            # remove any collisions.
            #
            # make_unique fixes any collision, but if we don't
            # download the dimensions, we don't need to fix collisions
            # between dimensions and attributes. So, we use
            # make_unique only if there are collisions within the
            # attribute names.
            if ((not atts_only or len(set(
                (a.name for a in schema.atts))) < len(schema.atts))
                    and schema.make_unique()):
                # Dimensions or attributes were renamed due to
                # collisions. We need to cast.
                query = 'cast({}, {:h})'.format(query, schema)

            # Unpack
            if not atts_only and not use_arrow:
                # apply: add dimensions as attributes
                # project: place dimensions first
                query = 'project(apply({}, {}), {})'.format(
                    query,
                    ', '.join('{0}, {0}'.format(d.name) for d in schema.dims),
                    ', '.join(
                        i.name
                        for i in itertools.chain(schema.dims, schema.atts)))

                # update schema after apply
                schema.make_dims_atts()

            # Execute Query and Download content
            self._shim(Shim.execute_query,
                       query=query,
                       save='arrow' if use_arrow else schema.atts_fmt_scidb,
                       result_size_limit=self.result_size_limit,
                       atts_only=1 if atts_only or not use_arrow else 0)
            buf = self._shim(Shim.read_bytes, n=0).content

            # Build result
            if use_arrow:
                data = pyarrow.RecordBatchStreamReader(
                    pyarrow.BufferReader(buf)).read_pandas()
                # Place dimensions first
                if not atts_only:
                    data = data[[
                        i.name
                        for i in itertools.chain(schema.dims, schema.atts)
                    ]]

            elif schema.is_fixsize():
                data = numpy.frombuffer(buf, dtype=schema.atts_dtype)

                if as_dataframe:
                    data = pandas.DataFrame.from_records(data)

                    if dataframe_promo:
                        schema.promote(data)
            else:
                # Parse binary buffer
                data = schema.frombytes(buf, as_dataframe, dataframe_promo)

                if as_dataframe:
                    data = pandas.DataFrame.from_records(data)

            return data

        else:  # fetch=False
            self._shim(Shim.execute_query, query=query)

            # Special case: -- - load_library - --
            if query.startswith('load_library('):
                self.load_ops()
def _ipc_read_batches(buf):
    reader = pa.RecordBatchStreamReader(buf)
    return [batch for batch in reader]
Ejemplo n.º 19
0
def _deserialize_pyarrow_recordbatch(buf):
    with pa.RecordBatchStreamReader(buf) as reader:
        return reader.read_next_batch()
Ejemplo n.º 20
0
def _deserialize_pyarrow_table(buf):
    reader = pyarrow.RecordBatchStreamReader(buf)
    table = reader.read_all()
    return table
Ejemplo n.º 21
0
def ipc_read_batch(buf):
    reader = pa.RecordBatchStreamReader(buf)
    return reader.read_next_batch()
Ejemplo n.º 22
0
def _deserialize_pyarrow_recordbatch(buf):
    reader = pyarrow.RecordBatchStreamReader(buf)
    batch = reader.read_next_batch()
    return batch
Ejemplo n.º 23
0
def deserialize_table(header, frames):
    blob = frames[0]
    reader = pyarrow.RecordBatchStreamReader(pyarrow.BufferReader(blob))
    return reader.read_all()
Ejemplo n.º 24
0
def deserialize_batch(header, frames):
    import pyarrow as pa
    blob = frames[0]
    reader = pa.RecordBatchStreamReader(pa.BufferReader(blob))
    return reader.read_next_batch()