def test_open_stream_from_buffer(stream_fixture): # ARROW-2859 stream_fixture.write_batches() source = stream_fixture.get_source() reader1 = pa.ipc.open_stream(source) reader2 = pa.ipc.open_stream(pa.BufferReader(source)) reader3 = pa.RecordBatchStreamReader(source) result1 = reader1.read_all() result2 = reader2.read_all() result3 = reader3.read_all() assert result1.equals(result2) assert result1.equals(result3) st1 = reader1.stats assert st1.num_messages == 6 assert st1.num_record_batches == 5 assert reader2.stats == st1 assert reader3.stats == st1 assert tuple(st1) == tuple(stream_fixture.write_stats)
def read_arrow(source): # print('reading arrow file as arrow table from disk') reader = pa.RecordBatchStreamReader(source) pa_df = reader.read_all() return pa_df
import pyarrow as pa import pandas as pd import base64 streamB64String = "/////3gGAAAQAAAAAAAKAA4ABgANAAgACgAAAAAABAAQAAAAAAEKAAwAAAAIAAQACgAAAAgAAAAIAAAAAAAAABcAAAD0BQAAnAUAAFwFAAAYBQAA2AQAAJgEAABYBAAAGAQAANgDAACQAwAAUAMAAAADAAC8AgAAeAIAADACAADoAQAAnAEAAFQBAAAUAQAA0AAAAJAAAABIAAAABAAAAHr6//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAAMP3//wIAAAAHAAAADQAAAHNzX25ldF9wcm9maXQAAAC6+v//FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAAHD9//8CAAAABwAAABMAAABzc19uZXRfcGFpZF9pbmNfdGF4AP76//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAAtP3//wIAAAAHAAAACwAAAHNzX25ldF9wYWlkADr7//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAA8P3//wIAAAAHAAAADQAAAHNzX2NvdXBvbl9hbXQAAAB6+///FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAADD+//8CAAAABwAAAAoAAABzc19leHRfdGF4AAC2+///FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAAGz+//8CAAAABwAAABEAAABzc19leHRfbGlzdF9wcmljZQAAAPr7//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAAsP7//wIAAAAHAAAAFQAAAHNzX2V4dF93aG9sZXNhbGVfY29zdAAAAEL8//8UAAAAFAAAABQAAAAAAAcBGAAAAAAAAAAAAAAA+P7//wIAAAAHAAAAEgAAAHNzX2V4dF9zYWxlc19wcmljZQAAhvz//xQAAAAUAAAAFAAAAAAABwEYAAAAAAAAAAAAAAA8////AgAAAAcAAAATAAAAc3NfZXh0X2Rpc2NvdW50X2FtdADK/P//FAAAABQAAAAUAAAAAAAHARgAAAAAAAAAAAAAAID///8CAAAABwAAAA4AAABzc19zYWxlc19wcmljZQAACv3//xQAAAAUAAAAFAAAAAAABwEYAAAAAAAAAAAAAADA////AgAAAAcAAAANAAAAc3NfbGlzdF9wcmljZQAAAEr9//8UAAAAFAAAABwAAAAAAAcBIAAAAAAAAAAAAAAACAAMAAgABAAIAAAAAgAAAAcAAAARAAAAc3Nfd2hvbGVzYWxlX2Nvc3QAAACW/f//FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAIT9//8AAAABIAAAAAsAAABzc19xdWFudGl0eQDS/f//FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAMD9//8AAAABQAAAABAAAABzc190aWNrZXRfbnVtYmVyAAAAABb+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAABP7//wAAAAFAAAAACwAAAHNzX3Byb21vX3NrAFL+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAQP7//wAAAAFAAAAACwAAAHNzX3N0b3JlX3NrAI7+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAfP7//wAAAAFAAAAACgAAAHNzX2FkZHJfc2sAAMr+//8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAuP7//wAAAAFAAAAACwAAAHNzX2hkZW1vX3NrAAb///8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAA9P7//wAAAAFAAAAACwAAAHNzX2NkZW1vX3NrAEL///8UAAAAFAAAABQAAAAAAAIBGAAAAAAAAAAAAAAAMP///wAAAAFAAAAADgAAAHNzX2N1c3RvbWVyX3NrAACC////FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAHD///8AAAABQAAAAAoAAABzc19pdGVtX3NrAAC+////FAAAABQAAAAUAAAAAAACARgAAAAAAAAAAAAAAKz///8AAAABQAAAAA8AAABzc19zb2xkX3RpbWVfc2sAAAASABgAFAATABIADAAAAAgABAASAAAAFAAAABQAAAAcAAAAAAACASAAAAAAAAAAAAAAAAgADAAIAAcACAAAAAAAAAFAAAAADwAAAHNzX3NvbGRfZGF0ZV9zawD/////qAQAABQAAAAAAAAADAAWAA4AFQAQAAQADAAAAIALAAAAAAAAAAAEABAAAAAAAwoAGAAMAAgABAAKAAAAFAAAAPgCAAAKAAAAAAAAAAAAAAAuAAAAAAAAAAAAAAACAAAAAAAAAAgAAAAAAAAAUAAAAAAAAABYAAAAAAAAAAIAAAAAAAAAYAAAAAAAAABQAAAAAAAAALAAAAAAAAAAAgAAAAAAAAC4AAAAAAAAAFAAAAAAAAAACAEAAAAAAAACAAAAAAAAABABAAAAAAAAUAAAAAAAAABgAQAAAAAAAAIAAAAAAAAAaAEAAAAAAABQAAAAAAAAALgBAAAAAAAAAgAAAAAAAADAAQAAAAAAAFAAAAAAAAAAEAIAAAAAAAACAAAAAAAAABgCAAAAAAAAUAAAAAAAAABoAgAAAAAAAAIAAAAAAAAAcAIAAAAAAABQAAAAAAAAAMACAAAAAAAAAgAAAAAAAADIAgAAAAAAAFAAAAAAAAAAGAMAAAAAAAACAAAAAAAAACADAAAAAAAAUAAAAAAAAABwAwAAAAAAAAIAAAAAAAAAeAMAAAAAAAAoAAAAAAAAAKADAAAAAAAAAgAAAAAAAACoAwAAAAAAAKAAAAAAAAAASAQAAAAAAAACAAAAAAAAAFAEAAAAAAAAoAAAAAAAAADwBAAAAAAAAAIAAAAAAAAA+AQAAAAAAACgAAAAAAAAAJgFAAAAAAAAAgAAAAAAAACgBQAAAAAAAKAAAAAAAAAAQAYAAAAAAAACAAAAAAAAAEgGAAAAAAAAoAAAAAAAAADoBgAAAAAAAAIAAAAAAAAA8AYAAAAAAACgAAAAAAAAAJAHAAAAAAAAAgAAAAAAAACYBwAAAAAAAKAAAAAAAAAAOAgAAAAAAAACAAAAAAAAAEAIAAAAAAAAoAAAAAAAAADgCAAAAAAAAAIAAAAAAAAA6AgAAAAAAACgAAAAAAAAAIgJAAAAAAAAAgAAAAAAAACQCQAAAAAAAKAAAAAAAAAAMAoAAAAAAAACAAAAAAAAADgKAAAAAAAAoAAAAAAAAADYCgAAAAAAAAIAAAAAAAAA4AoAAAAAAACgAAAAAAAAAAAAAAAXAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAAKAAAAAAAAAAAAAAAAAAAACgAAAAAAAAAAAAAAAAAAAAoAAAAAAAAAAAAAAAAAAAD/AwAAAAAAAGVpJQAAAAAAZWklAAAAAABlaSUAAAAAAGVpJQAAAAAAZWklAAAAAABlaSUAAAAAAGVpJQAAAAAAZWklAAAAAABlaSUAAAAAAGVpJQAAAAAA/wMAAAAAAADX/wAAAAAAANf/AAAAAAAA1/8AAAAAAADX/wAAAAAAANf/AAAAAAAA1/8AAAAAAADX/wAAAAAAANf/AAAAAAAA1/8AAAAAAADX/wAAAAAAAP8DAAAAAAAAQwQAAAAAAADGBQAAAAAAAKkHAAAAAAAA1gEAAAAAAACUAQAAAAAAABUHAAAAAAAAnwUAAAAAAADLAAAAAAAAAC0HAAAAAAAAzAUAAAAAAAD/AwAAAAAAAAYAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAYAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAYAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAYAAAAAAAAA/wMAAAAAAAABBwkAAAAAAAEHCQAAAAAAAQcJAAAAAAABBwkAAAAAAAEHCQAAAAAAAQcJAAAAAAABBwkAAAAAAAEHCQAAAAAAAQcJAAAAAAABBwkAAAAAAP8DAAAAAAAAZA0AAAAAAABkDQAAAAAAAGQNAAAAAAAAZA0AAAAAAABkDQAAAAAAAGQNAAAAAAAAZA0AAAAAAABkDQAAAAAAAGQNAAAAAAAAZA0AAAAAAAD/AwAAAAAAAEcDAAAAAAAARwMAAAAAAABHAwAAAAAAAEcDAAAAAAAARwMAAAAAAABHAwAAAAAAAEcDAAAAAAAARwMAAAAAAABHAwAAAAAAAEcDAAAAAAAA/wMAAAAAAAACAAAAAAAAAAIAAAAAAAAAAgAAAAAAAAACAAAAAAAAAAIAAAAAAAAAAgAAAAAAAAACAAAAAAAAAAIAAAAAAAAAAgAAAAAAAAACAAAAAAAAAP8DAAAAAAAAAgAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAACAAAAAAAAAAIAAAAAAAAAAQAAAAAAAAADAAAAAAAAAAEAAAAAAAAAAwAAAAAAAAD/AwAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAA/wMAAAAAAABPAAAAJQAAAGMAAAAOAAAAZAAAAFsAAAAFAAAASAAAAA4AAAA6AAAA/wMAAAAAAAB1BAAAAAAAAAAAAAAAAAAA2xgAAAAAAAAAAAAAAAAAAHQfAAAAAAAAAAAAAAAAAABpFgAAAAAAAAAAAAAAAAAAzAkAAAAAAAAAAAAAAAAAAIQkAAAAAAAAAAAAAAAAAAAsBAAAAAAAAAAAAAAAAAAAGCEAAAAAAAAAAAAAAAAAAIIEAAAAAAAAAAAAAAAAAADJAQAAAAAAAAAAAAAAAAAA/wMAAAAAAABPBwAAAAAAAAAAAAAAAAAAhScAAAAAAAAAAAAAAAAAAMg1AAAAAAAAAAAAAAAAAADOHQAAAAAAAAAAAAAAAAAAZg4AAAAAAAAAAAAAAAAAAFsqAAAAAAAAAAAAAAAAAAA3BgAAAAAAAAAAAAAAAAAArysAAAAAAAAAAAAAAAAAAJkEAAAAAAAAAAAAAAAAAAAWAgAAAAAAAAAAAAAAAAAA/wMAAAAAAAAYAQAAAAAAAAAAAAAAAAAAMxAAAAAAAAAAAAAAAAAAAM4gAAAAAAAAAAAAAAAAAABiAgAAAAAAAAAAAAAAAAAASQAAAAAAAAAAAAAAAAAAAGwkAAAAAAAAAAAAAAAAAACcAgAAAAAAAAAAAAAAAAAABhgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABgAQAAAAAAAAAAAAAAAAAA/wMAAAAAAADiJgAAAAAAAAAAAAAAAAAA+xEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/wMAAAAAAABoVgAAAAAAAAAAAAAAAAAAX1cCAAAAAAAAAAAAAAAAAKqvDAAAAAAAAAAAAAAAAABcIQAAAAAAAAAAAAAAAAAAhBwAAAAAAAAAAAAAAAAAAGTyDAAAAAAAAAAAAAAAAAAMDQAAAAAAAAAAAAAAAAAAsMEGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADATwAAAAAAAAAAAAAAAAAA/wMAAAAAAAAbYAEAAAAAAAAAAAAAAAAAp5cDAAAAAAAAAAAAAAAAANwpDAAAAAAAAAAAAAAAAAC+OQEAAAAAAAAAAAAAAAAAsNMDAAAAAAAAAAAAAAAAAOz6DAAAAAAAAAAAAAAAAADcFAAAAAAAAAAAAAAAAAAAwE4JAAAAAAAAAAAAAAAAABw/AAAAAAAAAAAAAAAAAACKZwAAAAAAAAAAAAAAAAAA/wMAAAAAAABhQQIAAAAAAAAAAAAAAAAAObYFAAAAAAAAAAAAAAAAAFjMFAAAAAAAAAAAAAAAAABEoQEAAAAAAAAAAAAAAAAA2J8FAAAAAAAAAAAAAAAAAFkODwAAAAAAAAAAAAAAAAATHwAAAAAAAAAAAAAAAAAAOEkMAAAAAAAAAAAAAAAAAF5AAAAAAAAAAAAAAAAAAAD8eAAAAAAAAAAAAAAAAAAA/wMAAAAAAABgAgAAAAAAAAAAAAAAAAAAQRcAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAkQIAAAAAAAAAAAAAAAAAAG5jAAAAAAAAAAAAAAAAAADpAAAAAAAAAAAAAAAAAAAAMEUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/wMAAAAAAADiJgAAAAAAAAAAAAAAAAAA+xEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/wMAAAAAAACGLwAAAAAAAAAAAAAAAAAAZEUCAAAAAAAAAAAAAAAAAKqvDAAAAAAAAAAAAAAAAABcIQAAAAAAAAAAAAAAAAAAhBwAAAAAAAAAAAAAAAAAAGTyDAAAAAAAAAAAAAAAAAAMDQAAAAAAAAAAAAAAAAAAsMEGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADATwAAAAAAAAAAAAAAAAAA/wMAAAAAAADmMQAAAAAAAAAAAAAAAAAApVwCAAAAAAAAAAAAAAAAAKqvDAAAAAAAAAAAAAAAAABcIQAAAAAAAAAAAAAAAAAAFR8AAAAAAAAAAAAAAAAAANJVDQAAAAAAAAAAAAAAAAD1DQAAAAAAAAAAAAAAAAAA4AYHAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADATwAAAAAAAAAAAAAAAAAA/wMAAAAAAABrz/7/////////////////va3+/////////////////86FAAAAAAAAAAAAAAAAAACe5/7/////////////////1Ej8/////////////////3j3//////////////////8w+P//////////////////8HL9/////////////////+TA//////////////////826P///////////////////////wAAAAA=" streamString = base64.b64decode(streamB64String) streamBytes = bytes(streamString) reader = pa.RecordBatchStreamReader(streamBytes) df = reader.read_pandas() print (df)
def _parse_tdf_gpu(tdf): """ Parse the results of a select ipc_gpu into a GpuDataFrame Parameters ---------- tdf : TDataFrame Returns ------- gdf : GpuDataFrame """ import pyarrow as pa from cudf.comm.gpuarrow import GpuArrowReader from cudf.core.dataframe import DataFrame from cudf._lib.arrow._cuda import Context, IpcMemHandle from numba import cuda ipc_handle = IpcMemHandle.from_buffer(pa.py_buffer(tdf.df_handle)) ctx = Context() ipc_buf = ctx.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() schema_buffer, shm_ptr = load_buffer(tdf.sm_handle, tdf.sm_size) buffer = pa.BufferReader(schema_buffer) schema = pa.read_schema(buffer) # Dictionary Memo functionality used to # deserialize on the C++ side is not # exposed on the pyarrow side, so we need to # handle this on our own. dict_memo = {} try: dict_batch_reader = pa.RecordBatchStreamReader(buffer) updated_fields = [] for f in schema: if pa.types.is_dictionary(f.type): msg = dict_batch_reader.read_next_batch() dict_memo[f.name] = msg.column(0) updated_fields.append(pa.field(f.name, f.type.index_type)) else: updated_fields.append(pa.field(f.name, f.type)) schema = pa.schema(updated_fields) except pa.ArrowInvalid: # This message does not have any dictionary encoded # columns pass dtype = np.dtype(np.byte) darr = cuda.devicearray.DeviceNDArray( shape=ipc_buf.size, strides=dtype.itemsize, dtype=dtype, gpu_data=ipc_buf.to_numba(), ) reader = GpuArrowReader(schema, darr) df = DataFrame() df.set_tdf = MethodType(set_tdf, df) df.get_tdf = MethodType(get_tdf, df) for k, v in reader.to_dict().items(): if k in dict_memo: df[k] = pa.DictionaryArray.from_arrays(v, dict_memo[k]) else: df[k] = v df.set_tdf(tdf) # free shared memory from Python # https://github.com/omnisci/pymapd/issues/46 # https://github.com/omnisci/pymapd/issues/31 free_sm = shmdt(ctypes.cast(shm_ptr, ctypes.c_void_p)) # noqa return df
def read_data(file_name): reader = pa.RecordBatchStreamReader(file_name) table = reader.read_all() print(str(table.to_pydict()))
def deserialize_batch(header, frames): blob = frames[0] reader = pyarrow.RecordBatchStreamReader(pyarrow.BufferReader(blob)) return reader.read_next_batch()
def deserialize_data_frame(path): global read_data_frame, read_types, read_serializers, _pandas_native_types_, path_to_mmap path_to_mmap = path with pyarrow.OSFile(path, 'rb') as f: stream_reader = pyarrow.RecordBatchStreamReader(f) arrowtable = stream_reader.read_all() # metadata pandas_metadata = json.loads( arrowtable.schema.metadata[b'pandas'].decode('utf-8')) names = [] for col in pandas_metadata['columns']: names.append(col['name']) read_types.append(col['metadata']['type_id']) ser_id = col['metadata']['serializer_id'] if ser_id != '': read_serializers[col['name']] = ser_id # data read_data_frame = pandas.DataFrame() for arrowcolumn in arrowtable.itercolumns(): typeidx = names.index(arrowcolumn.name) coltype = read_types[typeidx] if coltype in _pandas_native_types_: dfcol = arrowcolumn.to_pandas() else: if coltype == _types_.INTEGER_LIST or coltype == _types_.INTEGER_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.INTEGER_SET, 4, 'i')) elif coltype == _types_.LONG_LIST or coltype == _types_.LONG_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.LONG_SET, 8, 'q')) elif coltype == _types_.DOUBLE_LIST or coltype == _types_.DOUBLE_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.DOUBLE_SET, 8, 'd')) elif coltype == _types_.FLOAT_LIST or coltype == _types_.FLOAT_SET: dfcol = pandas.Series( collection_generator(arrowcolumn, coltype == _types_.FLOAT_SET, 4, 'f')) elif coltype == _types_.BOOLEAN_LIST or coltype == _types_.BOOLEAN_SET: dfcol = pandas.Series( boolean_collection_generator( arrowcolumn, coltype == _types_.BOOLEAN_SET)) elif coltype == _types_.STRING_LIST or coltype == _types_.STRING_SET: dfcol = pandas.Series( string_collection_generator( arrowcolumn, coltype == _types_.STRING_SET)) elif coltype == _types_.BYTES_LIST or coltype == _types_.BYTES_SET: dfcol = pandas.Series( bytes_collection_generator( arrowcolumn, coltype == _types_.BYTES_SET)) else: raise KeyError('Type with id ' + str(coltype) + ' cannot be deserialized!') # Note: we only have one index column (the KNIME RowKeys) if arrowcolumn.name in pandas_metadata['index_columns']: indexcol = dfcol else: read_data_frame[arrowcolumn.name] = dfcol if not 'indexcol' in locals(): raise NameError( 'Variable indexcol has not been set properly, exiting!') if len(read_data_frame.columns) > 0: read_data_frame.set_index(keys=indexcol, inplace=True) else: read_data_frame = pandas.DataFrame(index=indexcol)
def _deserialize_pyarrow_table(buf): with pa.RecordBatchStreamReader(buf) as reader: return reader.read_all()
def read_arrow_as_pandas(source): print('reading arrow file as pandas dataframe from disk') reader = pa.RecordBatchStreamReader(source) pa_df = reader.read_all() return pa_df.to_pandas()
def time_read_to_dataframe(self, *args): reader = pa.RecordBatchStreamReader(self.source) table = reader.read_all() df = table.to_pandas()
def hello(): channel = grpc.insecure_channel('untrusted:50051') stub = codeRunner_pb2_grpc.codeRunnerStub(channel) rand = random.choice([True, False]) from pyarrow import csv fn = "IRAhandle_tweets_1.csv" if rand else "mimic.csv" table = csv.read_csv(fn) start = time.clock() print("data loaded") batches = table.to_batches() print(1) client = plasma.connect("/tmp/plasma") print(2) code = """ import time while True: print(7) time.sleep(0.5) """ if False else """ import os import pyarrow import sys authors = dataTable.column("author") newData = [] for i in range(len(authors)): newData.append(1 if i == 0 or authors[i] != authors[i-1] else newData[-1]+1) newColumn = dataTable.column(3).from_array("authorTweetCount", [newData]) newTable = dataTable.append_column(newColumn) """ if rand else """ import os import pyarrow import sys ages = dataTable.column("age") maxV = max(ages.to_pylist()) newData = [] for i in ages: newData.append(1 if i == maxV else 0) newColumn = dataTable.column(3).from_array("oldest", [newData]) newTable = dataTable.append_column(newColumn) """ tables = [] for i in range(len(batches)): id_ = randString() strId = makeID(id_) mock_sink = pyarrow.MockOutputStream() #find data size stream_writer = pyarrow.RecordBatchStreamWriter( mock_sink, batches[0].schema) stream_writer.write_batch(batches[i]) stream_writer.close() data_size = mock_sink.size() buf = client.create(strId, data_size) stream = pyarrow.FixedSizeBufferWriter(buf) stream_writer = pyarrow.RecordBatchStreamWriter( stream, batches[0].schema) stream_writer.write_batch(batches[i]) stream_writer.close() client.seal(strId) print("sent batch " + str(i + 1)) codeToSend = codeRunner_pb2.code(toRun=code, id_=id_) newId = stub.runCode(codeToSend, timeout=1) newId = newId.id_ [data] = client.get_buffers([makeID(newId)]) outputBuf = pyarrow.py_buffer(data.to_pybytes()) buffer_ = pyarrow.BufferReader(outputBuf) reader = pyarrow.RecordBatchStreamReader(buffer_) if i == 0: datatable = reader.read_all() else: datatable = pyarrow.concat_tables([ datatable, datatable.from_batches(reader.read_all().to_batches()) ]) html = str(datatable.column("authorTweetCount" if rand else "oldest").data) print("data received after " + str(time.clock() - start)) return html
def handle_batch() -> None: nonlocal dataframe nonlocal batch nonlocal error try: if dataframe is None: batch = pyarrow.RecordBatchStreamReader(read_stream) dataframe = batch.read_pandas() if encoding is not None: def decode(value: typing.Any) -> typing.Any: if type(value) is bytes: assert encoding is not None return value.decode(encoding) if type(value) is bytearray: assert encoding is not None return value.decode(encoding) if type(value) is tuple: return tuple(decode(child) for child in value) if type(value) is list: return [decode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [decode(child) for child in value]) if type(value) is set: return {decode(child) for child in value} if type(value) is frozenset: return frozenset( decode(child) for child in value) if type(value) is dict: return { key: decode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(decode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) else: if encoding is not None: def encode(value: typing.Any) -> typing.Any: if type(value) is str: assert encoding is not None return value.encode(encoding) if type(value) is tuple: return tuple(encode(child) for child in value) if type(value) is list: return [encode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [encode(child) for child in value]) if type(value) is set: return {encode(child) for child in value} if type(value) is frozenset: return frozenset( encode(child) for child in value) if type(value) is dict: return { key: encode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(encode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) table = pyarrow.Table.from_arrays([ pyarrow.array(dataframe[column].values) for column in dataframe ], dataframe.columns) batch = pyarrow.RecordBatchStreamWriter( write_stream, table.schema) batch.write_table(table) dataframe = None batch.close() write_stream.close() except pyarrow.ArrowInvalid: pass except BaseException as raw_error: # pylint: disable=broad-except error = raw_error
def _request(self, is_redirected=False, **kwargs): """ Request something to the server. """ connection = Connection(self.server_address) try: # Authentication self.authenticate(connection, user=kwargs.pop('user', None), password=kwargs.pop('password', None)) # Sending request if is_redirected: connection.send( {key: kwargs[key] for key in ('request_type', 'path')}) else: connection.send(kwargs) data = connection.recv() # Redirecting request (if necessary) if type(data) is dict and 'redirection_address' in data: for key in data: if key != 'redirection_address': kwargs[key] = data[key] connection.kill() connection.connect(tuple(data['redirection_address'])) self.authenticate(connection) connection.send(kwargs) data = connection.recv() if type(data) is dict and 'msg' in data and data['msg']: log.info(data['msg']) # Processing request if kwargs['request_type'] == 'sync_databases': while data['msg'] != 'Done!': data = connection.recv() log.info(data['msg']) elif kwargs['request_type'] == 'new_batch': send_tables(connection, kwargs['files'], data) data = connection.recv() elif kwargs['request_type'] == 'query': reader = pa.RecordBatchStreamReader( pa.BufferReader(connection.recv().getbuffer())) log.info('Done!') data['batch'] = reader.read_next_batch() elif kwargs['request_type'] == 'add_attachment': log.info( f"Transferring '{os.path.basename(kwargs['file'])}' ({humansize(os.path.getsize(kwargs['file']))})..." ) connection.send_file(kwargs['file']) data = connection.recv() elif kwargs['request_type'] == 'download_attachment': connection.recv_file( os.path.join(kwargs['output_path'], kwargs['name'])) data = connection.recv() return data finally: connection.kill()
import pyarrow.plasma as plasma import binascii import pyarrow as pa import sys client = plasma.connect("/tmp/plasma", "", 0) [buffers] = client.get_buffers([plasma.ObjectID(b"A" * 20)]) data = pa.BufferReader(buffers) #print(data.read()) batch = pa.RecordBatchStreamReader(data)
def deserialize(self, header: Dict, buffers: List, context: Dict): reader = pa.RecordBatchStreamReader(pa.BufferReader(buffers[0])) if header['type'] == 'Table': return reader.read_all() else: return reader.read_next_batch()
def get_dfs(object_ids): """Retrieve dataframes from the object store given their object IDs.""" buffers = client.get_buffers(object_ids) return [pa.RecordBatchStreamReader(buf).read_next_batch().to_pandas() for buf in buffers]
def iquery(self, query, fetch=False, use_arrow=None, atts_only=False, as_dataframe=True, dataframe_promo=True, schema=None, upload_data=None, upload_schema=None): """Execute query in SciDB :param string query: SciDB AFL query to execute :param bool fetch: If ``True``, download SciDB array (default ``False``) :param bool use_arrow: If ``True``, download SciDB array using Apache Arrow library. Requires ``accelerated_io_tools`` and ``aio`` enabled in ``Shim``. If ``True``, a Pandas DataFrame is returned (``as_dataframe`` has no effect) and null-able types are promoted as per Pandas `promotion scheme <http://pandas.pydata.org/pandas-docs/stable/gotchas.html #na-type-promotions>`_ (``dataframe_promo`` has no effect). If ``None`` the ``use_arrow`` value set at connection time is used (default ``None``) :param bool atts_only: If ``True``, download only SciDB array attributes without dimensions (default ``False``) :param bool as_dataframe: If ``True``, return a Pandas DataFrame. If ``False``, return a NumPy array (default ``True``) :param bool dataframe_promo: If ``True``, null-able types are promoted as per Pandas `promotion scheme <http://pandas.pydata.org/pandas-docs/stable/gotchas.html #na-type-promotions>`_ If ``False``, object records are used for null-able types (default ``True``) :param schema: Schema of the SciDB array to use when downloading the array. Schema is not verified. If schema is a Schema instance, it is copied. Otherwise, a :py:class:``Schema`` object is built using :py:func:``Schema.fromstring`` (default ``None``) >>> DB().iquery('build(<x:int64>[i=0:1; j=0:1], i + j)', fetch=True) i j x 0 0 0 0.0 1 0 1 1.0 2 1 0 1.0 3 1 1 2.0 >>> DB().iquery("input({sch}, '{fn}', 0, '{fmt}')", ... fetch=True, ... upload_data=numpy.arange(3, 6)) i x 0 0 3 1 1 4 2 2 5 """ # Set use_arrow using local/global if use_arrow is None: use_arrow = self.use_arrow # Special case: -- - set_namespace - -- if query.startswith('set_namespace(') and query[-1] == ')': param = query[len('set_namespace('):-1] # Unquote if quoted. Will be quoted when set in prefix. if param[0] == "'" and param[-1] == "'": param = param[1:-1] self.namespace = param return if upload_data is not None: if isinstance(upload_data, numpy.ndarray): if upload_schema is None: try: upload_schema = Schema.fromdtype(upload_data.dtype) except Exception as e: warnings.warn( 'Mapping NumPy dtype to SciDB schema failed. ' + 'Try providing an explicit upload_schema') raise e # Convert upload data to bytes if upload_schema.is_fixsize(): upload_data = upload_data.tobytes() else: upload_data = upload_schema.tobytes(upload_data) # Check if placeholders are present place_holders = set( field_name for _1, field_name, _3, _4 in self._formatter.parse(query)) if 'fn' not in place_holders: warnings.warn( 'upload_data provided, but {fn} placeholder is missing', stacklevel=2) if 'fmt' in place_holders and upload_schema is None: warnings.warn('upload_data and {fmt} placeholder provided, ' + 'but upload_schema is None', stacklevel=2) # Check if upload data is bytes or file-like object if not (isinstance(upload_data, bytes) or isinstance( upload_data, bytearray) or hasattr(upload_data, 'read')): warnings.warn('upload_data is not bytes or file-like object', stacklevel=2) fn = self._shim(Shim.upload, data=upload_data).text query = query.format( sch=upload_schema, fn=fn, fmt=upload_schema.atts_fmt_scidb if upload_schema else None) if fetch: # Use provided schema or get schema from SciDB if schema: # Deep-copy schema since we might be mutating it if isinstance(schema, Schema): if not atts_only and not use_arrow: schema = copy.deepcopy(schema) else: schema = Schema.fromstring(schema) else: # Execute 'show(...)' and Download text self._shim(Shim.execute_query, query=DB._show_query.format( query.replace("'", "\\'")), save='tsv') schema = Schema.fromstring( self._shim(Shim.read_lines, n=0).text) # Attributes and dimensions can collide. Run make_unique to # remove any collisions. # # make_unique fixes any collision, but if we don't # download the dimensions, we don't need to fix collisions # between dimensions and attributes. So, we use # make_unique only if there are collisions within the # attribute names. if ((not atts_only or len(set( (a.name for a in schema.atts))) < len(schema.atts)) and schema.make_unique()): # Dimensions or attributes were renamed due to # collisions. We need to cast. query = 'cast({}, {:h})'.format(query, schema) # Unpack if not atts_only and not use_arrow: # apply: add dimensions as attributes # project: place dimensions first query = 'project(apply({}, {}), {})'.format( query, ', '.join('{0}, {0}'.format(d.name) for d in schema.dims), ', '.join( i.name for i in itertools.chain(schema.dims, schema.atts))) # update schema after apply schema.make_dims_atts() # Execute Query and Download content self._shim(Shim.execute_query, query=query, save='arrow' if use_arrow else schema.atts_fmt_scidb, result_size_limit=self.result_size_limit, atts_only=1 if atts_only or not use_arrow else 0) buf = self._shim(Shim.read_bytes, n=0).content # Build result if use_arrow: data = pyarrow.RecordBatchStreamReader( pyarrow.BufferReader(buf)).read_pandas() # Place dimensions first if not atts_only: data = data[[ i.name for i in itertools.chain(schema.dims, schema.atts) ]] elif schema.is_fixsize(): data = numpy.frombuffer(buf, dtype=schema.atts_dtype) if as_dataframe: data = pandas.DataFrame.from_records(data) if dataframe_promo: schema.promote(data) else: # Parse binary buffer data = schema.frombytes(buf, as_dataframe, dataframe_promo) if as_dataframe: data = pandas.DataFrame.from_records(data) return data else: # fetch=False self._shim(Shim.execute_query, query=query) # Special case: -- - load_library - -- if query.startswith('load_library('): self.load_ops()
def _ipc_read_batches(buf): reader = pa.RecordBatchStreamReader(buf) return [batch for batch in reader]
def _deserialize_pyarrow_recordbatch(buf): with pa.RecordBatchStreamReader(buf) as reader: return reader.read_next_batch()
def _deserialize_pyarrow_table(buf): reader = pyarrow.RecordBatchStreamReader(buf) table = reader.read_all() return table
def ipc_read_batch(buf): reader = pa.RecordBatchStreamReader(buf) return reader.read_next_batch()
def _deserialize_pyarrow_recordbatch(buf): reader = pyarrow.RecordBatchStreamReader(buf) batch = reader.read_next_batch() return batch
def deserialize_table(header, frames): blob = frames[0] reader = pyarrow.RecordBatchStreamReader(pyarrow.BufferReader(blob)) return reader.read_all()
def deserialize_batch(header, frames): import pyarrow as pa blob = frames[0] reader = pa.RecordBatchStreamReader(pa.BufferReader(blob)) return reader.read_next_batch()