def generate_file(attrs, cache_dir): filename = get_filename(attrs) filepath = os.path.join(cache_dir, filename) if os.path.isfile(filepath): return print(f"Generating file for {attrs}") np.random.seed(0) data = np.random.random((attrs['size'], 26)) data[:, 25] = np.sum(data[:, :25], axis=1) open_mode = 'w' if attrs['format'] == 'arrow': open_mode = 'wb' if attrs['type'] == 'array': with open(filepath, open_mode) as fd: if attrs['format'] == 'csv': writer = csv.writer(fd) for row in data.tolist(): writer.writerow(row) elif attrs['format'] == 'json': json.dump(data.tolist(), fd) elif attrs['format'] == 'arrow': raise NotImplementedError() elif attrs['type'] == 'table': columns = string.ascii_uppercase data = [ dict((col, x) for (col, x) in zip(columns, row)) for row in data] df = pd.DataFrame(data) with open(filepath, open_mode) as fd: if attrs['format'] == 'csv': df.to_csv(fd, index=False) elif attrs['format'] == 'json': df.to_json(fd, orient='records') elif attrs['format'] == 'arrow': batch = pa.RecordBatch.from_pandas(df, preserve_index=False) writer = pa.RecordBatchStreamWriter(fd, batch.schema) writer.write_batch(batch) writer.close() else: raise NotImplementedError() if attrs['compression'] == 'none': pass elif attrs['compression'] == 'gzip': with tempfile.NamedTemporaryFile(delete=False) as tmp: with gzip.open(tmp, 'wb') as gz: with open(filepath, 'rb') as fd: shutil.copyfileobj(fd, gz) os.unlink(filepath) shutil.copy(tmp.name, filepath) os.unlink(tmp.name) else: raise NotImplementedError()
def hello(): channel = grpc.insecure_channel('untrusted:50051') stub = codeRunner_pb2_grpc.codeRunnerStub(channel) rand = random.choice([True, False]) from pyarrow import csv fn = "IRAhandle_tweets_1.csv" if rand else "mimic.csv" table = csv.read_csv(fn) start = time.clock() print("data loaded") batches = table.to_batches() print(1) client = plasma.connect("/tmp/plasma") print(2) code = """ import time while True: print(7) time.sleep(0.5) """ if False else """ import os import pyarrow import sys authors = dataTable.column("author") newData = [] for i in range(len(authors)): newData.append(1 if i == 0 or authors[i] != authors[i-1] else newData[-1]+1) newColumn = dataTable.column(3).from_array("authorTweetCount", [newData]) newTable = dataTable.append_column(newColumn) """ if rand else """ import os import pyarrow import sys ages = dataTable.column("age") maxV = max(ages.to_pylist()) newData = [] for i in ages: newData.append(1 if i == maxV else 0) newColumn = dataTable.column(3).from_array("oldest", [newData]) newTable = dataTable.append_column(newColumn) """ tables = [] for i in range(len(batches)): id_ = randString() strId = makeID(id_) mock_sink = pyarrow.MockOutputStream() #find data size stream_writer = pyarrow.RecordBatchStreamWriter( mock_sink, batches[0].schema) stream_writer.write_batch(batches[i]) stream_writer.close() data_size = mock_sink.size() buf = client.create(strId, data_size) stream = pyarrow.FixedSizeBufferWriter(buf) stream_writer = pyarrow.RecordBatchStreamWriter( stream, batches[0].schema) stream_writer.write_batch(batches[i]) stream_writer.close() client.seal(strId) print("sent batch " + str(i + 1)) codeToSend = codeRunner_pb2.code(toRun=code, id_=id_) newId = stub.runCode(codeToSend, timeout=1) newId = newId.id_ [data] = client.get_buffers([makeID(newId)]) outputBuf = pyarrow.py_buffer(data.to_pybytes()) buffer_ = pyarrow.BufferReader(outputBuf) reader = pyarrow.RecordBatchStreamReader(buffer_) if i == 0: datatable = reader.read_all() else: datatable = pyarrow.concat_tables([ datatable, datatable.from_batches(reader.read_all().to_batches()) ]) html = str(datatable.column("authorTweetCount" if rand else "oldest").data) print("data received after " + str(time.clock() - start)) return html
def table_to_bytes(table): global _temp_dir if _temp_dir is None or not os.path.exists(_temp_dir): _temp_dir = tempfile.mkdtemp(prefix='knime-python-') # Delete temporary directory upon Python shutdown. atexit.register(close) fd, path = tempfile.mkstemp(suffix='.dat', prefix='python-to-java-', dir=_temp_dir, text=False) try: os.close(fd) mp = pyarrow.default_memory_pool() col_arrays = [] col_names = [] all_names = [] missing_names = [] # add the index column to the list of columns all_names.append("__index_level_0__") if len(table._data_frame.index) > 0: col_names.append("__index_level_0__") col_arrays.append(pyarrow.Array.from_pandas(table._data_frame.index, type=to_pyarrow_type(_types_.STRING), memory_pool=mp)) else: missing_names.append("__index_level_0__") # Serialize the dataframe into a list of pyarrow.Array column by column for i in range(len(table._data_frame.columns)): # missing column ? -> save name and don't send any buffer for column if (table._data_frame.iloc[:, i].isnull().all()): missing_names.append(table.get_name(i)) all_names.append(table.get_name(i)) continue # Convert collection types to binary if table.get_type(i) == _types_.INTEGER_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<i4'))) elif table.get_type(i) == _types_.LONG_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<f8'))) elif table.get_type(i) == _types_.FLOAT_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<f4'))) elif table.get_type(i) == _types_.BOOLEAN_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_boolean_list_generator(table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.STRING_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_string_list_generator(table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.BYTES_LIST: col_arrays.append( pyarrow.Array.from_pandas(binary_from_bytes_list_generator(table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.INTEGER_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<i4'))) elif table.get_type(i) == _types_.LONG_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<i8'))) elif table.get_type(i) == _types_.DOUBLE_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<f8'))) elif table.get_type(i) == _types_.FLOAT_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<f4'))) elif table.get_type(i) == _types_.BOOLEAN_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_boolean_set_generator(table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.STRING_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_string_set_generator(table._data_frame.iloc[:, i]))) elif table.get_type(i) == _types_.BYTES_SET: col_arrays.append( pyarrow.Array.from_pandas(binary_from_bytes_set_generator(table._data_frame.iloc[:, i]))) # Workaround until numpy typecasts are implemented in pyarrow elif table.get_type(i) == _types_.INTEGER and table._data_frame.iloc[:, i].dtype == np.int64: col_arrays.append( pyarrow.Array.from_pandas(np.array(table._data_frame.iloc[:, i], dtype=np.int32), memory_pool=mp)) # Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any elif table.get_type(i) == _types_.BYTES and type( get_first_not_None(table._data_frame.iloc[:, i])) == bytearray: col_arrays.append( pyarrow.Array.from_pandas(map(lambda x: x if x is None else bytes(x), table._data_frame.iloc[:, i]), memory_pool=mp)) # create pyarrow.Array else: pa_type = to_pyarrow_type(table.get_type(i)) # pyarrow.binary() type is not allowed as argument for type atm if pa_type == pyarrow.binary(): col_arrays.append(pyarrow.BinaryArray.from_pandas(table._data_frame.iloc[:, i], memory_pool=mp)) else: col_arrays.append( pyarrow.Array.from_pandas(table._data_frame.iloc[:, i], type=pa_type, memory_pool=mp)) col_names.append(table.get_name(i)) all_names.append(table.get_name(i)) # Construct metadata custom_metadata = {"index_columns": [all_names[0]], "columns": [ {"name": all_names[0], "metadata": {"serializer_id": "", "type_id": _types_.STRING}}], "missing_columns": missing_names, "num_rows": len(table._data_frame)} real_col_names = list(table._data_frame.columns) for name in all_names[1:]: col_idx = real_col_names.index(name) if table.get_type(col_idx) in [_types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET]: custom_metadata['columns'].append({"name": name, "metadata": { "serializer_id": table.get_column_serializers().get(name, ""), "type_id": table.get_type(col_idx)}}) else: custom_metadata['columns'].append( {"name": name, "metadata": {"serializer_id": "", "type_id": table.get_type(col_idx)}}) metadata = {b'ArrowSerializationLibrary': json.dumps(custom_metadata).encode('utf-8')} # Empty record batches are not supported, therefore add a dummy array if dataframe is empty if not col_arrays: col_arrays.append(pyarrow.array([0])) col_names.append('dummy') batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names) schema = batch.schema.remove_metadata() schema = schema.add_metadata(metadata) # Write data to file and return filepath with pyarrow.OSFile(path, 'wb') as f: stream_writer = pyarrow.RecordBatchStreamWriter(f, schema) stream_writer.write_batch(batch) stream_writer.close() return bytearray(path, 'utf-8') except BaseException: PythonUtils.invoke_safely(None, os.remove, [path]) raise
def export(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True): """ :param DatasetLocal dataset: dataset to export :param str path: path for file :param lis[str] column_names: list of column names to export or None for all columns :param str byteorder: = for native, < for little endian and > for big endian :param bool shuffle: export rows in random order :param bool selection: export selection or not :param progress: progress callback that gets a progress fraction as argument and should return True to continue, or a default progress bar when progress=True :param: bool virtual: When True, export virtual columns :return: """ column_names = column_names or dataset.get_column_names(virtual=virtual, strings=True) for name in column_names: if name not in dataset.columns: warnings.warn( 'Exporting to arrow with virtual columns is not efficient') N = len(dataset) if not selection else dataset.selected_length(selection) if N == 0: raise ValueError("Cannot export empty table") if shuffle and sort: raise ValueError("Cannot shuffle and sort at the same time") if shuffle: random_index_column = "random_index" while random_index_column in dataset.get_column_names(): random_index_column += "_new" partial_shuffle = shuffle and len(dataset) != N order_array = None if partial_shuffle: # if we only export a portion, we need to create the full length random_index array, and shuffle_array_full = np.random.choice(len(dataset), len(dataset), replace=False) # then take a section of it # shuffle_array[:] = shuffle_array_full[:N] shuffle_array = shuffle_array_full[shuffle_array_full < N] del shuffle_array_full order_array = shuffle_array elif shuffle: shuffle_array = np.random.choice(N, N, replace=False) order_array = shuffle_array if sort: if selection: raise ValueError("sorting selections not yet supported") logger.info("sorting...") indices = np.argsort(dataset.evaluate(sort)) order_array = indices if ascending else indices[::-1] logger.info("sorting done") if selection: full_mask = dataset.evaluate_selection_mask(selection) else: full_mask = None arrow_arrays = [] for column_name in column_names: mask = full_mask if selection: values = dataset.evaluate(column_name, filtered=False) values = values[mask] else: values = dataset.evaluate(column_name) if shuffle or sort: indices = order_array values = values[indices] arrow_arrays.append(arrow_array_from_numpy_array(values)) if shuffle: arrow_arrays.append(arrow_array_from_numpy_array(order_array)) column_names = column_names + [random_index_column] table = pa.Table.from_arrays(arrow_arrays, column_names) b = table.to_batches() with pa.OSFile(path, 'wb') as sink: writer = pa.RecordBatchStreamWriter(sink, b[0].schema) writer.write_table(table)
def handle_batch() -> None: nonlocal dataframe nonlocal batch nonlocal error try: if dataframe is None: batch = pyarrow.RecordBatchStreamReader(read_stream) dataframe = batch.read_pandas() if encoding is not None: def decode(value: typing.Any) -> typing.Any: if type(value) is bytes: assert encoding is not None return value.decode(encoding) if type(value) is bytearray: assert encoding is not None return value.decode(encoding) if type(value) is tuple: return tuple(decode(child) for child in value) if type(value) is list: return [decode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [decode(child) for child in value]) if type(value) is set: return {decode(child) for child in value} if type(value) is frozenset: return frozenset( decode(child) for child in value) if type(value) is dict: return { key: decode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(decode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) else: if encoding is not None: def encode(value: typing.Any) -> typing.Any: if type(value) is str: assert encoding is not None return value.encode(encoding) if type(value) is tuple: return tuple(encode(child) for child in value) if type(value) is list: return [encode(child) for child in value] if type(value) is numpy.ndarray: return numpy.array( [encode(child) for child in value]) if type(value) is set: return {encode(child) for child in value} if type(value) is frozenset: return frozenset( encode(child) for child in value) if type(value) is dict: return { key: encode(child) for key, child in value.items() } return value dataframe = pandas.DataFrame({ column: (dataframe[column].apply(encode) if dataframe[column].dtype == 'O' else dataframe[column]) for column in dataframe }) table = pyarrow.Table.from_arrays([ pyarrow.array(dataframe[column].values) for column in dataframe ], dataframe.columns) batch = pyarrow.RecordBatchStreamWriter( write_stream, table.schema) batch.write_table(table) dataframe = None batch.close() write_stream.close() except pyarrow.ArrowInvalid: pass except BaseException as raw_error: # pylint: disable=broad-except error = raw_error
def export(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True): table = _export_table(dataset, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending) b = table.to_batches() with pa.OSFile(path, 'wb') as sink: writer = pa.RecordBatchStreamWriter(sink, b[0].schema) writer.write_table(table)
def _serialize_pyarrow_recordbatch(batch): output_stream = pa.BufferOutputStream() with pa.RecordBatchStreamWriter(output_stream, schema=batch.schema) as wr: wr.write_batch(batch) return output_stream.getvalue() # This will also close the stream.
def write_branches_to_arrow(messaging, topic_name, file_path, file_id, attr_name_list, chunk_size, server_endpoint, event_limit=None, object_store=None): scratch_writer = None event_iterator = XAODEvents(file_path, attr_name_list) transformer = XAODTransformer(event_iterator) batch_number = 0 total_events = 0 total_bytes = 0 for pa_table in transformer.arrow_table(chunk_size, event_limit): if object_store: if not scratch_writer: scratch_writer = _open_scratch_file(args.result_format, pa_table) _append_table_to_scratch(args.result_format, scratch_writer, pa_table) total_events = total_events + pa_table.num_rows batches = pa_table.to_batches(chunksize=chunk_size) for batch in batches: if messaging: key = file_path + "-" + str(batch_number) sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() messaging.publish_message(topic_name, key, sink.getvalue()) total_bytes = total_bytes + len(sink.getvalue().to_pybytes()) avg_cell_size = len(sink.getvalue().to_pybytes()) / len( attr_name_list) / batch.num_rows print( "Batch number " + str(batch_number) + ", " + str(batch.num_rows) + " events published to " + topic_name, "Avg Cell Size = " + str(avg_cell_size) + " bytes") batch_number += 1 if object_store: _close_scratch_file(args.result_format, scratch_writer) print("Writing parquet to ", args.request_id, " as ", file_path.replace('/', ':')) object_store.upload_file(args.request_id, file_path.replace('/', ':'), "/tmp/out") os.remove("/tmp/out") print("===> Total Events ", total_events) print("===> Total Bytes ", total_bytes) if server_endpoint: post_status_update(server_endpoint, "File " + file_path + " complete") put_file_complete(server_endpoint, file_path, file_id, "success", num_messages=batch_number, total_time="??", total_events=total_events, total_bytes=total_bytes)
def _get_writer(self, sink, schema): return pa.RecordBatchStreamWriter( sink, schema, use_legacy_format=self.use_legacy_ipc_format)
def write_branches_to_arrow(self, transformer, topic_name, file_id, request_id): from .scratch_file_writer import ScratchFileWriter tick = time.time() scratch_writer = None total_messages = 0 for pa_table in transformer.arrow_table(): if self.object_store: if not scratch_writer: scratch_writer = ScratchFileWriter( file_format=self.file_format) scratch_writer.open_scratch_file(pa_table) scratch_writer.append_table_to_scratch(pa_table) if self.messaging: batches = pa_table.to_batches( max_chunksize=transformer.chunk_size) for batch in batches: messaging_tick = time.time() # Just need to make key unique to shard messages across brokers key = str.encode(transformer.file_path + "-" + str(total_messages)) sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, batch.schema) writer.write_batch(batch) writer.close() self.messaging.publish_message(topic_name, key, sink.getvalue()) self.avg_cell_size.append( len(sink.getvalue().to_pybytes()) / len(transformer.attr_name_list) / batch.num_rows) total_messages += 1 self.messaging_timings.append(time.time() - messaging_tick) if self.object_store: object_store_tick = time.time() scratch_writer.close_scratch_file() print("Writing parquet to ", request_id, " as ", transformer.file_path.replace('/', ':')) self.object_store.upload_file( request_id, transformer.file_path.replace('/', ':'), scratch_writer.file_path) scratch_writer.remove_scratch_file() self.object_store_timing = time.time() - object_store_tick tock = time.time() if self.messaging: avg_avg_cell_size = sum(self.avg_cell_size) / len(self.avg_cell_size) \ if len(self.avg_cell_size) else 0 print("Wrote " + str(total_messages) + " events to " + topic_name, "Avg Cell Size = " + str(avg_avg_cell_size) + " bytes") print("Real time: " + str(round(tock - tick / 60.0, 2)) + " minutes")
def _build_writer(self, schema: pa.Schema): self._schema: pa.Schema = schema self._type: pa.DataType = pa.struct(field for field in self._schema) self._features = Features.from_arrow_schema(self._schema) self.pa_writer = pa.RecordBatchStreamWriter(self.stream, schema)
def from_table(project, dataset, table, columns=None, condition=None, export=None, fs=None, fs_options=None, client_project=None, credentials=None): '''Download (stream) an entire Google BigQuery table locally. :param str project: The Google BigQuery project that owns the table. :param str dataset: The dataset the table is part of. :param str table: The name of the table :param list columns: A list of columns (field names) to download. If None, all columns will be downloaded. :param str condition: SQL text filtering statement, similar to a WHERE clause in a query. Aggregates are not supported. :param str export: Pass an filename or path to download the table as an Apache Arrow file, and leverage memory mapping. If `None` the DataFrame is in memory. :param fs: Valid if export is not None. {fs} :param fs: Valid if export is not None. {fs_options} :param str client_project: The ID of the project that executes the query. Will be passed when creating a job. If `None`, it will be set with the same value as `project`. :param credentials: The authorization credentials to attach to requests. See google.auth.credentials.Credentials for more details. :rtype: DataFrame Example: >>> import os >>> os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../path/to/project_access_key.json' >>> from vaex.contrib.io.gbq import from_table >>> client_project = 'my_project_id' >>> project = 'bigquery-public-data' >>> dataset = 'ml_datasets' >>> table = 'iris' >>> columns = ['species', 'sepal_width', 'petal_width'] >>> conditions = 'species = "virginica"' >>> df = from_table(project=project, dataset=dataset, table=table, columns=columns, condition=conditions, client_project=client_project) >>> df.head(3) # sepal_width petal_width species 0 2.5 1.7 virginica 1 2.5 2 virginica 2 2.2 1.5 virginica >>> ''' # Instantiate the table path and the reading session bq_table = f'projects/{project}/datasets/{dataset}/tables/{table}' req_sess = google.cloud.bigquery_storage.types.ReadSession( table=bq_table, data_format=google.cloud.bigquery_storage.types.DataFormat.ARROW) # Read options req_sess.read_options.selected_fields = columns req_sess.read_options.row_restriction = condition # Instantiate the reading client client = google.cloud.bigquery_storage.BigQueryReadClient( credentials=credentials) parent = f'projects/{client_project or project}' session = client.create_read_session(parent=parent, read_session=req_sess, max_stream_count=1) reader = client.read_rows(session.streams[0].name) if export is None: arrow_table = reader.to_arrow(session) return vaex.from_arrow_table(arrow_table) else: # We need to get the schema first - Get one RecordsBatch manually to get the schema # Get the pages iterator pages = reader.rows(session).pages # Get the first batch first_batch = pages.__next__().to_arrow() # Get the schema schema = first_batch.schema # This does the writing - streams the batches to disk! with vaex.file.open(path=export, mode='wb', fs=fs, fs_options=fs_options) as sink: with pa.RecordBatchStreamWriter(sink, schema) as writer: writer.write_batch(first_batch) for page in pages: batch = page.to_arrow() writer.write_batch(batch) return vaex.open(export)
cluster = rados.Rados(conffile='/etc/ceph/ceph.conf') cluster.connect() ioctx = cluster.open_ioctx('hepdatapool') data = ioctx.read('sample_dataset') ioctx.close() cluster.shutdown() data = json.loads(data) from skyhook import SkyhookDM sk = SkyhookDM() sk.connect('128.105.144.228') sk.writeDataset('/users/xweichu/projects/aod', 'aod') batches = table.to_batches() sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, schema) for batch in batches: writer.write_batch(batch) buff = sink.getvalue() buff_bytes = buff.to_pybytes() # file location:http://uaf-1.t2.ucsd.edu/jeff_data/ from skyhook import SkyhookDM sk = SkyhookDM() sk.connect('128.105.144.211') dst = sk.getDataset('dst') dst.getFiles() f = dst.getFiles()[0] sk.runQuery(f,'select event>X, project Events;1.Muon_dzErr,Events;1.SV_x,Events;1.Jet_puId') sk.runQuery(dst,'select event>X, project Events;1.Muon_dzErr,Events;1.SV_x,Events;1.Jet_puId')
def encode_to_stream(self, cols, out_stream: OutputStream): self._resettable_io.set_output_stream(out_stream) batch_writer = pa.RecordBatchStreamWriter(self._resettable_io, self._schema) batch_writer.write_batch( pandas_to_arrow(self._schema, self._timezone, self._field_types, cols))
def ipc_write_batch(batch): stream = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(stream, batch.schema) writer.write_batch(batch) writer.close() return stream.getvalue()
def _serialize(data: Any, ) -> Tuple[bytes, Serialization]: """Serializes an object to a ``pa.Buffer``. The way the object is serialized depends on the nature of the object: ``pa.RecordBatch`` and ``pa.Table`` are serialized using ``pyarrow`` functions. All other cases are serialized through the ``pickle`` library. Args: data: The object/data to be serialized. Returns: Tuple of the serialized data (in ``pa.Buffer`` format) and the :class:`Serialization` that was used. Raises: SerializationError: If the data could not be serialized. Note: ``pickle`` does not include the code of custom functions or classes, it only pickles their names. Following to the official `Python Docs <https://docs.python.org/3/library/pickle.html#what-can-be-pickled-and-unpickled>`_: "Thus the defining module must be importable in the unpickling environment, and the module must contain the named object, otherwise an exception will be raised." """ if isinstance(data, (pa.RecordBatch, pa.Table)): # Use the intended pyarrow functionalities when possible. if isinstance(data, pa.Table): serialization = Serialization.ARROW_TABLE else: serialization = Serialization.ARROW_BATCH output_buffer = pa.BufferOutputStream() try: writer = pa.RecordBatchStreamWriter(output_buffer, data.schema) writer.write(data) writer.close() except pa.ArrowSerializationError: raise error.SerializationError( f"Could not serialize data of type {type(data)}.") serialized = output_buffer.getvalue() else: # All other cases use the pickle library. serialization = Serialization.PICKLE # Use the best protocol possible, for reference see: # https://docs.python.org/3/library/pickle.html#pickle-protocols try: serialized = pickle.dumps(data, pickle.HIGHEST_PROTOCOL) except pickle.PicklingError: raise error.SerializationError( f"Could not pickle data of type {type(data)}.") # NOTE: zero-copy view on the bytes. serialized = pa.py_buffer(serialized) return serialized, serialization
def _get_writer(self, sink, schema): return pa.RecordBatchStreamWriter(sink, schema)
import pandas as pd import numpy as np import pyarrow as pa import random import sys data_size = int(sys.argv[1])/10 # 1000 = 10GB of data, 100 = 1GB of data, 10 = 100MB of data cols_num = int(data_size) cols = range(cols_num) df = pd.DataFrame(np.random.randint(0,1000000000,size= (1249905,cols_num)), columns=cols) table = pa.Table.from_pandas(df) batches = table.to_batches() sink = pa.BufferOutputStream() writer = pa.RecordBatchStreamWriter(sink, table.schema) for batch in batches: writer.write_batch(batch) buff = sink.getvalue() buff = buff.to_pybytes() f = open('data', 'wb') f.write(buff) f.close() print('The data file with size of ' + str(sys.argv[1]) + ' MB was generated successfully! ')