Esempio n. 1
0
def generate_file(attrs, cache_dir):
    filename = get_filename(attrs)
    filepath = os.path.join(cache_dir, filename)
    if os.path.isfile(filepath):
        return

    print(f"Generating file for {attrs}")

    np.random.seed(0)

    data = np.random.random((attrs['size'], 26))
    data[:, 25] = np.sum(data[:, :25], axis=1)

    open_mode = 'w'
    if attrs['format'] == 'arrow':
        open_mode = 'wb'

    if attrs['type'] == 'array':
        with open(filepath, open_mode) as fd:
            if attrs['format'] == 'csv':
                writer = csv.writer(fd)
                for row in data.tolist():
                    writer.writerow(row)
            elif attrs['format'] == 'json':
                json.dump(data.tolist(), fd)
            elif attrs['format'] == 'arrow':
                raise NotImplementedError()

    elif attrs['type'] == 'table':
        columns = string.ascii_uppercase
        data = [
            dict((col, x) for (col, x) in zip(columns, row))
            for row in data]
        df = pd.DataFrame(data)

        with open(filepath, open_mode) as fd:
            if attrs['format'] == 'csv':
                df.to_csv(fd, index=False)
            elif attrs['format'] == 'json':
                df.to_json(fd, orient='records')
            elif attrs['format'] == 'arrow':
                batch = pa.RecordBatch.from_pandas(df, preserve_index=False)
                writer = pa.RecordBatchStreamWriter(fd, batch.schema)
                writer.write_batch(batch)
                writer.close()

    else:
        raise NotImplementedError()

    if attrs['compression'] == 'none':
        pass
    elif attrs['compression'] == 'gzip':
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            with gzip.open(tmp, 'wb') as gz:
                with open(filepath, 'rb') as fd:
                    shutil.copyfileobj(fd, gz)
        os.unlink(filepath)
        shutil.copy(tmp.name, filepath)
        os.unlink(tmp.name)
    else:
        raise NotImplementedError()
Esempio n. 2
0
def hello():
    channel = grpc.insecure_channel('untrusted:50051')
    stub = codeRunner_pb2_grpc.codeRunnerStub(channel)

    rand = random.choice([True, False])

    from pyarrow import csv
    fn = "IRAhandle_tweets_1.csv" if rand else "mimic.csv"
    table = csv.read_csv(fn)
    start = time.clock()

    print("data loaded")

    batches = table.to_batches()
    print(1)
    client = plasma.connect("/tmp/plasma")

    print(2)

    code = """
import time
while True:
    print(7)
    time.sleep(0.5)
""" if False else """
import os
import pyarrow
import sys

authors = dataTable.column("author")
newData = []
for i in range(len(authors)):
    newData.append(1 if i == 0 or authors[i] != authors[i-1] else newData[-1]+1)
newColumn = dataTable.column(3).from_array("authorTweetCount", [newData])
newTable = dataTable.append_column(newColumn)
    """ if rand else """
import os
import pyarrow
import sys

ages = dataTable.column("age")
maxV = max(ages.to_pylist())
newData = []
for i in ages:
    newData.append(1 if i == maxV else 0)
newColumn = dataTable.column(3).from_array("oldest", [newData])
newTable = dataTable.append_column(newColumn)
    """

    tables = []

    for i in range(len(batches)):
        id_ = randString()

        strId = makeID(id_)

        mock_sink = pyarrow.MockOutputStream()  #find data size
        stream_writer = pyarrow.RecordBatchStreamWriter(
            mock_sink, batches[0].schema)
        stream_writer.write_batch(batches[i])
        stream_writer.close()
        data_size = mock_sink.size()
        buf = client.create(strId, data_size)

        stream = pyarrow.FixedSizeBufferWriter(buf)
        stream_writer = pyarrow.RecordBatchStreamWriter(
            stream, batches[0].schema)
        stream_writer.write_batch(batches[i])
        stream_writer.close()

        client.seal(strId)
        print("sent batch " + str(i + 1))

        codeToSend = codeRunner_pb2.code(toRun=code, id_=id_)

        newId = stub.runCode(codeToSend, timeout=1)
        newId = newId.id_

        [data] = client.get_buffers([makeID(newId)])
        outputBuf = pyarrow.py_buffer(data.to_pybytes())
        buffer_ = pyarrow.BufferReader(outputBuf)
        reader = pyarrow.RecordBatchStreamReader(buffer_)
        if i == 0:
            datatable = reader.read_all()
        else:
            datatable = pyarrow.concat_tables([
                datatable,
                datatable.from_batches(reader.read_all().to_batches())
            ])

    html = str(datatable.column("authorTweetCount" if rand else "oldest").data)
    print("data received after " + str(time.clock() - start))

    return html
Esempio n. 3
0
def table_to_bytes(table):
    global _temp_dir
    if _temp_dir is None or not os.path.exists(_temp_dir):
        _temp_dir = tempfile.mkdtemp(prefix='knime-python-')
        # Delete temporary directory upon Python shutdown.
        atexit.register(close)
    fd, path = tempfile.mkstemp(suffix='.dat', prefix='python-to-java-', dir=_temp_dir, text=False)
    try:
        os.close(fd)

        mp = pyarrow.default_memory_pool()
        col_arrays = []
        col_names = []
        all_names = []
        missing_names = []

        # add the index column to the list of columns
        all_names.append("__index_level_0__")
        if len(table._data_frame.index) > 0:
            col_names.append("__index_level_0__")
            col_arrays.append(pyarrow.Array.from_pandas(table._data_frame.index, type=to_pyarrow_type(_types_.STRING),
                                                        memory_pool=mp))
        else:
            missing_names.append("__index_level_0__")

        # Serialize the dataframe into a list of pyarrow.Array column by column
        for i in range(len(table._data_frame.columns)):
            # missing column ? -> save name and don't send any buffer for column
            if (table._data_frame.iloc[:, i].isnull().all()):
                missing_names.append(table.get_name(i))
                all_names.append(table.get_name(i))
                continue
            # Convert collection types to binary
            if table.get_type(i) == _types_.INTEGER_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<i4')))
            elif table.get_type(i) == _types_.LONG_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<f8')))
            elif table.get_type(i) == _types_.FLOAT_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_list_generator(table._data_frame.iloc[:, i], '<f4')))
            elif table.get_type(i) == _types_.BOOLEAN_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_boolean_list_generator(table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.STRING_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_string_list_generator(table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.BYTES_LIST:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_bytes_list_generator(table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.INTEGER_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<i4')))
            elif table.get_type(i) == _types_.LONG_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<i8')))
            elif table.get_type(i) == _types_.DOUBLE_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<f8')))
            elif table.get_type(i) == _types_.FLOAT_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_set_generator(table._data_frame.iloc[:, i], '<f4')))
            elif table.get_type(i) == _types_.BOOLEAN_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_boolean_set_generator(table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.STRING_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_string_set_generator(table._data_frame.iloc[:, i])))
            elif table.get_type(i) == _types_.BYTES_SET:
                col_arrays.append(
                    pyarrow.Array.from_pandas(binary_from_bytes_set_generator(table._data_frame.iloc[:, i])))
            # Workaround until numpy typecasts are implemented in pyarrow
            elif table.get_type(i) == _types_.INTEGER and table._data_frame.iloc[:, i].dtype == np.int64:
                col_arrays.append(
                    pyarrow.Array.from_pandas(np.array(table._data_frame.iloc[:, i], dtype=np.int32), memory_pool=mp))
            # Workaround until fixed in pyarrow ... it is assumed that the first non-None object is bytearray if any
            elif table.get_type(i) == _types_.BYTES and type(
                    get_first_not_None(table._data_frame.iloc[:, i])) == bytearray:
                col_arrays.append(
                    pyarrow.Array.from_pandas(map(lambda x: x if x is None else bytes(x), table._data_frame.iloc[:, i]),
                                              memory_pool=mp))
            # create pyarrow.Array
            else:
                pa_type = to_pyarrow_type(table.get_type(i))
                # pyarrow.binary() type is not allowed as argument for type atm
                if pa_type == pyarrow.binary():
                    col_arrays.append(pyarrow.BinaryArray.from_pandas(table._data_frame.iloc[:, i], memory_pool=mp))
                else:
                    col_arrays.append(
                        pyarrow.Array.from_pandas(table._data_frame.iloc[:, i], type=pa_type, memory_pool=mp))
            col_names.append(table.get_name(i))
            all_names.append(table.get_name(i))

        # Construct metadata
        custom_metadata = {"index_columns": [all_names[0]],
                           "columns": [
                               {"name": all_names[0], "metadata": {"serializer_id": "", "type_id": _types_.STRING}}],
                           "missing_columns": missing_names,
                           "num_rows": len(table._data_frame)}

        real_col_names = list(table._data_frame.columns)
        for name in all_names[1:]:
            col_idx = real_col_names.index(name)
            if table.get_type(col_idx) in [_types_.BYTES, _types_.BYTES_LIST, _types_.BYTES_SET]:
                custom_metadata['columns'].append({"name": name, "metadata": {
                    "serializer_id": table.get_column_serializers().get(name, ""), "type_id": table.get_type(col_idx)}})
            else:
                custom_metadata['columns'].append(
                    {"name": name, "metadata": {"serializer_id": "", "type_id": table.get_type(col_idx)}})

        metadata = {b'ArrowSerializationLibrary': json.dumps(custom_metadata).encode('utf-8')}

        # Empty record batches are not supported, therefore add a dummy array if dataframe is empty
        if not col_arrays:
            col_arrays.append(pyarrow.array([0]))
            col_names.append('dummy')

        batch = pyarrow.RecordBatch.from_arrays(col_arrays, col_names)

        schema = batch.schema.remove_metadata()
        schema = schema.add_metadata(metadata)

        # Write data to file and return filepath
        with pyarrow.OSFile(path, 'wb') as f:
            stream_writer = pyarrow.RecordBatchStreamWriter(f, schema)
            stream_writer.write_batch(batch)
            stream_writer.close()
        return bytearray(path, 'utf-8')
    except BaseException:
        PythonUtils.invoke_safely(None, os.remove, [path])
        raise
Esempio n. 4
0
def export(dataset,
           path,
           column_names=None,
           byteorder="=",
           shuffle=False,
           selection=False,
           progress=None,
           virtual=True,
           sort=None,
           ascending=True):
    """
    :param DatasetLocal dataset: dataset to export
    :param str path: path for file
    :param lis[str] column_names: list of column names to export or None for all columns
    :param str byteorder: = for native, < for little endian and > for big endian
    :param bool shuffle: export rows in random order
    :param bool selection: export selection or not
    :param progress: progress callback that gets a progress fraction as argument and should return True to continue,
            or a default progress bar when progress=True
    :param: bool virtual: When True, export virtual columns
    :return:
    """
    column_names = column_names or dataset.get_column_names(virtual=virtual,
                                                            strings=True)
    for name in column_names:
        if name not in dataset.columns:
            warnings.warn(
                'Exporting to arrow with virtual columns is not efficient')
    N = len(dataset) if not selection else dataset.selected_length(selection)
    if N == 0:
        raise ValueError("Cannot export empty table")

    if shuffle and sort:
        raise ValueError("Cannot shuffle and sort at the same time")

    if shuffle:
        random_index_column = "random_index"
        while random_index_column in dataset.get_column_names():
            random_index_column += "_new"
    partial_shuffle = shuffle and len(dataset) != N

    order_array = None
    if partial_shuffle:
        # if we only export a portion, we need to create the full length random_index array, and
        shuffle_array_full = np.random.choice(len(dataset),
                                              len(dataset),
                                              replace=False)
        # then take a section of it
        # shuffle_array[:] = shuffle_array_full[:N]
        shuffle_array = shuffle_array_full[shuffle_array_full < N]
        del shuffle_array_full
        order_array = shuffle_array
    elif shuffle:
        shuffle_array = np.random.choice(N, N, replace=False)
        order_array = shuffle_array

    if sort:
        if selection:
            raise ValueError("sorting selections not yet supported")
        logger.info("sorting...")
        indices = np.argsort(dataset.evaluate(sort))
        order_array = indices if ascending else indices[::-1]
        logger.info("sorting done")

    if selection:
        full_mask = dataset.evaluate_selection_mask(selection)
    else:
        full_mask = None

    arrow_arrays = []
    for column_name in column_names:
        mask = full_mask
        if selection:
            values = dataset.evaluate(column_name, filtered=False)
            values = values[mask]
        else:
            values = dataset.evaluate(column_name)
            if shuffle or sort:
                indices = order_array
                values = values[indices]
        arrow_arrays.append(arrow_array_from_numpy_array(values))
    if shuffle:
        arrow_arrays.append(arrow_array_from_numpy_array(order_array))
        column_names = column_names + [random_index_column]
    table = pa.Table.from_arrays(arrow_arrays, column_names)
    b = table.to_batches()
    with pa.OSFile(path, 'wb') as sink:
        writer = pa.RecordBatchStreamWriter(sink, b[0].schema)
        writer.write_table(table)
Esempio n. 5
0
        def handle_batch() -> None:
            nonlocal dataframe
            nonlocal batch
            nonlocal error

            try:
                if dataframe is None:
                    batch = pyarrow.RecordBatchStreamReader(read_stream)
                    dataframe = batch.read_pandas()

                    if encoding is not None:

                        def decode(value: typing.Any) -> typing.Any:
                            if type(value) is bytes:
                                assert encoding is not None

                                return value.decode(encoding)

                            if type(value) is bytearray:
                                assert encoding is not None

                                return value.decode(encoding)

                            if type(value) is tuple:
                                return tuple(decode(child) for child in value)

                            if type(value) is list:
                                return [decode(child) for child in value]

                            if type(value) is numpy.ndarray:
                                return numpy.array(
                                    [decode(child) for child in value])

                            if type(value) is set:
                                return {decode(child) for child in value}

                            if type(value) is frozenset:
                                return frozenset(
                                    decode(child) for child in value)

                            if type(value) is dict:
                                return {
                                    key: decode(child)
                                    for key, child in value.items()
                                }

                            return value

                        dataframe = pandas.DataFrame({
                            column: (dataframe[column].apply(decode)
                                     if dataframe[column].dtype == 'O' else
                                     dataframe[column])
                            for column in dataframe
                        })
                else:
                    if encoding is not None:

                        def encode(value: typing.Any) -> typing.Any:
                            if type(value) is str:
                                assert encoding is not None

                                return value.encode(encoding)

                            if type(value) is tuple:
                                return tuple(encode(child) for child in value)

                            if type(value) is list:
                                return [encode(child) for child in value]

                            if type(value) is numpy.ndarray:
                                return numpy.array(
                                    [encode(child) for child in value])

                            if type(value) is set:
                                return {encode(child) for child in value}

                            if type(value) is frozenset:
                                return frozenset(
                                    encode(child) for child in value)

                            if type(value) is dict:
                                return {
                                    key: encode(child)
                                    for key, child in value.items()
                                }

                            return value

                        dataframe = pandas.DataFrame({
                            column: (dataframe[column].apply(encode)
                                     if dataframe[column].dtype == 'O' else
                                     dataframe[column])
                            for column in dataframe
                        })

                    table = pyarrow.Table.from_arrays([
                        pyarrow.array(dataframe[column].values)
                        for column in dataframe
                    ], dataframe.columns)
                    batch = pyarrow.RecordBatchStreamWriter(
                        write_stream, table.schema)
                    batch.write_table(table)
                    dataframe = None
                    batch.close()
                    write_stream.close()
            except pyarrow.ArrowInvalid:
                pass
            except BaseException as raw_error:  # pylint: disable=broad-except
                error = raw_error
Esempio n. 6
0
def export(dataset, path, column_names=None, byteorder="=", shuffle=False, selection=False, progress=None, virtual=True, sort=None, ascending=True):
    table = _export_table(dataset, column_names, byteorder, shuffle, selection, progress, virtual, sort, ascending)
    b = table.to_batches()
    with pa.OSFile(path, 'wb') as sink:
        writer = pa.RecordBatchStreamWriter(sink, b[0].schema)
        writer.write_table(table)
Esempio n. 7
0
def _serialize_pyarrow_recordbatch(batch):
    output_stream = pa.BufferOutputStream()
    with pa.RecordBatchStreamWriter(output_stream, schema=batch.schema) as wr:
        wr.write_batch(batch)
    return output_stream.getvalue()  # This will also close the stream.
Esempio n. 8
0
def write_branches_to_arrow(messaging,
                            topic_name,
                            file_path,
                            file_id,
                            attr_name_list,
                            chunk_size,
                            server_endpoint,
                            event_limit=None,
                            object_store=None):

    scratch_writer = None

    event_iterator = XAODEvents(file_path, attr_name_list)
    transformer = XAODTransformer(event_iterator)

    batch_number = 0
    total_events = 0
    total_bytes = 0
    for pa_table in transformer.arrow_table(chunk_size, event_limit):
        if object_store:
            if not scratch_writer:
                scratch_writer = _open_scratch_file(args.result_format,
                                                    pa_table)
            _append_table_to_scratch(args.result_format, scratch_writer,
                                     pa_table)

        total_events = total_events + pa_table.num_rows
        batches = pa_table.to_batches(chunksize=chunk_size)

        for batch in batches:
            if messaging:
                key = file_path + "-" + str(batch_number)

                sink = pa.BufferOutputStream()
                writer = pa.RecordBatchStreamWriter(sink, batch.schema)
                writer.write_batch(batch)
                writer.close()
                messaging.publish_message(topic_name, key, sink.getvalue())

                total_bytes = total_bytes + len(sink.getvalue().to_pybytes())

                avg_cell_size = len(sink.getvalue().to_pybytes()) / len(
                    attr_name_list) / batch.num_rows
                print(
                    "Batch number " + str(batch_number) + ", " +
                    str(batch.num_rows) + " events published to " + topic_name,
                    "Avg Cell Size = " + str(avg_cell_size) + " bytes")
                batch_number += 1

    if object_store:
        _close_scratch_file(args.result_format, scratch_writer)
        print("Writing parquet to ", args.request_id, " as ",
              file_path.replace('/', ':'))
        object_store.upload_file(args.request_id, file_path.replace('/', ':'),
                                 "/tmp/out")
        os.remove("/tmp/out")

    print("===> Total Events ", total_events)
    print("===> Total Bytes ", total_bytes)

    if server_endpoint:
        post_status_update(server_endpoint, "File " + file_path + " complete")

    put_file_complete(server_endpoint,
                      file_path,
                      file_id,
                      "success",
                      num_messages=batch_number,
                      total_time="??",
                      total_events=total_events,
                      total_bytes=total_bytes)
Esempio n. 9
0
 def _get_writer(self, sink, schema):
     return pa.RecordBatchStreamWriter(
         sink, schema, use_legacy_format=self.use_legacy_ipc_format)
Esempio n. 10
0
    def write_branches_to_arrow(self, transformer, topic_name, file_id,
                                request_id):
        from .scratch_file_writer import ScratchFileWriter

        tick = time.time()
        scratch_writer = None
        total_messages = 0

        for pa_table in transformer.arrow_table():
            if self.object_store:
                if not scratch_writer:
                    scratch_writer = ScratchFileWriter(
                        file_format=self.file_format)
                    scratch_writer.open_scratch_file(pa_table)

                scratch_writer.append_table_to_scratch(pa_table)

            if self.messaging:
                batches = pa_table.to_batches(
                    max_chunksize=transformer.chunk_size)

                for batch in batches:
                    messaging_tick = time.time()

                    # Just need to make key unique to shard messages across brokers
                    key = str.encode(transformer.file_path + "-" +
                                     str(total_messages))

                    sink = pa.BufferOutputStream()
                    writer = pa.RecordBatchStreamWriter(sink, batch.schema)
                    writer.write_batch(batch)
                    writer.close()
                    self.messaging.publish_message(topic_name, key,
                                                   sink.getvalue())

                    self.avg_cell_size.append(
                        len(sink.getvalue().to_pybytes()) /
                        len(transformer.attr_name_list) / batch.num_rows)
                    total_messages += 1
                    self.messaging_timings.append(time.time() - messaging_tick)

        if self.object_store:
            object_store_tick = time.time()
            scratch_writer.close_scratch_file()

            print("Writing parquet to ", request_id, " as ",
                  transformer.file_path.replace('/', ':'))

            self.object_store.upload_file(
                request_id, transformer.file_path.replace('/', ':'),
                scratch_writer.file_path)

            scratch_writer.remove_scratch_file()
            self.object_store_timing = time.time() - object_store_tick

        tock = time.time()

        if self.messaging:
            avg_avg_cell_size = sum(self.avg_cell_size) / len(self.avg_cell_size) \
                if len(self.avg_cell_size) else 0

            print("Wrote " + str(total_messages) + " events  to " + topic_name,
                  "Avg Cell Size = " + str(avg_avg_cell_size) + " bytes")

        print("Real time: " + str(round(tock - tick / 60.0, 2)) + " minutes")
Esempio n. 11
0
 def _build_writer(self, schema: pa.Schema):
     self._schema: pa.Schema = schema
     self._type: pa.DataType = pa.struct(field for field in self._schema)
     self._features = Features.from_arrow_schema(self._schema)
     self.pa_writer = pa.RecordBatchStreamWriter(self.stream, schema)
Esempio n. 12
0
def from_table(project,
               dataset,
               table,
               columns=None,
               condition=None,
               export=None,
               fs=None,
               fs_options=None,
               client_project=None,
               credentials=None):
    '''Download (stream) an entire Google BigQuery table locally.

    :param str project: The Google BigQuery project that owns the table.
    :param str dataset: The dataset the table is part of.
    :param str table: The name of the table
    :param list columns: A list of columns (field names) to download. If None, all columns will be downloaded.
    :param str condition: SQL text filtering statement, similar to a WHERE clause in a query. Aggregates are not supported.
    :param str export: Pass an filename or path to download the table as an Apache Arrow file, and leverage memory mapping. If `None` the DataFrame is in memory.
    :param fs: Valid if export is not None. {fs}
    :param fs: Valid if export is not None. {fs_options}
    :param str client_project: The ID of the project that executes the query. Will be passed when creating a job. If `None`, it will be set with the same value as `project`.
    :param credentials: The authorization credentials to attach to requests. See google.auth.credentials.Credentials for more details.
    :rtype: DataFrame

    Example:

    >>> import os
    >>> os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../path/to/project_access_key.json'
    >>> from vaex.contrib.io.gbq import from_table

    >>> client_project = 'my_project_id'
    >>> project = 'bigquery-public-data'
    >>> dataset = 'ml_datasets'
    >>> table = 'iris'
    >>> columns = ['species', 'sepal_width', 'petal_width']
    >>> conditions = 'species = "virginica"'
    >>> df = from_table(project=project,
                                            dataset=dataset,
                                            table=table,
                                            columns=columns,
                                            condition=conditions,
                                            client_project=client_project)
    >>> df.head(3)
    #    sepal_width    petal_width  species
    0            2.5            1.7  virginica
    1            2.5            2    virginica
    2            2.2            1.5  virginica
    >>>

    '''
    # Instantiate the table path and the reading session
    bq_table = f'projects/{project}/datasets/{dataset}/tables/{table}'
    req_sess = google.cloud.bigquery_storage.types.ReadSession(
        table=bq_table,
        data_format=google.cloud.bigquery_storage.types.DataFormat.ARROW)

    # Read options
    req_sess.read_options.selected_fields = columns
    req_sess.read_options.row_restriction = condition

    # Instantiate the reading client
    client = google.cloud.bigquery_storage.BigQueryReadClient(
        credentials=credentials)

    parent = f'projects/{client_project or project}'
    session = client.create_read_session(parent=parent,
                                         read_session=req_sess,
                                         max_stream_count=1)
    reader = client.read_rows(session.streams[0].name)

    if export is None:
        arrow_table = reader.to_arrow(session)
        return vaex.from_arrow_table(arrow_table)

    else:
        # We need to get the schema first - Get one RecordsBatch manually to get the schema
        # Get the pages iterator
        pages = reader.rows(session).pages
        # Get the first batch
        first_batch = pages.__next__().to_arrow()
        # Get the schema
        schema = first_batch.schema

        # This does the writing - streams the batches to disk!
        with vaex.file.open(path=export,
                            mode='wb',
                            fs=fs,
                            fs_options=fs_options) as sink:
            with pa.RecordBatchStreamWriter(sink, schema) as writer:
                writer.write_batch(first_batch)
                for page in pages:
                    batch = page.to_arrow()
                    writer.write_batch(batch)

        return vaex.open(export)
Esempio n. 13
0
cluster = rados.Rados(conffile='/etc/ceph/ceph.conf')
cluster.connect()
ioctx = cluster.open_ioctx('hepdatapool')
data = ioctx.read('sample_dataset')
ioctx.close()
cluster.shutdown()
data = json.loads(data)

from skyhook import SkyhookDM
sk = SkyhookDM()
sk.connect('128.105.144.228')
sk.writeDataset('/users/xweichu/projects/aod', 'aod')

batches = table.to_batches()
sink = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(sink, schema)
for batch in batches:
    writer.write_batch(batch)
buff = sink.getvalue()
buff_bytes = buff.to_pybytes()

# file location:http://uaf-1.t2.ucsd.edu/jeff_data/

from skyhook import SkyhookDM
sk = SkyhookDM()
sk.connect('128.105.144.211')
dst = sk.getDataset('dst')
dst.getFiles()
f = dst.getFiles()[0]
sk.runQuery(f,'select event>X, project Events;1.Muon_dzErr,Events;1.SV_x,Events;1.Jet_puId')
sk.runQuery(dst,'select event>X, project Events;1.Muon_dzErr,Events;1.SV_x,Events;1.Jet_puId')
Esempio n. 14
0
 def encode_to_stream(self, cols, out_stream: OutputStream):
     self._resettable_io.set_output_stream(out_stream)
     batch_writer = pa.RecordBatchStreamWriter(self._resettable_io, self._schema)
     batch_writer.write_batch(
         pandas_to_arrow(self._schema, self._timezone, self._field_types, cols))
def ipc_write_batch(batch):
    stream = pa.BufferOutputStream()
    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
    writer.write_batch(batch)
    writer.close()
    return stream.getvalue()
Esempio n. 16
0
def _serialize(data: Any, ) -> Tuple[bytes, Serialization]:
    """Serializes an object to a ``pa.Buffer``.

    The way the object is serialized depends on the nature of the
    object: ``pa.RecordBatch`` and ``pa.Table`` are serialized using
    ``pyarrow`` functions. All other cases are serialized through the
    ``pickle`` library.

    Args:
        data: The object/data to be serialized.

    Returns:
        Tuple of the serialized data (in ``pa.Buffer`` format) and the
        :class:`Serialization` that was used.

    Raises:
        SerializationError: If the data could not be serialized.

    Note:
        ``pickle`` does not include the code of custom functions or
        classes, it only pickles their names. Following to the official
        `Python Docs
        <https://docs.python.org/3/library/pickle.html#what-can-be-pickled-and-unpickled>`_:
        "Thus the defining module must be importable in the unpickling
        environment, and the module must contain the named object,
        otherwise an exception will be raised."

    """
    if isinstance(data, (pa.RecordBatch, pa.Table)):
        # Use the intended pyarrow functionalities when possible.
        if isinstance(data, pa.Table):
            serialization = Serialization.ARROW_TABLE
        else:
            serialization = Serialization.ARROW_BATCH

        output_buffer = pa.BufferOutputStream()
        try:
            writer = pa.RecordBatchStreamWriter(output_buffer, data.schema)
            writer.write(data)
            writer.close()
        except pa.ArrowSerializationError:
            raise error.SerializationError(
                f"Could not serialize data of type {type(data)}.")

        serialized = output_buffer.getvalue()

    else:
        # All other cases use the pickle library.
        serialization = Serialization.PICKLE

        # Use the best protocol possible, for reference see:
        # https://docs.python.org/3/library/pickle.html#pickle-protocols
        try:
            serialized = pickle.dumps(data, pickle.HIGHEST_PROTOCOL)
        except pickle.PicklingError:
            raise error.SerializationError(
                f"Could not pickle data of type {type(data)}.")

        # NOTE: zero-copy view on the bytes.
        serialized = pa.py_buffer(serialized)

    return serialized, serialization
Esempio n. 17
0
 def _get_writer(self, sink, schema):
     return pa.RecordBatchStreamWriter(sink, schema)
Esempio n. 18
0
import pandas as pd
import numpy as np
import pyarrow as pa
import random
import sys

data_size = int(sys.argv[1])/10

# 1000 = 10GB of data, 100 = 1GB of data, 10 = 100MB of data
cols_num = int(data_size)
cols = range(cols_num)
df = pd.DataFrame(np.random.randint(0,1000000000,size= (1249905,cols_num)), columns=cols)
table = pa.Table.from_pandas(df)
batches = table.to_batches()
sink = pa.BufferOutputStream()
writer = pa.RecordBatchStreamWriter(sink, table.schema)

for batch in batches:
    writer.write_batch(batch)
buff = sink.getvalue()
buff = buff.to_pybytes()

f = open('data', 'wb')
f.write(buff)
f.close()

print('The data file with size of ' + str(sys.argv[1]) + ' MB was generated successfully! ')