Beispiel #1
0
def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buffer_output_stream)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for default WriteOptions
    assert orc_file.compression == 'UNCOMPRESSED'
    assert orc_file.file_version == '0.12'
    assert orc_file.row_index_stride == 10000
    assert orc_file.compression_size == 65536

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for default WriteOptions
    assert orc_file.compression == 'UNCOMPRESSED'
    assert orc_file.file_version == '0.12'
    assert orc_file.row_index_stride == 10000
    assert orc_file.compression_size == 65536
Beispiel #2
0
    def _read_orc(self, filename):
        if ('CPU' in self.compute_type):
            if (filename.startswith('gs://')):
                fs = gcsfs.GCSFileSystem()
                with fs.open(filename, mode='rb') as file:
                    dataset = pyarrow_orc.ORCFile(file).read().to_pandas()
            else:
                with open(filename, mode='rb') as file:
                    dataset = pyarrow_orc.ORCFile(file).read().to_pandas()

        elif ('GPU' in self.compute_type):
            dataset = cudf.read_orc(filename)

        return dataset
Beispiel #3
0
def test_orcfile_empty(datadir):
    from pyarrow import orc

    table = orc.ORCFile(datadir / 'TestOrcFile.emptyFile.orc').read()
    assert table.num_rows == 0

    expected_schema = pa.schema([
        ('boolean1', pa.bool_()),
        ('byte1', pa.int8()),
        ('short1', pa.int16()),
        ('int1', pa.int32()),
        ('long1', pa.int64()),
        ('float1', pa.float32()),
        ('double1', pa.float64()),
        ('bytes1', pa.binary()),
        ('string1', pa.string()),
        ('middle', pa.struct([
            ('list', pa.list_(pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ]))),
            ])),
        ('list', pa.list_(pa.struct([
            ('int1', pa.int32()),
            ('string1', pa.string()),
            ]))),
        ('map', pa.list_(pa.struct([
            ('key', pa.string()),
            ('value', pa.struct([
                ('int1', pa.int32()),
                ('string1', pa.string()),
                ])),
            ]))),
        ])
    assert table.schema == expected_schema
Beispiel #4
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    stripe=None,
    skip_rows=None,
    num_rows=None,
    use_index=True,
    **kwargs,
):
    """{docstring}"""

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        filepath_or_buffer, None, **kwargs
    )
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = libcudf.orc.read_orc(
            filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index
        )
    else:
        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Beispiel #5
0
def test_orcfile_empty(datadir):
    from pyarrow import orc

    table = orc.ORCFile(datadir / "TestOrcFile.emptyFile.orc").read()
    assert table.num_rows == 0

    expected_schema = pa.schema([
        ("boolean1", pa.bool_()),
        ("byte1", pa.int8()),
        ("short1", pa.int16()),
        ("int1", pa.int32()),
        ("long1", pa.int64()),
        ("float1", pa.float32()),
        ("double1", pa.float64()),
        ("bytes1", pa.binary()),
        ("string1", pa.string()),
        ("middle",
         pa.struct([("list",
                     pa.list_(
                         pa.struct([("int1", pa.int32()),
                                    ("string1", pa.string())])))])),
        ("list",
         pa.list_(pa.struct([("int1", pa.int32()),
                             ("string1", pa.string())]))),
        ("map",
         pa.map_(pa.string(),
                 pa.struct([("int1", pa.int32()), ("string1", pa.string())]))),
    ])
    assert table.schema == expected_schema
Beispiel #6
0
def check_example_file(orc_path, expected_df, need_fix=False):
    """
    Check a ORC file against the expected columns dictionary.
    """
    from pyarrow import orc

    orc_file = orc.ORCFile(orc_path)
    # Exercise ORCFile.read()
    table = orc_file.read()
    assert isinstance(table, pa.Table)
    table.validate()

    # This workaround needed because of ARROW-3080
    orc_df = pd.DataFrame(table.to_pydict())

    assert set(expected_df.columns) == set(orc_df.columns)

    # reorder columns if necessary
    if not orc_df.columns.equals(expected_df.columns):
        expected_df = expected_df.reindex(columns=orc_df.columns)

    if need_fix:
        fix_example_values(orc_df, expected_df)

    check_example_values(orc_df, expected_df)
    # Exercise ORCFile.read_stripe()
    json_pos = 0
    for i in range(orc_file.nstripes):
        batch = orc_file.read_stripe(i)
        check_example_values(pd.DataFrame(batch.to_pydict()),
                             expected_df,
                             start=json_pos,
                             stop=json_pos + len(batch))
        json_pos += len(batch)
    assert json_pos == orc_file.nrows
Beispiel #7
0
def read_orc(path, columns=None, **kwargs):
    """{docstring}"""
    warnings.warn("Using CPU via PyArrow to read ORC dataset, this will "
                  "be GPU accelerated in the future")
    orc_file = orc.ORCFile(path)
    pa_table = orc_file.read(columns=columns)
    return DataFrame.from_arrow(pa_table)
Beispiel #8
0
def _read_orc_stripe(fs, path, stripe, columns=None):
    """Pull out specific data from specific part of ORC file"""
    from pyarrow import orc

    with fs.open(path, "rb") as f:
        o = orc.ORCFile(f)
        table = o.read_stripe(stripe, columns)
    return table.to_pandas(date_as_object=False)
Beispiel #9
0
def test_orcfile_readwrite_with_writeoptions():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(
        table,
        buffer_output_stream,
        compression='snappy',
        file_version='0.11',
        row_index_stride=5000,
        compression_block_size=32768,
    )
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for modified WriteOptions
    assert orc_file.compression == 'SNAPPY'
    assert orc_file.file_version == '0.11'
    assert orc_file.row_index_stride == 5000
    assert orc_file.compression_size == 32768

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(
            buffer_output_stream,
            table,
            compression='uncompressed',
            file_version='0.11',
            row_index_stride=20000,
            compression_block_size=16384,
        )
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    orc_file = orc.ORCFile(buffer_reader)
    output_table = orc_file.read()
    assert table.equals(output_table)
    # Check for default WriteOptions
    assert orc_file.compression == 'UNCOMPRESSED'
    assert orc_file.file_version == '0.11'
    assert orc_file.row_index_stride == 20000
    assert orc_file.compression_size == 16384
Beispiel #10
0
def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buffer_output_stream)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)

    # deprecated keyword order
    buffer_output_stream = pa.BufferOutputStream()
    with pytest.warns(FutureWarning):
        orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)
def get_product_df():
    import pyarrow.orc as orc
    target_path = '/backup/ND/GC Product Master/part-00000-efc19fcb-f673-4438-af65-02560d10d5ac-c000.snappy.orc'
    with open(target_path, 'rb') as file:
        data = orc.ORCFile(file)
        product = data.read().to_pandas()
    if 'product_info.csv' not in os.listdir('/home/yezhipeng/info/'):
        product.to_csv('/home/yezhipeng/info/product_info.csv')
        print('copy saved to /home/yezhipeng/info/')
    return product
Beispiel #12
0
def read_orc_metadata(path):
    """{docstring}"""

    orc_file = orc.ORCFile(path)

    num_rows = orc_file.nrows
    num_stripes = orc_file.nstripes
    col_names = orc_file.schema.names

    return num_rows, num_stripes, col_names
Beispiel #13
0
def run_task(sync, files):
    """sync main flow"""
    if isinstance(files, str):
        files = [files]

    for file in files:
        sync_task = sync.sync_task
        adb_table = sync_task["adb_table"]
        tmp_mysql_table = adb_table
        if sync.dml_operator == 'INSERT INTO':
            tmp_mysql_table = f'{adb_table}_tmp'

        file_parts = file.split('/')
        suffix = file_parts[-1]
        local_name = f'{file_addr}/{suffix}'
        start = time.time()
        download(file, local_name)
        logger.info('oss read done')
        with open(local_name, 'rb') as f:
            data = orc.ORCFile(f)
            table = data.read()
            df = table.to_pandas()
            special_columns = sync_task.get('special_columns')
            if special_columns:
                sp = [c for c in special_columns if c in df.columns]
                if sp:
                    df = df.drop(columns=sp)
            num_columns = table.num_columns
            logger.info(table.num_rows)
            logger.info(num_columns)

        # find partitions
        for path in file_parts:
            if '=' in path:
                df[path.split('=')[0]] = path.split('=')[1]

        total = df.shape[0]
        logger.info(total)
        base = 10000
        rn = total // base
        all_tasks = []
        with ThreadPoolExecutor(5 * multiprocessing.cpu_count()) as executor:
            for i in range(rn):
                task = executor.submit(data_frame_to_mysql,
                                       df[i * base:(i + 1) * base],
                                       tmp_mysql_table)
                all_tasks.append(task)
            task = executor.submit(data_frame_to_mysql, df[rn * base:total],
                                   tmp_mysql_table)
            all_tasks.append(task)
        wait(all_tasks, return_when=ALL_COMPLETED)

        os.remove(local_name)
        logger.info('sync done!')
        logger.info(time.time() - start)
Beispiel #14
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimals_as_float=True,
    force_decimal_scale=None,
    timestamp_type=None,
    **kwargs,
):
    """{docstring}"""

    from cudf import DataFrame

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        path_or_data=filepath_or_buffer, compression=None, **kwargs)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = DataFrame._from_table(
            libcudf.orc.read_orc(
                filepath_or_buffer,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimals_as_float,
                force_decimal_scale,
                timestamp_type,
            ))
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        if stripes is not None and len(stripes) > 0:
            pa_tables = [
                read_orc_stripe(orc_file, i, columns) for i in stripes
            ]
            pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Beispiel #15
0
def _make_empty_df(filepath_or_buffer, columns):
    orc_file = orc.ORCFile(filepath_or_buffer)
    schema = orc_file.schema
    col_names = schema.names if columns is None else columns
    return cudf.DataFrame({
        col_name: cudf.core.column.column_empty(
            row_count=0,
            dtype=schema.field(col_name).type.to_pandas_dtype(),
        )
        for col_name in col_names
    })
Beispiel #16
0
def test_orcfile_readwrite():
    from pyarrow import orc

    buffer_output_stream = pa.BufferOutputStream()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(buffer_output_stream, table)
    buffer_reader = pa.BufferReader(buffer_output_stream.getvalue())
    output_table = orc.ORCFile(buffer_reader).read()
    assert table.equals(output_table)
    def load_data(self,
                  filename='dataset.orc',
                  col_labels=airline_cols,
                  y_label='ArrDelayBinary'):

        target_filename = self.CSP_paths['train_data'] + '/' + filename
        self.log_to_file(f'\n> loading dataset from {target_filename}...\n')

        with PerfTimer() as ingestion_timer:
            if 'CPU' in self.compute_type:
                if 'ORC' in self.data_type:
                    if (target_filename.startswith('gs://')):
                        fs = gcsfs.GCSFileSystem()
                        with fs.open(target_filename, mode='rb') as file:
                            dataset = pyarrow_orc.ORCFile(
                                file).read().to_pandas()
                    else:
                        with open(target_filename, mode='rb') as file:
                            dataset = pyarrow_orc.ORCFile(
                                file).read().to_pandas()

                elif 'CSV' in self.data_type:
                    dataset = pd.read_csv(target_filename, names=col_labels)

            elif 'GPU' in self.compute_type:
                if 'ORC' in self.data_type:
                    dataset = cudf.read_orc(target_filename)
                elif 'CSV' in self.data_type:
                    dataset = cudf.read_csv(target_filename, names=col_labels)

        self.log_to_file(
            f'\t ingestion completed in {ingestion_timer.duration}')

        self.log_to_file(
            f'dataset descriptors: {dataset.shape}\n {dataset.dtypes}\n {dataset.columns}\n'
        )
        self.query_memory()

        # TODO: if mem_free below a threshold issue a warning [ ? ]
        return dataset, col_labels, y_label, ingestion_timer.duration
Beispiel #18
0
def _read_orc_stripes(fs, path, stripes, schema, columns):
    # Construct a list of RecordBatch objects.
    # Each ORC stripe will corresonpond to a single RecordBatch.
    if columns is None:
        columns = list(schema)

    batches = []
    with fs.open(path, "rb") as f:
        o = orc.ORCFile(f)
        _stripes = range(o.nstripes) if stripes is None else stripes
        for stripe in _stripes:
            batches.append(o.read_stripe(stripe, columns))
    return batches
Beispiel #19
0
def test_bytesio_readwrite():
    from pyarrow import orc
    from io import BytesIO

    buf = BytesIO()
    a = pa.array([1, None, 3, None])
    b = pa.array([None, "Arrow", None, "ORC"])
    table = pa.table({"int64": a, "utf8": b})
    orc.write_table(table, buf)
    buf.seek(0)
    orc_file = orc.ORCFile(buf)
    output_table = orc_file.read()
    assert table.equals(output_table)
Beispiel #20
0
def test_read_orc(datadir, hdfs, test_url):
    fname = datadir / "orc" / "TestOrcFile.testSnappy.orc"
    # Read from local file system as buffer
    with open(fname, mode="rb") as f:
        buffer = BytesIO(f.read())
    # Write to hdfs
    hdfs.upload(basedir + "/file.orc", buffer)

    if test_url:
        hd_fpath = f"hdfs://{host}:{port}{basedir}/file.orc"
    else:
        hd_fpath = f"hdfs://{basedir}/file.orc"

    got = cudf.read_orc(hd_fpath)
    expect = orc.ORCFile(buffer).read().to_pandas()
    assert_eq(expect, got)
Beispiel #21
0
def transform():
    parser = argparse.ArgumentParser(
        description='Returns total lines of data records')
    parser.add_argument("--fileinput", type=str, required=True)
    #parser.add_argument("--output_path", type=str, required=True)
    parser.add_argument("--totallines", type=str, required=True)
    args = parser.parse_args()

    filename = args.fileinput
    with open(filename, 'rb') as file:
        data = orc.ORCFile(file)
        df = data.read().to_pandas()

    Path(args.totallines).parent.mkdir(parents=True, exist_ok=True)
    with open(args.totallines, 'w') as f:
        f.write(len(df))
Beispiel #22
0
def test_write_orc(pdf, hdfs, test_url):
    # Orc writer doesn't support writing unsigned ints
    pdf["Integer2"] = pdf["Integer2"].astype("int64")
    gdf = cudf.from_pandas(pdf)
    if test_url:
        hd_fpath = "hdfs://{}:{}{}/test_orc_writer.orc".format(
            host, port, basedir)
    else:
        hd_fpath = "hdfs://{}/test_orc_writer.orc".format(basedir)

    gdf.to_orc(hd_fpath)

    assert hdfs.exists(f"{basedir}/test_orc_writer.orc")
    with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f:
        got = orc.ORCFile(f).read().to_pandas()

    assert_eq(pdf, got)
Beispiel #23
0
Datei: orc.py Projekt: ziiin/cudf
def read_orc(path, engine='cudf', columns=None, skip_rows=None,
             num_rows=None):
    """{docstring}"""

    if engine == 'cudf':
        df = cpp_read_orc(
            path,
            columns,
            skip_rows,
            num_rows
        )
    else:
        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(path)
        pa_table = orc_file.read(columns=columns)
        df = DataFrame.from_arrow(pa_table)

    return df
Beispiel #24
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    stripe=None,
    skip_rows=None,
    num_rows=None,
    use_index=True,
    decimals_as_float=True,
    force_decimal_scale=None,
    **kwargs,
):
    """{docstring}"""

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        filepath_or_buffer, None, **kwargs
    )
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = libcudf.orc.read_orc(
            filepath_or_buffer,
            columns,
            stripe,
            skip_rows,
            num_rows,
            use_index,
            decimals_as_float,
            force_decimal_scale,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        if stripe is not None:
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Beispiel #25
0
def test_column_selection(tempdir):
    from pyarrow import orc

    # create a table with nested types
    inner = pa.field('inner', pa.int64())
    middle = pa.field('middle', pa.struct([inner]))
    fields = [
        pa.field('basic', pa.int32()),
        pa.field('list', pa.list_(pa.field('item', pa.int32()))),
        pa.field('struct', pa.struct([middle,
                                      pa.field('inner2', pa.int64())])),
        pa.field(
            'list-struct',
            pa.list_(
                pa.field(
                    'item',
                    pa.struct([
                        pa.field('inner1', pa.int64()),
                        pa.field('inner2', pa.int64())
                    ])))),
        pa.field('basic2', pa.int64()),
    ]
    arrs = [[0], [[1, 2]], [{
        "middle": {
            "inner": 3
        },
        "inner2": 4
    }], [[{
        "inner1": 5,
        "inner2": 6
    }, {
        "inner1": 7,
        "inner2": 8
    }]], [9]]
    table = pa.table(arrs, schema=pa.schema(fields))

    path = str(tempdir / 'test.orc')
    orc.write_table(table, path)
    orc_file = orc.ORCFile(path)

    # default selecting all columns
    result1 = orc_file.read()
    assert result1.equals(table)

    # selecting with columns names
    result2 = orc_file.read(columns=["basic", "basic2"])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=["list", "struct", "basic2"])
    assert result3.equals(table.select(["list", "struct", "basic2"]))

    # using dotted paths
    result4 = orc_file.read(columns=["struct.middle.inner"])
    expected4 = pa.table({"struct": [{"middle": {"inner": 3}}]})
    assert result4.equals(expected4)

    result5 = orc_file.read(columns=["struct.inner2"])
    expected5 = pa.table({"struct": [{"inner2": 4}]})
    assert result5.equals(expected5)

    result6 = orc_file.read(
        columns=["list", "struct.middle.inner", "struct.inner2"])
    assert result6.equals(table.select(["list", "struct"]))

    result7 = orc_file.read(columns=["list-struct.inner1"])
    expected7 = pa.table({"list-struct": [[{"inner1": 5}, {"inner1": 7}]]})
    assert result7.equals(expected7)

    # selecting with (Arrow-based) field indices
    result2 = orc_file.read(columns=[0, 4])
    assert result2.equals(table.select(["basic", "basic2"]))

    result3 = orc_file.read(columns=[1, 2, 3])
    assert result3.equals(table.select(["list", "struct", "list-struct"]))

    # error on non-existing name or index
    with pytest.raises(IOError):
        # liborc returns ParseError, which gets translated into IOError
        # instead of ValueError
        orc_file.read(columns=["wrong"])

    with pytest.raises(ValueError):
        orc_file.read(columns=[5])
Beispiel #26
0
Datei: orc.py Projekt: vyasr/cudf
def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
    """Read cudf dataframe from ORC file(s).

    Note that this function is mostly borrowed from upstream Dask.

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol specifier,
        and may include glob character if a single string.
    columns: None or list(str)
        Columns to load. If None, loads all.
    filters : None or list of tuple or list of lists of tuples
        If not None, specifies a filter predicate used to filter out row groups
        using statistics stored for each row group as Parquet metadata. Row
        groups that do not match the given filter predicate are not read. The
        predicate is expressed in disjunctive normal form (DNF) like
        `[[('x', '=', 0), ...], ...]`. DNF allows arbitrary boolean logical
        combinations of single column predicates. The innermost tuples each
        describe a single column predicate. The list of inner predicates is
        interpreted as a conjunction (AND), forming a more selective and
        multiple column predicate. Finally, the outermost list combines
        these filters as a disjunction (OR). Predicates may also be passed
        as a list of tuples. This form is interpreted as a single conjunction.
        To express OR in predicates, one must use the (preferred) notation of
        list of lists of tuples.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    cudf.DataFrame
    """

    storage_options = storage_options or {}
    fs, fs_token, paths = get_fs_token_paths(path,
                                             mode="rb",
                                             storage_options=storage_options)
    schema = None
    nstripes_per_file = []
    for path in paths:
        with fs.open(path, "rb") as f:
            o = orc.ORCFile(f)
            if schema is None:
                schema = o.schema
            elif schema != o.schema:
                raise ValueError(
                    "Incompatible schemas while parsing ORC files")
            nstripes_per_file.append(o.nstripes)
    schema = _get_pyarrow_dtypes(schema, categories=None)
    if columns is not None:
        ex = set(columns) - set(schema)
        if ex:
            raise ValueError("Requested columns (%s) not in schema (%s)" %
                             (ex, set(schema)))
    else:
        columns = list(schema)

    with fs.open(paths[0], "rb") as f:
        meta = cudf.read_orc(f, stripes=[0], columns=columns, **kwargs)

    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
    dsk = {}
    N = 0
    for path, n in zip(paths, nstripes_per_file):
        for stripe in (range(n) if filters is None else
                       cudf.io.orc._filter_stripes(filters, path)):
            dsk[(name, N)] = (
                _read_orc_stripe,
                fs,
                path,
                stripe,
                columns,
                kwargs,
            )
            N += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Beispiel #27
0
Datei: orc.py Projekt: vuule/cudf
def read_orc(path, columns=None, storage_options=None, **kwargs):
    """Read cudf dataframe from ORC file(s).

    Note that this function is mostly borrowed from upstream Dask.

    Parameters
    ----------
    path: str or list(str)
        Location of file(s), which can be a full URL with protocol specifier,
        and may include glob character if a single string.
    columns: None or list(str)
        Columns to load. If None, loads all.
    storage_options: None or dict
        Further parameters to pass to the bytes backend.

    Returns
    -------
    cudf.DataFrame
    """

    storage_options = storage_options or {}
    fs, fs_token, paths = get_fs_token_paths(path,
                                             mode="rb",
                                             storage_options=storage_options)
    schema = None
    nstripes_per_file = []
    for path in paths:
        with fs.open(path, "rb") as f:
            o = orc.ORCFile(f)
            if schema is None:
                schema = o.schema
            elif schema != o.schema:
                raise ValueError(
                    "Incompatible schemas while parsing ORC files")
            nstripes_per_file.append(o.nstripes)
    schema = _get_pyarrow_dtypes(schema, categories=None)
    if columns is not None:
        ex = set(columns) - set(schema)
        if ex:
            raise ValueError("Requested columns (%s) not in schema (%s)" %
                             (ex, set(schema)))
    else:
        columns = list(schema)

    with fs.open(paths[0], "rb") as f:
        meta = cudf.read_orc(f, stripe=0, columns=columns, **kwargs)

    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
    dsk = {}
    N = 0
    for path, n in zip(paths, nstripes_per_file):
        for stripe in range(n):
            dsk[(name, N)] = (
                _read_orc_stripe,
                fs,
                path,
                stripe,
                columns,
                kwargs,
            )
            N += 1

    divisions = [None] * (len(dsk) + 1)
    return dd.core.new_dd_object(dsk, name, meta, divisions)
Beispiel #28
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    filters=None,
    stripes=None,
    skiprows=None,
    num_rows=None,
    use_index=True,
    decimal_cols_as_float=None,
    timestamp_type=None,
    use_python_file_object=True,
    **kwargs,
):
    """{docstring}"""
    if decimal_cols_as_float is not None:
        warnings.warn(
            "`decimal_cols_as_float` is deprecated and will be removed in "
            "the future",
            FutureWarning,
        )
    from cudf import DataFrame

    # Multiple sources are passed as a list. If a single source is passed,
    # wrap it in a list for unified processing downstream.
    if not is_list_like(filepath_or_buffer):
        filepath_or_buffer = [filepath_or_buffer]

    # Each source must have a correlating stripe list. If a single stripe list
    # is provided rather than a list of list of stripes then extrapolate that
    # stripe list across all input sources
    if stripes is not None:
        if any(not isinstance(stripe, list) for stripe in stripes):
            stripes = [stripes]

        # Must ensure a stripe for each source is specified, unless None
        if not len(stripes) == len(filepath_or_buffer):
            raise ValueError(
                "A list of stripes must be provided for each input source"
            )

    filepaths_or_buffers = []
    for source in filepath_or_buffer:
        if ioutils.is_directory(source, **kwargs):
            fs = ioutils._ensure_filesystem(
                passed_filesystem=None, path=source, **kwargs,
            )
            source = stringify_path(source)
            source = fs.sep.join([source, "*.orc"])

        tmp_source, compression = ioutils.get_filepath_or_buffer(
            path_or_data=source,
            compression=None,
            use_python_file_object=use_python_file_object,
            **kwargs,
        )
        if compression is not None:
            raise ValueError(
                "URL content-encoding decompression is not supported"
            )
        if isinstance(tmp_source, list):
            filepaths_or_buffers.extend(tmp_source)
        else:
            filepaths_or_buffers.append(tmp_source)

    if filters is not None:
        selected_stripes = _filter_stripes(
            filters, filepaths_or_buffers, stripes, skiprows, num_rows
        )

        # Return empty if everything was filtered
        if len(selected_stripes) == 0:
            return _make_empty_df(filepaths_or_buffers[0], columns)
        else:
            stripes = selected_stripes

    if engine == "cudf":
        return DataFrame._from_data(
            *liborc.read_orc(
                filepaths_or_buffers,
                columns,
                stripes,
                skiprows,
                num_rows,
                use_index,
                decimal_cols_as_float,
                timestamp_type,
            )
        )
    else:

        def read_orc_stripe(orc_file, stripe, columns):
            pa_table = orc_file.read_stripe(stripe, columns)
            if isinstance(pa_table, pa.RecordBatch):
                pa_table = pa.Table.from_batches([pa_table])
            return pa_table

        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        if len(filepath_or_buffer) > 1:
            raise NotImplementedError(
                "Using CPU via PyArrow only supports a single a "
                "single input source"
            )

        orc_file = orc.ORCFile(filepath_or_buffer[0])
        if stripes is not None and len(stripes) > 0:
            for stripe_source_file in stripes:
                pa_tables = [
                    read_orc_stripe(orc_file, i, columns)
                    for i in stripe_source_file
                ]
                pa_table = pa.concat_tables(pa_tables)
        else:
            pa_table = orc_file.read(columns=columns)
        df = cudf.DataFrame.from_arrow(pa_table)

    return df
Beispiel #29
0
    def read_metadata(
        cls,
        fs,
        paths,
        columns,
        index,
        split_stripes,
        aggregate_files,
        **kwargs,
    ):

        # Convert root directory to file list.
        # TODO: Handle hive-partitioned data
        if len(paths) == 1 and not fs.isfile(paths[0]):
            paths = fs.find(paths[0])

        schema = None
        parts = []

        def _get_schema(_o, schema):
            if schema is None:
                schema = _o.schema
            elif schema != _o.schema:
                raise ValueError("Incompatible schemas while parsing ORC files")
            return schema

        if split_stripes:
            offset = 0
            for path in paths:
                with fs.open(path, "rb") as f:
                    o = orc.ORCFile(f)
                    if schema is None:
                        schema = o.schema
                    elif schema != o.schema:
                        raise ValueError("Incompatible schemas while parsing ORC files")
                    _stripes = list(range(o.nstripes))
                    if offset:
                        parts.append([(path, _stripes[0:offset])])
                    while offset < o.nstripes:
                        parts.append(
                            [(path, _stripes[offset : offset + int(split_stripes)])]
                        )
                        offset += int(split_stripes)
                    if aggregate_files and int(split_stripes) > 1:
                        offset -= o.nstripes
                    else:
                        offset = 0
        else:
            for path in paths:
                if schema is None:
                    with fs.open(paths[0], "rb") as f:
                        o = orc.ORCFile(f)
                        schema = o.schema
                parts.append([(path, None)])

        schema = _get_pyarrow_dtypes(schema, categories=None)
        if columns is not None:
            ex = set(columns) - set(schema)
            if ex:
                raise ValueError(
                    "Requested columns (%s) not in schema (%s)" % (ex, set(schema))
                )

        # Check if we can aggregate adjacent parts together
        parts = cls._aggregate_files(aggregate_files, split_stripes, parts)

        columns = list(schema) if columns is None else columns
        index = [index] if isinstance(index, str) else index
        meta = _meta_from_dtypes(columns, schema, index, [])
        return parts, schema, meta
Beispiel #30
0
if isinstance(json, dict):
    if (len(json) > 1):
        DataFrame = json_normalize(flatten(json))
    else:
        DataFrame = json_normalize(list(json.values())[0])
else:
    FlattenedData = (flatten(_json) for _json in json)
    DataFrame = pd.DataFrame(FlattenedData)

import pandas as pd
import pyarrow.orc as orc
import pyarrow.parquet as parquet

with open("SampleORC", encoding="utf-16", errors='ignore') as file:
    #with open("SampleORC") as file:
    data = orc.ORCFile(file)
    df = data.read().to_pandas()

data = orc.ORCFile("SampleORC.orc")
df = data.read().to_pandas()

data2 = parquet.ParquetFile("SampleParquet.parquet")
df2 = data2.read().to_pandas()

from fastavro import reader
with open('SampleAvro.avro', 'rb') as fo:
    for record in reader(fo):
        print(record)

import numpy as np
import pandas as pd