Beispiel #1
0
def read_parquet(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read Parquet dataset, this will "
                  "be GPU accelerated in the future")
    pa_table = pq.read_pandas(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
Beispiel #2
0
def read_feather(path, *args, **kwargs):
    """{docstring}"""

    warnings.warn("Using CPU via PyArrow to read feather dataset, this may "
                  "be GPU accelerated in the future")
    pa_table = feather.read_table(path, *args, **kwargs)
    return DataFrame.from_arrow(pa_table)
Beispiel #3
0
def read_orc(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    stripe=None,
    skip_rows=None,
    num_rows=None,
    use_index=True,
):
    """{docstring}"""

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        filepath_or_buffer, None)
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = cpp_read_orc(filepath_or_buffer, columns, stripe, skip_rows,
                          num_rows, use_index)
    else:
        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(filepath_or_buffer)
        pa_table = orc_file.read(columns=columns)
        df = DataFrame.from_arrow(pa_table)

    return df
Beispiel #4
0
def read_orc(path, columns=None, **kwargs):
    """{docstring}"""
    warnings.warn("Using CPU via PyArrow to read ORC dataset, this will "
                  "be GPU accelerated in the future")
    orc_file = orc.ORCFile(path)
    pa_table = orc_file.read(columns=columns)
    return DataFrame.from_arrow(pa_table)
Beispiel #5
0
def test_arrow_pandas_compat(pdf, gdf, preserve_index):
    pdf['z'] = range(10)
    pdf = pdf.set_index('z')
    gdf['z'] = range(10)
    gdf = gdf.set_index('z')

    pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index)
    gdf_arrow_table = gdf.to_arrow(preserve_index=preserve_index)

    assert(pa.Table.equals(pdf_arrow_table, gdf_arrow_table))

    gdf2 = DataFrame.from_arrow(pdf_arrow_table)
    pdf2 = pdf_arrow_table.to_pandas()

    assert_eq(pdf2, gdf2)
Beispiel #6
0
Datei: orc.py Projekt: ziiin/cudf
def read_orc(path, engine='cudf', columns=None, skip_rows=None,
             num_rows=None):
    """{docstring}"""

    if engine == 'cudf':
        df = cpp_read_orc(
            path,
            columns,
            skip_rows,
            num_rows
        )
    else:
        warnings.warn("Using CPU via PyArrow to read ORC dataset.")
        orc_file = orc.ORCFile(path)
        pa_table = orc_file.read(columns=columns)
        df = DataFrame.from_arrow(pa_table)

    return df
Beispiel #7
0
def read_parquet(path,
                 engine='cudf',
                 columns=None,
                 row_group=None,
                 skip_rows=None,
                 num_rows=None,
                 strings_to_categorical=False,
                 *args,
                 **kwargs):
    """{docstring}"""

    if engine == 'cudf':
        df = cpp_read_parquet(path, columns, row_group, skip_rows, num_rows,
                              strings_to_categorical)
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        pa_table = pq.read_pandas(path, columns=columns, *args, **kwargs)
        df = DataFrame.from_arrow(pa_table)

    return df
Beispiel #8
0
def read_parquet(
    filepath_or_buffer,
    engine="cudf",
    columns=None,
    row_group=None,
    skip_rows=None,
    num_rows=None,
    strings_to_categorical=False,
    *args,
    **kwargs,
):
    """{docstring}"""

    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
        filepath_or_buffer, None
    )
    if compression is not None:
        ValueError("URL content-encoding decompression is not supported")

    if engine == "cudf":
        df = cpp_read_parquet(
            filepath_or_buffer,
            columns,
            row_group,
            skip_rows,
            num_rows,
            strings_to_categorical,
        )
    else:
        warnings.warn("Using CPU via PyArrow to read Parquet dataset.")
        pa_table = pq.read_pandas(
            filepath_or_buffer, columns=columns, *args, **kwargs
        )
        df = DataFrame.from_arrow(pa_table)

    return df