def read_parquet(path, *args, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read Parquet dataset, this will " "be GPU accelerated in the future") pa_table = pq.read_pandas(path, *args, **kwargs) return DataFrame.from_arrow(pa_table)
def read_feather(path, *args, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read feather dataset, this may " "be GPU accelerated in the future") pa_table = feather.read_table(path, *args, **kwargs) return DataFrame.from_arrow(pa_table)
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripe=None, skip_rows=None, num_rows=None, use_index=True, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = cpp_read_orc(filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) pa_table = orc_file.read(columns=columns) df = DataFrame.from_arrow(pa_table) return df
def read_orc(path, columns=None, **kwargs): """{docstring}""" warnings.warn("Using CPU via PyArrow to read ORC dataset, this will " "be GPU accelerated in the future") orc_file = orc.ORCFile(path) pa_table = orc_file.read(columns=columns) return DataFrame.from_arrow(pa_table)
def test_arrow_pandas_compat(pdf, gdf, preserve_index): pdf['z'] = range(10) pdf = pdf.set_index('z') gdf['z'] = range(10) gdf = gdf.set_index('z') pdf_arrow_table = pa.Table.from_pandas(pdf, preserve_index=preserve_index) gdf_arrow_table = gdf.to_arrow(preserve_index=preserve_index) assert(pa.Table.equals(pdf_arrow_table, gdf_arrow_table)) gdf2 = DataFrame.from_arrow(pdf_arrow_table) pdf2 = pdf_arrow_table.to_pandas() assert_eq(pdf2, gdf2)
def read_orc(path, engine='cudf', columns=None, skip_rows=None, num_rows=None): """{docstring}""" if engine == 'cudf': df = cpp_read_orc( path, columns, skip_rows, num_rows ) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(path) pa_table = orc_file.read(columns=columns) df = DataFrame.from_arrow(pa_table) return df
def read_parquet(path, engine='cudf', columns=None, row_group=None, skip_rows=None, num_rows=None, strings_to_categorical=False, *args, **kwargs): """{docstring}""" if engine == 'cudf': df = cpp_read_parquet(path, columns, row_group, skip_rows, num_rows, strings_to_categorical) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") pa_table = pq.read_pandas(path, columns=columns, *args, **kwargs) df = DataFrame.from_arrow(pa_table) return df
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, row_group=None, skip_rows=None, num_rows=None, strings_to_categorical=False, *args, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = cpp_read_parquet( filepath_or_buffer, columns, row_group, skip_rows, num_rows, strings_to_categorical, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") pa_table = pq.read_pandas( filepath_or_buffer, columns=columns, *args, **kwargs ) df = DataFrame.from_arrow(pa_table) return df