def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripe=None, skip_rows=None, num_rows=None, use_index=True, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = libcudf.orc.read_orc( filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index ) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_avro( filepath_or_buffer, engine="cudf", columns=None, skiprows=None, num_rows=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_avro` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return DataFrame._from_table( libcudf.avro.read_avro(filepath_or_buffer, columns, skiprows, num_rows)) else: raise NotImplementedError("read_avro currently only supports cudf")
def read_orc_statistics( filepath_or_buffer, columns=None, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_orc_statistics` does not support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") # Read in statistics and unpack ( column_names, raw_file_statistics, raw_stripes_statistics, ) = libcudf.orc.read_raw_orc_statistics(filepath_or_buffer) # Parse column names column_names = [ column_name.decode("utf-8") for column_name in column_names ] # Parse statistics cs = cs_pb2.ColumnStatistics() file_statistics = { column_names[i]: _parse_column_statistics(cs, raw_file_stats) for i, raw_file_stats in enumerate(raw_file_statistics) if columns is None or column_names[i] in columns } if any( not parsed_statistics for parsed_statistics in file_statistics.values() ): return None stripes_statistics = [] for raw_stripe_statistics in raw_stripes_statistics: stripe_statistics = { column_names[i]: _parse_column_statistics(cs, raw_file_stats) for i, raw_file_stats in enumerate(raw_stripe_statistics) if columns is None or column_names[i] in columns } if any( not parsed_statistics for parsed_statistics in stripe_statistics.values() ): return None else: stripes_statistics.append(stripe_statistics) return file_statistics, stripes_statistics
def read_json( path_or_buf, engine="auto", dtype=True, lines=False, compression="infer", byte_range=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf" and not lines: raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=path_or_buf, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_json` does not yet support reading multiple files") path_or_buf, compression = ioutils.get_filepath_or_buffer( path_or_data=path_or_buf, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if engine == "cudf": return cudf.DataFrame._from_table( libjson.read_json(path_or_buf, dtype, lines, compression, byte_range)) else: warnings.warn("Using CPU via Pandas to read JSON dataset, this may " "be GPU accelerated in the future") if kwargs.get("orient") == "table": pd_value = pd.read_json( path_or_buf, lines=lines, compression=compression, *args, **kwargs, ) else: pd_value = pd.read_json( path_or_buf, lines=lines, dtype=dtype, compression=compression, *args, **kwargs, ) df = cudf.from_pandas(pd_value) return df
def read_orc_statistics( filepaths_or_buffers, columns=None, **kwargs, ): """{docstring}""" files_statistics = [] stripes_statistics = [] for source in filepaths_or_buffers: filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") # Read in statistics and unpack ( column_names, raw_file_statistics, raw_stripes_statistics, ) = liborc.read_raw_orc_statistics(filepath_or_buffer) # Parse column names column_names = [ column_name.decode("utf-8") for column_name in column_names ] # Parse statistics cs = cs_pb2.ColumnStatistics() file_statistics = { column_names[i]: _parse_column_statistics(cs, raw_file_stats) for i, raw_file_stats in enumerate(raw_file_statistics) if columns is None or column_names[i] in columns } if any( not parsed_statistics for parsed_statistics in file_statistics.values() ): continue else: files_statistics.append(file_statistics) for raw_stripe_statistics in raw_stripes_statistics: stripe_statistics = { column_names[i]: _parse_column_statistics(cs, raw_file_stats) for i, raw_file_stats in enumerate(raw_stripe_statistics) if columns is None or column_names[i] in columns } if any( not parsed_statistics for parsed_statistics in stripe_statistics.values() ): continue else: stripes_statistics.append(stripe_statistics) return files_statistics, stripes_statistics
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, timestamp_type=None, **kwargs, ): """{docstring}""" from cudf import DataFrame filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = DataFrame._from_table( libcudf.orc.read_orc( filepath_or_buffer, columns, stripes, skiprows, num_rows, use_index, decimals_as_float, force_decimal_scale, timestamp_type, )) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripes is not None and len(stripes) > 0: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripes ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_json( path_or_buf, engine="auto", dtype=True, lines=False, compression="infer", byte_range=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf" and not lines: raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" path_or_buf, compression = ioutils.get_filepath_or_buffer( path_or_buf, compression, (BytesIO, StringIO), **kwargs) if engine == "cudf": df = libcudf.json.read_json(path_or_buf, dtype, lines, compression, byte_range) else: warnings.warn("Using CPU via Pandas to read JSON dataset, this may " "be GPU accelerated in the future") if kwargs.get("orient") == "table": pd_value = pd.read_json( path_or_buf, lines=lines, compression=compression, *args, **kwargs, ) else: pd_value = pd.read_json( path_or_buf, lines=lines, dtype=dtype, compression=compression, *args, **kwargs, ) df = cudf.from_pandas(pd_value) return df
def read_avro( filepath_or_buffer, engine="cudf", columns=None, skip_rows=None, num_rows=None, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return cpp_read_avro(filepath_or_buffer, columns, skip_rows, num_rows) else: raise NotImplementedError("read_avro currently only supports cudf")
def read_orc( filepath_or_buffer, engine="cudf", columns=None, stripe=None, skip_rows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = libcudf.orc.read_orc( filepath_or_buffer, columns, stripe, skip_rows, num_rows, use_index, decimals_as_float, force_decimal_scale, ) else: warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripe is not None: pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_orc_statistics( filepath_or_buffer, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") # Read in statistics and unpack statistics = libcudf.orc.read_orc_statistics(filepath_or_buffer) if not statistics: return None ( column_names, raw_file_statistics, *raw_stripes_statistics, ) = statistics # Parse statistics cs = cs_pb2.ColumnStatistics() file_statistics = {} stripes_statistics = [] for i, raw_file_stats in enumerate(raw_file_statistics): parsed_statistics = _parse_column_statistics(cs, raw_file_stats) if not parsed_statistics: return None file_statistics[column_names[i].decode("utf-8")] = parsed_statistics for raw_stripe_statistics in raw_stripes_statistics: stripe_statistics = {} for i, raw_file_stats in enumerate(raw_stripe_statistics): parsed_statistics = _parse_column_statistics(cs, raw_file_stats) if not parsed_statistics: return None stripe_statistics[column_names[i].decode( "utf-8")] = parsed_statistics stripes_statistics.append(stripe_statistics) return file_statistics, stripes_statistics
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, row_group=None, skip_rows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": df = libcudf.parquet.read_parquet( filepath_or_buffer, columns, row_group, skip_rows, num_rows, strings_to_categorical, use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") pa_table = pq.read_pandas(filepath_or_buffer, columns=columns, *args, **kwargs) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_avro( filepath_or_buffer, engine="cudf", columns=None, skip_rows=None, num_rows=None, **kwargs, ): """{docstring}""" from cudf import DataFrame filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs) if compression is not None: ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return DataFrame._from_table( libcudfxx.avro.read_avro(filepath_or_buffer, columns, skip_rows, num_rows)) else: raise NotImplementedError("read_avro currently only supports cudf")
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skip_rows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] filepaths_or_buffers = [] for source in filepath_or_buffer: tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") filepaths_or_buffers.append(tmp_source) if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset(filepaths_or_buffers, format="parquet", partitioning="hive") # Load IDs of filtered row groups for each file in dataset filtered_rg_ids = defaultdict(list) for fragment in dataset.get_fragments(filter=filters): for rg_fragment in fragment.get_row_group_fragments(filters): for rg_id in rg_fragment.row_groups: filtered_rg_ids[rg_fragment.path].append(rg_id) # TODO: Use this with pyarrow 1.0.0 # # Load IDs of filtered row groups for each file in dataset # filtered_row_group_ids = {} # for fragment in dataset.get_fragments(filters): # for row_group_fragment in fragment.split_by_row_group(filters): # for row_group_info in row_group_fragment.row_groups: # path = row_group_fragment.path # if path not in filtered_row_group_ids: # filtered_row_group_ids[path] = [row_group_info.id] # else: # filtered_row_group_ids[path].append(row_group_info.id) # Initialize row_groups to be selected if row_groups is None: row_groups = [None for _ in dataset.files] # Store IDs of selected row groups for each file for i, file in enumerate(dataset.files): if row_groups[i] is None: row_groups[i] = filtered_rg_ids[file] else: row_groups[i] = filter(lambda id: id in row_groups[i], filtered_rg_ids[file]) if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skip_rows=skip_rows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimal_cols_as_float=None, timestamp_type=None, use_python_file_object=True, **kwargs, ): """{docstring}""" if decimal_cols_as_float is not None: warnings.warn( "`decimal_cols_as_float` is deprecated and will be removed in " "the future", FutureWarning, ) from cudf import DataFrame # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # Each source must have a correlating stripe list. If a single stripe list # is provided rather than a list of list of stripes then extrapolate that # stripe list across all input sources if stripes is not None: if any(not isinstance(stripe, list) for stripe in stripes): stripes = [stripes] # Must ensure a stripe for each source is specified, unless None if not len(stripes) == len(filepath_or_buffer): raise ValueError( "A list of stripes must be provided for each input source" ) filepaths_or_buffers = [] for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem( passed_filesystem=None, path=source, **kwargs, ) source = stringify_path(source) source = fs.sep.join([source, "*.orc"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, use_python_file_object=use_python_file_object, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported" ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) if filters is not None: selected_stripes = _filter_stripes( filters, filepaths_or_buffers, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepaths_or_buffers[0], columns) else: stripes = selected_stripes if engine == "cudf": return DataFrame._from_data( *liborc.read_orc( filepaths_or_buffers, columns, stripes, skiprows, num_rows, use_index, decimal_cols_as_float, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") if len(filepath_or_buffer) > 1: raise NotImplementedError( "Using CPU via PyArrow only supports a single a " "single input source" ) orc_file = orc.ORCFile(filepath_or_buffer[0]) if stripes is not None and len(stripes) > 0: for stripe_source_file in stripes: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripe_source_file ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_json( path_or_buf, engine="auto", dtype=True, lines=False, compression="infer", byte_range=None, *args, **kwargs, ): """{docstring}""" if engine == "cudf" and not lines: raise ValueError("cudf engine only supports JSON Lines format") if engine == "auto": engine = "cudf" if lines else "pandas" if engine == "cudf": # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(path_or_buf): path_or_buf = [path_or_buf] filepaths_or_buffers = [] for source in path_or_buf: if ioutils.is_directory(source, **kwargs): fs = ioutils._ensure_filesystem(passed_filesystem=None, path=source) source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.json"]) tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if isinstance(tmp_source, list): filepaths_or_buffers.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) return cudf.DataFrame._from_data(*libjson.read_json( filepaths_or_buffers, dtype, lines, compression, byte_range)) else: warnings.warn("Using CPU via Pandas to read JSON dataset, this may " "be GPU accelerated in the future") if not ioutils.ensure_single_filepath_or_buffer( path_or_data=path_or_buf, **kwargs, ): raise NotImplementedError( "`read_json` does not yet support reading " "multiple files via pandas") path_or_buf, compression = ioutils.get_filepath_or_buffer( path_or_data=path_or_buf, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if kwargs.get("orient") == "table": pd_value = pd.read_json( path_or_buf, lines=lines, compression=compression, *args, **kwargs, ) else: pd_value = pd.read_json( path_or_buf, lines=lines, dtype=dtype, compression=compression, *args, **kwargs, ) df = cudf.from_pandas(pd_value) return df
def read_csv( filepath_or_buffer, lineterminator="\n", quotechar='"', quoting=0, doublequote=True, header="infer", mangle_dupe_cols=True, usecols=None, sep=",", delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression="infer", thousands=None, decimal=".", true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, parse_dates=None, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None, use_python_file_object=True, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_csv` does not yet support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO, NativeFile), use_python_file_object=use_python_file_object, **kwargs, ) if na_values is not None and is_scalar(na_values): na_values = [na_values] return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, doublequote=doublequote, header=header, mangle_dupe_cols=mangle_dupe_cols, usecols=usecols, sep=sep, delimiter=delimiter, delim_whitespace=delim_whitespace, skipinitialspace=skipinitialspace, names=names, dtype=dtype, skipfooter=skipfooter, skiprows=skiprows, dayfirst=dayfirst, compression=compression, thousands=thousands, decimal=decimal, true_values=true_values, false_values=false_values, nrows=nrows, byte_range=byte_range, skip_blank_lines=skip_blank_lines, parse_dates=parse_dates, comment=comment, na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, prefix=prefix, index_col=index_col, )
def read_csv( filepath_or_buffer, lineterminator="\n", quotechar='"', quoting=0, doublequote=True, header="infer", mangle_dupe_cols=True, usecols=None, sep=",", delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression="infer", thousands=None, decimal=".", true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, parse_dates=None, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, doublequote=doublequote, header=header, mangle_dupe_cols=mangle_dupe_cols, usecols=usecols, sep=sep, delimiter=delimiter, delim_whitespace=delim_whitespace, skipinitialspace=skipinitialspace, names=names, dtype=dtype, skipfooter=skipfooter, skiprows=skiprows, dayfirst=dayfirst, compression=compression, thousands=thousands, decimal=decimal, true_values=true_values, false_values=false_values, nrows=nrows, byte_range=byte_range, skip_blank_lines=skip_blank_lines, parse_dates=parse_dates, comment=comment, na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, prefix=prefix, index_col=index_col, )
def read_csv( filepath_or_buffer, lineterminator="\n", quotechar='"', quoting=0, doublequote=True, header="infer", mangle_dupe_cols=True, usecols=None, sep=",", delimiter=None, delim_whitespace=False, skipinitialspace=False, names=None, dtype=None, skipfooter=0, skiprows=0, dayfirst=False, compression="infer", thousands=None, decimal=".", true_values=None, false_values=None, nrows=None, byte_range=None, skip_blank_lines=True, parse_dates=None, comment=None, na_values=None, keep_default_na=True, na_filter=True, prefix=None, index_col=None, **kwargs, ): """{docstring}""" is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_csv` does not yet support reading multiple files") filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=compression, iotypes=(BytesIO, StringIO), **kwargs, ) if na_values is not None and is_scalar(na_values): na_values = [na_values] if keep_default_na is False: # TODO: Remove this error once the following issue is fixed: # https://github.com/rapidsai/cudf/issues/6680 raise NotImplementedError( "keep_default_na=False is currently not supported, please refer " "to: https://github.com/rapidsai/cudf/issues/6680") return libcudf.csv.read_csv( filepath_or_buffer, lineterminator=lineterminator, quotechar=quotechar, quoting=quoting, doublequote=doublequote, header=header, mangle_dupe_cols=mangle_dupe_cols, usecols=usecols, sep=sep, delimiter=delimiter, delim_whitespace=delim_whitespace, skipinitialspace=skipinitialspace, names=names, dtype=dtype, skipfooter=skipfooter, skiprows=skiprows, dayfirst=dayfirst, compression=compression, thousands=thousands, decimal=decimal, true_values=true_values, false_values=false_values, nrows=nrows, byte_range=byte_range, skip_blank_lines=skip_blank_lines, parse_dates=parse_dates, comment=comment, na_values=na_values, keep_default_na=keep_default_na, na_filter=na_filter, prefix=prefix, index_col=index_col, )
def read_orc( filepath_or_buffer, engine="cudf", columns=None, filters=None, stripes=None, skiprows=None, num_rows=None, use_index=True, decimals_as_float=True, force_decimal_scale=None, timestamp_type=None, **kwargs, ): """{docstring}""" from cudf import DataFrame is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer( path_or_data=filepath_or_buffer, **kwargs, ) if not is_single_filepath_or_buffer: raise NotImplementedError( "`read_orc` does not yet support reading multiple files" ) filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( path_or_data=filepath_or_buffer, compression=None, **kwargs ) if compression is not None: ValueError("URL content-encoding decompression is not supported") if filters is not None: selected_stripes = _filter_stripes( filters, filepath_or_buffer, stripes, skiprows, num_rows ) # Return empty if everything was filtered if len(selected_stripes) == 0: return _make_empty_df(filepath_or_buffer, columns) else: stripes = selected_stripes if engine == "cudf": df = DataFrame._from_table( libcudf.orc.read_orc( filepath_or_buffer, columns, stripes, skiprows, num_rows, use_index, decimals_as_float, force_decimal_scale, timestamp_type, ) ) else: def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) if isinstance(pa_table, pa.RecordBatch): pa_table = pa.Table.from_batches([pa_table]) return pa_table warnings.warn("Using CPU via PyArrow to read ORC dataset.") orc_file = orc.ORCFile(filepath_or_buffer) if stripes is not None and len(stripes) > 0: pa_tables = [ read_orc_stripe(orc_file, i, columns) for i in stripes ] pa_table = pa.concat_tables(pa_tables) else: pa_table = orc_file.read(columns=columns) df = cudf.DataFrame.from_arrow(pa_table) return df
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skiprows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, use_python_file_object=True, categorical_partitions=True, open_file_options=None, *args, **kwargs, ): """{docstring}""" # Do not allow the user to set file-opening options # when `use_python_file_object=False` is specified if use_python_file_object is False: if open_file_options: raise ValueError( "open_file_options is not currently supported when " "use_python_file_object is set to False.") open_file_options = {} # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] # Check columns input if columns is not None: if not is_list_like(columns): raise ValueError("Expected list like for columns") # Start by trying construct a filesystem object, so we # can apply filters on remote file-systems fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs) # Use pyarrow dataset to detect/process directory-partitioned # data and apply filters. Note that we can only support partitioned # data and filtering if the input is a single directory or list of # paths. partition_keys = [] partition_categories = {} if fs and paths: ( paths, row_groups, partition_keys, partition_categories, ) = _process_dataset( paths, fs, filters=filters, row_groups=row_groups, categorical_partitions=categorical_partitions, ) elif filters is not None: raise ValueError("cudf cannot apply filters to open file objects.") filepath_or_buffer = paths if paths else filepath_or_buffer filepaths_or_buffers = [] if use_python_file_object: open_file_options = _default_open_file_options( open_file_options, columns, row_groups, fs=fs, ) for i, source in enumerate(filepath_or_buffer): tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, fs=fs, use_python_file_object=use_python_file_object, open_file_options=open_file_options, **kwargs, ) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") if isinstance(tmp_source, list): filepath_or_buffer.extend(tmp_source) else: filepaths_or_buffers.append(tmp_source) # Warn user if they are not using cudf for IO # (There is a good chance this was not the intention) if engine != "cudf": warnings.warn("Using CPU via PyArrow to read Parquet dataset." "This option is both inefficient and unstable!") if filters is not None: warnings.warn("Parquet row-group filtering is only supported with " "'engine=cudf'. Use pandas or pyarrow API directly " "for full CPU-based filtering functionality.") return _parquet_to_frame( filepaths_or_buffers, engine, *args, columns=columns, row_groups=row_groups, skiprows=skiprows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, partition_keys=partition_keys, partition_categories=partition_categories, **kwargs, )