def _read_parquet( filepaths_or_buffers, engine, columns=None, row_groups=None, skiprows=None, num_rows=None, strings_to_categorical=None, use_pandas_metadata=None, *args, **kwargs, ): # Simple helper function to dispatch between # cudf and pyarrow to read parquet data if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skiprows=skiprows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, row_group=None, row_group_count=None, skip_rows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( filepath_or_buffer, None, **kwargs ) if compression is not None: raise ValueError("URL content-encoding decompression is not supported") if engine == "cudf": return libparquet.read_parquet( filepath_or_buffer, columns=columns, row_group=row_group, row_group_count=row_group_count, skip_rows=skip_rows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") pa_table = pq.read_pandas( filepath_or_buffer, columns=columns, *args, **kwargs ) return cudf.DataFrame.from_arrow(pa_table)
def read_parquet( filepath_or_buffer, engine="cudf", columns=None, filters=None, row_groups=None, skip_rows=None, num_rows=None, strings_to_categorical=False, use_pandas_metadata=True, *args, **kwargs, ): """{docstring}""" # Multiple sources are passed as a list. If a single source is passed, # wrap it in a list for unified processing downstream. if not is_list_like(filepath_or_buffer): filepath_or_buffer = [filepath_or_buffer] # a list of row groups per source should be passed. make the list of # lists that is expected for multiple sources if row_groups is not None: if not is_list_like(row_groups): row_groups = [[row_groups]] elif not is_list_like(row_groups[0]): row_groups = [row_groups] filepaths_or_buffers = [] for source in filepath_or_buffer: tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=None, **kwargs) if compression is not None: raise ValueError( "URL content-encoding decompression is not supported") filepaths_or_buffers.append(tmp_source) if filters is not None: # Convert filters to ds.Expression filters = pq._filters_to_expression(filters) # Initialize ds.FilesystemDataset dataset = ds.dataset(filepaths_or_buffers, format="parquet", partitioning="hive") # Load IDs of filtered row groups for each file in dataset filtered_rg_ids = defaultdict(list) for fragment in dataset.get_fragments(filter=filters): for rg_fragment in fragment.get_row_group_fragments(filters): for rg_id in rg_fragment.row_groups: filtered_rg_ids[rg_fragment.path].append(rg_id) # TODO: Use this with pyarrow 1.0.0 # # Load IDs of filtered row groups for each file in dataset # filtered_row_group_ids = {} # for fragment in dataset.get_fragments(filters): # for row_group_fragment in fragment.split_by_row_group(filters): # for row_group_info in row_group_fragment.row_groups: # path = row_group_fragment.path # if path not in filtered_row_group_ids: # filtered_row_group_ids[path] = [row_group_info.id] # else: # filtered_row_group_ids[path].append(row_group_info.id) # Initialize row_groups to be selected if row_groups is None: row_groups = [None for _ in dataset.files] # Store IDs of selected row groups for each file for i, file in enumerate(dataset.files): if row_groups[i] is None: row_groups[i] = filtered_rg_ids[file] else: row_groups[i] = filter(lambda id: id in row_groups[i], filtered_rg_ids[file]) if engine == "cudf": return libparquet.read_parquet( filepaths_or_buffers, columns=columns, row_groups=row_groups, skip_rows=skip_rows, num_rows=num_rows, strings_to_categorical=strings_to_categorical, use_pandas_metadata=use_pandas_metadata, ) else: warnings.warn("Using CPU via PyArrow to read Parquet dataset.") return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs))