def _read_pf_simple(fs, path, base, index_names, all_columns, is_series, categories, cats, scheme, storage_name_mapping): """Read dataset with fastparquet using ParquetFile machinery""" from fastparquet import ParquetFile pf = ParquetFile(path, open_with=fs.open) relpath = path.replace(base, '').lstrip('/') for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = cats pf.fn = base df = pf.to_pandas(all_columns, categories, index=index_names) if df.index.nlevels == 1: if index_names: df.index.name = storage_name_mapping.get(index_names[0], index_names[0]) else: if index_names: df.index.names = [storage_name_mapping.get(name, name) for name in index_names] df.columns = [storage_name_mapping.get(col, col) for col in all_columns if col not in (index_names or [])] if is_series: return df[df.columns[0]] else: return df
def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf: df = pf.read_row_group_file(piece, columns, categories, index=index, **kwargs.get("read", {})) else: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base df = pf.to_pandas(columns, categories, index=index) return df
def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] pf.fmd.key_value_metadata = None return pf.read_row_group_file(rg_piece, columns, categories, index=index, **kwargs.get("read", {}))
def read_partition( cls, fs, piece, columns, index, categories=(), pf=None, **kwargs ): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) pf.fn = base if null_index_name and "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): if isinstance(pf[0], list): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] else: pf = ParquetFile( pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] if null_index_name: if "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index pf.fmd.key_value_metadata = None else: pf.fmd.key_value_metadata = None return pf.read_row_group_file( rg_piece, columns, categories, index=index, **kwargs.get("read", {}) )
def _read_parquet_file( fs, base, fn, index, columns, series, categories, cs, dt, scheme, storage_name_mapping, *args ): """Read a single file with fastparquet, to be used in a task""" from fastparquet.api import ParquetFile from collections import OrderedDict name_storage_mapping = {v: k for k, v in storage_name_mapping.items()} if not isinstance(columns, (tuple, list)): columns = [columns] series = True if index: index, = index if index not in columns: columns = columns + [index] columns = [name_storage_mapping.get(col, col) for col in columns] index = name_storage_mapping.get(index, index) cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns]) pf = ParquetFile(fn, open_with=fs.open) pf.file_scheme = scheme for rg in pf.row_groups: for ch in rg.columns: ch.file_path = fn.replace(base, "").lstrip("/") pf.fn = base df = pf.to_pandas(columns=columns, index=index, categories=categories) if df.index.nlevels == 1: if index: df.index.name = storage_name_mapping.get(index, index) else: if index: df.index.names = [storage_name_mapping.get(name, name) for name in index] df.columns = [storage_name_mapping.get(col, col) for col in columns if col != index] if series: return df[df.columns[0]] else: return df
def _read_fp_multifile(fs, fs_token, paths, columns=None, categories=None, index=None): """Read dataset with fastparquet by assuming metadata from first file""" from fastparquet import ParquetFile from fastparquet.util import analyse_paths, get_file_scheme, join_path base, fns = analyse_paths(paths) parsed_paths = [join_path(p) for p in paths] scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) ( meta, _, index_name, out_type, all_columns, index_names, storage_name_mapping, ) = _pf_validation(pf, columns, index, categories, []) name = "read-parquet-" + tokenize(fs_token, paths, all_columns, categories) dsk = {(name, i): ( _read_pf_simple, fs, path, base, index_names, all_columns, out_type == Series, categories, pf.cats, pf.file_scheme, storage_name_mapping, ) for i, path in enumerate(parsed_paths)} divisions = (None, ) * (len(paths) + 1) return out_type(dsk, name, meta, divisions)
def _read_parquet_file(fs, base, fn, index, columns, series, categories, cs, dt, scheme, storage_name_mapping, *args): """Read a single file with fastparquet, to be used in a task""" from fastparquet.api import ParquetFile from collections import OrderedDict name_storage_mapping = {v: k for k, v in storage_name_mapping.items()} if not isinstance(columns, (tuple, list)): columns = [columns,] series = True if index: index, = index if index not in columns: columns = columns + [index] columns = [name_storage_mapping.get(col, col) for col in columns] index = name_storage_mapping.get(index, index) cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns]) pf = ParquetFile(fn, open_with=fs.open) pf.file_scheme = scheme for rg in pf.row_groups: for ch in rg.columns: ch.file_path = fn.replace(base, "").lstrip('/') pf.fn = base df = pf.to_pandas(columns=columns, index=index, categories=categories) if df.index.nlevels == 1: if index: df.index.name = storage_name_mapping.get(index, index) else: if index: df.index.names = [storage_name_mapping.get(name, name) for name in index] df.columns = [storage_name_mapping.get(col, col) for col in columns if col != index] if series: return df[df.columns[0]] else: return df
def _read_fp_multifile(fs, fs_token, paths, columns=None, categories=None, index=None): """Read dataset with fastparquet by assuming metadata from first file""" from fastparquet import ParquetFile from fastparquet.util import analyse_paths, get_file_scheme base, fns = analyse_paths(paths) scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) (meta, _, index_name, out_type, all_columns, index_names, storage_name_mapping) = _pf_validation( pf, columns, index, categories, []) name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, categories) dsk = {(name, i): (_read_pf_simple, fs, path, base, index_names, all_columns, out_type == Series, categories, pf.cats, pf.file_scheme, storage_name_mapping) for i, path in enumerate(paths)} divisions = (None, ) * (len(paths) + 1) return out_type(dsk, name, meta, divisions)
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). """ parts = [] if len(paths) > 1: if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering pf = ParquetFile(paths, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # We have a _metadata file, lets use it pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) parts = paths.copy() else: if fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # Using _metadata file (best-case scenario) pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in relpaths: pf = ParquetFile(base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {})) else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) parts = paths.copy() else: # There is only one file to read pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) return parts, pf, gather_statistics
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). The `fast_metadata` output specifies that ParquetFile metadata parsing is fast enough for each worker to perform during `read_partition`. The value will be set to True if: (1) The path is a directory containing _metadta, (2) the path is a list of files containing _metadata, (3) there is only one file to read, or (4) `gather_statistics` is False. In other cases, the ParquetFile object will need to be stored in the task graph, because metadata parsing is too expensive. """ parts = [] fast_metadata = True if len(paths) > 1: base, fns = _analyze_paths(paths, fs) if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering if "_metadata" not in fns: paths_use = paths fast_metadata = False else: paths_use = base + fs.sep + "_metadata" pf = ParquetFile(paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: if "_metadata" in fns: # We have a _metadata file, lets use it pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() elif fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(paths, fs) if "_metadata" in fns: # Using _metadata file (best-case scenario) pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) fast_metadata = False else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in fns: pf = ParquetFile(base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {})) else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() else: # There is only one file to read base = None pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) return parts, pf, gather_statistics, fast_metadata, base
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). """ parts = [] if len(paths) > 1: paths, base, fns = _sort_and_analyze_paths(paths, fs) if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering if "_metadata" not in fns: paths_use = paths else: paths_use = base + fs.sep + "_metadata" pf = ParquetFile(paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: if "_metadata" in fns: # We have a _metadata file, lets use it pf = ParquetFile( base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() elif fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") paths, base, fns = _sort_and_analyze_paths(paths, fs) if "_metadata" in fns: # Using _metadata file (best-case scenario) pf = ParquetFile( base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in fns: pf = ParquetFile( base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {}), ) fns.remove("_common_metadata") else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = [fs.sep.join([base, fn]) for fn in fns] else: # There is only one file to read base = None pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError("No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format( pf.columns, pf.cats.keys())) return parts, pf, gather_statistics, base
def read_metadata(fs, paths, categories=None, index=None, gather_statistics=None, filters=None, **kwargs): if len(paths) > 1: if gather_statistics is not False: # this scans all the files, allowing index/divisions # and filtering pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: base, fns = analyse_paths(paths) scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) relpath = paths.replace(base, "").lstrip("/") for rg in pf.row_groups: rg.cats = pf.cats rg.schema = pf.schema for ch in rg.columns: ch.file_path = relpath else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) columns = None if pf.fmd.key_value_metadata: pandas_md = [ x.value for x in pf.fmd.key_value_metadata if x.key == "pandas" ] else: pandas_md = [] if len(pandas_md) == 0: index_names = [] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} column_index_names = [None] elif len(pandas_md) == 1: ( index_names, column_names, storage_name_mapping, column_index_names, ) = _parse_pandas_metadata(json.loads(pandas_md[0])) # auto-ranges should not be created by fastparquet index_names = [n for n in index_names if n is not None] column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") if index is None and len(index_names) > 0: if len(index_names) == 1: index = index_names[0] else: index = index_names # Normalize user inputs column_names, index_names = _normalize_index_columns( columns, column_names, index, index_names) all_columns = index_names + column_names categories_dict = None if isinstance(categories, dict): categories_dict = categories if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # Check that categories are included in columns if categories and not set(categories).intersection(all_columns): raise ValueError("categories not in available columns.\n" "categories: {} | columns: {}".format( categories, list(all_columns))) dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} index_cols = index or () meta = _meta_from_dtypes(all_columns, dtypes, index_cols, column_index_names) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") for cat in categories: if cat in meta: meta[cat] = pd.Series( pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index, ) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if gather_statistics and pf.row_groups: stats = [] if filters is None: filters = [] # make statistics conform in layout for (i, row_group) in enumerate(pf.row_groups): s = {"num-rows": row_group.num_rows, "columns": []} for col in pf.columns: d = {"name": col} if pf.statistics["min"][col][0] is not None: cs_min = pf.statistics["min"][col][i] cs_max = pf.statistics["max"][col][i] if isinstance(cs_min, np.datetime64): cs_min = pd.Timestamp(cs_min) cs_max = pd.Timestamp(cs_max) d.update({ "min": cs_min, "max": cs_max, "null_count": pf.statistics["null_count"][col][i], }) s["columns"].append(d) # Need this to filter out partitioned-on categorical columns s["filter"] = fastparquet.api.filter_out_cats( row_group, filters) stats.append(s) else: stats = None pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None # Create `parts` (list of row-group-descriptor dicts) parts = [{ "piece": rg, "kwargs": { "pf": pf, "categories": categories_dict or categories }, } for rg in pf.row_groups] return (meta, stats, parts)
def read_partition( cls, fs, pieces, columns, index, categories=(), root_cats=None, root_file_scheme=None, base_path=None, **kwargs, ): null_index_name = False base_path = False if not root_cats else base_path if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) # Always convert pieces to list if not isinstance(pieces, list): pieces = [pieces] sample = pieces[0] if isinstance(sample, tuple): if isinstance(sample[0], str): # We have paths to read from assert parquet_file is None row_groups = [] rg_offset = 0 parquet_file = ParquetFile( [p[0] for p in pieces], open_with=fs.open, root=base_path or False, **kwargs.get("file", {}), ) for piece in pieces: _pf = (parquet_file if len(pieces) == 1 else ParquetFile( piece[0], open_with=fs.open, root=base_path or False, **kwargs.get("file", {}), )) n_local_row_groups = len(_pf.row_groups) local_rg_indices = piece[1] or list( range(n_local_row_groups)) row_groups += [ parquet_file.row_groups[rg + rg_offset] for rg in local_rg_indices ] rg_offset += n_local_row_groups update_parquet_file = len(row_groups) < len( parquet_file.row_groups) elif parquet_file: row_groups = [] for piece in pieces: # `piece[1]` will contain actual row-group objects, # but they may be pickled rgs = piece[0] if isinstance(rgs, bytes): rgs = pickle.loads(rgs) row_groups += rgs update_parquet_file = True else: raise ValueError("Neither path nor ParquetFile detected!") if update_parquet_file: with _FP_FILE_LOCK: parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index # Update hive-partitioning information if necessary parquet_file.cats = root_cats or {} if root_cats: parquet_file.file_scheme = root_file_scheme parquet_file._dtypes = (lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed if set(columns).issubset(parquet_file.columns + list(parquet_file.cats.keys())): # Convert ParquetFile to pandas return parquet_file.to_pandas( columns=columns, categories=categories, index=index, ) else: # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), )) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `sample` is NOT a tuple raise ValueError(f"Expected tuple, got {type(sample)}")
def _collect_file_parts( cls, pf_or_files, dataset_info_kwargs, ): # Collect necessary information from dataset_info fs = dataset_info_kwargs["fs"] split_row_groups = dataset_info_kwargs["split_row_groups"] gather_statistics = dataset_info_kwargs["gather_statistics"] stat_col_indices = dataset_info_kwargs["stat_col_indices"] filters = dataset_info_kwargs["filters"] dtypes = dataset_info_kwargs["dtypes"] chunksize = dataset_info_kwargs["chunksize"] aggregation_depth = dataset_info_kwargs["aggregation_depth"] base_path = dataset_info_kwargs.get("base_path", None) root_cats = dataset_info_kwargs.get("root_cats", None) root_file_scheme = dataset_info_kwargs.get("root_file_scheme", None) has_metadata_file = dataset_info_kwargs["has_metadata_file"] # Get ParquetFile if not isinstance(pf_or_files, fastparquet.api.ParquetFile): # Construct local `ParquetFile` object pf = ParquetFile( pf_or_files, open_with=fs.open, root=base_path, ) # Update hive-partitioning to match global cats/scheme pf.cats = root_cats or {} if root_cats: pf.file_scheme = root_file_scheme else: # We already have a ParquetFile object to work with pf = pf_or_files # Organize row-groups by file ( file_row_groups, file_row_group_stats, file_row_group_column_stats, gather_statistics, base_path, ) = cls._organize_row_groups( pf, split_row_groups, gather_statistics, stat_col_indices, filters, dtypes, base_path, has_metadata_file, chunksize, aggregation_depth, ) # Convert organized row-groups to parts parts, stats = _row_groups_to_parts( gather_statistics, split_row_groups, aggregation_depth, file_row_groups, file_row_group_stats, file_row_group_column_stats, stat_col_indices, cls._make_part, make_part_kwargs={ "fs": fs, "pf": pf, "base_path": base_path, "partitions": pf.info.get("partitions", None), }, ) return parts, stats
def _collect_dataset_info( cls, paths, fs, categories, index, gather_statistics, filters, split_row_groups, chunksize, aggregate_files, ignore_metadata_file, metadata_task_size, require_extension=(".parq", ".parquet"), **kwargs, ): # Define the parquet-file (pf) object to use for metadata, # Also, initialize `parts`. If `parts` is populated here, # then each part will correspond to a file. Otherwise, each part will # correspond to a row group (populated later). # # This logic is mostly to handle `gather_statistics=False` cases, # because this also means we should avoid scanning every file in the # dataset. If _metadata is available, set `gather_statistics=True` # (if `gather_statistics=None`). parts = [] _metadata_exists = False if len(paths) == 1 and fs.isdir(paths[0]): # This is a directory. # Check if _metadata and/or _common_metadata files exists base = paths[0] _metadata_exists = True if not ignore_metadata_file: _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"])) # Find all files if we are not using a _metadata file if ignore_metadata_file or not _metadata_exists: # For now, we need to discover every file under paths[0] paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs) _update_paths = False for fn in ["_metadata", "_common_metadata"]: try: fns.remove(fn) _update_paths = True except ValueError: pass if _update_paths: paths = [fs.sep.join([base, fn]) for fn in fns] _metadata_exists = False if _metadata_exists: # Using _metadata file (best-case scenario) pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **kwargs, ) if gather_statistics is None: gather_statistics = True else: # Use 0th file # Note that "_common_metadata" can cause issues for # partitioned datasets. if require_extension: # Raise error if all files have been filtered by extension len0 = len(paths) paths = [ path for path in paths if path.endswith(require_extension) ] if len0 and paths == []: raise ValueError( "No files satisfy the `require_extension` criteria " f"(files must end with {require_extension}).") pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **kwargs) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = [fs.sep.join([base, fn]) for fn in fns] else: # This is a list of files paths, base, fns = _sort_and_analyze_paths(paths, fs) # Check if _metadata is in paths, and # remove it if ignore_metadata_file=True _metadata_exists = "_metadata" in fns if _metadata_exists and ignore_metadata_file: fns.remove("_metadata") _metadata_exists = False paths = [fs.sep.join([base, fn]) for fn in fns] if _metadata_exists: # We have a _metadata file, lets use it pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **kwargs, ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **kwargs) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = paths.copy() # Check the `aggregate_files` setting aggregation_depth = _get_aggregation_depth( aggregate_files, list(pf.cats), ) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format( pf.columns, pf.cats.keys())) return { "pf": pf, "paths": paths, "has_metadata_file": _metadata_exists, "parts": parts, "base": base, "fs": fs, "gather_statistics": gather_statistics, "categories": categories, "index": index, "filters": filters, "split_row_groups": split_row_groups, "chunksize": chunksize, "aggregate_files": aggregate_files, "aggregation_depth": aggregation_depth, "metadata_task_size": metadata_task_size, "kwargs": kwargs, }