def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf: df = pf.read_row_group_file(piece, columns, categories, index=index, **kwargs.get("read", {})) else: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base df = pf.to_pandas(columns, categories, index=index) return df
def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] pf.fmd.key_value_metadata = None return pf.read_row_group_file(rg_piece, columns, categories, index=index, **kwargs.get("read", {}))
def read_partition( cls, fs, piece, columns, index, categories=(), pf=None, **kwargs ): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) pf.fn = base if null_index_name and "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): if isinstance(pf[0], list): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] else: pf = ParquetFile( pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] if null_index_name: if "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index pf.fmd.key_value_metadata = None else: pf.fmd.key_value_metadata = None return pf.read_row_group_file( rg_piece, columns, categories, index=index, **kwargs.get("read", {}) )
def test_file_scheme(): paths = [None, None] assert get_file_scheme(paths) == 'simple' paths = [] assert get_file_scheme(paths) == 'empty' # this is pointless paths = ['file'] assert get_file_scheme(paths) == 'flat' paths = ['file', 'file'] assert get_file_scheme(paths) == 'flat' paths = ['a=1/b=2/file', 'a=2/b=1/file'] assert get_file_scheme(paths) == 'hive' paths = ['a=1/z=2/file', 'a=2/b=6/file'] # note key names do not match assert get_file_scheme(paths) == 'drill' paths = ['a=1/b=2/file', 'a=2/b/file'] assert get_file_scheme(paths) == 'drill' paths = ['a/b/c/file', 'a/b/file'] assert get_file_scheme(paths) == 'other'
def _read_fp_multifile(fs, fs_token, paths, columns=None, categories=None, index=None): """Read dataset with fastparquet by assuming metadata from first file""" from fastparquet import ParquetFile from fastparquet.util import analyse_paths, get_file_scheme, join_path base, fns = analyse_paths(paths) parsed_paths = [join_path(p) for p in paths] scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) ( meta, _, index_name, out_type, all_columns, index_names, storage_name_mapping, ) = _pf_validation(pf, columns, index, categories, []) name = "read-parquet-" + tokenize(fs_token, paths, all_columns, categories) dsk = {(name, i): ( _read_pf_simple, fs, path, base, index_names, all_columns, out_type == Series, categories, pf.cats, pf.file_scheme, storage_name_mapping, ) for i, path in enumerate(parsed_paths)} divisions = (None, ) * (len(paths) + 1) return out_type(dsk, name, meta, divisions)
def _read_fp_multifile(fs, fs_token, paths, columns=None, categories=None, index=None): """Read dataset with fastparquet by assuming metadata from first file""" from fastparquet import ParquetFile from fastparquet.util import analyse_paths, get_file_scheme base, fns = analyse_paths(paths) scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) (meta, _, index_name, out_type, all_columns, index_names, storage_name_mapping) = _pf_validation( pf, columns, index, categories, []) name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, categories) dsk = {(name, i): (_read_pf_simple, fs, path, base, index_names, all_columns, out_type == Series, categories, pf.cats, pf.file_scheme, storage_name_mapping) for i, path in enumerate(paths)} divisions = (None, ) * (len(paths) + 1) return out_type(dsk, name, meta, divisions)
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). """ parts = [] if len(paths) > 1: if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering pf = ParquetFile(paths, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # We have a _metadata file, lets use it pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) parts = paths.copy() else: if fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(paths, fs) relpaths = [path.replace(base, "").lstrip("/") for path in paths] if "_metadata" in relpaths: # Using _metadata file (best-case scenario) pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in relpaths: pf = ParquetFile(base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {})) else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) parts = paths.copy() else: # There is only one file to read pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) return parts, pf, gather_statistics
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """ Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). The `fast_metadata` output specifies that ParquetFile metadata parsing is fast enough for each worker to perform during `read_partition`. The value will be set to True if: (1) The path is a directory containing _metadta, (2) the path is a list of files containing _metadata, (3) there is only one file to read, or (4) `gather_statistics` is False. In other cases, the ParquetFile object will need to be stored in the task graph, because metadata parsing is too expensive. """ parts = [] fast_metadata = True if len(paths) > 1: base, fns = _analyze_paths(paths, fs) if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering if "_metadata" not in fns: paths_use = paths fast_metadata = False else: paths_use = base + fs.sep + "_metadata" pf = ParquetFile(paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: if "_metadata" in fns: # We have a _metadata file, lets use it pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() elif fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") base, fns = _analyze_paths(paths, fs) if "_metadata" in fns: # Using _metadata file (best-case scenario) pf = ParquetFile(base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) fast_metadata = False else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in fns: pf = ParquetFile(base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {})) else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() else: # There is only one file to read base = None pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) return parts, pf, gather_statistics, fast_metadata, base
def _determine_pf_parts(fs, paths, gather_statistics, **kwargs): """Determine how to access metadata and break read into ``parts`` This logic is mostly to handle `gather_statistics=False` cases, because this also means we should avoid scanning every file in the dataset. If _metadata is available, set `gather_statistics=True` (if `gather_statistics=None`). """ parts = [] if len(paths) > 1: paths, base, fns = _sort_and_analyze_paths(paths, fs) if gather_statistics is not False: # This scans all the files, allowing index/divisions # and filtering if "_metadata" not in fns: paths_use = paths else: paths_use = base + fs.sep + "_metadata" pf = ParquetFile(paths_use, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: if "_metadata" in fns: # We have a _metadata file, lets use it pf = ParquetFile( base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = paths.copy() elif fs.isdir(paths[0]): # This is a directory, check for _metadata, then _common_metadata paths = fs.glob(paths[0] + fs.sep + "*") paths, base, fns = _sort_and_analyze_paths(paths, fs) if "_metadata" in fns: # Using _metadata file (best-case scenario) pf = ParquetFile( base + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}), ) if gather_statistics is None: gather_statistics = True elif gather_statistics is not False: # Scan every file pf = ParquetFile(paths, open_with=fs.open, **kwargs.get("file", {})) else: # Use _common_metadata file if it is available. # Otherwise, just use 0th file if "_common_metadata" in fns: pf = ParquetFile( base + fs.sep + "_common_metadata", open_with=fs.open, **kwargs.get("file", {}), ) fns.remove("_common_metadata") else: pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) parts = [fs.sep.join([base, fn]) for fn in fns] else: # There is only one file to read base = None pf = ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError("No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format( pf.columns, pf.cats.keys())) return parts, pf, gather_statistics, base
def read_metadata(fs, paths, categories=None, index=None, gather_statistics=None, filters=None, **kwargs): if len(paths) > 1: if gather_statistics is not False: # this scans all the files, allowing index/divisions # and filtering pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) else: base, fns = analyse_paths(paths) scheme = get_file_scheme(fns) pf = ParquetFile(paths[0], open_with=fs.open, **kwargs.get("file", {})) pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) relpath = paths.replace(base, "").lstrip("/") for rg in pf.row_groups: rg.cats = pf.cats rg.schema = pf.schema for ch in rg.columns: ch.file_path = relpath else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + "_metadata", open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) if gather_statistics is None: gather_statistics = True except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) columns = None if pf.fmd.key_value_metadata: pandas_md = [ x.value for x in pf.fmd.key_value_metadata if x.key == "pandas" ] else: pandas_md = [] if len(pandas_md) == 0: index_names = [] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} column_index_names = [None] elif len(pandas_md) == 1: ( index_names, column_names, storage_name_mapping, column_index_names, ) = _parse_pandas_metadata(json.loads(pandas_md[0])) # auto-ranges should not be created by fastparquet index_names = [n for n in index_names if n is not None] column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") if index is None and len(index_names) > 0: if len(index_names) == 1: index = index_names[0] else: index = index_names # Normalize user inputs column_names, index_names = _normalize_index_columns( columns, column_names, index, index_names) all_columns = index_names + column_names categories_dict = None if isinstance(categories, dict): categories_dict = categories if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # Check that categories are included in columns if categories and not set(categories).intersection(all_columns): raise ValueError("categories not in available columns.\n" "categories: {} | columns: {}".format( categories, list(all_columns))) dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} index_cols = index or () meta = _meta_from_dtypes(all_columns, dtypes, index_cols, column_index_names) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") for cat in categories: if cat in meta: meta[cat] = pd.Series( pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index, ) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if gather_statistics and pf.row_groups: stats = [] if filters is None: filters = [] # make statistics conform in layout for (i, row_group) in enumerate(pf.row_groups): s = {"num-rows": row_group.num_rows, "columns": []} for col in pf.columns: d = {"name": col} if pf.statistics["min"][col][0] is not None: cs_min = pf.statistics["min"][col][i] cs_max = pf.statistics["max"][col][i] if isinstance(cs_min, np.datetime64): cs_min = pd.Timestamp(cs_min) cs_max = pd.Timestamp(cs_max) d.update({ "min": cs_min, "max": cs_max, "null_count": pf.statistics["null_count"][col][i], }) s["columns"].append(d) # Need this to filter out partitioned-on categorical columns s["filter"] = fastparquet.api.filter_out_cats( row_group, filters) stats.append(s) else: stats = None pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None # Create `parts` (list of row-group-descriptor dicts) parts = [{ "piece": rg, "kwargs": { "pf": pf, "categories": categories_dict or categories }, } for rg in pf.row_groups] return (meta, stats, parts)
def _collect_dataset_info( cls, paths, fs, categories, index, gather_statistics, filters, split_row_groups, chunksize, aggregate_files, ignore_metadata_file, metadata_task_size, require_extension=(".parq", ".parquet"), **kwargs, ): # Define the parquet-file (pf) object to use for metadata, # Also, initialize `parts`. If `parts` is populated here, # then each part will correspond to a file. Otherwise, each part will # correspond to a row group (populated later). # # This logic is mostly to handle `gather_statistics=False` cases, # because this also means we should avoid scanning every file in the # dataset. If _metadata is available, set `gather_statistics=True` # (if `gather_statistics=None`). parts = [] _metadata_exists = False if len(paths) == 1 and fs.isdir(paths[0]): # This is a directory. # Check if _metadata and/or _common_metadata files exists base = paths[0] _metadata_exists = True if not ignore_metadata_file: _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"])) # Find all files if we are not using a _metadata file if ignore_metadata_file or not _metadata_exists: # For now, we need to discover every file under paths[0] paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs) _update_paths = False for fn in ["_metadata", "_common_metadata"]: try: fns.remove(fn) _update_paths = True except ValueError: pass if _update_paths: paths = [fs.sep.join([base, fn]) for fn in fns] _metadata_exists = False if _metadata_exists: # Using _metadata file (best-case scenario) pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **kwargs, ) if gather_statistics is None: gather_statistics = True else: # Use 0th file # Note that "_common_metadata" can cause issues for # partitioned datasets. if require_extension: # Raise error if all files have been filtered by extension len0 = len(paths) paths = [ path for path in paths if path.endswith(require_extension) ] if len0 and paths == []: raise ValueError( "No files satisfy the `require_extension` criteria " f"(files must end with {require_extension}).") pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **kwargs) scheme = get_file_scheme(fns) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = [fs.sep.join([base, fn]) for fn in fns] else: # This is a list of files paths, base, fns = _sort_and_analyze_paths(paths, fs) # Check if _metadata is in paths, and # remove it if ignore_metadata_file=True _metadata_exists = "_metadata" in fns if _metadata_exists and ignore_metadata_file: fns.remove("_metadata") _metadata_exists = False paths = [fs.sep.join([base, fn]) for fn in fns] if _metadata_exists: # We have a _metadata file, lets use it pf = ParquetFile( fs.sep.join([base, "_metadata"]), open_with=fs.open, **kwargs, ) else: # Rely on metadata for 0th file. # Will need to pass a list of paths to read_partition scheme = get_file_scheme(fns) pf = ParquetFile(paths[:1], open_with=fs.open, root=base, **kwargs) pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) if not gather_statistics: parts = paths.copy() # Check the `aggregate_files` setting aggregation_depth = _get_aggregation_depth( aggregate_files, list(pf.cats), ) # Ensure that there is no overlap between partition columns # and explicit columns in `pf` if pf.cats: _partitions = [p for p in pf.cats if p not in pf.columns] if not _partitions: pf.cats = {} elif len(_partitions) != len(pf.cats): raise ValueError( "No partition-columns should be written in the \n" "file unless they are ALL written in the file.\n" "columns: {} | partitions: {}".format( pf.columns, pf.cats.keys())) return { "pf": pf, "paths": paths, "has_metadata_file": _metadata_exists, "parts": parts, "base": base, "fs": fs, "gather_statistics": gather_statistics, "categories": categories, "index": index, "filters": filters, "split_row_groups": split_row_groups, "chunksize": chunksize, "aggregate_files": aggregate_files, "aggregation_depth": aggregation_depth, "metadata_task_size": metadata_task_size, "kwargs": kwargs, }