Python get_file_schemeの例、fastparquet.util.get_file_scheme Pythonの例

コード例 #1

0

ファイルを表示

    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf:
            df = pf.read_row_group_file(piece,
                                        columns,
                                        categories,
                                        index=index,
                                        **kwargs.get("read", {}))
        else:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            df = pf.to_pandas(columns, categories, index=index)

        return df

コード例 #2

0

ファイルを表示

ファイル: fastparquet.py プロジェクト: yetudada/dask

    def read_partition(fs,
                       piece,
                       columns,
                       index,
                       categories=(),
                       pf=None,
                       **kwargs):
        if isinstance(index, list):
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = _paths_to_cats(fns, scheme)
            pf.fn = base
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(rg_piece,
                                          columns,
                                          categories,
                                          index=index,
                                          **kwargs.get("read", {}))

コード例 #3

0

ファイルを表示

    def read_partition(
        cls, fs, piece, columns, index, categories=(), pf=None, **kwargs
    ):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        if pf is None:
            base, fns = _analyze_paths([piece], fs)
            scheme = get_file_scheme(fns)
            pf = ParquetFile(piece, open_with=fs.open)
            relpath = piece.replace(base, "").lstrip("/")
            for rg in pf.row_groups:
                for ch in rg.columns:
                    ch.file_path = relpath
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            pf.fn = base
            if null_index_name and "__index_level_0__" in pf.columns:
                # See "Handling a None-labeled index" comment above
                index = ["__index_level_0__"]
                columns += index
            return pf.to_pandas(columns, categories, index=index)
        else:
            if isinstance(pf, tuple):
                if isinstance(pf[0], list):
                    pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1]
                else:
                    pf = ParquetFile(
                        pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})
                    )
                pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
                pf.fmd.row_groups = None
            rg_piece = pf.row_groups[piece]
            if null_index_name:
                if "__index_level_0__" in pf.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index
                    pf.fmd.key_value_metadata = None
            else:
                pf.fmd.key_value_metadata = None
            return pf.read_row_group_file(
                rg_piece, columns, categories, index=index, **kwargs.get("read", {})
            )

コード例 #4

0

ファイルを表示

def test_file_scheme():
    paths = [None, None]
    assert get_file_scheme(paths) == 'simple'
    paths = []
    assert get_file_scheme(paths) == 'empty'  # this is pointless
    paths = ['file']
    assert get_file_scheme(paths) == 'flat'
    paths = ['file', 'file']
    assert get_file_scheme(paths) == 'flat'
    paths = ['a=1/b=2/file', 'a=2/b=1/file']
    assert get_file_scheme(paths) == 'hive'
    paths = ['a=1/z=2/file', 'a=2/b=6/file']  # note key names do not match
    assert get_file_scheme(paths) == 'drill'
    paths = ['a=1/b=2/file', 'a=2/b/file']
    assert get_file_scheme(paths) == 'drill'
    paths = ['a/b/c/file', 'a/b/file']
    assert get_file_scheme(paths) == 'other'

コード例 #5

0

ファイルを表示

ファイル: test_util.py プロジェクト: klahnakoski/fastparquet

def test_file_scheme():
    paths = [None, None]
    assert get_file_scheme(paths) == 'simple'
    paths = []
    assert get_file_scheme(paths) == 'empty'  # this is pointless
    paths = ['file']
    assert get_file_scheme(paths) == 'flat'
    paths = ['file', 'file']
    assert get_file_scheme(paths) == 'flat'
    paths = ['a=1/b=2/file', 'a=2/b=1/file']
    assert get_file_scheme(paths) == 'hive'
    paths = ['a=1/z=2/file', 'a=2/b=6/file']  # note key names do not match
    assert get_file_scheme(paths) == 'drill'
    paths = ['a=1/b=2/file', 'a=2/b/file']
    assert get_file_scheme(paths) == 'drill'
    paths = ['a/b/c/file', 'a/b/file']
    assert get_file_scheme(paths) == 'other'

コード例 #6

0

ファイルを表示

ファイル: parquet.py プロジェクト: zhch158/dask

def _read_fp_multifile(fs,
                       fs_token,
                       paths,
                       columns=None,
                       categories=None,
                       index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme, join_path

    base, fns = analyse_paths(paths)
    parsed_paths = [join_path(p) for p in paths]
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (
        meta,
        _,
        index_name,
        out_type,
        all_columns,
        index_names,
        storage_name_mapping,
    ) = _pf_validation(pf, columns, index, categories, [])
    name = "read-parquet-" + tokenize(fs_token, paths, all_columns, categories)
    dsk = {(name, i): (
        _read_pf_simple,
        fs,
        path,
        base,
        index_names,
        all_columns,
        out_type == Series,
        categories,
        pf.cats,
        pf.file_scheme,
        storage_name_mapping,
    )
           for i, path in enumerate(parsed_paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)

コード例 #7

0

ファイルを表示

ファイル: parquet.py プロジェクト: mrocklin/dask

def _read_fp_multifile(fs, fs_token, paths, columns=None,
                       categories=None, index=None):
    """Read dataset with fastparquet by assuming metadata from first file"""
    from fastparquet import ParquetFile
    from fastparquet.util import analyse_paths, get_file_scheme
    base, fns = analyse_paths(paths)
    scheme = get_file_scheme(fns)
    pf = ParquetFile(paths[0], open_with=fs.open)
    pf.file_scheme = scheme
    pf.cats = _paths_to_cats(fns, scheme)
    (meta, _, index_name, out_type, all_columns, index_names,
     storage_name_mapping) = _pf_validation(
        pf, columns, index, categories, [])
    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns,
                                      categories)
    dsk = {(name, i): (_read_pf_simple, fs, path, base,
                       index_names, all_columns, out_type == Series,
                       categories, pf.cats,
                       pf.file_scheme, storage_name_mapping)
           for i, path in enumerate(paths)}
    divisions = (None, ) * (len(paths) + 1)
    return out_type(dsk, name, meta, divisions)

コード例 #8

0

ファイルを表示

def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).
    """
    parts = []
    if len(paths) > 1:
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # We have a _metadata file, lets use it
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                parts = paths.copy()
    else:
        if fs.isdir(paths[0]):
            # This is a directory, check for _metadata, then _common_metadata
            paths = fs.glob(paths[0] + fs.sep + "*")
            base, fns = _analyze_paths(paths, fs)
            relpaths = [path.replace(base, "").lstrip("/") for path in paths]
            if "_metadata" in relpaths:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
                if gather_statistics is None:
                    gather_statistics = True

            elif gather_statistics is not False:
                # Scan every file
                pf = ParquetFile(paths,
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            else:
                # Use _common_metadata file if it is available.
                # Otherwise, just use 0th file
                if "_common_metadata" in relpaths:
                    pf = ParquetFile(base + fs.sep + "_common_metadata",
                                     open_with=fs.open,
                                     **kwargs.get("file", {}))
                else:
                    pf = ParquetFile(paths[0],
                                     open_with=fs.open,
                                     **kwargs.get("file", {}))
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                parts = paths.copy()
        else:
            # There is only one file to read
            pf = ParquetFile(paths[0],
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))

    return parts, pf, gather_statistics

コード例 #9

0

ファイルを表示

def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """ Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).

    The `fast_metadata` output specifies that ParquetFile metadata parsing
    is fast enough for each worker to perform during `read_partition`. The
    value will be set to True if: (1) The path is a directory containing
    _metadta, (2) the path is a list of files containing _metadata, (3)
    there is only one file to read, or (4) `gather_statistics` is False.
    In other cases, the ParquetFile object will need to be stored in the
    task graph, because metadata parsing is too expensive.
    """
    parts = []
    fast_metadata = True
    if len(paths) > 1:
        base, fns = _analyze_paths(paths, fs)
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            if "_metadata" not in fns:
                paths_use = paths
                fast_metadata = False
            else:
                paths_use = base + fs.sep + "_metadata"
            pf = ParquetFile(paths_use,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                pf = ParquetFile(base + fs.sep + "_metadata",
                                 open_with=fs.open,
                                 sep=fs.sep,
                                 **kwargs.get("file", {}))
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                parts = paths.copy()
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        paths = fs.glob(paths[0] + fs.sep + "*")
        base, fns = _analyze_paths(paths, fs)
        if "_metadata" in fns:
            # Using _metadata file (best-case scenario)
            pf = ParquetFile(base + fs.sep + "_metadata",
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
            if gather_statistics is None:
                gather_statistics = True

        elif gather_statistics is not False:
            # Scan every file
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             **kwargs.get("file", {}))
            fast_metadata = False
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                pf = ParquetFile(base + fs.sep + "_common_metadata",
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            else:
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            scheme = get_file_scheme(fns)
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            parts = paths.copy()
    else:
        # There is only one file to read
        base = None
        pf = ParquetFile(paths[0],
                         open_with=fs.open,
                         sep=fs.sep,
                         **kwargs.get("file", {}))

    return parts, pf, gather_statistics, fast_metadata, base

コード例 #10

0

ファイルを表示

ファイル: fastparquet.py プロジェクト: peterpanmj/dask

def _determine_pf_parts(fs, paths, gather_statistics, **kwargs):
    """Determine how to access metadata and break read into ``parts``

    This logic is mostly to handle `gather_statistics=False` cases,
    because this also means we should avoid scanning every file in the
    dataset.  If _metadata is available, set `gather_statistics=True`
    (if `gather_statistics=None`).
    """
    parts = []
    if len(paths) > 1:
        paths, base, fns = _sort_and_analyze_paths(paths, fs)
        if gather_statistics is not False:
            # This scans all the files, allowing index/divisions
            # and filtering
            if "_metadata" not in fns:
                paths_use = paths
            else:
                paths_use = base + fs.sep + "_metadata"
            pf = ParquetFile(paths_use,
                             open_with=fs.open,
                             sep=fs.sep,
                             **kwargs.get("file", {}))
        else:
            if "_metadata" in fns:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    base + fs.sep + "_metadata",
                    open_with=fs.open,
                    sep=fs.sep,
                    **kwargs.get("file", {}),
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                parts = paths.copy()
    elif fs.isdir(paths[0]):
        # This is a directory, check for _metadata, then _common_metadata
        paths = fs.glob(paths[0] + fs.sep + "*")
        paths, base, fns = _sort_and_analyze_paths(paths, fs)
        if "_metadata" in fns:
            # Using _metadata file (best-case scenario)
            pf = ParquetFile(
                base + fs.sep + "_metadata",
                open_with=fs.open,
                sep=fs.sep,
                **kwargs.get("file", {}),
            )
            if gather_statistics is None:
                gather_statistics = True

        elif gather_statistics is not False:
            # Scan every file
            pf = ParquetFile(paths,
                             open_with=fs.open,
                             **kwargs.get("file", {}))
        else:
            # Use _common_metadata file if it is available.
            # Otherwise, just use 0th file
            if "_common_metadata" in fns:
                pf = ParquetFile(
                    base + fs.sep + "_common_metadata",
                    open_with=fs.open,
                    **kwargs.get("file", {}),
                )
                fns.remove("_common_metadata")
            else:
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
            scheme = get_file_scheme(fns)
            pf.file_scheme = scheme
            pf.cats = paths_to_cats(fns, scheme)
            parts = [fs.sep.join([base, fn]) for fn in fns]
    else:
        # There is only one file to read
        base = None
        pf = ParquetFile(paths[0],
                         open_with=fs.open,
                         sep=fs.sep,
                         **kwargs.get("file", {}))

    # Ensure that there is no overlap between partition columns
    # and explicit columns in `pf`
    if pf.cats:
        _partitions = [p for p in pf.cats if p not in pf.columns]
        if not _partitions:
            pf.cats = {}
        elif len(_partitions) != len(pf.cats):
            raise ValueError("No partition-columns should be written in the \n"
                             "file unless they are ALL written in the file.\n"
                             "columns: {} | partitions: {}".format(
                                 pf.columns, pf.cats.keys()))

    return parts, pf, gather_statistics, base

コード例 #11

0

ファイルを表示

ファイル: fastparquet.py プロジェクト: xvr-hlt/dask

    def read_metadata(fs,
                      paths,
                      categories=None,
                      index=None,
                      gather_statistics=None,
                      filters=None,
                      **kwargs):
        if len(paths) > 1:
            if gather_statistics is not False:
                # this scans all the files, allowing index/divisions
                # and filtering
                pf = fastparquet.ParquetFile(paths,
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))
            else:
                base, fns = analyse_paths(paths)
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[0],
                                 open_with=fs.open,
                                 **kwargs.get("file", {}))
                pf.file_scheme = scheme
                pf.cats = _paths_to_cats(fns, scheme)
                relpath = paths.replace(base, "").lstrip("/")
                for rg in pf.row_groups:
                    rg.cats = pf.cats
                    rg.schema = pf.schema
                    for ch in rg.columns:
                        ch.file_path = relpath
        else:
            try:
                pf = fastparquet.ParquetFile(paths[0] + fs.sep + "_metadata",
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))
                if gather_statistics is None:
                    gather_statistics = True
            except Exception:
                pf = fastparquet.ParquetFile(paths[0],
                                             open_with=fs.open,
                                             sep=fs.sep,
                                             **kwargs.get("file", {}))

        columns = None
        if pf.fmd.key_value_metadata:
            pandas_md = [
                x.value for x in pf.fmd.key_value_metadata if x.key == "pandas"
            ]
        else:
            pandas_md = []

        if len(pandas_md) == 0:
            index_names = []
            column_names = pf.columns + list(pf.cats)
            storage_name_mapping = {k: k for k in column_names}
            column_index_names = [None]

        elif len(pandas_md) == 1:
            (
                index_names,
                column_names,
                storage_name_mapping,
                column_index_names,
            ) = _parse_pandas_metadata(json.loads(pandas_md[0]))
            #  auto-ranges should not be created by fastparquet
            index_names = [n for n in index_names if n is not None]
            column_names.extend(pf.cats)

        else:
            raise ValueError("File has multiple entries for 'pandas' metadata")

        if index is None and len(index_names) > 0:
            if len(index_names) == 1:
                index = index_names[0]
            else:
                index = index_names

        # Normalize user inputs
        column_names, index_names = _normalize_index_columns(
            columns, column_names, index, index_names)

        all_columns = index_names + column_names

        categories_dict = None
        if isinstance(categories, dict):
            categories_dict = categories

        if categories is None:
            categories = pf.categories
        elif isinstance(categories, string_types):
            categories = [categories]
        else:
            categories = list(categories)

        # Check that categories are included in columns
        if categories and not set(categories).intersection(all_columns):
            raise ValueError("categories not in available columns.\n"
                             "categories: {} | columns: {}".format(
                                 categories, list(all_columns)))

        dtypes = pf._dtypes(categories)
        dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

        index_cols = index or ()
        meta = _meta_from_dtypes(all_columns, dtypes, index_cols,
                                 column_index_names)

        # fastparquet doesn't handle multiindex
        if len(index_names) > 1:
            raise ValueError("Cannot read DataFrame with MultiIndex.")

        for cat in categories:
            if cat in meta:
                meta[cat] = pd.Series(
                    pd.Categorical([], categories=[UNKNOWN_CATEGORIES]),
                    index=meta.index,
                )

        for catcol in pf.cats:
            if catcol in meta.columns:
                meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
            elif meta.index.name == catcol:
                meta.index = meta.index.set_categories(pf.cats[catcol])

        if gather_statistics and pf.row_groups:
            stats = []
            if filters is None:
                filters = []
            # make statistics conform in layout
            for (i, row_group) in enumerate(pf.row_groups):
                s = {"num-rows": row_group.num_rows, "columns": []}
                for col in pf.columns:
                    d = {"name": col}
                    if pf.statistics["min"][col][0] is not None:
                        cs_min = pf.statistics["min"][col][i]
                        cs_max = pf.statistics["max"][col][i]
                        if isinstance(cs_min, np.datetime64):
                            cs_min = pd.Timestamp(cs_min)
                            cs_max = pd.Timestamp(cs_max)
                        d.update({
                            "min":
                            cs_min,
                            "max":
                            cs_max,
                            "null_count":
                            pf.statistics["null_count"][col][i],
                        })
                    s["columns"].append(d)
                # Need this to filter out partitioned-on categorical columns
                s["filter"] = fastparquet.api.filter_out_cats(
                    row_group, filters)
                stats.append(s)

        else:
            stats = None

        pf._dtypes = lambda *args: pf.dtypes  # ugly patch, could be fixed
        pf.fmd.row_groups = None

        # Create `parts` (list of row-group-descriptor dicts)
        parts = [{
            "piece": rg,
            "kwargs": {
                "pf": pf,
                "categories": categories_dict or categories
            },
        } for rg in pf.row_groups]

        return (meta, stats, parts)

コード例 #12

0

ファイルを表示

ファイル: fastparquet.py プロジェクト: vijaykriishna/dask

    def _collect_dataset_info(
            cls,
            paths,
            fs,
            categories,
            index,
            gather_statistics,
            filters,
            split_row_groups,
            chunksize,
            aggregate_files,
            ignore_metadata_file,
            metadata_task_size,
            require_extension=(".parq", ".parquet"),
            **kwargs,
    ):

        # Define the parquet-file (pf) object to use for metadata,
        # Also, initialize `parts`.  If `parts` is populated here,
        # then each part will correspond to a file.  Otherwise, each part will
        # correspond to a row group (populated later).
        #
        # This logic is mostly to handle `gather_statistics=False` cases,
        # because this also means we should avoid scanning every file in the
        # dataset.  If _metadata is available, set `gather_statistics=True`
        # (if `gather_statistics=None`).

        parts = []
        _metadata_exists = False
        if len(paths) == 1 and fs.isdir(paths[0]):

            # This is a directory.
            # Check if _metadata and/or _common_metadata files exists
            base = paths[0]
            _metadata_exists = True
            if not ignore_metadata_file:
                _metadata_exists = fs.isfile(fs.sep.join([base, "_metadata"]))

            # Find all files if we are not using a _metadata file
            if ignore_metadata_file or not _metadata_exists:
                # For now, we need to discover every file under paths[0]
                paths, base, fns = _sort_and_analyze_paths(fs.find(base), fs)
                _update_paths = False
                for fn in ["_metadata", "_common_metadata"]:
                    try:
                        fns.remove(fn)
                        _update_paths = True
                    except ValueError:
                        pass
                if _update_paths:
                    paths = [fs.sep.join([base, fn]) for fn in fns]
                _metadata_exists = False
            if _metadata_exists:
                # Using _metadata file (best-case scenario)
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **kwargs,
                )
                if gather_statistics is None:
                    gather_statistics = True
            else:
                # Use 0th file
                # Note that "_common_metadata" can cause issues for
                # partitioned datasets.
                if require_extension:
                    # Raise error if all files have been filtered by extension
                    len0 = len(paths)
                    paths = [
                        path for path in paths
                        if path.endswith(require_extension)
                    ]
                    if len0 and paths == []:
                        raise ValueError(
                            "No files satisfy the `require_extension` criteria "
                            f"(files must end with {require_extension}).")
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **kwargs)
                scheme = get_file_scheme(fns)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = [fs.sep.join([base, fn]) for fn in fns]
        else:
            # This is a list of files
            paths, base, fns = _sort_and_analyze_paths(paths, fs)

            # Check if _metadata is in paths, and
            # remove it if ignore_metadata_file=True
            _metadata_exists = "_metadata" in fns
            if _metadata_exists and ignore_metadata_file:
                fns.remove("_metadata")
                _metadata_exists = False
            paths = [fs.sep.join([base, fn]) for fn in fns]

            if _metadata_exists:
                # We have a _metadata file, lets use it
                pf = ParquetFile(
                    fs.sep.join([base, "_metadata"]),
                    open_with=fs.open,
                    **kwargs,
                )
            else:
                # Rely on metadata for 0th file.
                # Will need to pass a list of paths to read_partition
                scheme = get_file_scheme(fns)
                pf = ParquetFile(paths[:1],
                                 open_with=fs.open,
                                 root=base,
                                 **kwargs)
                pf.file_scheme = scheme
                pf.cats = paths_to_cats(fns, scheme)
                if not gather_statistics:
                    parts = paths.copy()

        # Check the `aggregate_files` setting
        aggregation_depth = _get_aggregation_depth(
            aggregate_files,
            list(pf.cats),
        )

        # Ensure that there is no overlap between partition columns
        # and explicit columns in `pf`
        if pf.cats:
            _partitions = [p for p in pf.cats if p not in pf.columns]
            if not _partitions:
                pf.cats = {}
            elif len(_partitions) != len(pf.cats):
                raise ValueError(
                    "No partition-columns should be written in the \n"
                    "file unless they are ALL written in the file.\n"
                    "columns: {} | partitions: {}".format(
                        pf.columns, pf.cats.keys()))

        return {
            "pf": pf,
            "paths": paths,
            "has_metadata_file": _metadata_exists,
            "parts": parts,
            "base": base,
            "fs": fs,
            "gather_statistics": gather_statistics,
            "categories": categories,
            "index": index,
            "filters": filters,
            "split_row_groups": split_row_groups,
            "chunksize": chunksize,
            "aggregate_files": aggregate_files,
            "aggregation_depth": aggregation_depth,
            "metadata_task_size": metadata_task_size,
            "kwargs": kwargs,
        }