Exemple #1
0
def _pf_validation(pf, columns, index, categories, filters):
    """Validate user options against metadata in dataset

     columns, index and categories must be in the list of columns available
     (both data columns and path-based partitioning - subject to possible
     renaming, if pandas metadata is present). The output index will
     be inferred from any available pandas metadata, if not given.
     """
    from fastparquet.util import check_column_names
    check_column_names(pf.columns, categories)
    check_column_names(pf.columns + list(pf.cats or []), columns)
    if isinstance(columns, tuple):
        # ensure they tokenize the same
        columns = list(columns)

    if pf.fmd.key_value_metadata:
        pandas_md = [x.value for x in pf.fmd.key_value_metadata
                     if x.key == 'pandas']
    else:
        pandas_md = []

    if len(pandas_md) == 0:
        # Fall back to the storage information
        index_names = pf._get_index()
        if not isinstance(index_names, list):
            index_names = [index_names]
        column_names = pf.columns + list(pf.cats)
        storage_name_mapping = {k: k for k in column_names}
    elif len(pandas_md) == 1:
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(json.loads(pandas_md[0]))
        )
        column_names.extend(pf.cats)
    else:
        raise ValueError("File has multiple entries for 'pandas' metadata")

    # Normalize user inputs

    if filters is None:
        filters = []

    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names)

    if categories is None:
        categories = pf.categories
    elif isinstance(categories, string_types):
        categories = [categories]
    else:
        categories = list(categories)

    # TODO: write partition_on to pandas metadata...
    all_columns = list(column_names)
    all_columns.extend(x for x in index_names if x not in column_names)

    dtypes = pf._dtypes(categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])

    # fastparquet doesn't handle multiindex
    if len(index_names) > 1:
        raise ValueError("Cannot read DataFrame with MultiIndex.")
    elif len(index_names) == 0:
        index_names = None

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical([],
                                                 categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    for catcol in pf.cats:
        if catcol in meta.columns:
            meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
        elif meta.index.name == catcol:
            meta.index = meta.index.set_categories(pf.cats[catcol])

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]
    return (meta, filters, index_names, out_type, all_columns, index_names,
            storage_name_mapping)
Exemple #2
0
def read_parquet(path,
                 columns=None,
                 filters=None,
                 categories=None,
                 index=None,
                 storage_options=None):
    """
    Read ParquetFile into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    This uses the fastparquet project: http://fastparquet.readthedocs.io/en/latest

    Parameters
    ----------
    path : string
        Source directory for data.
        Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data.
    columns: list or None
        List of column names to load
    filters: list
        List of filters to apply, like ``[('x', '>' 0), ...]``
    index: string or None
        Name of index column to use if that column is sorted
    categories: list or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
    storage_options : dict
        Key/value pairs to be passed on to the file-system backend, if any.

    Examples
    --------
    >>> df = read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    """
    if fastparquet is False:
        raise ImportError("fastparquet not installed")
    if filters is None:
        filters = []
    myopen = OpenFileCreator(path,
                             compression=None,
                             text=False,
                             **(storage_options or {}))

    if isinstance(columns, list):
        columns = tuple(columns)

    try:
        pf = fastparquet.ParquetFile(path + myopen.fs.sep + '_metadata',
                                     open_with=myopen,
                                     sep=myopen.fs.sep)
    except:
        pf = fastparquet.ParquetFile(path, open_with=myopen, sep=myopen.fs.sep)

    check_column_names(pf.columns, categories)
    categories = categories or []
    name = 'read-parquet-' + tokenize(pf, columns, categories)

    rgs = [
        rg for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.helper))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    # Find an index among the partially sorted columns
    minmax = fastparquet.api.sorted_partitioned_columns(pf)

    if index is False:
        index_col = None
    elif len(minmax) == 1:
        index_col = first(minmax)
    elif len(minmax) > 1:
        if index:
            index_col = index
        elif 'index' in minmax:
            index_col = 'index'
        else:
            raise ValueError("Multiple possible indexes exist: %s.  "
                             "Please select one with index='index-name'" %
                             sorted(minmax))
    else:
        index_col = None

    if columns is None:
        all_columns = tuple(pf.columns + list(pf.cats))
    else:
        all_columns = columns
    if not isinstance(all_columns, tuple):
        out_type = Series
        all_columns = (all_columns, )
    else:
        out_type = DataFrame
    if index_col and index_col not in all_columns:
        all_columns = all_columns + (index_col, )

    dtypes = {
        k: ('category' if k in categories else v)
        for k, v in pf.dtypes.items() if k in all_columns
    }

    meta = pd.DataFrame(
        {c: pd.Series([], dtype=d)
         for (c, d) in dtypes.items()},
        columns=[c for c in pf.columns if c in dtypes])
    meta = meta[list(all_columns)]

    for cat in categories:
        meta[cat] = pd.Series(
            pd.Categorical([], categories=[UNKNOWN_CATEGORIES]))

    if index_col:
        meta = meta.set_index(index_col)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    dsk = {(name, i):
           (_read_parquet_row_group, myopen, pf.row_group_filename(rg),
            index_col, all_columns, rg, out_type == Series, categories,
            pf.helper, pf.cats, pf.dtypes)
           for i, rg in enumerate(rgs)}

    if index_col:
        divisions = list(
            minmax[index_col]['min']) + [minmax[index_col]['max'][-1]]
    else:
        divisions = (None, ) * (len(rgs) + 1)

    return out_type(dsk, name, meta, divisions)
Exemple #3
0
def parquet_to_databuffers(filename,
                           x,
                           y,
                           category,
                           width=512,
                           height=None,
                           xmin=None,
                           ymin=None,
                           xmax=None,
                           ymax=None,
                           projection=None):
    proj = lambda x, y, inverse: (x, y)
    root, ext = os.path.splitext(filename)
    if ext != '.parq':
        raise ValueError('Expected a .parq file, got ({}) {}'.format(
            ext, filename))

    pf = fastparquet.ParquetFile(filename)
    check_column_names(pf.columns,
                       [x, y, category])  # raise if columns not there
    description = {'source': {"filename": filename, "type": "parquet"}}
    if projection:
        description['projection'] = {"type": projection}
        proj = pyproj.Proj(init=projection, preserve_units=True)

    stats = pf.statistics
    if 'max' in stats:
        if xmax is None:
            xmax = np.max(stats['max'][x])
        if ymax is None:
            ymax = np.max(stats['max'][y])
    if 'min' in stats:
        if xmin is None:
            xmin = np.min(stats['min'][x])
        if ymin is None:
            ymin = np.min(stats['min'][y])
    if xmin is None or xmax is None or ymin is None or ymax is None:
        compute_xmin = (xmin is None)
        compute_xmax = (xmax is None)
        compute_ymin = (ymin is None)
        compute_ymax = (ymax is None)
        print('Incomplete stats, computing min/max')
        for df in pf.iter_row_groups(columns=[x, y]):
            if compute_xmin:
                if xmin is None:
                    xmin = df[x].min()
                else:
                    xmin = np.min([xmin, df[x].min()])
            if compute_ymin:
                if ymin is None:
                    ymin = df[y].min()
                else:
                    ymin = np.min([ymin, df[y].min()])
            if compute_xmax:
                if xmax is None:
                    xmax = df[x].max()
                else:
                    xmax = np.max([xmax, df[x].max()])
            if compute_ymax:
                if ymax is None:
                    ymax = df[y].max()
                else:
                    ymax = np.max([ymax, df[y].max()])
    xy_range = [[float(xmin), float(xmax)], [float(ymin), float(ymax)]]
    if ymax == ymin or xmax == xmin:
        raise ValueError('Invalid bounds: {}'.format(xy_range))
    if height is None:
        ratio = (ymax - ymin) / (xmax - xmin)
        height = int(width * ratio)
    bins = (width, height)
    # description['bounds'] = xy_range
    # description['bins'] = list(bins)
    print('Range: %s, bins: %s' % (xy_range, bins))
    histograms = {}
    counts = {}

    for df in pf.iter_row_groups(columns=[x, y, category],
                                 categories=[category]):
        print('Accessing row_group len=%d' % len(df))
        values = df[category].cat.categories
        cat_column = df[category]
        for i, cat in enumerate(values):
            df_cat = df.loc[cat_column == cat, [x, y]]
            (histo, xedges, yedges) = np.histogram2d(df_cat[x],
                                                     df_cat[y],
                                                     normed=False,
                                                     bins=bins,
                                                     range=xy_range)
            if isinstance(bins, list):
                if (xedges != bins[0]).any():
                    print('X Edges differ: %s' % xedges)
                    bins = [xedges, yedges]
                if (yedges != bins[1]).any():
                    print('Y Edges differ: %s' % yedges)
                    bins = [xedges, yedges]
            else:
                bins = [xedges, yedges]
            if isinstance(cat, str):
                key = cat
            else:
                key = i + 1
            if key in histograms:
                histograms[key] += histo
            else:
                histograms[key] = histo
            counts[key] = len(df_cat) + counts.get(key, 0)

    if projection:
        xmin, ymin = proj(xmin, ymin, inverse=True)
        xmax, ymax = proj(xmax, ymax, inverse=True)
        xtype = "latitude"
        ytype = "longitude"
    else:
        xtype = "quantitative"
        ytype = "quantitative"

    description['encoding'] = {
        "x": {
            "field": x,
            "type": xtype,
            "bin": {
                "maxbins": width
            },
            "aggregate": "count",
            "scale": {
                "domain": [xmin, xmax],
                "range": [0, width]
            }
        },
        "y": {
            "field": y,
            "type": ytype,
            "bin": {
                "maxbins": height
            },
            "aggregate": "count",
            "scale": {
                "domain": [ymin, ymax],
                "range": [0, height]
            }
        },
        "z": {
            "field": category,
            "type": "nominal",  # or ordinal
            "scale": {
                "domain": list(histograms.keys())
            }
        }
    }

    print('Writing files')
    count = 0
    buffers = []
    for (key, histo) in histograms.items():
        histo = histo.T
        hmin = np.min(histo)
        hmax = np.max(histo)
        outfile = root + '_cat_%s.json' % key
        with open(outfile, 'w') as outf:
            json.dump(histo.tolist(), outf)
        data = {
            'url': outfile,
            'count': counts[key],
            'value': key,
            'range': [int(hmin), int(hmax)]
        }
        buffers.append(data)
        count += counts[key]
    description['buffers'] = buffers
    description['source']['rows'] = count
    with open(root + '_data.json', 'w') as outf:
        json.dump(description, outf, indent=2)
Exemple #4
0
def _read_fastparquet(fs,
                      fs_token,
                      paths,
                      columns=None,
                      filters=None,
                      categories=None,
                      index=None,
                      infer_divisions=None):
    import fastparquet
    from fastparquet.util import check_column_names

    if isinstance(paths, fastparquet.api.ParquetFile):
        pf = paths
    elif len(paths) > 1:
        pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=fs.open,
                                         sep=fs.sep)
        except Exception:
            pf = fastparquet.ParquetFile(paths[0],
                                         open_with=fs.open,
                                         sep=fs.sep)

    # Validate infer_divisions
    if os.path.split(pf.fn)[-1] != '_metadata' and infer_divisions is True:
        raise NotImplementedError(
            "infer_divisions=True is not supported by the fastparquet engine for datasets "
            "that do not contain a global '_metadata' file")

    check_column_names(pf.columns, categories)
    if isinstance(columns, tuple):
        # ensure they tokenize the same
        columns = list(columns)

    if pf.fmd.key_value_metadata:
        pandas_md = [
            x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas'
        ]
    else:
        pandas_md = []

    if len(pandas_md) == 0:
        # Fall back to the storage information
        index_names = pf._get_index()
        if not isinstance(index_names, list):
            index_names = [index_names]
        column_names = pf.columns + list(pf.cats)
        storage_name_mapping = {k: k for k in column_names}
    elif len(pandas_md) == 1:
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(json.loads(pandas_md[0])))
        column_names.extend(pf.cats)
    else:
        raise ValueError("File has multiple entries for 'pandas' metadata")

    # Normalize user inputs

    if filters is None:
        filters = []

    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names)

    if categories is None:
        categories = pf.categories
    elif isinstance(categories, string_types):
        categories = [categories]
    else:
        categories = list(categories)

    # TODO: write partition_on to pandas metadata...
    all_columns = list(column_names)
    all_columns.extend(x for x in index_names if x not in column_names)

    rgs = [
        rg for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    dtypes = pf._dtypes(categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])
    # fastparquet doesn't handle multiindex
    if len(index_names) > 1:
        raise ValueError("Cannot read DataFrame with MultiIndex.")
    elif len(index_names) == 0:
        index_names = None

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical(
                [], categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    for catcol in pf.cats:
        if catcol in meta.columns:
            meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
        elif meta.index.name == catcol:
            meta.index = meta.index.set_categories(pf.cats[catcol])

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters,
                                      categories)

    dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg),
                       index_names, all_columns, rg, out_type == Series,
                       categories, pf.schema, pf.cats, pf.dtypes,
                       pf.file_scheme, storage_name_mapping)
           for i, rg in enumerate(rgs)}
    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_names and infer_divisions is not False:
        index_name = meta.index.name
        try:
            # is https://github.com/dask/fastparquet/pull/371 available in
            # current fastparquet installation?
            minmax = fastparquet.api.sorted_partitioned_columns(pf, filters)
        except TypeError:
            minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_name in minmax:
            divisions = minmax[index_name]
            divisions = divisions['min'] + [divisions['max'][-1]]
        else:
            if infer_divisions is True:
                raise ValueError((
                    "Unable to infer divisions for index of '{index_name}' because it is not known to be "
                    "sorted across partitions").format(index_name=index_name))

            divisions = (None, ) * (len(rgs) + 1)
    else:
        if infer_divisions is True:
            raise ValueError(
                'Unable to infer divisions for because no index column was discovered'
            )

        divisions = (None, ) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)
Exemple #5
0
def _read_fastparquet(fs,
                      paths,
                      myopen,
                      columns=None,
                      filters=None,
                      categories=None,
                      index=None,
                      storage_options=None):
    import fastparquet
    from fastparquet.util import check_column_names
    if filters is None:
        filters = []

    if isinstance(columns, list):
        columns = tuple(columns)

    if len(paths) > 1:
        pf = fastparquet.ParquetFile(paths,
                                     open_with=myopen,
                                     sep=myopen.fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=myopen,
                                         sep=fs.sep)
        except Exception:
            pf = fastparquet.ParquetFile(paths[0],
                                         open_with=myopen,
                                         sep=fs.sep)

    check_column_names(pf.columns, categories)
    name = 'read-parquet-' + tokenize(pf, columns, categories)

    rgs = [
        rg for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    if index is False:
        index_col = None
    elif index is None:
        index_col = pf._get_index()
    else:
        index_col = index

    if columns is None:
        all_columns = tuple(pf.columns + list(pf.cats))
    else:
        all_columns = columns
    if not isinstance(all_columns, tuple):
        out_type = Series
        all_columns = (all_columns, )
    else:
        out_type = DataFrame
    if index_col and index_col not in all_columns:
        all_columns = all_columns + (index_col, )

    if categories is None:
        categories = pf.categories
    dtypes = pf._dtypes(categories)

    meta = _meta_from_dtypes(all_columns, tuple(pf.columns + list(pf.cats)),
                             dtypes, index_col)

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical(
                [], categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    dsk = {(name, i):
           (_read_parquet_row_group, myopen, pf.row_group_filename(rg),
            index_col, all_columns, rg, out_type == Series, categories,
            pf.schema, pf.cats, pf.dtypes, pf.file_scheme)
           for i, rg in enumerate(rgs)}

    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_col:
        minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_col in minmax:
            divisions = (list(minmax[index_col]['min']) +
                         [minmax[index_col]['max'][-1]])
            divisions = [
                divisions[i] for i, rg in enumerate(pf.row_groups) if rg in rgs
            ] + [divisions[-1]]
        else:
            divisions = (None, ) * (len(rgs) + 1)
    else:
        divisions = (None, ) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)
Exemple #6
0
def _read_fastparquet(fs,
                      fs_token,
                      paths,
                      columns=None,
                      filters=None,
                      categories=None,
                      index=None):
    import fastparquet
    from fastparquet.util import check_column_names

    if len(paths) > 1:
        pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=fs.open,
                                         sep=fs.sep)
        except Exception:
            pf = fastparquet.ParquetFile(paths[0],
                                         open_with=fs.open,
                                         sep=fs.sep)

    check_column_names(pf.columns, categories)
    if isinstance(columns, tuple):
        # ensure they tokenize the same
        columns = list(columns)

    if pf.fmd.key_value_metadata:
        pandas_md = [
            x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas'
        ]
    else:
        pandas_md = []

    if len(pandas_md) == 0:
        # Fall back to the storage information
        index_names = pf._get_index()
        if not isinstance(index_names, list):
            index_names = [index_names]
        column_names = pf.columns + list(pf.cats)
        storage_name_mapping = {k: k for k in column_names}
    elif len(pandas_md) == 1:
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(json.loads(pandas_md[0])))
        column_names.extend(pf.cats)
    else:
        raise ValueError("File has multiple entries for 'pandas' metadata")

    # Normalize user inputs

    if filters is None:
        filters = []

    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names)

    if categories is None:
        categories = pf.categories
    elif isinstance(categories, string_types):
        categories = [categories]
    else:
        categories = list(categories)

    # TODO: write partition_on to pandas metadata...
    all_columns = list(column_names)
    all_columns.extend(x for x in index_names if x not in column_names)

    rgs = [
        rg for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    dtypes = pf._dtypes(categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])
    # fastparquet doesn't handle multiindex
    if len(index_names) > 1:
        raise ValueError("Cannot read DataFrame with MultiIndex.")
    elif len(index_names) == 0:
        index_names = None

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical(
                [], categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    for catcol in pf.cats:
        if catcol in meta.columns:
            meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
        elif meta.index.name == catcol:
            meta.index = meta.index.set_categories(pf.cats[catcol])

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters,
                                      categories)

    dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg),
                       index_names, all_columns, rg, out_type == Series,
                       categories, pf.schema, pf.cats, pf.dtypes,
                       pf.file_scheme, storage_name_mapping)
           for i, rg in enumerate(rgs)}
    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_names:
        index_name = meta.index.name
        minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_name in minmax:
            divisions = (list(minmax[index_name]['min']) +
                         [minmax[index_name]['max'][-1]])
            divisions = [
                divisions[i] for i, rg in enumerate(pf.row_groups) if rg in rgs
            ] + [divisions[-1]]
        else:
            divisions = (None, ) * (len(rgs) + 1)
    else:
        divisions = (None, ) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)
Exemple #7
0
def _read_fastparquet(fs, paths, myopen, columns=None, filters=None,
                      categories=None, index=None, storage_options=None):
    import fastparquet
    from fastparquet.util import check_column_names
    if filters is None:
        filters = []

    if isinstance(columns, list):
        columns = tuple(columns)

    if len(paths) > 1:
        pf = fastparquet.ParquetFile(paths, open_with=myopen, sep=myopen.fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=myopen,
                                         sep=fs.sep)
        except Exception:
            pf = fastparquet.ParquetFile(paths[0], open_with=myopen, sep=fs.sep)

    check_column_names(pf.columns, categories)
    name = 'read-parquet-' + tokenize(pf, columns, categories)

    rgs = [rg for rg in pf.row_groups if
           not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and
           not (fastparquet.api.filter_out_cats(rg, filters))]

    if index is False:
        index_col = None
    elif index is None:
        index_col = pf._get_index()
    else:
        index_col = index

    if columns is None:
        all_columns = tuple(pf.columns + list(pf.cats))
    else:
        all_columns = columns
    if not isinstance(all_columns, tuple):
        out_type = Series
        all_columns = (all_columns,)
    else:
        out_type = DataFrame
    if index_col and index_col not in all_columns:
        all_columns = all_columns + (index_col,)

    if categories is None:
        categories = pf.categories
    dtypes = pf._dtypes(categories)

    meta = _meta_from_dtypes(all_columns, tuple(pf.columns + list(pf.cats)),
                             dtypes, index_col)

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical([],
                                                 categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg),
                       index_col, all_columns, rg, out_type == Series,
                       categories, pf.schema, pf.cats, pf.dtypes,
                       pf.file_scheme)
           for i, rg in enumerate(rgs)}

    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_col:
        minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_col in minmax:
            divisions = (list(minmax[index_col]['min']) +
                         [minmax[index_col]['max'][-1]])
            divisions = [divisions[i] for i, rg in enumerate(pf.row_groups)
                         if rg in rgs] + [divisions[-1]]
        else:
            divisions = (None,) * (len(rgs) + 1)
    else:
        divisions = (None,) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)
Exemple #8
0
def read_parquet(path, columns=None, filters=None, categories=None, index=None,
                 storage_options=None):
    """
    Read ParquetFile into a Dask DataFrame

    This reads a directory of Parquet data into a Dask.dataframe, one file per
    partition.  It selects the index among the sorted columns if any exist.

    This uses the fastparquet project: http://fastparquet.readthedocs.io/en/latest

    Parameters
    ----------
    path : string
        Source directory for data.
        Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data.
    columns: list or None
        List of column names to load
    filters: list
        List of filters to apply, like ``[('x', '>' 0), ...]``
    index: string or None
        Name of index column to use if that column is sorted
    categories: list or None
        For any fields listed here, if the parquet encoding is Dictionary,
        the column will be created with dtype category. Use only if it is
        guaranteed that the column is encoded as dictionary in all row-groups.
    storage_options : dict
        Key/value pairs to be passed on to the file-system backend, if any.

    Examples
    --------
    >>> df = read_parquet('s3://bucket/my-parquet-data')  # doctest: +SKIP

    See Also
    --------
    to_parquet
    """
    if fastparquet is False:
        raise ImportError("fastparquet not installed")
    if filters is None:
        filters = []
    myopen = OpenFileCreator(path, compression=None, text=False,
                             **(storage_options or {}))

    if isinstance(columns, list):
        columns = tuple(columns)

    try:
        pf = fastparquet.ParquetFile(path + myopen.fs.sep + '_metadata',
                                     open_with=myopen,
                                     sep=myopen.fs.sep)
    except:
        pf = fastparquet.ParquetFile(path, open_with=myopen, sep=myopen.fs.sep)

    check_column_names(pf.columns, categories)
    categories = categories or []
    name = 'read-parquet-' + tokenize(pf, columns, categories)

    rgs = [rg for rg in pf.row_groups if
           not(fastparquet.api.filter_out_stats(rg, filters, pf.helper)) and
           not(fastparquet.api.filter_out_cats(rg, filters))]

    # Find an index among the partially sorted columns
    minmax = fastparquet.api.sorted_partitioned_columns(pf)

    if index is False:
        index_col = None
    elif len(minmax) == 1:
        index_col = first(minmax)
    elif len(minmax) > 1:
        if index:
            index_col = index
        elif 'index' in minmax:
            index_col = 'index'
        else:
            raise ValueError("Multiple possible indexes exist: %s.  "
                             "Please select one with index='index-name'"
                             % sorted(minmax))
    else:
        index_col = None

    if columns is None:
        all_columns = tuple(pf.columns + list(pf.cats))
    else:
        all_columns = columns
    if not isinstance(all_columns, tuple):
        out_type = Series
        all_columns = (all_columns,)
    else:
        out_type = DataFrame
    if index_col and index_col not in all_columns:
        all_columns = all_columns + (index_col,)

    dtypes = {k: ('category' if k in categories else v) for k, v in
              pf.dtypes.items() if k in all_columns}

    meta = pd.DataFrame({c: pd.Series([], dtype=d)
                        for (c, d) in dtypes.items()},
                        columns=[c for c in pf.columns if c in dtypes])

    for cat in categories:
        meta[cat] = pd.Series(pd.Categorical([],
                              categories=[UNKNOWN_CATEGORIES]))

    if index_col:
        meta = meta.set_index(index_col)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg),
                       index_col, all_columns, rg, out_type == Series,
                       categories, pf.helper, pf.cats, pf.dtypes)
           for i, rg in enumerate(rgs)}

    if index_col:
        divisions = list(minmax[index_col]['min']) + [minmax[index_col]['max'][-1]]
    else:
        divisions = (None,) * (len(rgs) + 1)

    return out_type(dsk, name, meta, divisions)
Exemple #9
0
def _pf_validation(pf, columns, index, categories, filters):
    """Validate user options against metadata in dataset

     columns, index and categories must be in the list of columns available
     (both data columns and path-based partitioning - subject to possible
     renaming, if pandas metadata is present). The output index will
     be inferred from any available pandas metadata, if not given.
     """
    from fastparquet.util import check_column_names
    check_column_names(pf.columns, categories)
    check_column_names(pf.columns + list(pf.cats or []), columns)
    if isinstance(columns, tuple):
        # ensure they tokenize the same
        columns = list(columns)

    if pf.fmd.key_value_metadata:
        pandas_md = [x.value for x in pf.fmd.key_value_metadata
                     if x.key == 'pandas']
    else:
        pandas_md = []

    if len(pandas_md) == 0:
        # Fall back to the storage information
        index_names = pf._get_index()
        if not isinstance(index_names, list):
            index_names = [index_names]
        column_names = pf.columns + list(pf.cats)
        storage_name_mapping = {k: k for k in column_names}
    elif len(pandas_md) == 1:
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(json.loads(pandas_md[0]))
        )
        #  auto-ranges should not be created by fastparquet
        index_names = [n for n in index_names if n is not None]
        column_names.extend(pf.cats)
    else:
        raise ValueError("File has multiple entries for 'pandas' metadata")

    # Normalize user inputs

    if filters is None:
        filters = []

    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names)

    if categories is None:
        categories = pf.categories
    elif isinstance(categories, string_types):
        categories = [categories]
    else:
        categories = list(categories)

    # TODO: write partition_on to pandas metadata...
    all_columns = list(column_names)
    all_columns.extend(x for x in index_names if x not in column_names)

    dtypes = pf._dtypes(categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])

    # fastparquet doesn't handle multiindex
    if len(index_names) > 1:
        raise ValueError("Cannot read DataFrame with MultiIndex.")
    elif len(index_names) == 0:
        index_names = None

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical([],
                                                 categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    for catcol in pf.cats:
        if catcol in meta.columns:
            meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
        elif meta.index.name == catcol:
            meta.index = meta.index.set_categories(pf.cats[catcol])

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]
    return (meta, filters, index_names, out_type, all_columns, index_names,
            storage_name_mapping)
Exemple #10
0
def _read_fastparquet(fs, fs_token, paths, columns=None, filters=None,
                      categories=None, index=None, infer_divisions=None):
    import fastparquet
    from fastparquet.util import check_column_names

    if isinstance(paths,fastparquet.api.ParquetFile):
        pf = paths
    elif len(paths) > 1:
        pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=fs.open,
                                         sep=fs.sep)
        except Exception:
            pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep)

    # Validate infer_divisions
    if os.path.split(pf.fn)[-1] != '_metadata' and infer_divisions is True:
        raise NotImplementedError("infer_divisions=True is not supported by the fastparquet engine for datasets "
                                  "that do not contain a global '_metadata' file")

    check_column_names(pf.columns, categories)
    check_column_names(pf.columns + list(pf.cats or []), columns)
    if isinstance(columns, tuple):
        # ensure they tokenize the same
        columns = list(columns)

    if pf.fmd.key_value_metadata:
        pandas_md = [x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas']
    else:
        pandas_md = []

    if len(pandas_md) == 0:
        # Fall back to the storage information
        index_names = pf._get_index()
        if not isinstance(index_names, list):
            index_names = [index_names]
        column_names = pf.columns + list(pf.cats)
        storage_name_mapping = {k: k for k in column_names}
    elif len(pandas_md) == 1:
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(json.loads(pandas_md[0]))
        )
        column_names.extend(pf.cats)
    else:
        raise ValueError("File has multiple entries for 'pandas' metadata")

    # Normalize user inputs

    if filters is None:
        filters = []

    column_names, index_names, out_type = _normalize_index_columns(
        columns, column_names, index, index_names)

    if categories is None:
        categories = pf.categories
    elif isinstance(categories, string_types):
        categories = [categories]
    else:
        categories = list(categories)

    # TODO: write partition_on to pandas metadata...
    all_columns = list(column_names)
    all_columns.extend(x for x in index_names if x not in column_names)

    rgs = [rg for rg in pf.row_groups if
           not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and
           not (fastparquet.api.filter_out_cats(rg, filters))]

    dtypes = pf._dtypes(categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])
    # fastparquet doesn't handle multiindex
    if len(index_names) > 1:
        raise ValueError("Cannot read DataFrame with MultiIndex.")
    elif len(index_names) == 0:
        index_names = None

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical([],
                                                 categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    for catcol in pf.cats:
        if catcol in meta.columns:
            meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol])
        elif meta.index.name == catcol:
            meta.index = meta.index.set_categories(pf.cats[catcol])

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters,
                                      categories)

    dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg),
                       index_names, all_columns, rg, out_type == Series,
                       categories, pf.schema, pf.cats, pf.dtypes,
                       pf.file_scheme, storage_name_mapping)
           for i, rg in enumerate(rgs)}
    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_names and infer_divisions is not False:
        index_name = meta.index.name
        try:
            # is https://github.com/dask/fastparquet/pull/371 available in
            # current fastparquet installation?
            minmax = fastparquet.api.sorted_partitioned_columns(pf, filters)
        except TypeError:
            minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_name in minmax:
            divisions = minmax[index_name]
            divisions = divisions['min'] + [divisions['max'][-1]]
        else:
            if infer_divisions is True:
                raise ValueError(
                    ("Unable to infer divisions for index of '{index_name}' because it is not known to be "
                     "sorted across partitions").format(index_name=index_name))

            divisions = (None,) * (len(rgs) + 1)
    else:
        if infer_divisions is True:
            raise ValueError(
                'Unable to infer divisions for because no index column was discovered')

        divisions = (None,) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)
Exemple #11
0
def _read_fastparquet(fs,
                      paths,
                      myopen,
                      columns=None,
                      filters=None,
                      categories=None,
                      index=None,
                      storage_options=None):
    if filters is None:
        filters = []

    if isinstance(columns, list):
        columns = tuple(columns)

    if len(paths) > 1:
        pf = fastparquet.ParquetFile(paths,
                                     open_with=myopen,
                                     sep=myopen.fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=myopen,
                                         sep=fs.sep)
        except:
            pf = fastparquet.ParquetFile(paths[0],
                                         open_with=myopen,
                                         sep=fs.sep)

    check_column_names(pf.columns, categories)
    name = 'read-parquet-' + tokenize(pf, columns, categories)

    rgs = [
        rg for rg in pf.row_groups
        if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema))
        and not (fastparquet.api.filter_out_cats(rg, filters))
    ]

    # Find an index among the partially sorted columns
    minmax = fastparquet.api.sorted_partitioned_columns(pf)

    if index is False:
        index_col = None
    elif len(minmax) == 1:
        index_col = first(minmax)
    elif len(minmax) > 1:
        if index:
            index_col = index
        elif 'index' in minmax:
            index_col = 'index'
        else:
            raise ValueError("Multiple possible indexes exist: %s.  "
                             "Please select one with index='index-name'" %
                             sorted(minmax))
    else:
        index_col = None

    if columns is None:
        all_columns = tuple(pf.columns + list(pf.cats))
    else:
        all_columns = columns
    if not isinstance(all_columns, tuple):
        out_type = Series
        all_columns = (all_columns, )
    else:
        out_type = DataFrame
    if index_col and index_col not in all_columns:
        all_columns = all_columns + (index_col, )

    if categories is None:
        categories = pf.categories
    dtypes = pf._dtypes(categories)

    meta = _meta_from_dtypes(all_columns, pf.columns, dtypes)

    for cat in categories:
        meta[cat] = pd.Series(
            pd.Categorical([], categories=[UNKNOWN_CATEGORIES]))

    if index_col:
        meta = meta.set_index(index_col)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    dsk = {(name, i):
           (_read_parquet_row_group, myopen, pf.row_group_filename(rg),
            index_col, all_columns, rg, out_type == Series, categories,
            pf.schema, pf.cats, pf.dtypes)
           for i, rg in enumerate(rgs)}

    if index_col:
        divisions = list(
            minmax[index_col]['min']) + [minmax[index_col]['max'][-1]]
    else:
        divisions = (None, ) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)
Exemple #12
0
def _read_fastparquet(fs, paths, myopen, columns=None, filters=None,
                      categories=None, index=None, storage_options=None):
    import fastparquet
    from fastparquet.util import check_column_names

    if len(paths) > 1:
        pf = fastparquet.ParquetFile(paths, open_with=myopen, sep=myopen.fs.sep)
    else:
        try:
            pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata',
                                         open_with=myopen,
                                         sep=fs.sep)
        except Exception:
            pf = fastparquet.ParquetFile(paths[0], open_with=myopen, sep=fs.sep)

    check_column_names(pf.columns, categories)
    if isinstance(columns, tuple):
        # ensure they tokenize the same
        columns = list(columns)
    name = 'read-parquet-' + tokenize(pf, columns, categories)

    if pf.fmd.key_value_metadata:
        pandas_md = [x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas']
    else:
        pandas_md = []

    if len(pandas_md) == 0:
        # Fall back to the storage information
        index_names = pf._get_index()
        if not isinstance(index_names, list):
            index_names = [index_names]
        column_names = pf.columns + list(pf.cats)
        storage_name_mapping = {k: k for k in column_names}
        column_index_names = [None]
    elif len(pandas_md) == 1:
        index_names, column_names, storage_name_mapping, column_index_names = (
            _parse_pandas_metadata(json.loads(pandas_md[0]))
        )
    else:
        raise ValueError("File has multiple entries for 'pandas' metadata")

    # Normalize user inputs

    if filters is None:
        filters = []

    column_names, out_type = _normalize_columns(columns, column_names)
    index_names = _normalize_index(index, index_names)

    if categories is None:
        categories = pf.categories
    elif isinstance(categories, string_types):
        categories = [categories]
    else:
        categories = list(categories)

    # TODO: write partition_on to pandas metadata...
    # TODO: figure out if partition_on <-> categories. I suspect not...
    all_columns = list(column_names)
    all_columns.extend(x for x in index_names if x not in column_names)
    file_cats = pf.cats
    if file_cats:
        all_columns.extend(list(file_cats))

    rgs = [rg for rg in pf.row_groups if
           not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and
           not (fastparquet.api.filter_out_cats(rg, filters))]

    dtypes = pf._dtypes(categories)
    dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()}

    meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None])
    # fastparquet / dask don't handle multiindex
    if len(index_names) > 1:
        raise ValueError("Cannot read DataFrame with MultiIndex.")
    elif len(index_names) == 0:
        index_names = None

    for cat in categories:
        if cat in meta:
            meta[cat] = pd.Series(pd.Categorical([],
                                                 categories=[UNKNOWN_CATEGORIES]),
                                  index=meta.index)

    if out_type == Series:
        assert len(meta.columns) == 1
        meta = meta[meta.columns[0]]

    dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg),
                       index_names, all_columns, rg, out_type == Series,
                       categories, pf.schema, pf.cats, pf.dtypes,
                       pf.file_scheme, storage_name_mapping)
           for i, rg in enumerate(rgs)}

    if not dsk:
        # empty dataframe
        dsk = {(name, 0): meta}
        divisions = (None, None)
        return out_type(dsk, name, meta, divisions)

    if index_names:
        index_name = meta.index.name
        minmax = fastparquet.api.sorted_partitioned_columns(pf)
        if index_name in minmax:
            divisions = (list(minmax[index_name]['min']) +
                         [minmax[index_name]['max'][-1]])
            divisions = [divisions[i] for i, rg in enumerate(pf.row_groups)
                         if rg in rgs] + [divisions[-1]]
        else:
            divisions = (None,) * (len(rgs) + 1)
    else:
        divisions = (None,) * (len(rgs) + 1)

    if isinstance(divisions[0], np.datetime64):
        divisions = [pd.Timestamp(d) for d in divisions]

    return out_type(dsk, name, meta, divisions)