def _pf_validation(pf, columns, index, categories, filters): """Validate user options against metadata in dataset columns, index and categories must be in the list of columns available (both data columns and path-based partitioning - subject to possible renaming, if pandas metadata is present). The output index will be inferred from any available pandas metadata, if not given. """ from fastparquet.util import check_column_names check_column_names(pf.columns, categories) check_column_names(pf.columns + list(pf.cats or []), columns) if isinstance(columns, tuple): # ensure they tokenize the same columns = list(columns) if pf.fmd.key_value_metadata: pandas_md = [x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas'] else: pandas_md = [] if len(pandas_md) == 0: # Fall back to the storage information index_names = pf._get_index() if not isinstance(index_names, list): index_names = [index_names] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} elif len(pandas_md) == 1: index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(json.loads(pandas_md[0])) ) column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") # Normalize user inputs if filters is None: filters = [] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names) if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # TODO: write partition_on to pandas metadata... all_columns = list(column_names) all_columns.extend(x for x in index_names if x not in column_names) dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None]) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") elif len(index_names) == 0: index_names = None for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] return (meta, filters, index_names, out_type, all_columns, index_names, storage_name_mapping)
def read_parquet(path, columns=None, filters=None, categories=None, index=None, storage_options=None): """ Read ParquetFile into a Dask DataFrame This reads a directory of Parquet data into a Dask.dataframe, one file per partition. It selects the index among the sorted columns if any exist. This uses the fastparquet project: http://fastparquet.readthedocs.io/en/latest Parameters ---------- path : string Source directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. columns: list or None List of column names to load filters: list List of filters to apply, like ``[('x', '>' 0), ...]`` index: string or None Name of index column to use if that column is sorted categories: list or None For any fields listed here, if the parquet encoding is Dictionary, the column will be created with dtype category. Use only if it is guaranteed that the column is encoded as dictionary in all row-groups. storage_options : dict Key/value pairs to be passed on to the file-system backend, if any. Examples -------- >>> df = read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP See Also -------- to_parquet """ if fastparquet is False: raise ImportError("fastparquet not installed") if filters is None: filters = [] myopen = OpenFileCreator(path, compression=None, text=False, **(storage_options or {})) if isinstance(columns, list): columns = tuple(columns) try: pf = fastparquet.ParquetFile(path + myopen.fs.sep + '_metadata', open_with=myopen, sep=myopen.fs.sep) except: pf = fastparquet.ParquetFile(path, open_with=myopen, sep=myopen.fs.sep) check_column_names(pf.columns, categories) categories = categories or [] name = 'read-parquet-' + tokenize(pf, columns, categories) rgs = [ rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.helper)) and not (fastparquet.api.filter_out_cats(rg, filters)) ] # Find an index among the partially sorted columns minmax = fastparquet.api.sorted_partitioned_columns(pf) if index is False: index_col = None elif len(minmax) == 1: index_col = first(minmax) elif len(minmax) > 1: if index: index_col = index elif 'index' in minmax: index_col = 'index' else: raise ValueError("Multiple possible indexes exist: %s. " "Please select one with index='index-name'" % sorted(minmax)) else: index_col = None if columns is None: all_columns = tuple(pf.columns + list(pf.cats)) else: all_columns = columns if not isinstance(all_columns, tuple): out_type = Series all_columns = (all_columns, ) else: out_type = DataFrame if index_col and index_col not in all_columns: all_columns = all_columns + (index_col, ) dtypes = { k: ('category' if k in categories else v) for k, v in pf.dtypes.items() if k in all_columns } meta = pd.DataFrame( {c: pd.Series([], dtype=d) for (c, d) in dtypes.items()}, columns=[c for c in pf.columns if c in dtypes]) meta = meta[list(all_columns)] for cat in categories: meta[cat] = pd.Series( pd.Categorical([], categories=[UNKNOWN_CATEGORIES])) if index_col: meta = meta.set_index(index_col) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg), index_col, all_columns, rg, out_type == Series, categories, pf.helper, pf.cats, pf.dtypes) for i, rg in enumerate(rgs)} if index_col: divisions = list( minmax[index_col]['min']) + [minmax[index_col]['max'][-1]] else: divisions = (None, ) * (len(rgs) + 1) return out_type(dsk, name, meta, divisions)
def parquet_to_databuffers(filename, x, y, category, width=512, height=None, xmin=None, ymin=None, xmax=None, ymax=None, projection=None): proj = lambda x, y, inverse: (x, y) root, ext = os.path.splitext(filename) if ext != '.parq': raise ValueError('Expected a .parq file, got ({}) {}'.format( ext, filename)) pf = fastparquet.ParquetFile(filename) check_column_names(pf.columns, [x, y, category]) # raise if columns not there description = {'source': {"filename": filename, "type": "parquet"}} if projection: description['projection'] = {"type": projection} proj = pyproj.Proj(init=projection, preserve_units=True) stats = pf.statistics if 'max' in stats: if xmax is None: xmax = np.max(stats['max'][x]) if ymax is None: ymax = np.max(stats['max'][y]) if 'min' in stats: if xmin is None: xmin = np.min(stats['min'][x]) if ymin is None: ymin = np.min(stats['min'][y]) if xmin is None or xmax is None or ymin is None or ymax is None: compute_xmin = (xmin is None) compute_xmax = (xmax is None) compute_ymin = (ymin is None) compute_ymax = (ymax is None) print('Incomplete stats, computing min/max') for df in pf.iter_row_groups(columns=[x, y]): if compute_xmin: if xmin is None: xmin = df[x].min() else: xmin = np.min([xmin, df[x].min()]) if compute_ymin: if ymin is None: ymin = df[y].min() else: ymin = np.min([ymin, df[y].min()]) if compute_xmax: if xmax is None: xmax = df[x].max() else: xmax = np.max([xmax, df[x].max()]) if compute_ymax: if ymax is None: ymax = df[y].max() else: ymax = np.max([ymax, df[y].max()]) xy_range = [[float(xmin), float(xmax)], [float(ymin), float(ymax)]] if ymax == ymin or xmax == xmin: raise ValueError('Invalid bounds: {}'.format(xy_range)) if height is None: ratio = (ymax - ymin) / (xmax - xmin) height = int(width * ratio) bins = (width, height) # description['bounds'] = xy_range # description['bins'] = list(bins) print('Range: %s, bins: %s' % (xy_range, bins)) histograms = {} counts = {} for df in pf.iter_row_groups(columns=[x, y, category], categories=[category]): print('Accessing row_group len=%d' % len(df)) values = df[category].cat.categories cat_column = df[category] for i, cat in enumerate(values): df_cat = df.loc[cat_column == cat, [x, y]] (histo, xedges, yedges) = np.histogram2d(df_cat[x], df_cat[y], normed=False, bins=bins, range=xy_range) if isinstance(bins, list): if (xedges != bins[0]).any(): print('X Edges differ: %s' % xedges) bins = [xedges, yedges] if (yedges != bins[1]).any(): print('Y Edges differ: %s' % yedges) bins = [xedges, yedges] else: bins = [xedges, yedges] if isinstance(cat, str): key = cat else: key = i + 1 if key in histograms: histograms[key] += histo else: histograms[key] = histo counts[key] = len(df_cat) + counts.get(key, 0) if projection: xmin, ymin = proj(xmin, ymin, inverse=True) xmax, ymax = proj(xmax, ymax, inverse=True) xtype = "latitude" ytype = "longitude" else: xtype = "quantitative" ytype = "quantitative" description['encoding'] = { "x": { "field": x, "type": xtype, "bin": { "maxbins": width }, "aggregate": "count", "scale": { "domain": [xmin, xmax], "range": [0, width] } }, "y": { "field": y, "type": ytype, "bin": { "maxbins": height }, "aggregate": "count", "scale": { "domain": [ymin, ymax], "range": [0, height] } }, "z": { "field": category, "type": "nominal", # or ordinal "scale": { "domain": list(histograms.keys()) } } } print('Writing files') count = 0 buffers = [] for (key, histo) in histograms.items(): histo = histo.T hmin = np.min(histo) hmax = np.max(histo) outfile = root + '_cat_%s.json' % key with open(outfile, 'w') as outf: json.dump(histo.tolist(), outf) data = { 'url': outfile, 'count': counts[key], 'value': key, 'range': [int(hmin), int(hmax)] } buffers.append(data) count += counts[key] description['buffers'] = buffers description['source']['rows'] = count with open(root + '_data.json', 'w') as outf: json.dump(description, outf, indent=2)
def _read_fastparquet(fs, fs_token, paths, columns=None, filters=None, categories=None, index=None, infer_divisions=None): import fastparquet from fastparquet.util import check_column_names if isinstance(paths, fastparquet.api.ParquetFile): pf = paths elif len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=fs.open, sep=fs.sep) except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep) # Validate infer_divisions if os.path.split(pf.fn)[-1] != '_metadata' and infer_divisions is True: raise NotImplementedError( "infer_divisions=True is not supported by the fastparquet engine for datasets " "that do not contain a global '_metadata' file") check_column_names(pf.columns, categories) if isinstance(columns, tuple): # ensure they tokenize the same columns = list(columns) if pf.fmd.key_value_metadata: pandas_md = [ x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas' ] else: pandas_md = [] if len(pandas_md) == 0: # Fall back to the storage information index_names = pf._get_index() if not isinstance(index_names, list): index_names = [index_names] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} elif len(pandas_md) == 1: index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(json.loads(pandas_md[0]))) column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") # Normalize user inputs if filters is None: filters = [] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names) if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # TODO: write partition_on to pandas metadata... all_columns = list(column_names) all_columns.extend(x for x in index_names if x not in column_names) rgs = [ rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters)) ] dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None]) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") elif len(index_names) == 0: index_names = None for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical( [], categories=[UNKNOWN_CATEGORIES]), index=meta.index) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters, categories) dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg), index_names, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes, pf.file_scheme, storage_name_mapping) for i, rg in enumerate(rgs)} if not dsk: # empty dataframe dsk = {(name, 0): meta} divisions = (None, None) return out_type(dsk, name, meta, divisions) if index_names and infer_divisions is not False: index_name = meta.index.name try: # is https://github.com/dask/fastparquet/pull/371 available in # current fastparquet installation? minmax = fastparquet.api.sorted_partitioned_columns(pf, filters) except TypeError: minmax = fastparquet.api.sorted_partitioned_columns(pf) if index_name in minmax: divisions = minmax[index_name] divisions = divisions['min'] + [divisions['max'][-1]] else: if infer_divisions is True: raise ValueError(( "Unable to infer divisions for index of '{index_name}' because it is not known to be " "sorted across partitions").format(index_name=index_name)) divisions = (None, ) * (len(rgs) + 1) else: if infer_divisions is True: raise ValueError( 'Unable to infer divisions for because no index column was discovered' ) divisions = (None, ) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)
def _read_fastparquet(fs, paths, myopen, columns=None, filters=None, categories=None, index=None, storage_options=None): import fastparquet from fastparquet.util import check_column_names if filters is None: filters = [] if isinstance(columns, list): columns = tuple(columns) if len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=myopen, sep=myopen.fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=myopen, sep=fs.sep) except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=myopen, sep=fs.sep) check_column_names(pf.columns, categories) name = 'read-parquet-' + tokenize(pf, columns, categories) rgs = [ rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters)) ] if index is False: index_col = None elif index is None: index_col = pf._get_index() else: index_col = index if columns is None: all_columns = tuple(pf.columns + list(pf.cats)) else: all_columns = columns if not isinstance(all_columns, tuple): out_type = Series all_columns = (all_columns, ) else: out_type = DataFrame if index_col and index_col not in all_columns: all_columns = all_columns + (index_col, ) if categories is None: categories = pf.categories dtypes = pf._dtypes(categories) meta = _meta_from_dtypes(all_columns, tuple(pf.columns + list(pf.cats)), dtypes, index_col) for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical( [], categories=[UNKNOWN_CATEGORIES]), index=meta.index) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg), index_col, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes, pf.file_scheme) for i, rg in enumerate(rgs)} if not dsk: # empty dataframe dsk = {(name, 0): meta} divisions = (None, None) return out_type(dsk, name, meta, divisions) if index_col: minmax = fastparquet.api.sorted_partitioned_columns(pf) if index_col in minmax: divisions = (list(minmax[index_col]['min']) + [minmax[index_col]['max'][-1]]) divisions = [ divisions[i] for i, rg in enumerate(pf.row_groups) if rg in rgs ] + [divisions[-1]] else: divisions = (None, ) * (len(rgs) + 1) else: divisions = (None, ) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)
def _read_fastparquet(fs, fs_token, paths, columns=None, filters=None, categories=None, index=None): import fastparquet from fastparquet.util import check_column_names if len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=fs.open, sep=fs.sep) except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep) check_column_names(pf.columns, categories) if isinstance(columns, tuple): # ensure they tokenize the same columns = list(columns) if pf.fmd.key_value_metadata: pandas_md = [ x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas' ] else: pandas_md = [] if len(pandas_md) == 0: # Fall back to the storage information index_names = pf._get_index() if not isinstance(index_names, list): index_names = [index_names] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} elif len(pandas_md) == 1: index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(json.loads(pandas_md[0]))) column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") # Normalize user inputs if filters is None: filters = [] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names) if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # TODO: write partition_on to pandas metadata... all_columns = list(column_names) all_columns.extend(x for x in index_names if x not in column_names) rgs = [ rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters)) ] dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None]) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") elif len(index_names) == 0: index_names = None for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical( [], categories=[UNKNOWN_CATEGORIES]), index=meta.index) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters, categories) dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg), index_names, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes, pf.file_scheme, storage_name_mapping) for i, rg in enumerate(rgs)} if not dsk: # empty dataframe dsk = {(name, 0): meta} divisions = (None, None) return out_type(dsk, name, meta, divisions) if index_names: index_name = meta.index.name minmax = fastparquet.api.sorted_partitioned_columns(pf) if index_name in minmax: divisions = (list(minmax[index_name]['min']) + [minmax[index_name]['max'][-1]]) divisions = [ divisions[i] for i, rg in enumerate(pf.row_groups) if rg in rgs ] + [divisions[-1]] else: divisions = (None, ) * (len(rgs) + 1) else: divisions = (None, ) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)
def _read_fastparquet(fs, paths, myopen, columns=None, filters=None, categories=None, index=None, storage_options=None): import fastparquet from fastparquet.util import check_column_names if filters is None: filters = [] if isinstance(columns, list): columns = tuple(columns) if len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=myopen, sep=myopen.fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=myopen, sep=fs.sep) except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=myopen, sep=fs.sep) check_column_names(pf.columns, categories) name = 'read-parquet-' + tokenize(pf, columns, categories) rgs = [rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters))] if index is False: index_col = None elif index is None: index_col = pf._get_index() else: index_col = index if columns is None: all_columns = tuple(pf.columns + list(pf.cats)) else: all_columns = columns if not isinstance(all_columns, tuple): out_type = Series all_columns = (all_columns,) else: out_type = DataFrame if index_col and index_col not in all_columns: all_columns = all_columns + (index_col,) if categories is None: categories = pf.categories dtypes = pf._dtypes(categories) meta = _meta_from_dtypes(all_columns, tuple(pf.columns + list(pf.cats)), dtypes, index_col) for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg), index_col, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes, pf.file_scheme) for i, rg in enumerate(rgs)} if not dsk: # empty dataframe dsk = {(name, 0): meta} divisions = (None, None) return out_type(dsk, name, meta, divisions) if index_col: minmax = fastparquet.api.sorted_partitioned_columns(pf) if index_col in minmax: divisions = (list(minmax[index_col]['min']) + [minmax[index_col]['max'][-1]]) divisions = [divisions[i] for i, rg in enumerate(pf.row_groups) if rg in rgs] + [divisions[-1]] else: divisions = (None,) * (len(rgs) + 1) else: divisions = (None,) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)
def read_parquet(path, columns=None, filters=None, categories=None, index=None, storage_options=None): """ Read ParquetFile into a Dask DataFrame This reads a directory of Parquet data into a Dask.dataframe, one file per partition. It selects the index among the sorted columns if any exist. This uses the fastparquet project: http://fastparquet.readthedocs.io/en/latest Parameters ---------- path : string Source directory for data. Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data. columns: list or None List of column names to load filters: list List of filters to apply, like ``[('x', '>' 0), ...]`` index: string or None Name of index column to use if that column is sorted categories: list or None For any fields listed here, if the parquet encoding is Dictionary, the column will be created with dtype category. Use only if it is guaranteed that the column is encoded as dictionary in all row-groups. storage_options : dict Key/value pairs to be passed on to the file-system backend, if any. Examples -------- >>> df = read_parquet('s3://bucket/my-parquet-data') # doctest: +SKIP See Also -------- to_parquet """ if fastparquet is False: raise ImportError("fastparquet not installed") if filters is None: filters = [] myopen = OpenFileCreator(path, compression=None, text=False, **(storage_options or {})) if isinstance(columns, list): columns = tuple(columns) try: pf = fastparquet.ParquetFile(path + myopen.fs.sep + '_metadata', open_with=myopen, sep=myopen.fs.sep) except: pf = fastparquet.ParquetFile(path, open_with=myopen, sep=myopen.fs.sep) check_column_names(pf.columns, categories) categories = categories or [] name = 'read-parquet-' + tokenize(pf, columns, categories) rgs = [rg for rg in pf.row_groups if not(fastparquet.api.filter_out_stats(rg, filters, pf.helper)) and not(fastparquet.api.filter_out_cats(rg, filters))] # Find an index among the partially sorted columns minmax = fastparquet.api.sorted_partitioned_columns(pf) if index is False: index_col = None elif len(minmax) == 1: index_col = first(minmax) elif len(minmax) > 1: if index: index_col = index elif 'index' in minmax: index_col = 'index' else: raise ValueError("Multiple possible indexes exist: %s. " "Please select one with index='index-name'" % sorted(minmax)) else: index_col = None if columns is None: all_columns = tuple(pf.columns + list(pf.cats)) else: all_columns = columns if not isinstance(all_columns, tuple): out_type = Series all_columns = (all_columns,) else: out_type = DataFrame if index_col and index_col not in all_columns: all_columns = all_columns + (index_col,) dtypes = {k: ('category' if k in categories else v) for k, v in pf.dtypes.items() if k in all_columns} meta = pd.DataFrame({c: pd.Series([], dtype=d) for (c, d) in dtypes.items()}, columns=[c for c in pf.columns if c in dtypes]) for cat in categories: meta[cat] = pd.Series(pd.Categorical([], categories=[UNKNOWN_CATEGORIES])) if index_col: meta = meta.set_index(index_col) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg), index_col, all_columns, rg, out_type == Series, categories, pf.helper, pf.cats, pf.dtypes) for i, rg in enumerate(rgs)} if index_col: divisions = list(minmax[index_col]['min']) + [minmax[index_col]['max'][-1]] else: divisions = (None,) * (len(rgs) + 1) return out_type(dsk, name, meta, divisions)
def _pf_validation(pf, columns, index, categories, filters): """Validate user options against metadata in dataset columns, index and categories must be in the list of columns available (both data columns and path-based partitioning - subject to possible renaming, if pandas metadata is present). The output index will be inferred from any available pandas metadata, if not given. """ from fastparquet.util import check_column_names check_column_names(pf.columns, categories) check_column_names(pf.columns + list(pf.cats or []), columns) if isinstance(columns, tuple): # ensure they tokenize the same columns = list(columns) if pf.fmd.key_value_metadata: pandas_md = [x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas'] else: pandas_md = [] if len(pandas_md) == 0: # Fall back to the storage information index_names = pf._get_index() if not isinstance(index_names, list): index_names = [index_names] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} elif len(pandas_md) == 1: index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(json.loads(pandas_md[0])) ) # auto-ranges should not be created by fastparquet index_names = [n for n in index_names if n is not None] column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") # Normalize user inputs if filters is None: filters = [] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names) if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # TODO: write partition_on to pandas metadata... all_columns = list(column_names) all_columns.extend(x for x in index_names if x not in column_names) dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None]) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") elif len(index_names) == 0: index_names = None for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] return (meta, filters, index_names, out_type, all_columns, index_names, storage_name_mapping)
def _read_fastparquet(fs, fs_token, paths, columns=None, filters=None, categories=None, index=None, infer_divisions=None): import fastparquet from fastparquet.util import check_column_names if isinstance(paths,fastparquet.api.ParquetFile): pf = paths elif len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=fs.open, sep=fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=fs.open, sep=fs.sep) except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=fs.open, sep=fs.sep) # Validate infer_divisions if os.path.split(pf.fn)[-1] != '_metadata' and infer_divisions is True: raise NotImplementedError("infer_divisions=True is not supported by the fastparquet engine for datasets " "that do not contain a global '_metadata' file") check_column_names(pf.columns, categories) check_column_names(pf.columns + list(pf.cats or []), columns) if isinstance(columns, tuple): # ensure they tokenize the same columns = list(columns) if pf.fmd.key_value_metadata: pandas_md = [x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas'] else: pandas_md = [] if len(pandas_md) == 0: # Fall back to the storage information index_names = pf._get_index() if not isinstance(index_names, list): index_names = [index_names] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} elif len(pandas_md) == 1: index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(json.loads(pandas_md[0])) ) column_names.extend(pf.cats) else: raise ValueError("File has multiple entries for 'pandas' metadata") # Normalize user inputs if filters is None: filters = [] column_names, index_names, out_type = _normalize_index_columns( columns, column_names, index, index_names) if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # TODO: write partition_on to pandas metadata... all_columns = list(column_names) all_columns.extend(x for x in index_names if x not in column_names) rgs = [rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters))] dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None]) # fastparquet doesn't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") elif len(index_names) == 0: index_names = None for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index) for catcol in pf.cats: if catcol in meta.columns: meta[catcol] = meta[catcol].cat.set_categories(pf.cats[catcol]) elif meta.index.name == catcol: meta.index = meta.index.set_categories(pf.cats[catcol]) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] name = 'read-parquet-' + tokenize(fs_token, paths, all_columns, filters, categories) dsk = {(name, i): (_read_parquet_row_group, fs, pf.row_group_filename(rg), index_names, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes, pf.file_scheme, storage_name_mapping) for i, rg in enumerate(rgs)} if not dsk: # empty dataframe dsk = {(name, 0): meta} divisions = (None, None) return out_type(dsk, name, meta, divisions) if index_names and infer_divisions is not False: index_name = meta.index.name try: # is https://github.com/dask/fastparquet/pull/371 available in # current fastparquet installation? minmax = fastparquet.api.sorted_partitioned_columns(pf, filters) except TypeError: minmax = fastparquet.api.sorted_partitioned_columns(pf) if index_name in minmax: divisions = minmax[index_name] divisions = divisions['min'] + [divisions['max'][-1]] else: if infer_divisions is True: raise ValueError( ("Unable to infer divisions for index of '{index_name}' because it is not known to be " "sorted across partitions").format(index_name=index_name)) divisions = (None,) * (len(rgs) + 1) else: if infer_divisions is True: raise ValueError( 'Unable to infer divisions for because no index column was discovered') divisions = (None,) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)
def _read_fastparquet(fs, paths, myopen, columns=None, filters=None, categories=None, index=None, storage_options=None): if filters is None: filters = [] if isinstance(columns, list): columns = tuple(columns) if len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=myopen, sep=myopen.fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=myopen, sep=fs.sep) except: pf = fastparquet.ParquetFile(paths[0], open_with=myopen, sep=fs.sep) check_column_names(pf.columns, categories) name = 'read-parquet-' + tokenize(pf, columns, categories) rgs = [ rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters)) ] # Find an index among the partially sorted columns minmax = fastparquet.api.sorted_partitioned_columns(pf) if index is False: index_col = None elif len(minmax) == 1: index_col = first(minmax) elif len(minmax) > 1: if index: index_col = index elif 'index' in minmax: index_col = 'index' else: raise ValueError("Multiple possible indexes exist: %s. " "Please select one with index='index-name'" % sorted(minmax)) else: index_col = None if columns is None: all_columns = tuple(pf.columns + list(pf.cats)) else: all_columns = columns if not isinstance(all_columns, tuple): out_type = Series all_columns = (all_columns, ) else: out_type = DataFrame if index_col and index_col not in all_columns: all_columns = all_columns + (index_col, ) if categories is None: categories = pf.categories dtypes = pf._dtypes(categories) meta = _meta_from_dtypes(all_columns, pf.columns, dtypes) for cat in categories: meta[cat] = pd.Series( pd.Categorical([], categories=[UNKNOWN_CATEGORIES])) if index_col: meta = meta.set_index(index_col) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg), index_col, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes) for i, rg in enumerate(rgs)} if index_col: divisions = list( minmax[index_col]['min']) + [minmax[index_col]['max'][-1]] else: divisions = (None, ) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)
def _read_fastparquet(fs, paths, myopen, columns=None, filters=None, categories=None, index=None, storage_options=None): import fastparquet from fastparquet.util import check_column_names if len(paths) > 1: pf = fastparquet.ParquetFile(paths, open_with=myopen, sep=myopen.fs.sep) else: try: pf = fastparquet.ParquetFile(paths[0] + fs.sep + '_metadata', open_with=myopen, sep=fs.sep) except Exception: pf = fastparquet.ParquetFile(paths[0], open_with=myopen, sep=fs.sep) check_column_names(pf.columns, categories) if isinstance(columns, tuple): # ensure they tokenize the same columns = list(columns) name = 'read-parquet-' + tokenize(pf, columns, categories) if pf.fmd.key_value_metadata: pandas_md = [x.value for x in pf.fmd.key_value_metadata if x.key == 'pandas'] else: pandas_md = [] if len(pandas_md) == 0: # Fall back to the storage information index_names = pf._get_index() if not isinstance(index_names, list): index_names = [index_names] column_names = pf.columns + list(pf.cats) storage_name_mapping = {k: k for k in column_names} column_index_names = [None] elif len(pandas_md) == 1: index_names, column_names, storage_name_mapping, column_index_names = ( _parse_pandas_metadata(json.loads(pandas_md[0])) ) else: raise ValueError("File has multiple entries for 'pandas' metadata") # Normalize user inputs if filters is None: filters = [] column_names, out_type = _normalize_columns(columns, column_names) index_names = _normalize_index(index, index_names) if categories is None: categories = pf.categories elif isinstance(categories, string_types): categories = [categories] else: categories = list(categories) # TODO: write partition_on to pandas metadata... # TODO: figure out if partition_on <-> categories. I suspect not... all_columns = list(column_names) all_columns.extend(x for x in index_names if x not in column_names) file_cats = pf.cats if file_cats: all_columns.extend(list(file_cats)) rgs = [rg for rg in pf.row_groups if not (fastparquet.api.filter_out_stats(rg, filters, pf.schema)) and not (fastparquet.api.filter_out_cats(rg, filters))] dtypes = pf._dtypes(categories) dtypes = {storage_name_mapping.get(k, k): v for k, v in dtypes.items()} meta = _meta_from_dtypes(all_columns, dtypes, index_names, [None]) # fastparquet / dask don't handle multiindex if len(index_names) > 1: raise ValueError("Cannot read DataFrame with MultiIndex.") elif len(index_names) == 0: index_names = None for cat in categories: if cat in meta: meta[cat] = pd.Series(pd.Categorical([], categories=[UNKNOWN_CATEGORIES]), index=meta.index) if out_type == Series: assert len(meta.columns) == 1 meta = meta[meta.columns[0]] dsk = {(name, i): (_read_parquet_row_group, myopen, pf.row_group_filename(rg), index_names, all_columns, rg, out_type == Series, categories, pf.schema, pf.cats, pf.dtypes, pf.file_scheme, storage_name_mapping) for i, rg in enumerate(rgs)} if not dsk: # empty dataframe dsk = {(name, 0): meta} divisions = (None, None) return out_type(dsk, name, meta, divisions) if index_names: index_name = meta.index.name minmax = fastparquet.api.sorted_partitioned_columns(pf) if index_name in minmax: divisions = (list(minmax[index_name]['min']) + [minmax[index_name]['max'][-1]]) divisions = [divisions[i] for i, rg in enumerate(pf.row_groups) if rg in rgs] + [divisions[-1]] else: divisions = (None,) * (len(rgs) + 1) else: divisions = (None,) * (len(rgs) + 1) if isinstance(divisions[0], np.datetime64): divisions = [pd.Timestamp(d) for d in divisions] return out_type(dsk, name, meta, divisions)