def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf: df = pf.read_row_group_file(piece, columns, categories, index=index, **kwargs.get("read", {})) else: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base df = pf.to_pandas(columns, categories, index=index) return df
def _read_pf_simple(fs, path, base, index_names, all_columns, is_series, categories, cats, scheme, storage_name_mapping): """Read dataset with fastparquet using ParquetFile machinery""" from fastparquet import ParquetFile pf = ParquetFile(path, open_with=fs.open) relpath = path.replace(base, '').lstrip('/') for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = cats pf.fn = base df = pf.to_pandas(all_columns, categories, index=index_names) if df.index.nlevels == 1: if index_names: df.index.name = storage_name_mapping.get(index_names[0], index_names[0]) else: if index_names: df.index.names = [storage_name_mapping.get(name, name) for name in index_names] df.columns = [storage_name_mapping.get(col, col) for col in all_columns if col not in (index_names or [])] if is_series: return df[df.columns[0]] else: return df
def read_partition(fs, piece, columns, index, categories=(), pf=None, **kwargs): if isinstance(index, list): columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = _paths_to_cats(fns, scheme) pf.fn = base return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] pf.fmd.key_value_metadata = None return pf.read_row_group_file(rg_piece, columns, categories, index=index, **kwargs.get("read", {}))
def read_partition( cls, fs, piece, columns, index, categories=(), pf=None, **kwargs ): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index if pf is None: base, fns = _analyze_paths([piece], fs) scheme = get_file_scheme(fns) pf = ParquetFile(piece, open_with=fs.open) relpath = piece.replace(base, "").lstrip("/") for rg in pf.row_groups: for ch in rg.columns: ch.file_path = relpath pf.file_scheme = scheme pf.cats = paths_to_cats(fns, scheme) pf.fn = base if null_index_name and "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index return pf.to_pandas(columns, categories, index=index) else: if isinstance(pf, tuple): if isinstance(pf[0], list): pf = _determine_pf_parts(fs, pf[0], pf[1], **kwargs)[1] else: pf = ParquetFile( pf[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {}) ) pf._dtypes = lambda *args: pf.dtypes # ugly patch, could be fixed pf.fmd.row_groups = None rg_piece = pf.row_groups[piece] if null_index_name: if "__index_level_0__" in pf.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index pf.fmd.key_value_metadata = None else: pf.fmd.key_value_metadata = None return pf.read_row_group_file( rg_piece, columns, categories, index=index, **kwargs.get("read", {}) )
def _read_parquet_file( fs, base, fn, index, columns, series, categories, cs, dt, scheme, storage_name_mapping, *args ): """Read a single file with fastparquet, to be used in a task""" from fastparquet.api import ParquetFile from collections import OrderedDict name_storage_mapping = {v: k for k, v in storage_name_mapping.items()} if not isinstance(columns, (tuple, list)): columns = [columns] series = True if index: index, = index if index not in columns: columns = columns + [index] columns = [name_storage_mapping.get(col, col) for col in columns] index = name_storage_mapping.get(index, index) cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns]) pf = ParquetFile(fn, open_with=fs.open) pf.file_scheme = scheme for rg in pf.row_groups: for ch in rg.columns: ch.file_path = fn.replace(base, "").lstrip("/") pf.fn = base df = pf.to_pandas(columns=columns, index=index, categories=categories) if df.index.nlevels == 1: if index: df.index.name = storage_name_mapping.get(index, index) else: if index: df.index.names = [storage_name_mapping.get(name, name) for name in index] df.columns = [storage_name_mapping.get(col, col) for col in columns if col != index] if series: return df[df.columns[0]] else: return df
def _read_parquet_file(fs, base, fn, index, columns, series, categories, cs, dt, scheme, storage_name_mapping, *args): """Read a single file with fastparquet, to be used in a task""" from fastparquet.api import ParquetFile from collections import OrderedDict name_storage_mapping = {v: k for k, v in storage_name_mapping.items()} if not isinstance(columns, (tuple, list)): columns = [columns,] series = True if index: index, = index if index not in columns: columns = columns + [index] columns = [name_storage_mapping.get(col, col) for col in columns] index = name_storage_mapping.get(index, index) cs = OrderedDict([(k, v) for k, v in cs.items() if k in columns]) pf = ParquetFile(fn, open_with=fs.open) pf.file_scheme = scheme for rg in pf.row_groups: for ch in rg.columns: ch.file_path = fn.replace(base, "").lstrip('/') pf.fn = base df = pf.to_pandas(columns=columns, index=index, categories=categories) if df.index.nlevels == 1: if index: df.index.name = storage_name_mapping.get(index, index) else: if index: df.index.names = [storage_name_mapping.get(name, name) for name in index] df.columns = [storage_name_mapping.get(col, col) for col in columns if col != index] if series: return df[df.columns[0]] else: return df