def test_autocat(tempdir): tmp = str(tempdir) fn = os.path.join(tmp, "test.parq") data = pd.DataFrame( {'o': pd.Categorical(np.random.choice(['hello', 'world'], size=1000))}) write(fn, data) pf = ParquetFile(fn) assert 'o' in pf.categories assert pf.categories['o'] == 2 assert pf.dtypes['o'] == 'category' out = pf.to_pandas() assert out.dtypes['o'] == 'category' out = pf.to_pandas(categories={}) assert str(out.dtypes['o']) != 'category' out = pf.to_pandas(categories=['o']) assert out.dtypes['o'] == 'category' out = pf.to_pandas(categories={'o': 2}) assert out.dtypes['o'] == 'category' # regression test pf.fmd.key_value_metadata = [ parquet_thrift.KeyValue(key='fastparquet.cats', value='{"o": 2}') ] pf._set_attrs() assert 'o' in pf.categories assert pf.categories['o'] == 2 assert pf.dtypes['o'] == 'category' out = pf.to_pandas() assert out.dtypes['o'] == 'category' out = pf.to_pandas(categories={}) assert str(out.dtypes['o']) != 'category' out = pf.to_pandas(categories=['o']) assert out.dtypes['o'] == 'category' out = pf.to_pandas(categories={'o': 2}) assert out.dtypes['o'] == 'category'
def read_partition(cls, fs, piece, columns, index, categories=(), **kwargs): null_index_name = False if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) if isinstance(piece, tuple): if isinstance(piece[0], str): # We have a path to read from assert parquet_file is None parquet_file = ParquetFile(piece[0], open_with=fs.open, sep=fs.sep, **kwargs.get("file", {})) rg_indices = piece[1] or list( range(len(parquet_file.row_groups))) # `piece[1]` will contain row-group indices row_groups = [parquet_file.row_groups[rg] for rg in rg_indices] elif parquet_file: # `piece[1]` will contain actual row-group objects, # but they may be pickled row_groups = piece[0] if isinstance(row_groups, bytes): row_groups = pickle.loads(row_groups) parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats else: raise ValueError("Neither path nor ParquetFile detected!") if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index parquet_file._dtypes = (lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), )) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `piece` is NOT a tuple raise ValueError(f"Expected tuple, got {type(piece)}")
def read_partition( cls, fs, pieces, columns, index, categories=(), root_cats=None, root_file_scheme=None, base_path=None, **kwargs, ): null_index_name = False base_path = False if not root_cats else base_path if isinstance(index, list): if index == [None]: # Handling a None-labeled index... # The pandas metadata told us to read in an index # labeled `None`. If this corresponds to a `RangeIndex`, # fastparquet will need use the pandas metadata to # construct the index. Otherwise, the index will correspond # to a column named "__index_level_0__". We will need to # check the `ParquetFile` object for this column below. index = [] null_index_name = True columns += index # Use global `parquet_file` object. Need to reattach # the desired row_group parquet_file = kwargs.pop("parquet_file", None) # Always convert pieces to list if not isinstance(pieces, list): pieces = [pieces] sample = pieces[0] if isinstance(sample, tuple): if isinstance(sample[0], str): # We have paths to read from assert parquet_file is None row_groups = [] rg_offset = 0 parquet_file = ParquetFile( [p[0] for p in pieces], open_with=fs.open, root=base_path or False, **kwargs.get("file", {}), ) for piece in pieces: _pf = (parquet_file if len(pieces) == 1 else ParquetFile( piece[0], open_with=fs.open, root=base_path or False, **kwargs.get("file", {}), )) n_local_row_groups = len(_pf.row_groups) local_rg_indices = piece[1] or list( range(n_local_row_groups)) row_groups += [ parquet_file.row_groups[rg + rg_offset] for rg in local_rg_indices ] rg_offset += n_local_row_groups update_parquet_file = len(row_groups) < len( parquet_file.row_groups) elif parquet_file: row_groups = [] for piece in pieces: # `piece[1]` will contain actual row-group objects, # but they may be pickled rgs = piece[0] if isinstance(rgs, bytes): rgs = pickle.loads(rgs) row_groups += rgs update_parquet_file = True else: raise ValueError("Neither path nor ParquetFile detected!") if update_parquet_file: with _FP_FILE_LOCK: parquet_file.fmd.row_groups = row_groups # NOTE: May lose cats after `_set_attrs` call save_cats = parquet_file.cats parquet_file._set_attrs() parquet_file.cats = save_cats if null_index_name: if "__index_level_0__" in parquet_file.columns: # See "Handling a None-labeled index" comment above index = ["__index_level_0__"] columns += index # Update hive-partitioning information if necessary parquet_file.cats = root_cats or {} if root_cats: parquet_file.file_scheme = root_file_scheme parquet_file._dtypes = (lambda *args: parquet_file.dtypes ) # ugly patch, could be fixed if set(columns).issubset(parquet_file.columns + list(parquet_file.cats.keys())): # Convert ParquetFile to pandas return parquet_file.to_pandas( columns=columns, categories=categories, index=index, ) else: # Read necessary row-groups and concatenate dfs = [] for row_group in row_groups: dfs.append( parquet_file.read_row_group_file( row_group, columns, categories, index=index, **kwargs.get("read", {}), )) return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0] else: # `sample` is NOT a tuple raise ValueError(f"Expected tuple, got {type(sample)}")