def detect_folders( bucket: str, fs: fsspec.AbstractFileSystem, ) -> Mapping[str, DiagnosticFolder]: diag_ncs = fs.glob(os.path.join(bucket, "*", "diags.nc")) return { Path(url).parent.name: DiagnosticFolder(fs, Path(url).parent.as_posix()) for url in diag_ncs }
def write_parquet( fs: fsspec.AbstractFileSystem, path: str, df: pd.DataFrame, partition_cols: Optional[List[str]], schema: pa.Schema, **kwargs, ): """ Write a single dataframe to parquet. """ # Check partition values are valid before writing to parquet mappings = check_partition_columns(df=df, partition_columns=partition_cols) df = clean_partition_cols(df=df, mappings=mappings) # Dataframe -> pyarrow Table table = pa.Table.from_pandas(df, schema=schema) if "basename_template" not in kwargs and "ts_init" in df.columns: kwargs["basename_template"] = ( f"{df['ts_init'].min()}-{df['ts_init'].max()}" + "-{i}.parquet" ) # Write the actual file partitions = ( ds.partitioning( schema=pa.schema(fields=[table.schema.field(c) for c in partition_cols]), flavor="hive", ) if partition_cols else None ) if pa.__version__ >= "6.0.0": kwargs.update(existing_data_behavior="overwrite_or_ignore") files = set(fs.glob(f"{path}/**")) ds.write_dataset( data=table, base_dir=path, filesystem=fs, partitioning=partitions, format="parquet", **kwargs, ) # Ensure data written by write_dataset is sorted new_files = set(fs.glob(f"{path}/**/*.parquet")) - files del df for fn in new_files: ndf = pd.read_parquet(fs.open(fn)) # assert ndf.shape[0] == shape if "ts_init" in ndf.columns: ndf = ndf.sort_values("ts_init").reset_index(drop=True) pq.write_table( table=pa.Table.from_pandas(ndf), where=fn, filesystem=fs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.6", filesystem=fs) # Write out any partition columns we had to modify due to filesystem requirements if mappings: write_partition_column_mappings(fs=fs, path=path, mappings=mappings)