def test_combined_preprocessing_mislabeled_coords(): """Test if the renaming is applied to datavariables and then if they are moved to the coords.""" # create a 2d dataset xlen, ylen, zlen = (10, 5, 1) ds = create_test_ds("x", "y", "dummy", xlen, ylen, zlen).squeeze().drop("dummy") ds = ds.assign(depth=5.0) ds_pp = combined_preprocessing(ds) assert "lev" in ds_pp.coords np.testing.assert_allclose(ds.depth.data, ds_pp.lev.data)
def test_preserve_attrs(): # create a 2d dataset xlen, ylen, zlen = (10, 5, 1) ds = (create_test_ds("x", "y", "dummy", xlen, ylen, zlen).squeeze().drop_vars("dummy")) ds.attrs = {"preserve_this": "here"} # TODO: there are a bunch of errors if the metadata is not full. # I should probably ignore them and still put the datset out? # Well for now create one for att in exact_attrs: ds.attrs[att] = "a" ds_pp = combined_preprocessing(ds) assert ds_pp.attrs["preserve_this"] == "here"
def test_combined_preprocessing_dropped_coords(add_coords, shift): """Check if coordinates are properly dropped""" # create a 2d dataset xlen, ylen, zlen = (10, 5, 1) ds = (create_test_ds("x", "y", "dummy", xlen, ylen, zlen).squeeze().drop_vars("dummy")) x_bnds = xr.concat([ds.x, ds.x], "bnds") ds = ds.assign_coords(x_bounds=x_bnds) if add_coords: ds = ds.assign_coords(bnds=np.arange(len(ds.bnds)) + shift) ds = combined_preprocessing(ds) assert "bnds" not in ds.coords
def data(source_id, variable_id, experiment_id, grid_label, use_intake_esm, catalog="main"): zarr_kwargs = { "consolidated": True, # "decode_times": False, "decode_times": True, "use_cftime": True, } cat = google_cmip_col(catalog=catalog).search( source_id=source_id, experiment_id=experiment_id, variable_id=variable_id, # member_id="r1i1p1f1", table_id="Omon", grid_label=grid_label, ) if len(cat.df["zstore"]) > 0: if use_intake_esm: ddict = cat.to_dataset_dict( zarr_kwargs=zarr_kwargs, preprocess=combined_preprocessing, storage_options={"token": "anon"}, ) _, ds = ddict.popitem() else: ##### debugging options # @charlesbluca suggested this to make this work in GHA # https://github.com/jbusecke/cmip6_preprocessing/pull/62#issuecomment-741928365 mm = fsspec.get_mapper( cat.df["zstore"] [0]) # think you can pass in storage options here as well? ds_raw = xr.open_zarr(mm, **zarr_kwargs) ds = combined_preprocessing(ds_raw) else: ds = None return ds, cat
def get_and_organize_cmip6_data(conf): # Dictionary to hold the queried variables first = True for experiment_id in conf.experiment_ids: for grid_label in conf.grid_labels: for source_id in conf.source_ids: for member_id in conf.member_ids: for variable_id, table_id in zip(conf.variable_ids, conf.table_ids): # Create unique key to hold dataset in dictionary key = "{}_{}_{}_{}_{}".format(variable_id, experiment_id, grid_label, source_id, member_id) # Historical query string query_string = "source_id=='{}'and table_id=='{}' and grid_label=='{}' and experiment_id=='historical' and variable_id=='{}'".format( source_id, table_id, grid_label, variable_id) print("Running historical query on data: \n ==> {}\n". format(query_string)) ds_hist = perform_cmip6_query(conf, query_string) # Future projection depending on choice in experiment_id query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='{}' and variable_id=='{}'".format( source_id, table_id, member_id, grid_label, experiment_id, variable_id, ) print("Running projections query on data: \n ==> {}\n". format(query_string)) ds_proj = perform_cmip6_query(conf, query_string) # Concatentate the historical and projections datasets ds = xr.concat([ds_hist, ds_proj], dim="time") # Remove the duplicate overlapping times (e.g. 2001-2014) # _, index = np.unique(ds["time"], return_index=True) # ds = ds.isel(time=index) # Extract the time period of interest ds = ds.sel(time=slice(conf.start_date, conf.end_date)) print("{} => Dates extracted range from {} to {}\n". format(source_id, ds["time"].values[0], ds["time"].values[-1])) # pass the preprocessing directly dset_processed = combined_preprocessing(ds) if (variable_id in ["chl"]): if (source_id in [ "CESM2", "CESM2-FV2", "CESM2-WACCM-FV2", "CESM2-WACCM" ]): dset_processed = dset_processed.isel( lev_partial=conf.selected_depth) else: dset_processed = dset_processed.isel( lev=conf.selected_depth) # Save the dataset for variable_id in the dictionary conf.dset_dict[key] = dset_processed return conf
def organize_cmip6_datasets(self, config: CMIP6_config.Config_albedo, current_experiment_id): # for experiment_id in config.experiment_ids: for grid_label in config.grid_labels: for source_id in config.source_ids: if source_id in config.models.keys(): model_object = config.models[source_id] else: model_object = CMIP6_model.CMIP6_MODEL(name=source_id) logging.info( "[CMIP6_IO] Organizing CMIP6 model object {}".format( model_object.name)) for member_id in config.member_ids: for variable_id, table_id in zip(config.variable_ids, config.table_ids): # Historical query string query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='historical' and variable_id=='{}'".format( source_id, table_id, member_id, grid_label, variable_id) ds_hist = self.perform_cmip6_query( config, query_string) # Future projection depending on choice in experiment_id query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='{}' and variable_id=='{}'".format( source_id, table_id, member_id, grid_label, current_experiment_id, variable_id, ) print("query_string: {}".format(query_string)) ds_proj = self.perform_cmip6_query( config, query_string) if isinstance(ds_proj, xr.Dataset) and isinstance( ds_hist, xr.Dataset): # Concatenate the historical and projections datasets ds = xr.concat([ds_hist, ds_proj], dim="time") if not ds.indexes["time"].dtype in [ "datetime64[ns]" ]: start_date = datetime.fromisoformat( config.start_date) end_date = datetime.fromisoformat( config.end_date) ds = self.to_360day_monthly(ds) else: start_date = config.start_date end_date = config.end_date ds = xr.decode_cf(ds) logging.info( "[CMIP6_IO] Variable: {} and units {}".format( variable_id, ds[variable_id].units)) if variable_id in ["prw"]: # 1 kg of rain water spread over 1 square meter of surface is 1 mm in thickness # The pvlib functions takes cm so we convert values ds[variable_id].values = ds[ variable_id].values / 10.0 ds.attrs["units"] = "cm" logging.info( "[CMIP6_IO] Minimum {} and maximum {} values after converting to {} units" .format(np.nanmin(ds[variable_id].values), np.nanmax(ds[variable_id].values), ds[variable_id].units)) if variable_id in ["tas"]: if ds[variable_id].units in [ "K", "Kelvin", "kelvin" ]: ds[variable_id].values = ds[ variable_id].values - 273.15 ds.attrs["units"] = "C" logging.info( "[CMIP6_IO] Minimum {} and maximum {} values after converting to {} units" .format( np.nanmin(ds[variable_id].values), np.nanmax(ds[variable_id].values), ds[variable_id].units)) # Remove the duplicate overlapping times (e.g. 2001-2014) _, index = np.unique(ds["time"], return_index=True) ds = ds.isel(time=index) # if not isinstance((ds.indexes["time"]), pd.DatetimeIndex): # ds["time"] = ds.indexes["time"].to_datetimeindex() ds = ds.sel(time=slice(start_date, end_date)) ds["time"] = pd.to_datetime(ds.indexes["time"]) # Extract the time period of interest ds = ds.sel(time=slice(start_date, end_date)) logging.info( "[CMIP6_IO] {} => Extracted {} range from {} to {} for member {}" .format(source_id, variable_id, ds["time"].values[0], ds["time"].values[-1], member_id)) # pass the pre-processing directly dset_processed = combined_preprocessing(ds) if variable_id in ["chl"]: if source_id in [ "CESM2", "CESM2-FV2", "CESM2-WACCM-FV2" ]: dset_processed = dset_processed.isel( lev_partial=config.selected_depth) else: dset_processed = dset_processed.isel( lev=config.selected_depth) if variable_id in ["ph"]: logging.info( "[CMIP6_IO] => Extract only depth level {}" .format(config.selected_depth)) dset_processed = dset_processed.isel( lev=config.selected_depth) # Save the info to model object if not member_id in model_object.member_ids: model_object.member_ids.append(member_id) if not member_id in model_object.ocean_vars.keys(): model_object.ocean_vars[member_id] = [] if not variable_id in model_object.ocean_vars[ member_id]: current_vars = model_object.ocean_vars[ member_id] current_vars.append(variable_id) model_object.ocean_vars[ member_id] = current_vars self.dataset_into_model_dictionary( member_id, variable_id, dset_processed, model_object) else: logging.error( "[CMIP6_IO] Error - unable to find variable {}" .format(variable_id)) self.models.append(model_object) logging.info( "[CMIP6_IO] Stored {} variables for model {}".format( len(model_object.ocean_vars), model_object.name))