Example #1
0
def test_combined_preprocessing_mislabeled_coords():
    """Test if the renaming is applied to datavariables and then if they are moved to the coords."""
    # create a 2d dataset
    xlen, ylen, zlen = (10, 5, 1)
    ds = create_test_ds("x", "y", "dummy", xlen, ylen,
                        zlen).squeeze().drop("dummy")
    ds = ds.assign(depth=5.0)

    ds_pp = combined_preprocessing(ds)
    assert "lev" in ds_pp.coords
    np.testing.assert_allclose(ds.depth.data, ds_pp.lev.data)
def test_preserve_attrs():
    # create a 2d dataset
    xlen, ylen, zlen = (10, 5, 1)
    ds = (create_test_ds("x", "y", "dummy", xlen, ylen,
                         zlen).squeeze().drop_vars("dummy"))
    ds.attrs = {"preserve_this": "here"}

    # TODO:  there are a bunch of errors if the metadata is not full.
    # I should probably ignore them and still put the datset out?
    # Well for now create one
    for att in exact_attrs:
        ds.attrs[att] = "a"

    ds_pp = combined_preprocessing(ds)
    assert ds_pp.attrs["preserve_this"] == "here"
def test_combined_preprocessing_dropped_coords(add_coords, shift):
    """Check if coordinates are properly dropped"""
    # create a 2d dataset
    xlen, ylen, zlen = (10, 5, 1)
    ds = (create_test_ds("x", "y", "dummy", xlen, ylen,
                         zlen).squeeze().drop_vars("dummy"))
    x_bnds = xr.concat([ds.x, ds.x], "bnds")
    ds = ds.assign_coords(x_bounds=x_bnds)

    if add_coords:
        ds = ds.assign_coords(bnds=np.arange(len(ds.bnds)) + shift)

    ds = combined_preprocessing(ds)

    assert "bnds" not in ds.coords
Example #4
0
def data(source_id,
         variable_id,
         experiment_id,
         grid_label,
         use_intake_esm,
         catalog="main"):
    zarr_kwargs = {
        "consolidated": True,
        # "decode_times": False,
        "decode_times": True,
        "use_cftime": True,
    }

    cat = google_cmip_col(catalog=catalog).search(
        source_id=source_id,
        experiment_id=experiment_id,
        variable_id=variable_id,
        # member_id="r1i1p1f1",
        table_id="Omon",
        grid_label=grid_label,
    )

    if len(cat.df["zstore"]) > 0:
        if use_intake_esm:
            ddict = cat.to_dataset_dict(
                zarr_kwargs=zarr_kwargs,
                preprocess=combined_preprocessing,
                storage_options={"token": "anon"},
            )
            _, ds = ddict.popitem()
        else:
            ##### debugging options
            # @charlesbluca suggested this to make this work in GHA
            # https://github.com/jbusecke/cmip6_preprocessing/pull/62#issuecomment-741928365
            mm = fsspec.get_mapper(
                cat.df["zstore"]
                [0])  # think you can pass in storage options here as well?
            ds_raw = xr.open_zarr(mm, **zarr_kwargs)
            ds = combined_preprocessing(ds_raw)
    else:
        ds = None

    return ds, cat
Example #5
0
def get_and_organize_cmip6_data(conf):
    # Dictionary to hold the queried variables

    first = True
    for experiment_id in conf.experiment_ids:
        for grid_label in conf.grid_labels:
            for source_id in conf.source_ids:
                for member_id in conf.member_ids:
                    for variable_id, table_id in zip(conf.variable_ids,
                                                     conf.table_ids):

                        # Create unique key to hold dataset in dictionary
                        key = "{}_{}_{}_{}_{}".format(variable_id,
                                                      experiment_id,
                                                      grid_label, source_id,
                                                      member_id)
                        # Historical query string
                        query_string = "source_id=='{}'and table_id=='{}' and grid_label=='{}' and experiment_id=='historical' and variable_id=='{}'".format(
                            source_id, table_id, grid_label, variable_id)

                        print("Running historical query on data: \n ==> {}\n".
                              format(query_string))
                        ds_hist = perform_cmip6_query(conf, query_string)

                        # Future projection depending on choice in experiment_id
                        query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='{}' and variable_id=='{}'".format(
                            source_id,
                            table_id,
                            member_id,
                            grid_label,
                            experiment_id,
                            variable_id,
                        )
                        print("Running projections query on data: \n ==> {}\n".
                              format(query_string))
                        ds_proj = perform_cmip6_query(conf, query_string)

                        # Concatentate the historical and projections datasets
                        ds = xr.concat([ds_hist, ds_proj], dim="time")
                        # Remove the duplicate overlapping times (e.g. 2001-2014)
                        #     _, index = np.unique(ds["time"], return_index=True)
                        #     ds = ds.isel(time=index)

                        # Extract the time period of interest
                        ds = ds.sel(time=slice(conf.start_date, conf.end_date))
                        print("{} => Dates extracted range from {} to {}\n".
                              format(source_id, ds["time"].values[0],
                                     ds["time"].values[-1]))
                        # pass the preprocessing directly
                        dset_processed = combined_preprocessing(ds)
                        if (variable_id in ["chl"]):
                            if (source_id in [
                                    "CESM2", "CESM2-FV2", "CESM2-WACCM-FV2",
                                    "CESM2-WACCM"
                            ]):
                                dset_processed = dset_processed.isel(
                                    lev_partial=conf.selected_depth)
                            else:
                                dset_processed = dset_processed.isel(
                                    lev=conf.selected_depth)

                        # Save the dataset for variable_id in the dictionary
                        conf.dset_dict[key] = dset_processed
    return conf
Example #6
0
    def organize_cmip6_datasets(self, config: CMIP6_config.Config_albedo,
                                current_experiment_id):

        # for experiment_id in config.experiment_ids:
        for grid_label in config.grid_labels:
            for source_id in config.source_ids:

                if source_id in config.models.keys():
                    model_object = config.models[source_id]
                else:
                    model_object = CMIP6_model.CMIP6_MODEL(name=source_id)

                logging.info(
                    "[CMIP6_IO] Organizing CMIP6 model object {}".format(
                        model_object.name))

                for member_id in config.member_ids:

                    for variable_id, table_id in zip(config.variable_ids,
                                                     config.table_ids):

                        # Historical query string
                        query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='historical' and variable_id=='{}'".format(
                            source_id, table_id, member_id, grid_label,
                            variable_id)

                        ds_hist = self.perform_cmip6_query(
                            config, query_string)

                        # Future projection depending on choice in experiment_id
                        query_string = "source_id=='{}'and table_id=='{}' and member_id=='{}' and grid_label=='{}' and experiment_id=='{}' and variable_id=='{}'".format(
                            source_id,
                            table_id,
                            member_id,
                            grid_label,
                            current_experiment_id,
                            variable_id,
                        )
                        print("query_string: {}".format(query_string))
                        ds_proj = self.perform_cmip6_query(
                            config, query_string)

                        if isinstance(ds_proj, xr.Dataset) and isinstance(
                                ds_hist, xr.Dataset):
                            # Concatenate the historical and projections datasets
                            ds = xr.concat([ds_hist, ds_proj], dim="time")

                            if not ds.indexes["time"].dtype in [
                                    "datetime64[ns]"
                            ]:
                                start_date = datetime.fromisoformat(
                                    config.start_date)
                                end_date = datetime.fromisoformat(
                                    config.end_date)
                                ds = self.to_360day_monthly(ds)
                            else:
                                start_date = config.start_date
                                end_date = config.end_date
                            ds = xr.decode_cf(ds)
                            logging.info(
                                "[CMIP6_IO] Variable: {} and units {}".format(
                                    variable_id, ds[variable_id].units))
                            if variable_id in ["prw"]:
                                # 1 kg of rain water spread over 1 square meter of surface is 1 mm in thickness
                                # The pvlib functions takes cm so we convert values
                                ds[variable_id].values = ds[
                                    variable_id].values / 10.0
                                ds.attrs["units"] = "cm"
                                logging.info(
                                    "[CMIP6_IO] Minimum {} and maximum {} values after converting to {} units"
                                    .format(np.nanmin(ds[variable_id].values),
                                            np.nanmax(ds[variable_id].values),
                                            ds[variable_id].units))

                            if variable_id in ["tas"]:
                                if ds[variable_id].units in [
                                        "K", "Kelvin", "kelvin"
                                ]:
                                    ds[variable_id].values = ds[
                                        variable_id].values - 273.15
                                    ds.attrs["units"] = "C"
                                    logging.info(
                                        "[CMIP6_IO] Minimum {} and maximum {} values after converting to {} units"
                                        .format(
                                            np.nanmin(ds[variable_id].values),
                                            np.nanmax(ds[variable_id].values),
                                            ds[variable_id].units))

                            # Remove the duplicate overlapping times (e.g. 2001-2014)
                            _, index = np.unique(ds["time"], return_index=True)
                            ds = ds.isel(time=index)
                            # if not isinstance((ds.indexes["time"]), pd.DatetimeIndex):
                            #     ds["time"] = ds.indexes["time"].to_datetimeindex()
                            ds = ds.sel(time=slice(start_date, end_date))
                            ds["time"] = pd.to_datetime(ds.indexes["time"])

                            # Extract the time period of interest
                            ds = ds.sel(time=slice(start_date, end_date))

                            logging.info(
                                "[CMIP6_IO] {} => Extracted {} range from {} to {} for member {}"
                                .format(source_id, variable_id,
                                        ds["time"].values[0],
                                        ds["time"].values[-1], member_id))

                            # pass the pre-processing directly
                            dset_processed = combined_preprocessing(ds)
                            if variable_id in ["chl"]:
                                if source_id in [
                                        "CESM2", "CESM2-FV2", "CESM2-WACCM-FV2"
                                ]:
                                    dset_processed = dset_processed.isel(
                                        lev_partial=config.selected_depth)
                                else:
                                    dset_processed = dset_processed.isel(
                                        lev=config.selected_depth)
                            if variable_id in ["ph"]:

                                logging.info(
                                    "[CMIP6_IO] => Extract only depth level {}"
                                    .format(config.selected_depth))
                                dset_processed = dset_processed.isel(
                                    lev=config.selected_depth)

                            # Save the info to model object
                            if not member_id in model_object.member_ids:
                                model_object.member_ids.append(member_id)

                            if not member_id in model_object.ocean_vars.keys():
                                model_object.ocean_vars[member_id] = []
                            if not variable_id in model_object.ocean_vars[
                                    member_id]:
                                current_vars = model_object.ocean_vars[
                                    member_id]
                                current_vars.append(variable_id)
                                model_object.ocean_vars[
                                    member_id] = current_vars

                            self.dataset_into_model_dictionary(
                                member_id, variable_id, dset_processed,
                                model_object)

                        else:
                            logging.error(
                                "[CMIP6_IO] Error - unable to find variable {}"
                                .format(variable_id))

                self.models.append(model_object)
                logging.info(
                    "[CMIP6_IO] Stored {} variables for model {}".format(
                        len(model_object.ocean_vars), model_object.name))