Ejemplo n.º 1
0
def get_obs(
    obs: str,
    train_period: slice,
    variables: Union[str, List[str]],
    chunking_approach: Optional[str] = None,
    cache_within_rechunk: Optional[bool] = True,
) -> xr.Dataset:
    if obs == 'ERA5':
        ds_obs = open_era5(variables=variables, time_period=train_period)
    else:
        raise NotImplementedError(
            'only ERA5 is available as observation dataset right now')

    if chunking_approach is None:
        return ds_obs

    if cache_within_rechunk:
        path_dict = {
            'obs': obs,
            'train_period': train_period,
            'variables': variables,
        }
        rechunked_path = make_rechunked_obs_path(
            chunking_approach=chunking_approach,
            **path_dict,
        )
    else:
        rechunked_path = None
    ds_obs_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=ds_obs,
        chunking_approach=chunking_approach,
        output_path=rechunked_path,
    )

    return ds_obs_rechunked
Ejemplo n.º 2
0
def return_gcm_train_full_time(coarse_obs_full_time_ds: xr.Dataset, gcm: str,
                               scenario: str, variable: str,
                               train_period: slice, predict_period: slice,
                               bbox: BBox, **kwargs) -> xr.Dataset:
    """Returns GCM training rechunked dataset in full time.

    Parameters
    ----------
    coarse_obs_full_time_ds : xr.Dataset
        Output coarse observation dataset rechunked in full_time
    gcm : str
        Input GCM
    scenario: str
        Input GCM scenario
    variable: str
        The variable included in the dataset.
    train_period: slice
        Start and end year slice of training/historical period. Ex: slice('1990','1990')
    predict_period: slice
        Start and end year slice of predict period. Ex: slice('2020','2020')
    bbox : BBox
        Bounding box including latmin,latmax,lonmin,lonmax.
    **kwargs : dict, optional

    Returns
    -------
    xr.Dataset
        x_train rechunked dataset in full time.
    """
    gcm_train_ds = load_cmip(source_ids=gcm,
                             variable_ids=[variable],
                             return_type='xr')
    gcm_train_ds_180 = lon_to_180(gcm_train_ds)

    gcm_train_ds_subset = subset_dataset(
        gcm_train_ds_180,
        variable,
        train_period,
        bbox,
        chunking_schema={
            'time': 365,
            'lat': 150,
            'lon': 150
        },
    )

    # this call was to force the timestamps for the cmip data to use the friendlier era5 timestamps. (i forget which dataset used which time formats). i could picture this introducing a tricky bug though (for instance if gcm timestamp didn't align for some reason) so we could use another conversion system if that is better. Perhaps datetime equivilence test.
    gcm_train_ds_subset['time'] = coarse_obs_full_time_ds.time.values

    gcm_train_subset_full_time_ds = rechunk_zarr_array_with_caching(
        gcm_train_ds_subset,
        chunking_approach='full_time',
        max_mem='1GB',
    )
    return gcm_train_subset_full_time_ds
Ejemplo n.º 3
0
def maca_coarse_bias_correction_task(
    ds_gcm: xr.Dataset,
    ds_obs: xr.Dataset,
    train_period_start: str,
    train_period_end: str,
    variables: Union[str, List[str]],
    chunking_approach: str,
    batch_size: Optional[int] = 15,
    buffer_size: Optional[int] = 15,
    **kwargs,
):
    """
    Task that implements the coarse scale bias correction in MACA. The historical GCM is mapped to historical
    coarsened observation in the bias correction. Rechunks the GCM data to match observation data because
    the bias correction model in skdownscale requires these datasets to have the same chunks/blocks.

    ds_gcm: xr.Dataset
        GCM dataset
    ds_obs: xr.Dataset
        Observation dataset
    train_period_start: str
        Start year of training/historical period
    train_period_end: str
        End year of training/historical period
    variables: List[str]
        Names of the variables used in obs and gcm dataset (including features and label)
    chunking_approach: str
        'full_space', 'full_time', 'matched' or None
    batch_size: Optional[int]
        The batch size in terms of day of year to bias correct together
    buffer_size: Optional[int]
        The buffer size in terms of day of year to include in the bias correction

    Returns
    -------
    bias_corrected: xr.Dataset
        Bias corrected GCM dataset
    """
    # TODO: test if this is needed if both ds_gcm and ds_obs are chunked in full time
    ds_gcm_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=ds_gcm, template_chunk_array=ds_obs)

    historical_period = slice(train_period_start, train_period_end)
    bias_corrected = maca_bias_correction(
        ds_gcm=ds_gcm_rechunked,
        ds_obs=ds_obs,
        historical_period=historical_period,
        variables=variables,
        batch_size=batch_size,
        buffer_size=buffer_size,
    )

    return bias_corrected
Ejemplo n.º 4
0
def prep_gard_input_task(
    obs: str,
    train_period: slice,
    predict_period: slice,
    variables: List[str],
    X_train: xr.Dataset,
    X_pred: xr.Dataset,
    gcm_identifier: str,
    bias_correction_method: str,
    bbox,
):
    # get observation data in the same chunking scheme as
    ds_obs = return_obs(obs=obs,
                        train_period=train_period,
                        variable=variables,
                        bbox=bbox)

    rechunked_obs_path = make_rechunked_obs_path(
        obs=obs,
        train_period=train_period,
        variable=variables,
        bbox=bbox,
        chunking_approach='matched',
    )
    y_train_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=ds_obs,
        template_chunk_array=X_train,
        output_path=rechunked_obs_path)

    rechunked_gcm_path = make_bias_corrected_gcm_path(
        gcm_identifier=gcm_identifier,
        method=bias_correction_method,
        chunking_approach='matched')
    X_pred_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=X_pred.sel(time=predict_period),
        template_chunk_array=X_train,
        output_path=rechunked_gcm_path,
    )

    return X_train, y_train_rechunked, X_pred_rechunked
Ejemplo n.º 5
0
def rechunker_task(
    zarr_array: xr.Dataset,
    chunking_approach: Optional[str] = None,
    template_chunk_array: Optional[xr.Dataset] = None,
    naming_func: Optional[Callable] = None,
    **kwargs,
):
    """
    Task to rechunk a dataset

    Parameters
    ----------
    zarr_array : zarr or xarray dataset
        Dataset you want to rechunk.
    chunking_approach : str, optional
        Has to be one of `full_space` or `full_time`. If `full_space`, the data will be rechunked such that the space dimensions are contiguous (i.e. each chunk
        will contain full maps). If `full_time`, the data will be rechunked such that the time dimension is contiguous (i.e. each chunk will contain full time
        series). Either the chunking approach or the template chunk array must be provided.
    template_chunk_array: zarr or xarray dataset, optional
        A template dataset with the desired chunksizes. Either the chunking approach or the template chunk array must be provided.
    naming_func: callable, optional
        A function that returns a string that represents the output caching location that the rechunk task should save to.
        The input arguments of this naming func must be provided as kwargs to this method

    Returns
    -------
    rechunked_ds : xr.Dataset
        Rechunked dataset
    """
    if naming_func is not None:
        output_path = naming_func(chunking_approach=chunking_approach,
                                  **kwargs)
    else:
        output_path = None

    rechunked = rechunk_zarr_array_with_caching(
        zarr_array=zarr_array,
        chunking_approach=chunking_approach,
        template_chunk_array=template_chunk_array,
        output_path=output_path,
    )

    return rechunked
Ejemplo n.º 6
0
def return_coarse_obs_full_time(coarse_obs_ds: xr.Dataset, gcm: str,
                                scenario: str, variable: str,
                                train_period: slice, predict_period: slice,
                                bbox: BBox, **kwargs) -> xr.Dataset:
    """

    Return coarse observation dataset that has been chunked in time.

    Parameters
    ----------
    coarse_obs_ds : xr.Dataset
        Input coarse observation dataset
    gcm : str
        Input GCM
    scenario: str
        Input GCM scenario
    variable: str
        The variable included in the dataset.
    train_period: slice
        Start and end year slice of training/historical period. Ex: slice('1990','1990')
    predict_period: slice
        Start and end year slice of predict period. Ex: slice('2020','2020')
    bbox : BBox
        Bounding box including latmin,latmax,lonmin,lonmax.
    **kwargs : dict, optional

    Returns
    -------
    xr.Dataset
        coarse_obs_full_time_ds rechunked dataset
    """
    coarse_obs_full_time_ds = rechunk_zarr_array_with_caching(
        coarse_obs_ds,
        chunking_approach='full_time',
        max_mem='1GB',
    )
    return coarse_obs_full_time_ds
Ejemplo n.º 7
0
def interpolate_gcm_task(
    obs: str,
    gcm: str,
    scenario: str,
    train_period: slice,
    predict_period: slice,
    variables: Union[str, List[str]],
    chunking_approach: str,
    bbox,
):
    """
    Interpolate the GCM dataset to the grid of the observation dataset.
    Rechunk the final output according to chunking approach.
    Parameters
    ----------
    obs: str
        Name of obs dataset
    gcm: str
        Name of the GCM model
    scenario: str
        Name of the emission scenario
    training_period: slice
        Training/historical period bounds
    predict_period: slice
        Prediction (historical and/or future) period bounds
    variables: List[str]
        List of variables to get in obs dataset
    chunking_approach: str
        'full_space', 'full_time', or None

    Returns
    -------
    ds_gcm_interpolated_rechunked: xr.Dataset
        The GCM dataset that has been interpolated to the obs grid then rechunked.
    """
    # get gcm in full space chunks
    ds_gcm_full_space = get_gcm(
        gcm=gcm,
        scenario=scenario,
        variables=variables,
        train_period=train_period,
        predict_period=predict_period,
        chunking_approach="full_space",
        cache_within_rechunk=False,
        bbox=bbox,
    )

    # get obs as a template
    ds_obs = return_obs(obs=obs,
                        train_period=train_period,
                        variable=variables,
                        bbox=bbox)

    # interpolate gcm to obs resolution
    ds_gcm_interpolated = regrid_ds(
        ds=ds_gcm_full_space,
        target_grid_ds=ds_obs.isel(time=0).load(),
        chunking_approach="full_space",
    )

    # rechunked to final output chunking approach if needed
    ds_gcm_interpolated_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=ds_gcm_interpolated,
        output_path=None,
        chunking_approach=chunking_approach,
    )

    return ds_gcm_interpolated_rechunked
Ejemplo n.º 8
0
def coarsen_and_interpolate_obs_task(obs, train_period, predict_period,
                                     variables, gcm, scenario,
                                     chunking_approach, bbox, **kwargs):
    """
    Coarsen the observation dataset to the grid of the GCM model specified in inputs then
    interpolate back into the observation grid. Rechunk the final output according to chunking approach.
    Parameters
    ----------
    obs: str
        Name of obs dataset
    gcm: str
        Name of GCM model
    training_period_start: str
        Start year of training/historical period
    training_period_end: str
        End year of training/historical period
    variables: List[str]
        List of variables to get in obs dataset
    chunking_approach: str
        'full_space', 'full_time', or None
    **kwargs: Dict
        Other arguments to be used in generating the target path
    Returns
    -------
    ds_obs_interpolated_rechunked: xr.Dataset
        An observation dataset that has been coarsened, interpolated back to original grid, and then rechunked.
    """
    # get obs
    ds_obs = return_obs(obs=obs,
                        train_period=train_period,
                        variable=variables,
                        bbox=bbox)

    # regrid to coarse scale
    ds_obs_coarse = get_coarse_obs(
        obs_ds=ds_obs,
        gcm=gcm,
        scenario=scenario,
        variable=variables,
        train_period=train_period,
        predict_period=predict_period,
        bbox=bbox,
        **kwargs,
    )

    # interpolate to fine scale again
    ds_obs_interpolated = regrid_ds(
        ds=ds_obs_coarse,
        target_grid_ds=ds_obs.isel(time=0).chunk({
            'lat': -1,
            'lon': -1
        }),
        chunking_approach="full_space",
    )

    # rechunked to final output chunking approach if needed
    ds_obs_interpolated_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=ds_obs_interpolated,
        output_path=None,
        chunking_approach=chunking_approach,
    )

    return ds_obs_interpolated_rechunked
Ejemplo n.º 9
0
def fit_and_predict(gcm_train_subset_full_time_ds: xr.Dataset,
                    coarse_obs_full_time_ds: xr.Dataset,
                    gcm_predict_rechunked_ds: xr.Dataset,
                    gcm: str,
                    scenario: str,
                    variable: str,
                    train_period: slice,
                    predict_period: slice,
                    bbox: BBox,
                    dim: str = "time",
                    **kwargs) -> xr.Dataset:
    """Fit bcsd model on prepared CMIP data with obs at corresponding spatial scale.
    Then predict for a set of CMIP data (likely future).

    Parameters
    ----------
    gcm_train_subset_full_time_ds : xr.Dataset
        GCM training dataset chunked along space
    coarse_obs_full_time_ds : xr.Dataset
        Obs training dataset chunked along space
    gcm_predict_rechunked_ds : xr.Dataset
        GCM prediction dataset chunked along space.
    gcm : str
        Input GCM
    scenario: str
        Input GCM scenario
    variable: str
        The variable included in the dataset.
    train_period: slice
        Start and end year slice of training/historical period. Ex: slice('1990','1990')
    predict_period: slice
        Start and end year slice of predict period. Ex: slice('2020','2020')
    bbox : BBox
        Bounding box including latmin,latmax,lonmin,lonmax.
    dim : str, optional
        dimension on which you want to do the modelling, by default "time"
    **kwargs : dict, optional

    Returns
    -------
    bias_corrected_ds : xr.Dataset
        Bias-corrected dataset
    """
    if variable in ABSOLUTE_VARS:
        bcsd_model = BcsdTemperature(return_anoms=False)
    elif variable in RELATIVE_VARS:
        bcsd_model = BcsdPrecipitation(return_anoms=False)

    pointwise_model = PointWiseDownscaler(model=bcsd_model, dim=dim)

    coarse_obs_rechunked_validated_ds = rechunk_zarr_array_with_caching(
        coarse_obs_full_time_ds,
        template_chunk_array=gcm_train_subset_full_time_ds)
    pointwise_model.fit(gcm_train_subset_full_time_ds[variable],
                        coarse_obs_rechunked_validated_ds[variable])
    bias_corrected_da = pointwise_model.predict(
        gcm_predict_rechunked_ds[variable])

    bias_corrected_ds = bias_corrected_da.astype('float32').to_dataset(
        name=variable)

    return bias_corrected_ds
Ejemplo n.º 10
0
def get_gcm(
    gcm: str,
    scenario: str,
    variables: Union[str, List[str]],
    train_period: slice,
    predict_period: slice,
    bbox,
    chunking_approach: Optional[str] = None,
    cache_within_rechunk: Optional[bool] = True,
) -> xr.Dataset:
    """
    Load and combine historical and future GCM into one dataset.
    Parameters
    ----------
    gcm: str
        Name of GCM
    scenario: str
        Name of scenario
    variables: str or list
        Name of variable(s) to load
    train_period_start: str
        Start year of train/historical period
    train_period_end: str
        End year of train/historical period
    predict_period_start: str
        Start year of predict/future period
    predict_period_end: str
        End year of predict/future period
    chunking_approach: Optional[str]
        'full_space', 'full_time', or None
    Returns
    -------
    ds_gcm: xr.Dataset
        A dataset containing both historical and future period of GCM data
    """
    historical_gcm = load_cmip(
        activity_ids='CMIP',
        experiment_ids='historical',
        source_ids=gcm,
        variable_ids=variables,
        return_type='xr',
    )

    future_gcm = load_cmip(
        activity_ids='ScenarioMIP',
        experiment_ids=scenario,
        source_ids=gcm,
        variable_ids=variables,
        return_type='xr',
    )

    ds_gcm = xr.combine_by_coords([historical_gcm, future_gcm], combine_attrs='drop_conflicts')

    ds_gcm_train = subset_dataset(
        ds=ds_gcm,
        variable=variables[0],
        time_period=train_period,
        bbox=bbox,
    )
    ds_gcm_predict = subset_dataset(
        ds=ds_gcm,
        variable=variables[0],
        time_period=predict_period,
        bbox=bbox,
    )

    ds_gcm = xr.combine_by_coords([ds_gcm_train, ds_gcm_predict], combine_attrs='drop_conflicts')
    ds_gcm = ds_gcm.reindex(time=sorted(ds_gcm.time.values))

    if chunking_approach is None:
        return ds_gcm

    if cache_within_rechunk:
        path_dict = {
            'gcm': gcm,
            'scenario': scenario,
            'train_period': train_period,
            'predict_period': predict_period,
            'variables': variables,
        }
        rechunked_path = make_rechunked_gcm_path(chunking_approach=chunking_approach, **path_dict)
    else:
        rechunked_path = None
    ds_gcm_rechunked = rechunk_zarr_array_with_caching(
        zarr_array=ds_gcm,
        chunking_approach=chunking_approach,
        output_path=rechunked_path,
    )

    return ds_gcm_rechunked