Example #1
0
def make_rechunker_stores(
    output_path: Optional[str] = None,
) -> Tuple[fsspec.FSMap, fsspec.FSMap, str]:
    """Initialize two stores for rechunker to use as temporary and final rechunked locations
    Parameters
    ----------
    output_path : str, optional
        Output path for rechunker stores
    Returns
    -------
    temp_store, target_store, path_tgt : tuple[fsspec.mapping.FSmap, fsspec.mapping.FSmap, string]
        Stores where rechunker will write and the path to the target store
    """
    storage_options = config.get('storage.temporary.storage_options')
    path_tmp = config.get('storage.temporary.uri') + "/{}.zarr".format(
        temp_file_name())

    temp_store = fsspec.get_mapper(path_tmp, **storage_options)

    if output_path is None:
        output_path = config.get('storage.temporary.uri') + "/{}.zarr".format(
            temp_file_name())

    target_store = fsspec.get_mapper(output_path, **storage_options)
    return temp_store, target_store, output_path
Example #2
0
def test_config():
    # check that required config keys are always there
    assert config.get('storage.intermediate.uri')
    assert config.get('storage.results.uri')
    assert config.get('storage.temporary.uri')
    assert config.get('runtime.cloud.storage_options')
    assert config.get('runtime.local.storage_options')
    assert config.get('runtime.test.storage_options')
    assert config.get('runtime.pangeo.storage_options')
Example #3
0
def build_gcm_identifier(
    gcm: str,
    scenario: str,
    variable: str,
    train_period: slice,
    predict_period: slice,
    bbox: BBox,
    **kwargs,
) -> str:
    """
    Build the common identifier for GCM related data

    Parameters
    ----------
    gcm : str
        Name of the GCM model
    scenario : str
        Name of the future emission scenario to load
    variable : str
        Name of the variable used in obs and gcm dataset (including features and label)
    train_period : slice
        Start and end year slice of training/historical period. Ex: slice('1990', '1990')
    predict_period: slice
        Start and end year slice of prediction period. Ex: slice('2020', '2040')
    bbox: BBox
        Bounding box including latmin,latmax,lonmin,lonmax.

    Returns
    -------
    identifier : str
        string to be used in gcm related paths as specified by the params
    """
    if isinstance(variable, str):
        variable = [variable]
    var_string = '_'.join(sorted(variable))

    gcm_identifier = config.get('storage.gcm_identifier_template').format(
        gcm=gcm,
        scenario=scenario,
        variable=var_string,
        train_period=f'{train_period.start}_{train_period.stop}',
        predict_period=f'{predict_period.start}_{predict_period.stop}',
        bbox=bbox,
    )
    return gcm_identifier
Example #4
0
def build_obs_identifier(
    obs: str,
    variable: str,
    train_period: slice,
    bbox: BBox,
    **kwargs,
) -> str:
    """
    Build the common identifier for observation related data: the same pattern is used for: 1) chunked raw obs, 2) coarsened obs, and 3) coarsened then interpolated obs

    Parameters
    ----------
    obs: str
        Name of obs dataset
    variable: str
        Name of the variable used in obs and gcm dataset (including features and label)
    train_period: slice
        Start and end year slice of training/historical period. Ex: slice('1990','1990')
    bbox: BBox
        dataclass containing the latmin,latmax,lonmin,lonmax. Class can be found in utils.


    Returns
    -------
    identifier : str
        string to be used in obs related paths as specified by the params
    """
    if isinstance(variable, str):
        variable = [variable]
    var_string = '_'.join(sorted(variable))

    obs_identifier = config.get('storage.obs_identifier_template').format(
        obs=obs,
        train_period=f'{train_period.start}_{train_period.stop}',
        bbox=bbox,
        variable=var_string,
    )

    return obs_identifier
    pyramid_path_annual = make_annual_pyramid_path(gcm_identifier)

    return (
        gcm_grid_spec,
        obs_identifier,
        gcm_identifier,
        pyramid_path_daily,
        pyramid_path_monthly,
        pyramid_path_annual,
    )


@task(
    checkpoint=True,
    result=XpersistResult(
        CacheStore(config.get('storage.intermediate.uri')),
        serializer='xarray.zarr',
    ),
    target=make_interpolated_obs_path,
)
def coarsen_and_interpolate_obs_task(obs, train_period, predict_period,
                                     variables, gcm, scenario,
                                     chunking_approach, bbox, **kwargs):
    """
    Coarsen the observation dataset to the grid of the GCM model specified in inputs then
    interpolate back into the observation grid. Rechunk the final output according to chunking approach.
    Parameters
    ----------
    obs: str
        Name of obs dataset
    gcm: str
from cmip6_downscaling.workflows.paths import (
    make_annual_summary_path,
    make_bcsd_output_path,
    make_bias_corrected_path,
    make_coarse_obs_path,
    make_gcm_predict_path,
    make_monthly_summary_path,
    make_rechunked_gcm_path,
    make_return_obs_path,
    make_spatial_anomalies_path,
)

runtime = runtimes.get_runtime()

intermediate_cache_store = CacheStore(
    config.get("storage.intermediate.uri"),
    storage_options=config.get("storage.intermediate.storage_options"),
)
results_cache_store = CacheStore(
    config.get("storage.results.uri"),
    storage_options=config.get("storage.results.storage_options"),
)

# Transform Functions into Tasks -----------------------------------------------------------

return_obs_task = task(
    return_obs,
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_return_obs_path,
)
get_coarse_obs_task = task(
Example #7
0
def load_cmip(
    activity_ids: str = "CMIP",
    experiment_ids: str = "historical",
    member_ids: str = "r1i1p1f1",
    source_ids: str = "MIROC6",
    table_ids: str = "day",
    grid_labels: str = "gn",
    variable_ids: List[str] = ["tasmax"],
    return_type: str = 'zarr',
) -> xr.Dataset:
    """Loads CMIP6 GCM dataset based on input criteria.
    Parameters
    ----------
    activity_ids : list, optional
        activity_ids in CMIP6 catalog, by default ["CMIP", "ScenarioMIP"],
    experiment_ids : list, optional
        experiment_ids in CMIP6 catalog, by default ["historical", "ssp370"],  ex:#  "ssp126", "ssp245",  "ssp585"
    member_ids : list, optional
        member_ids in CMIP6 catalog, by default ["r1i1p1f1"]
    source_ids : list, optional
        source_ids in CMIP6 catalog, by default ["MIROC6"]
    table_ids : list, optional
        table_ids in CMIP6 catalog, by default ["day"]
    grid_labels : list, optional
        grid_labels in CMIP6 catalog, by default ["gn"]
    variable_ids : list, optional
        variable_ids in CMIP6 catalog, by default ['tasmax']
    Returns
    -------
    ds : xr.Dataset or zarr group
        Dataset or zarr group with CMIP data
    """

    if isinstance(variable_ids, str):
        variable_ids = [variable_ids]

    col = cat.cmip6()

    for i, var in enumerate(variable_ids):
        stores = (
            col.search(
                activity_id=activity_ids,
                experiment_id=experiment_ids,
                member_id=member_ids,
                source_id=source_ids,
                table_id=table_ids,
                grid_label=grid_labels,
                variable_id=[var],
            )
            .df['zstore']
            .to_list()
        )

        storage_options = config.get('data_catalog.era5.storage_options')
        if len(stores) > 1:
            raise ValueError('can only get 1 store at a time')
        if return_type == 'zarr':
            ds = zarr.open_consolidated(stores[0], mode='r', storage_options=storage_options)
        elif return_type == 'xr':
            ds = xr.open_zarr(stores[0], consolidated=True, storage_options=storage_options)

        # flip the lats if necessary and drop the extra dims/vars like bnds
        ds = gcm_munge(ds)
        ds = lon_to_180(ds)

        # convert to mm/day - helpful to prevent rounding errors from very tiny numbers
        if var == 'pr':
            ds['pr'] *= 86400

        if i == 0:
            ds_out = ds
        else:
            ds_out[var] = ds[var]

    return ds_out
    coarsen_and_interpolate_obs_task,
    interpolate_gcm_task,
    path_builder_task,
)
from cmip6_downscaling.workflows.paths import (
    make_bias_corrected_gcm_path,
    make_gard_post_processed_output_path,
    make_gard_predict_output_path,
    make_rechunked_obs_path,
)
from cmip6_downscaling.workflows.utils import rechunk_zarr_array_with_caching

runtime = get_runtime()

intermediate_cache_store = CacheStore(
    config.get('storage.intermediate.uri'),
    storage_options=config.get('storage.intermediate.storage_options'),
)
results_cache_store = CacheStore(
    config.get('storage.results.uri'),
    storage_options=config.get('storage.results.storage_options'))

fit_and_predict_task = task(
    gard_fit_and_predict,
    checkpoint=True,
    result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"),
    target=make_gard_predict_output_path,
)

read_scrf_task = task(read_scrf, )
Example #9
0
def rechunk_zarr_array_with_caching(
    zarr_array: xr.Dataset,
    chunking_approach: Optional[str] = None,
    template_chunk_array: Optional[xr.Dataset] = None,
    output_path: Optional[str] = None,
    max_mem: str = "200MB",
    overwrite: bool = False,
) -> xr.Dataset:
    """Use `rechunker` package to adjust chunks of dataset to a form
    conducive for your processing.
    Parameters
    ----------
    zarr_array : zarr or xarray dataset
        Dataset you want to rechunk.
    output_path: str
        Path to where the output data is saved. If output path is not empty, the content would be loaded and the schema checked. If the schema check passed,
        the content will be returned without rechunking again (i.e. caching); else, the content can be overwritten (see overwrite option).
    chunking_approach : str
        Has to be one of `full_space` or `full_time`. If `full_space`, the data will be rechunked such that the space dimensions are contiguous (i.e. each chunk
        will contain full maps). If `full_time`, the data will be rechunked such that the time dimension is contiguous (i.e. each chunk will contain full time
        series)
    max_mem : str
        The max memory you want to allow for a chunk. Probably want it to be around 100 MB, but that
        is also controlled by the `calc_auspicious_chunk_sizes` calls.
    overwrite : bool
        Whether to overwrite the content saved at output_path if the content did not pass schema check.
    Returns
    -------
    rechunked_ds : xr.Dataset
        Rechunked dataset
    """
    # determine the chunking schema
    if template_chunk_array is None:
        if chunking_approach == 'full_space':
            chunk_dims = (
                'time',
            )  # if we need full maps, chunk along the time dimension
        elif chunking_approach == 'full_time':
            chunk_dims = (
                'lat',
                'lon',
            )  # if we need full time series, chunk along the lat/lon dimensions
        else:
            raise NotImplementedError(
                "chunking_approach must be in ['full_space', 'full_time']")
        example_var = list(zarr_array.data_vars)[0]
        chunk_def = calc_auspicious_chunks_dict(zarr_array[example_var],
                                                chunk_dims=chunk_dims)
    else:
        example_var = list(zarr_array.data_vars)[0]
        chunk_def = {
            'time':
            min(template_chunk_array.chunks['time'][0], len(zarr_array.time)),
            'lat':
            min(template_chunk_array.chunks['lat'][0], len(zarr_array.lat)),
            'lon':
            min(template_chunk_array.chunks['lon'][0], len(zarr_array.lon)),
        }
    chunks_dict = {
        'time':
        None,  # write None here because you don't want to rechunk this array
        'lon': None,
        'lat': None,
    }
    for var in zarr_array.data_vars:
        chunks_dict[var] = chunk_def

    # make the schema for what you want the rechunking routine to produce
    # so that you can check whether what you passed in (zarr_array) already looks like that
    # if it does, you'll skip the rechunking!
    schema_dict = {}
    for var in zarr_array.data_vars:
        schema_dict[var] = DataArraySchema(chunks=chunk_def)
    target_schema = DatasetSchema(schema_dict)

    # make storage patterns
    if output_path is not None:
        output_path = config.get(
            'storage.intermediate.uri') + '/' + output_path
    temp_store, target_store, target_path = make_rechunker_stores(output_path)
    print(f'target path is {target_path}')

    # check and see if the output is empty, if there is content, check that it's chunked correctly
    if len(target_store) > 0:
        print('checking the cache')
        output = xr.open_zarr(target_store)
        try:
            # if the content in target path is correctly chunked, return
            target_schema.validate(output)
            return output

        except SchemaError:
            if overwrite:
                target_store.clear()
            else:
                raise NotImplementedError(
                    'The content in the output path is incorrectly chunked, but overwrite is disabled.'
                    'Either clear the output or enable overwrite by setting overwrite=True'
                )

    # process the input zarr array
    delete_chunks_encoding(zarr_array)
    try:
        print('checking the chunk')
        # now check if the input is already correctly chunked. If so, save to the output location and return
        target_schema.validate(zarr_array)
        zarr_array.to_zarr(target_store, mode='w', consolidated=True)
        return zarr_array

    except SchemaError:
        print('rechunking')
        try:
            rechunk_plan = rechunk(
                zarr_array,
                chunks_dict,
                max_mem,
                target_store,
                temp_store=temp_store,
            )
            rechunk_plan.execute(retries=5)
        except ValueError:
            print(
                'WARNING: Failed to write zarr store, perhaps because of variable chunk sizes, trying to rechunk it'
            )
            # clearing the store because the target store has already been created in the try statement above
            # and rechunker fails if there's already content at the target
            target_store.clear()
            zarr_array = zarr_array.chunk(chunks_dict[example_var])
            rechunk_plan = rechunk(
                zarr_array,
                chunks_dict,
                max_mem,
                target_store,
                temp_store=temp_store,
            )
            rechunk_plan.execute(retries=5)
        rechunked_ds = xr.open_zarr(
            target_store
        )  # ideally we want consolidated=True but it seems that functionality isn't offered in rechunker right now
        # we can just add a consolidate_metadata step here to do it after the fact (once rechunker is done) but only
        # necessary if we'll reopen this rechukned_ds multiple times
        return rechunked_ds