def make_rechunker_stores( output_path: Optional[str] = None, ) -> Tuple[fsspec.FSMap, fsspec.FSMap, str]: """Initialize two stores for rechunker to use as temporary and final rechunked locations Parameters ---------- output_path : str, optional Output path for rechunker stores Returns ------- temp_store, target_store, path_tgt : tuple[fsspec.mapping.FSmap, fsspec.mapping.FSmap, string] Stores where rechunker will write and the path to the target store """ storage_options = config.get('storage.temporary.storage_options') path_tmp = config.get('storage.temporary.uri') + "/{}.zarr".format( temp_file_name()) temp_store = fsspec.get_mapper(path_tmp, **storage_options) if output_path is None: output_path = config.get('storage.temporary.uri') + "/{}.zarr".format( temp_file_name()) target_store = fsspec.get_mapper(output_path, **storage_options) return temp_store, target_store, output_path
def test_config(): # check that required config keys are always there assert config.get('storage.intermediate.uri') assert config.get('storage.results.uri') assert config.get('storage.temporary.uri') assert config.get('runtime.cloud.storage_options') assert config.get('runtime.local.storage_options') assert config.get('runtime.test.storage_options') assert config.get('runtime.pangeo.storage_options')
def build_gcm_identifier( gcm: str, scenario: str, variable: str, train_period: slice, predict_period: slice, bbox: BBox, **kwargs, ) -> str: """ Build the common identifier for GCM related data Parameters ---------- gcm : str Name of the GCM model scenario : str Name of the future emission scenario to load variable : str Name of the variable used in obs and gcm dataset (including features and label) train_period : slice Start and end year slice of training/historical period. Ex: slice('1990', '1990') predict_period: slice Start and end year slice of prediction period. Ex: slice('2020', '2040') bbox: BBox Bounding box including latmin,latmax,lonmin,lonmax. Returns ------- identifier : str string to be used in gcm related paths as specified by the params """ if isinstance(variable, str): variable = [variable] var_string = '_'.join(sorted(variable)) gcm_identifier = config.get('storage.gcm_identifier_template').format( gcm=gcm, scenario=scenario, variable=var_string, train_period=f'{train_period.start}_{train_period.stop}', predict_period=f'{predict_period.start}_{predict_period.stop}', bbox=bbox, ) return gcm_identifier
def build_obs_identifier( obs: str, variable: str, train_period: slice, bbox: BBox, **kwargs, ) -> str: """ Build the common identifier for observation related data: the same pattern is used for: 1) chunked raw obs, 2) coarsened obs, and 3) coarsened then interpolated obs Parameters ---------- obs: str Name of obs dataset variable: str Name of the variable used in obs and gcm dataset (including features and label) train_period: slice Start and end year slice of training/historical period. Ex: slice('1990','1990') bbox: BBox dataclass containing the latmin,latmax,lonmin,lonmax. Class can be found in utils. Returns ------- identifier : str string to be used in obs related paths as specified by the params """ if isinstance(variable, str): variable = [variable] var_string = '_'.join(sorted(variable)) obs_identifier = config.get('storage.obs_identifier_template').format( obs=obs, train_period=f'{train_period.start}_{train_period.stop}', bbox=bbox, variable=var_string, ) return obs_identifier
pyramid_path_annual = make_annual_pyramid_path(gcm_identifier) return ( gcm_grid_spec, obs_identifier, gcm_identifier, pyramid_path_daily, pyramid_path_monthly, pyramid_path_annual, ) @task( checkpoint=True, result=XpersistResult( CacheStore(config.get('storage.intermediate.uri')), serializer='xarray.zarr', ), target=make_interpolated_obs_path, ) def coarsen_and_interpolate_obs_task(obs, train_period, predict_period, variables, gcm, scenario, chunking_approach, bbox, **kwargs): """ Coarsen the observation dataset to the grid of the GCM model specified in inputs then interpolate back into the observation grid. Rechunk the final output according to chunking approach. Parameters ---------- obs: str Name of obs dataset gcm: str
from cmip6_downscaling.workflows.paths import ( make_annual_summary_path, make_bcsd_output_path, make_bias_corrected_path, make_coarse_obs_path, make_gcm_predict_path, make_monthly_summary_path, make_rechunked_gcm_path, make_return_obs_path, make_spatial_anomalies_path, ) runtime = runtimes.get_runtime() intermediate_cache_store = CacheStore( config.get("storage.intermediate.uri"), storage_options=config.get("storage.intermediate.storage_options"), ) results_cache_store = CacheStore( config.get("storage.results.uri"), storage_options=config.get("storage.results.storage_options"), ) # Transform Functions into Tasks ----------------------------------------------------------- return_obs_task = task( return_obs, result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_return_obs_path, ) get_coarse_obs_task = task(
def load_cmip( activity_ids: str = "CMIP", experiment_ids: str = "historical", member_ids: str = "r1i1p1f1", source_ids: str = "MIROC6", table_ids: str = "day", grid_labels: str = "gn", variable_ids: List[str] = ["tasmax"], return_type: str = 'zarr', ) -> xr.Dataset: """Loads CMIP6 GCM dataset based on input criteria. Parameters ---------- activity_ids : list, optional activity_ids in CMIP6 catalog, by default ["CMIP", "ScenarioMIP"], experiment_ids : list, optional experiment_ids in CMIP6 catalog, by default ["historical", "ssp370"], ex:# "ssp126", "ssp245", "ssp585" member_ids : list, optional member_ids in CMIP6 catalog, by default ["r1i1p1f1"] source_ids : list, optional source_ids in CMIP6 catalog, by default ["MIROC6"] table_ids : list, optional table_ids in CMIP6 catalog, by default ["day"] grid_labels : list, optional grid_labels in CMIP6 catalog, by default ["gn"] variable_ids : list, optional variable_ids in CMIP6 catalog, by default ['tasmax'] Returns ------- ds : xr.Dataset or zarr group Dataset or zarr group with CMIP data """ if isinstance(variable_ids, str): variable_ids = [variable_ids] col = cat.cmip6() for i, var in enumerate(variable_ids): stores = ( col.search( activity_id=activity_ids, experiment_id=experiment_ids, member_id=member_ids, source_id=source_ids, table_id=table_ids, grid_label=grid_labels, variable_id=[var], ) .df['zstore'] .to_list() ) storage_options = config.get('data_catalog.era5.storage_options') if len(stores) > 1: raise ValueError('can only get 1 store at a time') if return_type == 'zarr': ds = zarr.open_consolidated(stores[0], mode='r', storage_options=storage_options) elif return_type == 'xr': ds = xr.open_zarr(stores[0], consolidated=True, storage_options=storage_options) # flip the lats if necessary and drop the extra dims/vars like bnds ds = gcm_munge(ds) ds = lon_to_180(ds) # convert to mm/day - helpful to prevent rounding errors from very tiny numbers if var == 'pr': ds['pr'] *= 86400 if i == 0: ds_out = ds else: ds_out[var] = ds[var] return ds_out
coarsen_and_interpolate_obs_task, interpolate_gcm_task, path_builder_task, ) from cmip6_downscaling.workflows.paths import ( make_bias_corrected_gcm_path, make_gard_post_processed_output_path, make_gard_predict_output_path, make_rechunked_obs_path, ) from cmip6_downscaling.workflows.utils import rechunk_zarr_array_with_caching runtime = get_runtime() intermediate_cache_store = CacheStore( config.get('storage.intermediate.uri'), storage_options=config.get('storage.intermediate.storage_options'), ) results_cache_store = CacheStore( config.get('storage.results.uri'), storage_options=config.get('storage.results.storage_options')) fit_and_predict_task = task( gard_fit_and_predict, checkpoint=True, result=XpersistResult(intermediate_cache_store, serializer="xarray.zarr"), target=make_gard_predict_output_path, ) read_scrf_task = task(read_scrf, )
def rechunk_zarr_array_with_caching( zarr_array: xr.Dataset, chunking_approach: Optional[str] = None, template_chunk_array: Optional[xr.Dataset] = None, output_path: Optional[str] = None, max_mem: str = "200MB", overwrite: bool = False, ) -> xr.Dataset: """Use `rechunker` package to adjust chunks of dataset to a form conducive for your processing. Parameters ---------- zarr_array : zarr or xarray dataset Dataset you want to rechunk. output_path: str Path to where the output data is saved. If output path is not empty, the content would be loaded and the schema checked. If the schema check passed, the content will be returned without rechunking again (i.e. caching); else, the content can be overwritten (see overwrite option). chunking_approach : str Has to be one of `full_space` or `full_time`. If `full_space`, the data will be rechunked such that the space dimensions are contiguous (i.e. each chunk will contain full maps). If `full_time`, the data will be rechunked such that the time dimension is contiguous (i.e. each chunk will contain full time series) max_mem : str The max memory you want to allow for a chunk. Probably want it to be around 100 MB, but that is also controlled by the `calc_auspicious_chunk_sizes` calls. overwrite : bool Whether to overwrite the content saved at output_path if the content did not pass schema check. Returns ------- rechunked_ds : xr.Dataset Rechunked dataset """ # determine the chunking schema if template_chunk_array is None: if chunking_approach == 'full_space': chunk_dims = ( 'time', ) # if we need full maps, chunk along the time dimension elif chunking_approach == 'full_time': chunk_dims = ( 'lat', 'lon', ) # if we need full time series, chunk along the lat/lon dimensions else: raise NotImplementedError( "chunking_approach must be in ['full_space', 'full_time']") example_var = list(zarr_array.data_vars)[0] chunk_def = calc_auspicious_chunks_dict(zarr_array[example_var], chunk_dims=chunk_dims) else: example_var = list(zarr_array.data_vars)[0] chunk_def = { 'time': min(template_chunk_array.chunks['time'][0], len(zarr_array.time)), 'lat': min(template_chunk_array.chunks['lat'][0], len(zarr_array.lat)), 'lon': min(template_chunk_array.chunks['lon'][0], len(zarr_array.lon)), } chunks_dict = { 'time': None, # write None here because you don't want to rechunk this array 'lon': None, 'lat': None, } for var in zarr_array.data_vars: chunks_dict[var] = chunk_def # make the schema for what you want the rechunking routine to produce # so that you can check whether what you passed in (zarr_array) already looks like that # if it does, you'll skip the rechunking! schema_dict = {} for var in zarr_array.data_vars: schema_dict[var] = DataArraySchema(chunks=chunk_def) target_schema = DatasetSchema(schema_dict) # make storage patterns if output_path is not None: output_path = config.get( 'storage.intermediate.uri') + '/' + output_path temp_store, target_store, target_path = make_rechunker_stores(output_path) print(f'target path is {target_path}') # check and see if the output is empty, if there is content, check that it's chunked correctly if len(target_store) > 0: print('checking the cache') output = xr.open_zarr(target_store) try: # if the content in target path is correctly chunked, return target_schema.validate(output) return output except SchemaError: if overwrite: target_store.clear() else: raise NotImplementedError( 'The content in the output path is incorrectly chunked, but overwrite is disabled.' 'Either clear the output or enable overwrite by setting overwrite=True' ) # process the input zarr array delete_chunks_encoding(zarr_array) try: print('checking the chunk') # now check if the input is already correctly chunked. If so, save to the output location and return target_schema.validate(zarr_array) zarr_array.to_zarr(target_store, mode='w', consolidated=True) return zarr_array except SchemaError: print('rechunking') try: rechunk_plan = rechunk( zarr_array, chunks_dict, max_mem, target_store, temp_store=temp_store, ) rechunk_plan.execute(retries=5) except ValueError: print( 'WARNING: Failed to write zarr store, perhaps because of variable chunk sizes, trying to rechunk it' ) # clearing the store because the target store has already been created in the try statement above # and rechunker fails if there's already content at the target target_store.clear() zarr_array = zarr_array.chunk(chunks_dict[example_var]) rechunk_plan = rechunk( zarr_array, chunks_dict, max_mem, target_store, temp_store=temp_store, ) rechunk_plan.execute(retries=5) rechunked_ds = xr.open_zarr( target_store ) # ideally we want consolidated=True but it seems that functionality isn't offered in rechunker right now # we can just add a consolidate_metadata step here to do it after the fact (once rechunker is done) but only # necessary if we'll reopen this rechukned_ds multiple times return rechunked_ds