def transform(self, X, **kwargs): """Apply transforms to the data, and transform with the final estimator Parameters ---------- X : xarray.DataArray Data to transform on. Must fulfill input requirements of first step of the model or pipeline. feature_dim : str, optional Name of feature dimension. **transform_params : dict of string -> object Parameters to the ``transform`` called at the end of all transformations in the pipeline. Returns ------- y_trans : xarray.DataArray """ kws = {'feature_dim': DEFAULT_FEATURE_DIM} kws.update(kwargs) X = self._to_feature_x(X, feature_dim=kws['feature_dim']) if X.chunks: return xr.map_blocks(_transform_wrapper, X, args=[self._models], kwargs=kws) else: return _transform_wrapper(X, self._models, **kws)
def compute_global_btt_quantile_complete_ensemble(sub_ds, name, q, nr_time_steps, sto_cache_size): chunk_dict = {"prob": 5} sub_ds = sub_ds.chunk(chunk_dict) fake_data = np.zeros((len(sub_ds.prob), len(sub_ds.time))) # fake_array = xr.DataArray(data=fake_data, dims=['prob', "time"]) fake_coords = {"prob": sub_ds.prob.data, "time": sub_ds.time.data} fake_ds = xr.Dataset(data_vars={ name: fake_array }, coords=fake_coords).chunk(chunk_dict) ds_res = xr.map_blocks(func_chunk, sub_ds, args=(compute_global_btt_quantile, ), kwargs={ "name": name, "q": q, "nr_time_steps": nr_time_steps, "time_step_in_days": params["time_step_in_days"], "nr_sites": None, "sto_cache_size": sto_cache_size }, template=fake_ds) return ds_res
def predict(self, X, **kwargs): """Apply transforms to the data, and predict with the final estimator Parameters ---------- X : xarray.DataArray Data to predict on. Must fulfill input requirements of first step of the model or pipeline. feature_dim : str, optional Name of feature dimension. **predict_params : dict of string -> object Parameters to the ``predict`` called at the end of all transformations in the pipeline. Note that while this may be used to return uncertainties from some models with return_std or return_cov, uncertainties that are generated by the transformations in the pipeline are not propagated to the final estimator. Returns ------- y_pred : xarray.DataArray """ kws = {'along_dim': self._dim, 'feature_dim': DEFAULT_FEATURE_DIM} kws.update(kwargs) X = self._to_feature_x(X, feature_dim=kws['feature_dim']) if X.chunks: return xr.map_blocks(_predict_wrapper, X, args=[self._models], kwargs=kws) else: return _predict_wrapper(X, self._models, **kws)
def smooth2d(da: Var, datastore: DataStore = None, chunks: dict = None, **kwargs) -> xr.DataArray: ''' Return an xr.DataArray smoothed along two dimensions. Works with both chunked(dask) and unchunked(numpy) data. Metadata attrs are adjusted according to camps metadata conventions. ''' if isinstance(da, camps.Variable): da = da(datastore=datastore, chunks=chunks, **kwargs) else: if chunks: da = da.camps.chunk(chunks) x = da.camps.x.name y = da.camps.y.name # rechunk so that multiple chunks don't span x and y dims if da.chunks is not None: da = da.chunk({x: -1, y: -1}) dims = (x, y) kwargs['dims'] = dims # kwargs are passed to smooth2d_block da = xr.map_blocks(smooth2d_block, da, kwargs=kwargs, template=da) da.attrs['smooth'] = 'smooth_9point' return da
def test_map_blocks_kwargs(obj): expected = xr.full_like(obj, fill_value=np.nan) with raise_if_dask_computes(): actual = xr.map_blocks(xr.full_like, obj, kwargs=dict(fill_value=np.nan)) assert_chunks_equal(expected.chunk(), actual) xr.testing.assert_identical(actual.compute(), expected.compute())
def mld_dsigma(SALT, TEMP, dsigma=0.03, rho_chunks={'nlat': 16, 'nlon': 16}): """ Compute MLD based on ∆σ criterion. Uses xarray.map_blocks. Parameters ---------- SALT : xarray.DataArray Salinity TEMP : xarray.DataArray Potential temperature dsigma : float, optional The value for ∆σ. Returns ------- MLD : xarray.DataArray The MLD (m) defined as the point in the water column where density exceeds rho[0] + dsigma. """ # determine dimensionality dims_in = SALT.dims assert dims_in == TEMP.dims, 'dimension mismatch' assert 'z_t' in SALT.coords, 'z_t not found in SALT coords' # drop ancillary coordinates (this may not be necessary) SALT = SALT.reset_coords(drop=True) TEMP = TEMP.reset_coords(drop=True) # compute density rho = pop_tools.eos(SALT.chunk({'z_t': 10}), TEMP.chunk({'z_t': 10}), depth=SALT.z_t * 0.).compute() if 'nlat' in rho.dims: rho = rho.assign_coords({ 'nlat': xr.DataArray(np.arange(len(SALT.nlat)), dims=('nlat')), 'nlon': xr.DataArray(np.arange(len(SALT.nlon)), dims=('nlon')), }) rho = rho.chunk(rho_chunks).persist() # compute and return MLD template = rho.isel(z_t=0).drop('z_t') template.attrs['long_name'] = 'MLD' template.attrs['units'] = SALT.z_t.attrs['units'] template.name = 'MLD' return xr.map_blocks( _interp_mld, rho, kwargs=dict(dsigma=dsigma), template=template, )
def test_map_blocks_object_method(obj): def func(obj): result = obj + obj.x + 5 * obj.y return result with raise_if_dask_computes(): expected = xr.map_blocks(func, obj) actual = obj.map_blocks(func) assert_identical(expected.compute(), actual.compute())
def test_map_blocks(obj): def func(obj): result = obj + obj.x + 5 * obj.y return result with raise_if_dask_computes(): actual = xr.map_blocks(func, obj) expected = func(obj) assert_chunks_equal(expected.chunk(), actual) xr.testing.assert_identical(actual.compute(), expected.compute())
def test_map_blocks_change_name(map_da): def change_name(obj): obj = obj.copy(deep=True) obj.name = "new" return obj expected = change_name(map_da) with raise_if_dask_computes(): actual = xr.map_blocks(change_name, map_da) xr.testing.assert_identical(actual.compute(), expected.compute())
def test_map_blocks_add_attrs(obj): def add_attrs(obj): obj = obj.copy(deep=True) obj.attrs["new"] = "new" obj.cxy.attrs["new2"] = "new2" return obj expected = add_attrs(obj) with raise_if_dask_computes(): actual = xr.map_blocks(add_attrs, obj) xr.testing.assert_identical(actual.compute(), expected.compute())
def fit(self, X, *args, **kwargs): """Fit the model Fit all the transforms one after the other and transform the data, then fit the transformed data using the final estimator. Parameters ---------- X : xarray.DataArray or xarray.Dataset Training data. Must fulfill input requirements of first step of the pipeline. If an xarray.Dataset is passed, it will be converted to an array using `to_array()`. y : xarray.DataArray, optional Training targets. Must fulfill label requirements for all steps of the pipeline. feature_dim : str, optional Name of feature dimension. **fit_params : dict of string -> object Parameters passed to the ``fit`` method of the this model. If the model is a sklearn Pipeline, parameters can be passed to each step, where each parameter name is prefixed such that parameter ``p`` for step ``s`` has key ``s__p``. """ kws = {'along_dim': self._dim, 'feature_dim': DEFAULT_FEATURE_DIM} kws.update(kwargs) assert len(args) <= 1 args = list(args) args.append(self._model) X = self._to_feature_x(X, feature_dim=kws['feature_dim']) if X.chunks: reduce_dims = [self._dim, kws['feature_dim']] mask = _make_mask(X, reduce_dims) template = xr.full_like(mask, None, dtype=np.object) self._models = xr.map_blocks(_fit_wrapper, X, args=args, kwargs=kws, template=template) else: self._models = _fit_wrapper(X, *args, **kws)
def test_map_blocks_error(map_da, map_ds): def bad_func(darray): return (darray * darray.x + 5 * darray.y)[:1, :1] with raises_regex(ValueError, "Length of the.* has changed."): xr.map_blocks(bad_func, map_da).compute() def returns_numpy(darray): return (darray * darray.x + 5 * darray.y).values with raises_regex(TypeError, "Function must return an xarray DataArray"): xr.map_blocks(returns_numpy, map_da) with raises_regex(TypeError, "args must be"): xr.map_blocks(operator.add, map_da, args=10) with raises_regex(TypeError, "kwargs must be"): xr.map_blocks(operator.add, map_da, args=[10], kwargs=[20]) def really_bad_func(darray): raise ValueError("couldn't do anything.") with raises_regex(Exception, "Cannot infer"): xr.map_blocks(really_bad_func, map_da) ds_copy = map_ds.copy() ds_copy["cxy"] = ds_copy.cxy.chunk({"y": 10}) with raises_regex(ValueError, "inconsistent chunks"): xr.map_blocks(bad_func, ds_copy) with raises_regex(TypeError, "Cannot pass dask collections"): xr.map_blocks(bad_func, map_da, args=[map_da.chunk()]) with raises_regex(TypeError, "Cannot pass dask collections"): xr.map_blocks(bad_func, map_da, kwargs=dict(a=map_da.chunk()))
def generate_product( self, dc, path_prefix, aoi, output_projection, baseline_start_date, baseline_end_date, analysis_start_date, analysis_end_date, platform_base, platform_analysis, res, aoi_crs, **kwargs, ): ## Create datacube query dask_chunks = dict(time=10, x=500, y=500) query = create_base_query(aoi, res, output_projection, aoi_crs, dask_chunks) all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"] ( baseline_product, baseline_measurement, baseline_water_product, ) = create_product_measurement(platform_base, all_measurements) ( analysis_product, analysis_measurement, analysis_water_product, ) = create_product_measurement(platform_analysis, all_measurements) baseline_time_period = (baseline_start_date, baseline_end_date) analysis_time_period = (analysis_start_date, analysis_end_date) ## Create dask graph baseline_ds = dc.load( time=baseline_time_period, platform=platform_base, product=baseline_product, measurements=baseline_measurement, **query, ) analysis_ds = dc.load( time=analysis_time_period, platform=platform_analysis, product=analysis_product, measurements=analysis_measurement, **query, ) if is_dataset_empty(baseline_ds): raise Exception( "DataCube Load returned an empty Dataset." + "Please check load parameters for Baseline Dataset!" ) if is_dataset_empty(analysis_ds): raise Exception( "DataCube Load returned an empty Dataset." + "Please check load parameters for Analysis Dataset!" ) water_scenes_baseline = dc.load( product=baseline_water_product, measurements=["water_classification"], time=baseline_time_period, **query, ) water_scenes_baseline = water_scenes_baseline.where(water_scenes_baseline >= 0) water_scenes_analysis = dc.load( product=analysis_water_product, measurements=["water_classification"], time=analysis_time_period, **query, ) water_scenes_analysis = water_scenes_analysis.where(water_scenes_analysis >= 0) baseline_composite = geomedian(baseline_ds, baseline_product, all_measurements) analysis_composite = geomedian(analysis_ds, analysis_product, all_measurements) water_classes_base = water_scenes_baseline.where(water_scenes_baseline >= 0) water_classes_analysis = water_scenes_analysis.where(water_scenes_analysis >= 0) water_composite_base = water_classes_base.water_classification.mean(dim="time") water_composite_analysis = water_classes_analysis.water_classification.mean( dim="time" ) baseline_composite = baseline_composite.rename( {"y": "latitude", "x": "longitude"} ) water_composite_base = water_composite_base.rename( {"y": "latitude", "x": "longitude"} ) analysis_composite = analysis_composite.rename( {"y": "latitude", "x": "longitude"} ) water_composite_analysis = water_composite_analysis.rename( {"y": "latitude", "x": "longitude"} ) # Spectral Parameter Anomaly parameter_baseline_composite = xr.map_blocks( frac_coverage_classify, baseline_composite, kwargs={"no_data": np.nan} ) parameter_analysis_composite = xr.map_blocks( frac_coverage_classify, analysis_composite, kwargs={"no_data": np.nan} ) frac_cov_baseline = parameter_baseline_composite.where( (water_composite_base <= 0.4) & (parameter_baseline_composite != -9999) ) frac_cov_analysis = parameter_analysis_composite.where( (water_composite_analysis <= 0.4) & (parameter_analysis_composite != -9999) ) parameter_anomaly = frac_cov_analysis - frac_cov_baseline ## Compute parameter_anomaly_output = parameter_anomaly.compute() ## Export products bs_output = parameter_anomaly_output.bs pv_output = parameter_anomaly_output.pv npv_output = parameter_anomaly_output.npv ## Write files result = [] file_name = path.join(path_prefix, "land_change.tiff") import_export.export_xarray_to_geotiff( parameter_anomaly_output, file_name, crs=output_projection, x_coord="longitude", y_coord="latitude", ) result.append(file_name) file_name = path.join(path_prefix, "bs_change.tiff") import_export.export_xarray_to_geotiff( bs_output, file_name, crs=output_projection, x_coord="longitude", y_coord="latitude", ) result.append(file_name) file_name = path.join(path_prefix, "pv_change.tiff") import_export.export_xarray_to_geotiff( pv_output, file_name, crs=output_projection, x_coord="longitude", y_coord="latitude", ) result.append(file_name) file_name = path.join(path_prefix, "npv_change.tiff") import_export.export_xarray_to_geotiff( npv_output, file_name, crs=output_projection, x_coord="longitude", y_coord="latitude", ) result.append(file_name) return result
the_mean = calc_mean(the_array) the_perturb = the_array - the_mean return the_perturb # # save the perturbation and mean in separate dictionaries # vars = ['TABS', 'W', 'TR01'] perturb_dict_keys = ['temp_prime', 'w_prime', 'tr_prime'] mean_dict_keys = ['temp_mean', 'w_mean', 'tr_mean'] perturb_dict = {} key_pairs = zip(perturb_dict_keys, vars) for key, a_var in key_pairs: perturb_dict[key] = \ xr.map_blocks(calc_perturb, zarr_slim_ds[a_var].chunk((tlim,zlim,ylim,xlim))) mean_dict = {} key_pairs = zip(mean_dict_keys, vars) for a_var in vars: mean_dict[a_var] = \ xr.map_blocks(calc_mean, zarr_slim_ds[a_var].chunk((tlim,zlim,ylim,xlim))) # # now make a new dataset with these variables # new_ds = xr.Dataset(perturb_dict) # # and add the remaining means # for key, value in mean_dict.items():
def main(): #client = Client(n_workers=2, threads_per_worker=1, memory_limit="3GB") #client # + data_folder = "/home/hmetzler/Desktop/CARDAMOM/" # local #data_folder = "/home/data/CARDAMOM/" # matagorda filestem = "cardamom_for_holger_10_ensembles" #filestem = "cardamom_for_holger" #chunk_dict = {"ens": 20} chunk_dict = {"ens": 2} #filestem = "cardamom_for_holger" #chunk_dict = {"ens": 100} ds = xr.open_dataset(data_folder + filestem + ".nc") #.isel( # ens=slice(None, 6), # time=slice(None, 5) #) ds = ds.chunk(chunk_dict) ds # - # there is no multi-dimensional 'groupby' in xarray data structures def nested_groupby_apply(dataset, groupby, apply_fn, **kwargs): if len(groupby) == 1: return dataset.groupby(groupby[0]).apply(apply_fn, **kwargs) else: return dataset.groupby(groupby[0]).apply(nested_groupby_apply, groupby=groupby[1:], apply_fn=apply_fn, **kwargs) comp_dict = {'zlib': True, 'complevel': 9} # + # %%time # compute in parallel the model runs and save them to ds_mrs in netcdf format small_template = xr.Dataset( data_vars={ 'x': xr.DataArray(data=np.ndarray(dtype=float, shape=(len(ds.ens.data), )), dims=['ens']) }).chunk(chunk_dict) def func(single_site_ds): res = CARDAMOMlib.compute_pwc_mr_fd_ds(single_site_ds) return res def func_chunk(chunk_ds): print('\nChunk start:', chunk_ds.ens.data[0], '\n') res = nested_groupby_apply(chunk_ds, ['ens', 'lat', 'lon'], func) filename = filestem + "_{:03d}-{:03d}.nc".format( res.ens.data[0], res.ens.data[-1]) encoding = {var: comp_dict for var in res.data_vars} res.to_netcdf(filename, encoding=encoding) print(res) del res return xr.Dataset(data_vars={ 'x': xr.DataArray(data=np.zeros((chunk_dict['ens'], )), dims=['ens']) }) _ = xr.map_blocks(func_chunk, ds, template=small_template).compute() # - ds.close() del ds
def process_fractional_cover( dc, product, query_x_from, query_x_to, query_y_from, query_y_to, time_from, time_to, output_crs, query_crs="EPSG:4326", dask_time_chunk_size="10", dask_x_chunk_size="600", dask_y_chunk_size="600", **kwargs, ): nodata = -9999 time = (time_from, time_to) data_bands = ["red", "green", "blue", "nir", "swir1", "swir2"] # Product here is a geomedian product if product.startswith("ls"): resolution = (-30, 30) water_product = product[:3] + "_water_classification" else: resolution = (-10, 10) # TODO: Change when S2 WOFS ready water_product = None return None query = {} query["output_crs"] = output_crs query["resolution"] = resolution query["dask_chunks"] = { "time": int(dask_time_chunk_size), "x": int(dask_x_chunk_size), "y": int(dask_y_chunk_size), } if query_crs != "EPSG:4326": query["crs"] = query_crs query["x"] = (float(query_x_from), float(query_x_to)) query["y"] = (float(query_y_from), float(query_y_to)) water_scenes = dc.load(product=water_product, measurements=["water_classification"], time=time, **query) water_scenes = water_scenes.where(water_scenes >= 0) water_composite_mean = water_scenes.water_classification.mean(dim="time") land_composite = dc.load(product=product, measurements=data_bands, time=time, **query) if len(land_composite.dims) == 0 or len(land_composite.data_vars) == 0: return None # Fractional Cover Classification frac_classes = xr.map_blocks(frac_coverage_classify, land_composite, kwargs={"no_data": nodata}) # Mask to remove clounds, cloud shadow, and water. frac_cov_masked = frac_classes.where((frac_classes != nodata) & (water_composite_mean <= 0.4)) ## Compute fractional_cover = frac_cov_masked.compute() return fractional_cover
def func(single_site_ds): res = CARDAMOMlib.compute_pwc_mr_fd_ds(single_site_ds) return res def func_chunk(chunk_ds): print('\nChunk start:', chunk_ds.ens.data[0], '\n') res = nested_groupby_apply(chunk_ds, ['ens', 'lat', 'lon'], func) filename = filestem + "_{:03d}-{:03d}.nc".format(res.ens.data[0], res.ens.data[-1]) encoding = {var: comp_dict for var in res.data_vars} res.to_netcdf(filename, encoding=encoding) print(res) del res gc.collect() return xr.Dataset(data_vars={ 'x': xr.DataArray(data=np.zeros((chunk_dict['ens'], )), dims=['ens']) }) _ = xr.map_blocks(func_chunk, ds, template=small_template).compute() # - ds.close() del ds
def test_map_blocks_to_array(map_ds): with raise_if_dask_computes(): actual = xr.map_blocks(lambda x: x.to_array(), map_ds) # to_array does not preserve name, so cannot use assert_identical assert_equal(actual.compute(), map_ds.to_array().compute())
def generate_product( self, dc, path_prefix, aoi, output_projection, start_date, end_date, platform, res, aoi_crs, **kwargs, ): ## Create datacube query dask_chunks = dict(time=10, x=600, y=600) query = create_base_query(aoi, res, output_projection, aoi_crs, dask_chunks) all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"] product, measurement, water_product = create_product_measurement( platform, all_measurements ) time = (start_date, end_date) ## Create dask graph ds = dc.load( time=time, platform=platform, product=product, measurements=measurement, **query, ) if is_dataset_empty(ds): raise Exception( "DataCube Load returned an empty Dataset." + "Please check load parameters for Baseline Dataset!" ) water_scenes = dc.load( product=water_product, measurements=["water_classification"], time=time, **query, ) water_scenes = water_scenes.where(water_scenes >= 0) water_composite_mean = water_scenes.water_classification.mean(dim="time") water_composite_mean = water_composite_mean.rename( {"x": "longitude", "y": "latitude"} ) land_composite = geomedian(ds, product, all_measurements) land_composite = land_composite.rename({"x": "longitude", "y": "latitude"}) # Fractional Cover Classification frac_classes = xr.map_blocks( frac_coverage_classify, land_composite, kwargs={"no_data": np.nan} ) # Mask to remove clounds, cloud shadow, and water. frac_cov_masked = frac_classes.where( (frac_classes != np.nan) & (water_composite_mean <= 0.4) ) ## Compute fractional_cover_output = frac_cov_masked.compute() ## Write file file_name = path.join(path_prefix, "fractional_cover.tiff") import_export.export_xarray_to_geotiff( fractional_cover_output, file_name, crs=output_projection, x_coord="longitude", y_coord="latitude", ) return [file_name]
# %%time # compute in parallel the model runs and save them to ds_mrs in netcdf format def func(single_site_ds): res = CARDAMOMlib.compute_pwc_mr_fd_ds(single_site_ds) return res def func_chunk(chunk_ds): res = nested_groupby_apply(chunk_ds, ['ens', 'lat', 'lon'], func) return res ds_mrs = xr.map_blocks(func_chunk, ds, template=ds_mr_template).compute() #ds_mrs = xr.map_blocks(func_chunk, ds, template=ds).compute() print(ds_mrs) comp_dict = {'zlib': True, 'complevel': 9} encoding = {var: comp_dict for var in ds_mrs.data_vars} ds_mrs.to_netcdf(filestem + "_pwc_mrs_fd" + ".nc", encoding=encoding) ds_mrs.close() # + # %%time # compute Delta 14C values on entire grid (ens x lat x lon) (400 x 2 x 2) def func(sub_ds):
# flush=True # ) # write_to_logfile( # 'chunk finished,', # "lat:", chunk_ds.lat[0].data, # "lon:", chunk_ds.lon[0].data, # "prob:", chunk_ds.prob[0].data # ) return res_ds # - fake_ds = make_fake_ds(ds_sub).chunk(chunk_dict) ds_pwc_mr_fd = xr.map_blocks(func_chunk, ds_sub, template=fake_ds) fake_ds ds_pwc_mr_fd # + # %%time c = ds_sub.chunks nr_chunks = np.prod([len(val) for val in c.values()]) nr_singles = len(ds_sub.lat) * len(ds_sub.lon) * len(ds_sub.prob) write_to_logfile( 'starting:', # nr_chunks, "chunks, ", nr_singles, "singles")
def test_map_blocks_convert_args_to_list(obj): expected = obj + 10 with raise_if_dask_computes(): actual = xr.map_blocks(operator.add, obj, [10]) assert_chunks_equal(expected.chunk(), actual) xr.testing.assert_identical(actual.compute(), expected.compute())
def reduce(self, xx: xr.Dataset) -> xr.Dataset: template = xr.Dataset({m: xx.nbart_blue for m in self.measurements}) return xr.map_blocks(nmask_pmod.summarise, xx, template=template)
'lat': ds_sub.lat.data, 'lon': ds_sub.lon.data, 'prob': ds_sub.prob.data } fake_ds = xr.Dataset(data_vars={ 'abs_err': fake_array, 'rel_err': fake_array }, coords=fake_coords).chunk(chunk_dict) fake_ds # + # %%time ds_data_consistency = xr.map_blocks(func_chunk, ds_sub, template=fake_ds) # - comp_dict = {'zlib': True, 'complevel': 9} ds_data_consistency.to_netcdf(data_folder + filestem + output_folder + "data_consistency.nc", compression=comp_dict, compute=True) ds.close() del ds ds_sub.close() del ds_sub ds_data_consistency.close() del ds_data_consistency
def bootstrap_func(compute_index_func: Callable, **kwargs) -> xr.DataArray: """Bootstrap the computation of percentile-based exceedance indices. Indices measuring exceedance over percentile-based threshold may contain artificial discontinuities at the beginning and end of the reference period used for calculating the percentile. A bootstrap resampling procedure can reduce those discontinuities by iteratively replacing each the year the indice is computed on from the percentile estimate, and replacing it with another year within the reference period. Parameters ---------- compute_index_func : Callable Indice function. kwargs : dict Arguments to `func`. Returns ------- xr.DataArray The result of func with bootstrapping. References ---------- Zhang, X., Hegerl, G., Zwiers, F. W., & Kenyon, J. (2005). Avoiding Inhomogeneity in Percentile-Based Indices of Temperature Extremes, Journal of Climate, 18(11), 1641-1651, https://doi.org/10.1175/JCLI3366.1 Notes ----- This function is meant to be used by the `percentile_bootstrap` decorator. The parameters of the percentile calculation (percentile, window, reference_period) are stored in the attributes of the percentile DataArray. The bootstrap algorithm implemented here does the following:: For each temporal grouping in the calculation of the indice If the group `g_t` is in the reference period For every other group `g_s` in the reference period Replace group `g_t` by `g_s` Compute percentile on resampled time series Compute indice function using percentile Average output from indice function over all resampled time series Else compute indice function using original percentile """ # Identify the input and the percentile arrays from the bound arguments per_key = None for name, val in kwargs.items(): if isinstance(val, DataArray): if "percentile_doy" in val.attrs.get("history", ""): per_key = name else: da_key = name # Extract the DataArray inputs from the arguments da: DataArray = kwargs.pop(da_key) per_da: Optional[DataArray] = kwargs.pop(per_key, None) if per_da is None: # per may be empty on non doy percentiles raise KeyError( "`bootstrap` can only be used with percentiles computed using `percentile_doy`" ) # Boundary years of reference period clim = per_da.attrs["climatology_bounds"] if xclim.core.utils.uses_dask(da) and len( da.chunks[da.get_axis_num("time")]) > 1: warnings.warn( "The input data is chunked on time dimension and must be fully re-chunked to" " run percentile bootstrapping." " Beware, this operation can significantly increase the number of tasks dask" " has to handle.", stacklevel=2, ) chunking = {d: "auto" for d in da.dims} chunking["time"] = -1 # no chunking on time to use map_block da = da.chunk(chunking) # overlap of studied `da` and reference period used to compute percentile overlap_da = da.sel(time=slice(*clim)) if len(overlap_da.time) == len(da.time): raise KeyError( "`bootstrap` is unnecessary when all years are overlapping between reference " "(percentiles period) and studied (index period) periods") if len(overlap_da) == 0: raise KeyError( "`bootstrap` is unnecessary when no year overlap between reference " "(percentiles period) and studied (index period) periods.") pdoy_args = dict( window=per_da.attrs["window"], alpha=per_da.attrs["alpha"], beta=per_da.attrs["beta"], per=per_da.percentiles.data[()], ) bfreq = _get_bootstrap_freq(kwargs["freq"]) # Group input array in years, with an offset matching freq overlap_years_groups = overlap_da.resample(time=bfreq).groups da_years_groups = da.resample(time=bfreq).groups per_template = per_da.copy(deep=True) acc = [] # Compute bootstrapped index on each year of overlapping years for year_key, year_slice in da_years_groups.items(): kw = {da_key: da.isel(time=year_slice), **kwargs} if _get_year_label(year_key) in overlap_da.get_index("time").year: # If the group year is in both reference and studied periods, run the bootstrap bda = build_bootstrap_year_da(overlap_da, overlap_years_groups, year_key) if BOOTSTRAP_DIM not in per_template.dims: per_template = per_template.expand_dims( {BOOTSTRAP_DIM: np.arange(len(bda._bootstrap))}) if xclim.core.utils.uses_dask(bda): chunking = { d: bda.chunks[bda.get_axis_num(d)] for d in set(bda.dims).intersection( set(per_template.dims)) } per_template = per_template.chunk(chunking) per = xr.map_blocks( percentile_doy. __wrapped__, # strip history update from percentile_doy obj=bda, kwargs={ **pdoy_args, "copy": False }, template=per_template, ) if "percentiles" not in per_da.dims: per = per.squeeze("percentiles") kw[per_key] = per value = compute_index_func(**kw).mean(dim=BOOTSTRAP_DIM, keep_attrs=True) else: # Otherwise, run the normal computation using the original percentile kw[per_key] = per_da value = compute_index_func(**kw) acc.append(value) result = xr.concat(acc, dim="time") result.attrs["units"] = value.attrs["units"] return result
def generate_product( self, dc, path_prefix, aoi, output_projection, start_date, end_date, platform, res, aoi_crs, **kwargs, ): ## Create datacube query dask_chunks = dict(time=10, x=1000, y=1000) query = create_base_query(aoi, res, output_projection, aoi_crs, dask_chunks) all_measurements = ["green", "red", "blue", "nir", "swir1", "swir2"] product, measurement, water_product = create_product_measurement( platform, all_measurements) time = (start_date, end_date) ## Create dask graph ds = dc.load( time=time, platform=platform, product=product, measurements=measurement, **query, ) if is_dataset_empty(ds): raise Exception( "DataCube Load returned an empty Dataset." + "Please check load parameters for Baseline Dataset!") water_scenes = dc.load( product=water_product, measurements=["water_classification"], time=time, **query, ) # Set land to no_data water_dataset = water_scenes.where(water_scenes > 0) good_quality = mask_good_quality(ds, product) ds_clear = ds.where(good_quality) ds_clear_land = ds_clear.where(water_dataset.water_classification > 0) tsm_dataset = xr.map_blocks(tsm, ds_clear_land) mean_tsm = tsm_dataset.mean(dim=["time"]) max_tsm = tsm_dataset.max(dim=["time"]) min_tsm = tsm_dataset.min(dim=["time"]) ## Compute mean_tsm, max_tsm, min_tsm = dask.compute(mean_tsm, max_tsm, min_tsm) ## Write files result = [] file_name = path.join(path_prefix, "mean_tsm.tiff") import_export.export_xarray_to_geotiff( mean_tsm, file_name, crs=output_projection, x_coord="x", y_coord="y", ) result.append(file_name) file_name = path.join(path_prefix, "min_tsm.tiff") import_export.export_xarray_to_geotiff( min_tsm, file_name, crs=output_projection, x_coord="x", y_coord="y", ) result.append(file_name) file_name = path.join(path_prefix, "max_tsm.tiff") import_export.export_xarray_to_geotiff( max_tsm, file_name, crs=output_projection, x_coord="x", y_coord="y", ) result.append(file_name) return result
def test_map_blocks_ds_transformations(func, map_ds): with raise_if_dask_computes(): actual = xr.map_blocks(func, map_ds) assert_identical(actual.compute(), func(map_ds).compute())
def to_stations(da: Var, *, stations: pd.DataFrame, datastore: DataStore = None, chunks: dict = None, **kwargs) -> xr.DataArray: if isinstance(da, camps.Variable): da = da(datastore=datastore, chunks=chunks, **kwargs) elif isinstance(da, xr.DataArray): if chunks: da = da.camps.chunk(chunks) # Determine x and y # Use Projected crs as common crs for grid and station try: x = da.camps.projx.name y = da.camps.projy.name except KeyError: # projected coordinates do not exist if da.camps.grid_mapping: # try to make them from grid_mapping and lat/lons da = da.metpy.assign_crs(da.camps.grid_mapping) da = da.metpy.assign_y_x() da = da.drop('metpy_crs') x = da.camps.projx.name y = da.camps.projy.name else: # exception for mercator data without grid_mapping lat = da.camps.latitude if lat.ndim == 1: y = lat.name lon = da.camps.longitude if lon.ndim == 1: x = lon.name # ensure longitude expressed as degrees east from prime meridian with -179 and 180 bounds lon_attrs = lon.attrs da[lon.name] = xr.where(lon > 180, lon - 360, lon) da[lon.name].attrs = lon_attrs # Add the lat lon grid mapping since it didn't have one # Latitude and longitude on the WGS 1984 datum lat_lon_wgs84 = { 'grid_mapping_name': "latitude_longitude", 'longitude_of_prime_meridian': 0.0, 'semi_major_axis': 6378137.0, 'inverse_flattening': 298.257223563 } gm = xr.DataArray() gm.attrs.update(lat_lon_wgs84) da = da.assign_coords({'lat_lon_wgs84': gm}) da.attrs['grid_mapping'] = 'lat_lon_wgs84' pyproj_crs = CFProjection(da.camps.grid_mapping).to_pyproj() stations['x'], stations['y'] = Proj(pyproj_crs)(stations.lon.values, stations.lat.values) # rechunk so that multiple chunks don't span x and y dims if da.chunks is not None: da = da.chunk({x: -1, y: -1}) da = da.unify_chunks() # load x,y data da[x].load() da[y].load() # make horizonontal space 1-D by stacking x and y stacked = da.stack(xy=(x, y)) gridxy = np.column_stack((stacked[x].data, stacked[y].data)) stationxy = np.column_stack((stations.x, stations.y)) tree = cKDTree(gridxy) # fast nearest neighbor search algorith dist_ix = tree.query( stationxy) # find distance to nearest and index of nearest # make a grid polygon from shapely.geometry import Polygon, MultiPoint, Point unit_y = xr.DataArray(np.ones(da[y].shape), dims=da[y].dims) unit_x = xr.DataArray(np.ones(da[x].shape), dims=da[x].dims) edge_x = edge(da[x] * unit_y) # broadcast constant x along the y dim edge_y = edge(unit_x * da[y]) # broadcast constant y along the x dim xy = zip(edge_x, edge_y) grid_polygon = Polygon(xy) # make station points xy = zip(stations.x.values, stations.y.values) station_points = MultiPoint(list(xy)) # determine which stations lie outside grid domain (polygon) stations['point_str'] = pd.Series([str(p) for p in station_points]) stations['ix'] = stations.index points_outside_grid = station_points - station_points.intersection( grid_polygon) if not points_outside_grid.is_empty: if isinstance(points_outside_grid, Point): points_outside_grid = [points_outside_grid] ix_stations_outside_grid = stations.set_index('point_str').loc[[ str(p) for p in points_outside_grid ]].ix else: ix_stations_outside_grid = list() # let be empty list print(len(ix_stations_outside_grid)) def nearest_worker(da: xr.DataArray, *, x, y, ix, ix_nan) -> xr.DataArray: da = da.stack(station=(x, y)) # squash the horizontal space dims into one da = da.isel(station=ix) da = da.drop_vars('station') # remove station coord da.loc[{ 'station': ix_nan }] = np.nan # use integer index location to set stations outside grid to missing return da # make template for expressing the change in shape in map_blocks template = da.copy() # reshape action template = template.stack( station=(x, y)) # combine the lat/lon dims into one dim called station template = template.isel( station=[0] * len(stations)) # select only the first; this removes the dim station template = template.drop( 'station' ) # drop the multiindex lat/lon coord associated with 'station' from the 0th grid point mb_kwargs = dict(x=x, y=y, ix=dist_ix[1], ix_nan=ix_stations_outside_grid) da = xr.map_blocks(nearest_worker, da, kwargs=mb_kwargs, template=template) # remove any metadata that may be leftover from the grid da = da.drop_vars([x, y], errors='ignore') # configure station metadata # prep station coord with numeric index called 'station' stations = stations.reset_index() stations.index.set_names('station', inplace=True) # assign the new coords da = da.assign_coords({'platform_id': stations.call}) da.platform_id.attrs['standard_name'] = 'platform_id' # assign the new coords with numeric index da = da.assign_coords({'lat': stations.lat}) da.lat.attrs['standard_name'] = 'latitude' da.lat.attrs['units'] = 'degrees_north' da = da.assign_coords({'lon': stations.lon}) da.lon.attrs['standard_name'] = 'longitude' da.lon.attrs['units'] = 'degrees_east' # drop the numeric index; da = da.reset_index('station', drop=True) return da
# + chunk_dict = {"lat": 1, "lon": 1, "prob": 1} ds_sub = ds.isel(lat=slice(0, None, 1), lon=slice(0, None, 1), prob=slice(0, 5, 1)).chunk(chunk_dict) #ds_sub = ds.chunk(chunk_dict) ds_sub # + # %%time #ds_data_consistency = xr.map_blocks(func_chunk, ds_sub, template=fake_ds) xr.map_blocks(func_chunk, ds_sub, template=ds_sub).compute() # - comp_dict = {'zlib': True, 'complevel': 9} ds_data_consistency.to_netcdf(data_folder + filestem + output_folder + "data_consistency.nc", compression=comp_dict, compute=True) ds.close() del ds ds_sub.close() del ds_sub ds_data_consistency.close() del ds_data_consistency
def interp_to_isosurfaces(da: Var, *, level_data: Var, isovalues: Union[Number, List[Number]], datastore: DataStore = None, chunks: dict = None, **kwargs) -> xr.DataArray: if isinstance(da, camps.Variable): da = da(datastore=datastore, chunks=chunks, **kwargs) elif isinstance(da, xr.DataArray): if chunks: da = da.camps.chunk(chunks) if isinstance(level_data, camps.Variable): level_data = level_data(datastore=datastore, chunks=chunks, **kwargs) elif isinstance(level_data, xr.DataArray): if chunks: level_data = level_data.camps.chunk(chunks) # rechunk so that multiple chunks don't span z dim if da.chunks is not None: da = da.camps.chunk({'z': -1}) if da.coords.keys() != level_data.coords.keys(): raise ValueError('data and level variable coords do not match') # detach and re-attach non-NUG coords z = da.camps.z.name saved_coords = dict() for coord in da.coords: if len(da[coord].dims) > 1: if z in da[coord].dims: raise ValueError( 'non-NUG coords spanning z axis are not allowed') saved_coords[coord] = da[coord] da = da.drop(coord) level_data = level_data.drop(coord) # prep for computation via map_blocks kwargs['isovalues'] = isovalues # inputs prepped as dataset as map_blocks does not take multiple chunked arrays ds = xr.Dataset() ds['variable'] = da ds['level_variable'] = level_data # prep output template (the output will have this meta structure) template = interp_to_isosurface_meta_template(ds, isovalues) # horizontal dims will be either x,y grid or stations, for now don't worry about handling stations # rename input z axis dim name to output z axis dim name (determined by meta template; only z is touched) x = da.camps.x.name y = da.camps.y.name z_in = da.camps.z.name z_final = template.camps.z.name ds = ds.rename({z_in: z_final}) kwargs['space_dims'] = [z_final, x, y] # perform work on each chunked block individually with the interp_to_isosurface_worker #da = xr.map_blocks(interp_to_isosurface_worker, ds, kwargs=kwargs, template=template) da = xr.map_blocks(interp_to_isosurface_worker, ds, kwargs=kwargs, template=template) # re-assign coords that spanned more than one dimension da = da.assign_coords(saved_coords) return da