def __gt__(self, other): if isinstance(other, Longitude): if self.hemisphere == 'W': if other.hemisphere == 'E': return False else: return self.longitude < other.longitude else: if other.hemisphere == 'W': return True else: return self.longitude > other.longitude else: return xr.apply_ufunc(np.less, other, self)
def aggregate_da(da, agg_dims, suf='_agg'): input_core_dims = list(agg_dims) n_agg = len(input_core_dims) core_block_size = tuple([agg_dims[k] for k in input_core_dims]) block_size = (da.ndim - n_agg)*(1,) + core_block_size output_core_dims = [dim + suf for dim in input_core_dims] output_sizes = {(dim + suf): da.shape[da.get_axis_num(dim)]//agg_dims[dim] for dim in input_core_dims} output_dtypes = da.dtype da_out = xr.apply_ufunc(block_reduce, da, kwargs={'block_size': block_size}, input_core_dims=[input_core_dims], output_core_dims=[output_core_dims], output_sizes=output_sizes, output_dtypes=[output_dtypes], dask='parallelized') for dim in input_core_dims: new_coord = block_reduce(da[dim].data, (agg_dims[dim],), func=np.mean) da_out.coords[dim + suf] = (dim + suf, new_coord) return da_out
def xr_moment(x, dim, order=1): """Calculate statistical moment of an XArray DataArray object. Parameters ---------- x : xarray object dim : str or sequence of str Dimension(s) over which to calculate moment. order : int or array_like of ints, optional Order of central moment that is returned. Default is 1 (mean). Returns ------- moment : Calculated moment as an XArray object. """ return xr.apply_ufunc( moment, x, input_core_dims=[[dim]], kwargs={'moment': order, 'axis': -1, 'nan_policy': 'omit'}, dask='parallelized', output_dtypes=[float] )
def destagger(xarr, dim, **kwargs): """Destagger an inteface located variable along a dimension Parameters ---------- xarr : xr.Dataset input datarray dim : str dimension to destagger the data along mode : str Passed to np.take Returns ------- destaggered : xr.Dataset cell centered DataArray See Also -------- numpy.take Examples -------- >>> x = xr.DataArray(np.arange(0, 5), [('x', np.arange(0, 5))]) >>> destagger(x, 'x') <xarray.DataArray (x: 5)> array([ 0.5, 1.5, 2.5, 3.5, 2. ]) Coordinates: * x (x) int64 0 1 2 3 4 """ return apply_ufunc(destagger_dask, xarr, input_core_dims=[[dim]], output_core_dims=[[dim]], dask='parallelized', output_dtypes=[xarr.dtype], kwargs=kwargs)
def smape(a, b, dim=None, weights=None, skipna=False, keep_attrs=False): """Symmetric Mean Absolute Percentage Error. .. math:: \\mathrm{SMAPE} = \\frac{1}{n} \\sum_{i=1}^{n} \\frac{ \\vert a_{i} - b_{i} \\vert } { \\vert a_{i} \\vert + \\vert b_{i} \\vert } .. note:: Percent error is reported as decimal percent. I.e., a value of 1 is 100%. Parameters ---------- a : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. (Truth which will be divided by) b : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. dim : str, list The dimension(s) to apply the smape along. Note that this dimension will be reduced as a result. Defaults to None reducing all dimensions. weights : xarray.Dataset or xarray.DataArray or None Weights matching dimensions of ``dim`` to apply during the function. skipna : bool If True, skip NaNs when computing function. keep_attrs : bool If True, the attributes (attrs) will be copied from the first input to the new one. If False (default), the new object will be returned without attributes. Returns ------- xarray.Dataset or xarray.DataArray Symmetric Mean Absolute Percentage Error. References ---------- https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error Examples -------- >>> import numpy as np >>> import xarray as xr >>> from xskillscore import smape >>> a = xr.DataArray(np.random.rand(5, 3, 3), dims=['time', 'x', 'y']) >>> b = xr.DataArray(np.random.rand(5, 3, 3), dims=['time', 'x', 'y']) >>> smape(a, b, dim='time') """ dim, axis = _preprocess_dims(dim, a) a, b = xr.broadcast(a, b, exclude=dim) weights = _preprocess_weights(a, dim, dim, weights) input_core_dims = _determine_input_core_dims(dim, weights) return xr.apply_ufunc( _smape, a, b, weights, input_core_dims=input_core_dims, kwargs={ "axis": axis, "skipna": skipna }, dask="parallelized", output_dtypes=[float], keep_attrs=keep_attrs, )
def xr_moment(x, dim, order=1): return xr.apply_ufunc( moment, x, input_core_dims=[[dim]], kwargs={'moment': order, 'axis': -1, 'nan_policy': 'omit'}, dask='parallelized', output_dtypes=[float] )
# %% [markdown] # # Calculate height range (h_range) # # A simple way of finding active subglacial lakes is to see where # there has been a noticeably rapid change in elevation over # a short period of time such as 2-5 metres a year (or ~4x91-day ICESat-2 cycles). # 'Range of height' is quick way to do this, # basically just doing maximum height minus minimum height. # %% # Calculate height range across cycles, parallelized using dask ds["h_range"]: xr.DataArray = xr.apply_ufunc( deepicedrain. nanptp, # min point to max point (range) that handles NaN values ds.h_corr, input_core_dims=[["cycle_number"]], dask="allowed", output_dtypes=[ds.h_corr.dtype], kwargs={"axis": 1}, ) # %% # %%time # Compute height range. Also include all height and time info ds_ht: xr.Dataset = ds[["h_range", "h_corr", "delta_time"]].compute() # %% # Non-parallelized # h_range = deepicedrain.nanptp(a=ds.h_corr[0:1], axis=1) # Ensure no height range values which are zero (usually due to only 1 data point) # assert len(dask.array.argwhere(dsh.h_range <= 0.0).compute()) == 0
def regrid(ds, dimx, dimy, **kwargs): """ Interpolate Dataset or DataArray `ds` to a new grid, using rasterio's reproject facility. See also: https://mapbox.github.io/rasterio/topics/resampling.html Parameters ---------- ds : xr.Dataset|xr.DataArray N-dim data on a spatial grid dimx : pd.Index New x-coordinates in destination crs. dimx.name MUST refer to x-coord of ds. dimy : pd.Index New y-coordinates in destination crs. dimy.name MUST refer to y-coord of ds. **kwargs : Arguments passed to rio.wrap.reproject; of note: - resampling is one of gis.Resampling.{average,cubic,bilinear,nearest} - src_crs, dst_crs define the different crs (default: latlong) """ namex = dimx.name namey = dimy.name ds = maybe_swap_spatial_dims(ds, namex, namey) src_transform = _as_transform(ds.indexes[namex], ds.indexes[namey]) dst_transform = _as_transform(dimx, dimy) dst_shape = len(dimy), len(dimx) kwargs.update(dst_shape=dst_shape, src_transform=src_transform, dst_transform=dst_transform) kwargs.setdefault("src_crs", 'longlat') kwargs.setdefault("dst_crs", 'longlat') def _reproject(src, dst_shape, **kwargs): dst = np.empty(src.shape[:-2] + dst_shape, dtype=src.dtype) rio.warp.reproject(np.asarray(src), dst, **kwargs) return dst data_vars = ds.data_vars.values() if isinstance(ds, xr.Dataset) else (ds, ) dtypes = {da.dtype for da in data_vars} assert len( dtypes ) == 1, "regrid can only reproject datasets with homogeneous dtype" return (xr.apply_ufunc(_reproject, ds, input_core_dims=[[namey, namex]], output_core_dims=[['yout', 'xout']], output_dtypes=[dtypes.pop()], output_sizes={ 'yout': dst_shape[0], 'xout': dst_shape[1] }, dask='parallelized', kwargs=kwargs).rename({ 'yout': namey, 'xout': namex }).assign_coords( **{ namey: (namey, dimy, ds.coords[namey].attrs), namex: (namex, dimx, ds.coords[namex].attrs) }).assign_attrs(**ds.attrs))
def second_derivative(self, dim): """Compute second derivative with the 4th order accurate centered scheme. It is fully functional with all boundary conditions available on Xcompact3d and stretched mesh in y direction. The **atribute** ``BC`` is used to store Boundary Condition information in a dictionary (see examples), default is ``ncl1 = ncln = 2`` and ``npaire = 1``. Parameters ---------- dim : str Coordinate used for the derivative. Returns ------- :obj:`xarray.DataArray` **differentiated** Examples ------- >>> da.attrs['BC'] = { ... 'x': { ... 'ncl1': 1, ... 'ncln': 1, ... 'npaire': 0 ... }, ... 'y': { ... 'ncl1': 2, ... 'ncln': 1, ... 'npaire': 1 ... 'istret': 0, ... 'beta': 1.0 ... }, ... 'z': { ... 'ncl1': 0, ... 'ncln': 0, ... 'npaire': 1 ... } >>> da.x3d.second_derivative('x') Notes ----- The **atribute** ``BC`` is automatically defined for ``ux``, ``uy``, ``uz``, ``pp`` and ``phi`` when read from the disc with :obj:`xcompact3d_toolbox.io.readfield` or initialized at :obj:`xcompact3d_toolbox.sendbox.init_dataset`. """ if dim not in self._Dxx: try: ncl1 = self._data_array.attrs["BC"][dim]["ncl1"] ncln = self._data_array.attrs["BC"][dim]["ncln"] npaire = self._data_array.attrs["BC"][dim]["npaire"] except: ncl1, ncln, npaire = 2, 2, 1 n = self._data_array[dim].size m = n if ncl1 == 0 and ncln == 0 else n - 1 d = (self._data_array[dim][-1] - self._data_array[dim][0]).values / m self._Dxx[dim] = SecondDerivative(n, d, ncl1, ncln, npaire) try: istret = self._data_array.attrs["BC"][dim]["istret"] beta = self._data_array.attrs["BC"][dim]["beta"] except: istret = 0 beta = 1.0 if istret == 0: return xr.apply_ufunc( lambda f: self._Dxx[dim].dot(f), self._data_array, input_core_dims=[[dim]], output_core_dims=[[dim]], dask="parallelized", vectorize=True, output_dtypes=[param["mytype"]], ) else: yly = (self._data_array[dim][-1] - self._data_array[dim][0]).values yp, ppy, pp2y, pp4y = stretching(istret, beta, yly, m, n) da_pp2y = xr.DataArray(pp2y, coords=[self._data_array[dim]], name="pp2y") da_pp4y = xr.DataArray(pp4y, coords=[self._data_array[dim]], name="pp4y") return da_pp2y * xr.apply_ufunc( lambda f: self._Dxx[dim].dot(f), self._data_array, input_core_dims=[[dim]], output_core_dims=[[dim]], dask="parallelized", vectorize=True, output_dtypes=[param["mytype"]], ) - da_pp4y * self._data_array.x3d.first_derivative(dim)
def xr_linregress(x, y, dim="time"): """Calculates linear regression along dimension `dim`. Results are equivalent to `scipy.stats.linregress`. Parameters ---------- x : {xr.DataArray} Independent variable for linear regression. E.g. time. y : {xr.DataArray, xr.Dataset} Dependent variable. dim : str Dimension over which to perform linear regression. Must be present in both `a` and `b` (the default is 'time'). Returns ------- type(b) Returns a dataarray containing the parameter values for each data_variable in `b`. The naming convention follows `scipy.stats.linregress` """ # align the nan Values before... x = x.where(~np.isnan(y)) y = y.where(~np.isnan(x)) # TODO: think about making this optional? Right now I err on the side of caution # Inspired by this post https://stackoverflow.com/a/60352716 but adjusted, so that # results are exactly as with scipy.stats.linregress for 1d vectors. n = y.notnull().sum(dim) nanmask = np.isnan(y).all(dim) xmean = x.mean(dim) ymean = y.mean(dim) xstd = x.std(dim) ystd = y.std(dim) cov = ((x - xmean) * (y - ymean)).sum(dim) / (n) cor = cov / (xstd * ystd) slope = cov / (xstd**2) intercept = ymean - xmean * slope df = n - 2 TINY = 1.0e-20 tstats = cor * np.sqrt(df / ((1.0 - cor + TINY) * (1.0 + cor + TINY))) stderr = slope / tstats pval = (xr.apply_ufunc( stats.distributions.t.sf, abs(tstats), df, dask="parallelized", output_dtypes=[y.dtype], ) * 2) return xr.Dataset({ "slope": slope, "intercept": intercept, "r_value": cor.fillna(0).where(~nanmask), "p_value": pval, "std_err": stderr.where(~np.isinf(stderr), 0), })
def seeds_init( varr: xr.DataArray, wnd_size=500, method="rolling", stp_size=200, nchunk=100, max_wnd=10, diff_thres=2, ): """ Generate over-complete set of seeds by finding local maxima across frames. This function computes the maximum intensity projection of a subset of frames and finds the local maxima. The subsetting use either a rolling window or random sampling of frames. `wnd_size` `stp_size` and `nchunk` controls different aspects of the subsetting. `max_wnd` and `diff_thres` controls how local maxima are computed. The set of all local maxima found in this process constitutes an overly-complete set of seeds, representing putative locations of cells. Parameters ---------- varr : xr.DataArray Input movie data. Should have dimensions "frame", "height" and "width". wnd_size : int, optional Number of frames in each chunk, for which a max projection will be calculated. By default `500`. method : str, optional Either `"rolling"` or `"random"`. Controls whether to use rolling window or random sampling of frames to construct chunks. By default `"rolling"`. stp_size : int, optional Number of frames between the center of each chunk when stepping through the data with rolling windows. Only used if `method is "rolling"`. By default `200`. nchunk : int, optional Number of chunks to sample randomly. Only used if `method is "random"`. By default `100`. max_wnd : int, optional Radius (in pixels) of the disk window used for computing local maxima. Local maximas are defined as pixels with maximum intensity in such a window. By default `10`. diff_thres : int, optional Intensity threshold for the difference between local maxima and its neighbours. Any local maxima that is not birghter than its neighbor (defined by the same disk window) by `diff_thres` intensity values will be filtered out. By default `2`. Returns ------- seeds : pd.DataFrame Seeds dataframe with each seed as a row. Has column "height" and "width" which are location of the seeds. Also has column "seeds" which is an integer showing how many chunks where the seed is considered a local maxima. """ int_path = os.environ["MINIAN_INTERMEDIATE"] print("constructing chunks") idx_fm = varr.coords["frame"] nfm = len(idx_fm) if method == "rolling": nstp = np.ceil(nfm / stp_size) + 1 centers = np.linspace(0, nfm - 1, int(nstp)) hwnd = np.ceil(wnd_size / 2) max_idx = list( map( lambda c: slice(int(np.floor(c - hwnd).clip(0)), int(np.ceil(c + hwnd))), centers, )) elif method == "random": max_idx = [ np.random.randint(0, nfm - 1, wnd_size) for _ in range(nchunk) ] print("computing max projections") res = [max_proj_frame(varr, cur_idx) for cur_idx in max_idx] max_res = xr.concat(res, "sample") max_res = save_minian(max_res.rename("max_res"), int_path, overwrite=True) print("calculating local maximum") loc_max = xr.apply_ufunc( local_max_roll, max_res, input_core_dims=[["height", "width"]], output_core_dims=[["height", "width"]], vectorize=True, dask="parallelized", output_dtypes=[np.uint8], kwargs=dict(k0=2, k1=max_wnd, diff=diff_thres), ).sum("sample") seeds = (loc_max.where( loc_max > 0).rename("seeds").to_dataframe().dropna().reset_index()) return seeds[["height", "width", "seeds"]]
def __ge__(self, other): if isinstance(other, Longitude): return self > other or self == other else: return xr.apply_ufunc(np.less_equal, other, self)
def effective_sample_size(a, b, dim, skipna=False): """Effective sample size for temporally correlated data. .. note:: This metric should only be applied over the time dimension, since it is designed for temporal autocorrelation. Weights are not included due to the reliance on temporal autocorrelation. The effective sample size extracts the number of independent samples between two time series being correlated. This is derived by assessing the magnitude of the lag-1 autocorrelation coefficient in each of the time series being correlated. A higher autocorrelation induces a lower effective sample size which raises the correlation coefficient for a given p value. .. math:: N_{eff} = N\\left( \\frac{1 - \\rho_{f}\\rho_{o}}{1 + \\rho_{f}\\rho_{o}} \\right), where :math:`\\rho_{f}` and :math:`\\rho_{o}` are the lag-1 autocorrelation coefficients for the forecast and observations. Parameters ---------- a : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. b : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. dim : str, list The dimension(s) to apply the function along. skipna : bool If True, skip NaNs when computing function. Returns ------- xarray.Dataset or xarray.DataArray Effective sample size. Reference --------- * Bretherton, Christopher S., et al. "The effective number of spatial degrees of freedom of a time-varying field." Journal of climate 12.7 (1999): 1990-2009. * Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100. Academic press, 2011. """ dim, _ = _preprocess_dims(dim) if len(dim) > 1: raise ValueError( 'Effective sample size should only be applied to a singular time dimension.' ) else: new_dim = dim[0] if new_dim != 'time': warnings.warn( f"{dim} is not 'time'. Make sure that you are applying this over a " f'temporal dimension.') return xr.apply_ufunc( _effective_sample_size, a, b, input_core_dims=[[new_dim], [new_dim]], kwargs={ 'axis': -1, 'skipna': skipna }, dask='parallelized', output_dtypes=[float], )
def fuzzy_where( cond: xr.DataArray, x, y, join="left" ) -> xr.DataArray: from xarray.core import duck_array_ops return xr.apply_ufunc( duck_array_ops.where, cond, x, y, join=join, dataset_join=join, dask="allowed" )
def __call__(self, correlations, n): return xr.apply_ufunc(lambda correlation: self.correct(correlation, n), correlations)
return find_ridges_spherical_hessian(x, sigma=4, tolerance_threshold=0.0015e-3, scheme='second_order', return_eigvectors=True)[return_idx] def func_to_apply(x): ridge = x.groupby('time').apply(find_ridges) ridge = ridge.where(x > 1.2) return ridge for i, filename in enumerate(file_list[61:]): print(1) # print('*--- Reading file {} of {} ---*'.format(i, filename)) outname = filename.split('/')[-1] da = xr.open_dataarray(filename, chunks={'time': 60}) da = da.sortby('time') da = .5 * xr.apply_ufunc(np.log, da, dask='allowed') print(da) with ProgressBar(): ridges = da.map_blocks(func_to_apply, template=da).compute(scheduler='processes', num_workers=4) print('Calculation end.') print('Writing output.') ridges.to_netcdf(outpath + outname)
def compute_dataset(cube_func: CubeFunc, *input_cubes: xr.Dataset, input_cube_schema: CubeSchema = None, input_var_names: Sequence[str] = None, input_params: Dict[str, Any] = None, output_var_name: str = 'output', output_var_dims: AbstractSet[str] = None, output_var_dtype: Any = np.float64, output_var_attrs: Dict[str, Any] = None, vectorize: bool = None, cube_asserted: bool = False) -> xr.Dataset: """ Compute a new output dataset with a single variable named *output_var_name* from variables named *input_var_names* contained in zero, one, or more input data cubes in *input_cubes* using a cube factory function *cube_func*. *cube_func* is called concurrently for each of the chunks of the input variables. It is expected to return a chunk block whith is type ``np.ndarray``. If *input_cubes* is not empty, *cube_func* receives variables as specified by *input_var_names*. If *input_cubes* is empty, *input_var_names* must be empty too, and *input_cube_schema* must be given, so that a new cube can be created. The full signature of *cube_func* is::: def cube_func(*input_vars: np.ndarray, input_params: Dict[str, Any] = None, dim_coords: Dict[str, np.ndarray] = None, dim_ranges: Dict[str, Tuple[int, int]] = None) -> np.ndarray: pass The arguments are: * ``input_vars``: the variables according to the given *input_var_names*; * ``input_params``: is this call's *input_params*, a mapping from parameter name to value; * ``dim_coords``: a mapping from dimension names to the current chunk's coordinate arrays; * ``dim_ranges``: a mapping from dimension names to the current chunk's index ranges. Only the ``input_vars`` argument is mandatory. The keyword arguments ``input_params``, ``input_params``, ``input_params`` do need to be present at all. *output_var_dims* my be given in the case, where ... TODO: describe new output_var_dims... :param cube_func: The cube factory function. :param input_cubes: An optional sequence of input cube datasets, must be provided if *input_cube_schema* is not. :param input_cube_schema: An optional input cube schema, must be provided if *input_cubes* is not. :param input_var_names: A sequence of variable names :param input_params: Optional dictionary with processing parameters passed to *cube_func*. :param output_var_name: Optional name of the output variable, defaults to ``'output'``. :param output_var_dims: Optional set of names of the output dimensions, used in the case *cube_func* reduces dimensions. :param output_var_dtype: Optional numpy datatype of the output variable, defaults to ``'float32'``. :param output_var_attrs: Optional metadata attributes for the output variable. :param vectorize: Whether all *input_cubes* have the same variables which are concatenated and passed as vectors to *cube_func*. Not implemented yet. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset that contains the computed output variable. """ if vectorize is not None: # TODO: support vectorize = all cubes have same variables and cube_func # receives variables as vectors (with extra dim) raise NotImplementedError('vectorize is not supported yet') if not cube_asserted: for cube in input_cubes: assert_cube(cube) # Check compatibility of inputs if input_cubes: input_cube_schema = CubeSchema.new(input_cubes[0]) for cube in input_cubes: if not cube_asserted: assert_cube(cube) if cube != input_cubes[0]: # noinspection PyUnusedLocal other_schema = CubeSchema.new(cube) # TODO (forman): broadcast all cubes to same shape, rechunk to same chunks elif input_cube_schema is None: raise ValueError('input_cube_schema must be given') output_var_name = output_var_name or 'output' # Collect named input variables, raise if not found input_var_names = input_var_names or [] input_vars = [] for var_name in input_var_names: input_var = None for cube in input_cubes: if var_name in cube.data_vars: input_var = cube[var_name] break if input_var is None: raise ValueError( f'variable {var_name!r} not found in any of cubes') input_vars.append(input_var) # Find out, if cube_func uses any of _PREDEFINED_KEYWORDS has_input_params, has_dim_coords, has_dim_ranges = _inspect_cube_func( cube_func, input_var_names) def cube_func_wrapper(index_chunk, *input_var_chunks): nonlocal input_cube_schema, input_var_names, input_params, input_vars nonlocal has_input_params, has_dim_coords, has_dim_ranges # Note, xarray.apply_ufunc does a test call with empty input arrays, # so index_chunk.size == 0 is a valid case empty_call = index_chunk.size == 0 # TODO: when output_var_dims is given, index_chunk must be reordered # as core dimensions are moved to the and of index_chunk and input_var_chunks if not empty_call: index_chunk = index_chunk.ravel() if index_chunk.size < 2 * input_cube_schema.ndim: if not empty_call: warnings.warn( f"unexpected index_chunk of size {index_chunk.size} received!" ) return None dim_ranges = None if has_dim_ranges or has_dim_coords: dim_ranges = {} for i in range(input_cube_schema.ndim): dim_name = input_cube_schema.dims[i] if not empty_call: start = int(index_chunk[2 * i + 0]) end = int(index_chunk[2 * i + 1]) dim_ranges[dim_name] = start, end else: dim_ranges[dim_name] = () dim_coords = None if has_dim_coords: dim_coords = {} for coord_var_name, coord_var in input_cube_schema.coords.items(): coord_slices = [slice(None)] * coord_var.ndim for i in range(input_cube_schema.ndim): dim_name = input_cube_schema.dims[i] if dim_name in coord_var.dims: j = coord_var.dims.index(dim_name) coord_slices[j] = slice(*dim_ranges[dim_name]) dim_coords[coord_var_name] = coord_var[tuple( coord_slices)].values kwargs = {} if has_input_params: kwargs['input_params'] = input_params if has_dim_ranges: kwargs['dim_ranges'] = dim_ranges if has_dim_coords: kwargs['dim_coords'] = dim_coords return cube_func(*input_var_chunks, **kwargs) index_var = _gen_index_var(input_cube_schema) all_input_vars = [index_var] + input_vars input_core_dims = None if output_var_dims: input_core_dims = [] has_warned = False for i in range(len(all_input_vars)): input_var = all_input_vars[i] var_core_dims = [ dim for dim in input_var.dims if dim not in output_var_dims ] must_rechunk = False if var_core_dims and input_var.chunks: for var_core_dim in var_core_dims: dim_index = input_var.dims.index(var_core_dim) dim_chunk_size = input_var.chunks[dim_index][0] dim_shape_size = input_var.shape[dim_index] if dim_chunk_size != dim_shape_size: must_rechunk = True break if must_rechunk: if not has_warned: warnings.warn( f'Input variables must not be chunked in dimension(s): {", ".join(var_core_dims)}.\n' f'Rechunking applies, which may drastically decrease runtime performance ' f'and increase memory usage.') has_warned = True all_input_vars[i] = input_var.chunk( {var_core_dim: -1 for var_core_dim in var_core_dims}) input_core_dims.append(var_core_dims) output_var = xr.apply_ufunc(cube_func_wrapper, *all_input_vars, dask='parallelized', input_core_dims=input_core_dims, output_dtypes=[output_var_dtype]) if output_var_attrs: output_var.attrs.update(output_var_attrs) return xr.Dataset({output_var_name: output_var}, coords=input_cube_schema.coords)
def __eq__(self, other): if isinstance(other, Longitude): return (self.hemisphere == other.hemisphere and self.longitude == other.longitude) else: return xr.apply_ufunc(np.equal, other, self)
'SpeciesConc_NO2', 'SpeciesConc_HNO3', 'SpeciesConc_PAN', 'SpeciesConc_CO', 'SpeciesConc_CH2O', 'SpeciesConc_SO2', 'SpeciesConc_NH3', ] spc_mmr = spc_mmr[relevant_spc] spc_mmr = drop_non_dmmr_variables(spc_mmr) spc_mol = dmmr_to_moles(spc_mmr, mass_dry_air=met.Met_AD) print("Calculating tropospheric totals ...") tropo_total = xr.apply_ufunc(total_below, spc_mol, met.Met_TropP, met.Met_PS1WET, input_core_dims=[['lev'], [], []], vectorize=True) global_tropo_total = tropo_total.sum(dim=('nf', 'Ydim', 'Xdim')) global_tropo_total = global_tropo_total.rename( {old_name: f'Global{old_name}' for old_name in global_tropo_total.keys()}) # strat_total = xr.apply_ufunc( # total_below, # spc_mol, met.Met_TropP, met.Met_PS1WET, kwargs=dict(above_instead=True), # input_core_dims=[['lev'], [], []], # vectorize=True # ) print("Subsetting budget dataset ...")
def spearman_r_eff_p_value(a, b, dim, skipna=False): """ 2-tailed p-value associated with Spearman rank correlation coefficient, accounting for autocorrelation. .. note:: This metric should only be applied over the time dimension, since it is designed for temporal autocorrelation. Weights are not included due to the reliance on temporal autocorrelation. The effective p value is computed by replacing the sample size :math:`N` in the t-statistic with the effective sample size, :math:`N_{eff}`. The same Spearman's rank correlation coefficient :math:`r` is used as when computing the standard p value. .. math:: t = r\\sqrt{ \\frac{N_{eff} - 2}{1 - r^{2}} }, where :math:`N_{eff}` is computed via the autocorrelation in the forecast and observations. .. math:: N_{eff} = N\\left( \\frac{1 - \\rho_{f}\\rho_{o}}{1 + \\rho_{f}\\rho_{o}} \\right), where :math:`\\rho_{f}` and :math:`\\rho_{o}` are the lag-1 autocorrelation coefficients for the forecast and observations. Parameters ---------- a : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. b : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. dim : str, list The dimension(s) to compute the p value over. skipna : bool If True, skip NaNs when computing function. Returns ------- xarray.Dataset or xarray.DataArray 2-tailed p-value of Spearman's correlation coefficient, accounting for autocorrelation. Reference --------- * Bretherton, Christopher S., et al. "The effective number of spatial degrees of freedom of a time-varying field." Journal of climate 12.7 (1999): 1990-2009. * Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100. Academic press, 2011. See Also -------- xarray.apply_ufunc scipy.stats.spearman_r xskillscore.core.np_deterministic._spearman_r_eff_p_value """ dim, _ = _preprocess_dims(dim) if len(dim) > 1: raise ValueError( 'Effective sample size should only be applied to a singular time dimension.' ) else: new_dim = dim[0] if new_dim != 'time': warnings.warn( f"{dim} is not 'time'. Make sure that you are applying this over a " f'temporal dimension.') return xr.apply_ufunc( _spearman_r_eff_p_value, a, b, input_core_dims=[[new_dim], [new_dim]], kwargs={ 'axis': -1, 'skipna': skipna }, dask='parallelized', output_dtypes=[float], )
def __le__(self, other): if isinstance(other, Longitude): return self < other or self == other else: return xr.apply_ufunc(np.greater_equal, other, self)
def gmm_refine( varr: xr.DataArray, seeds: pd.DataFrame, q=(0.1, 99.9), n_components=2, valid_components=1, mean_mask=True, ) -> Tuple[pd.DataFrame, xr.DataArray, GaussianMixture]: """ Filter seeds by fitting a GMM to peak-to-peak values. This function assume that the distribution of peak-to-peak values of fluorescence across all seeds can be model by a Gaussian Mixture Model (GMM) with different means. It computes peak-to-peak value for all the seeds, then fit a GMM with `n_components` to the distribution, and filter out the seeds belonging to the `n_components - valid_components` number of gaussians with lower means. Parameters ---------- varr : xr.DataArray The input movie data. Should have dimension "spatial" and "frame". seeds : pd.DataFrame The input over-complete set of seeds to be filtered. q : tuple, optional Percentile to use to compute the peak-to-peak values. For a given seed with corresponding fluorescent fluctuation `f`, the peak-to-peak value for that seed is computed as `np.percentile(f, q[1]) - np.percentile(f, q[0])`. By default `(0.1, 99.9)`. n_components : int, optional Number of components (Gaussians) in the GMM model. By default `2`. valid_components : int, optional Number of components (Gaussians) to be considered as modeling the distribution of peak-to-peak values of valid seeds. Should be smaller than `n_components`. By default `1`. mean_mask : bool, optional Whether to apply additional criteria where a seed is valid only if its peak-to-peak value exceeds the mean of the lowest gaussian distribution. Only useful in corner cases where the distribution of the gaussian heavily overlap. By default `True`. Returns ------- seeds : pd.DataFrame The resulting seeds dataframe with an additional column "mask_gmm", indicating whether the seed is considered valid by this function. If the column already exists in input `seeds` it will be overwritten. varr_pv : xr.DataArray The computed peak-to-peak values for each seeds. gmm : GaussianMixture The fitted GMM model object. See Also ------- sklearn.mixture.GaussianMixture """ print("selecting seeds") varr_sub = varr.sel( spatial=[tuple(hw) for hw in seeds[["height", "width"]].values]) print("computing peak-valley values") varr_valley = xr.apply_ufunc( np.percentile, varr_sub.chunk(dict(frame=-1)), input_core_dims=[["frame"]], kwargs=dict(q=q[0], axis=-1), dask="parallelized", output_dtypes=[varr_sub.dtype], ) varr_peak = xr.apply_ufunc( np.percentile, varr_sub.chunk(dict(frame=-1)), input_core_dims=[["frame"]], kwargs=dict(q=q[1], axis=-1), dask="parallelized", output_dtypes=[varr_sub.dtype], ) varr_pv = varr_peak - varr_valley varr_pv = varr_pv.compute() print("fitting GMM models") dat = varr_pv.values.reshape(-1, 1) gmm = GaussianMixture(n_components=n_components) gmm.fit(dat) idg = np.argsort(gmm.means_.reshape(-1))[-valid_components:] idx_valid = np.isin(gmm.predict(dat), idg) if mean_mask: idx_mean = dat > np.sort(gmm.means_)[0] idx_valid = np.logical_and(idx_mean.squeeze(), idx_valid) seeds["mask_gmm"] = idx_valid return seeds, varr_pv, gmm
def compute_aggregates(perf_da, baseline_ds): """Aggregate function evaluations in the experiments to get performance summaries of each method. Parameters ---------- perf_da : :class:`xarray:xarray.DataArray` Aggregate experimental results with each function evaluation in the experiments. `all_perf` has dimensions ``(ITER, SUGGEST, TEST_CASE, METHOD, TRIAL)`` as is assumed to have no nan values. baseline_ds : :class:`xarray:xarray.Dataset` Dataset with baseline performance. It was variables ``(PERF_MED, PERF_MEAN, PERF_CLIP, PERF_BEST)`` with dimensions ``(ITER, TEST_CASE)``, ``(ITER, TEST_CASE)``, ``(TEST_CASE,)``, and ``(TEST_CASE,)``, respectively. `PERF_MED` is a baseline of performance based on random search when using medians to summarize performance. Likewise, `PERF_MEAN` is for means. `PERF_CLIP` is an upperbound to clip poor performance when using the mean. `PERF_BEST` is an estimate on the global minimum. Returns ------- agg_result : :class:`xarray:xarray.Dataset` Dataset with summary of performance for each method and test case combination. Contains variables: ``(PERF_MED, LB_MED, UB_MED, NORMED_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN)`` each with dimensions ``(ITER, METHOD, TEST_CASE)``. `PERF_MED` is a median summary of performance with `LB_MED` and `UB_MED` as error bars. `NORMED_MED` is a rescaled `PERF_MED` so we expect the optimal performance is 0, and random search gives 1 at all `ITER`. Likewise, `PERF_MEAN`, `LB_MEAN`, `UB_MEAN`, `NORMED_MEAN` are for mean performance. summary : :class:`xarray:xarray.Dataset` Dataset with overall summary of performance of each method. Contains variables ``(PERF_MED, LB_MED, UB_MED, PERF_MEAN, LB_MEAN, UB_MEAN)`` each with dimensions ``(ITER, METHOD)``. """ validate_agg_perf(perf_da, min_trial=1) assert isinstance(baseline_ds, xr.Dataset) assert tuple(baseline_ds[PERF_BEST].dims) == (TEST_CASE,) assert tuple(baseline_ds[PERF_CLIP].dims) == (TEST_CASE,) assert tuple(baseline_ds[PERF_MED].dims) == (ITER, TEST_CASE) assert tuple(baseline_ds[PERF_MEAN].dims) == (ITER, TEST_CASE) assert xru.coord_compat((perf_da, baseline_ds), (ITER, TEST_CASE)) assert not any(np.any(np.isnan(baseline_ds[kk].values)) for kk in baseline_ds) # Now actually get the aggregate performance numbers per test case agg_result = xru.ds_like( perf_da, (PERF_MED, LB_MED, UB_MED, NORMED_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN), (ITER, METHOD, TEST_CASE), ) baseline_mean_da = xru.only_dataarray(xru.ds_like(perf_da, ["ref"], (ITER, TEST_CASE))) # Using values here since just clearer to get raw items than xr object for func_name for func_name in perf_da.coords[TEST_CASE].values: rand_perf_med = baseline_ds[PERF_MED].sel({TEST_CASE: func_name}, drop=True).values rand_perf_mean = baseline_ds[PERF_MEAN].sel({TEST_CASE: func_name}, drop=True).values best_opt = baseline_ds[PERF_BEST].sel({TEST_CASE: func_name}, drop=True).values base_clip_val = baseline_ds[PERF_CLIP].sel({TEST_CASE: func_name}, drop=True).values assert np.all(np.diff(rand_perf_med) <= 0), "Baseline should be decreasing with iteration" assert np.all(np.diff(rand_perf_mean) <= 0), "Baseline should be decreasing with iteration" assert np.all(rand_perf_med > best_opt) assert np.all(rand_perf_mean > best_opt) assert np.all(rand_perf_mean <= base_clip_val) baseline_mean_da.loc[{TEST_CASE: func_name}] = linear_rescale( rand_perf_mean, best_opt, base_clip_val, 0.0, 1.0, enforce_bounds=False ) for method_name in perf_da.coords[METHOD].values: # Take the minimum over all suggestion at given iter + sanity check perf_da curr_da = perf_da.sel({METHOD: method_name, TEST_CASE: func_name}, drop=True).min(dim=SUGGEST) assert curr_da.dims == (ITER, TRIAL) # Want to evaluate minimum so far during optimization perf_array = np.minimum.accumulate(curr_da.values, axis=0) # Compute median perf and CI on it med_perf, LB, UB = qt.quantile_and_CI(perf_array, EVAL_Q, alpha=ALPHA) assert med_perf.shape == rand_perf_med.shape agg_result[PERF_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = med_perf agg_result[LB_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = LB agg_result[UB_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = UB # Now store normed version, which is better for aggregation normed = linear_rescale(med_perf, best_opt, rand_perf_med, 0.0, 1.0, enforce_bounds=False) agg_result[NORMED_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = normed # Compute mean perf and CI on it perf_array = np.minimum(base_clip_val, perf_array) mean_perf = np.mean(perf_array, axis=1) assert mean_perf.shape == rand_perf_mean.shape EB = t_EB(perf_array, alpha=ALPHA, axis=1) assert EB.shape == rand_perf_mean.shape agg_result[PERF_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf agg_result[LB_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf - EB agg_result[UB_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf + EB # Now store normed version, which is better for aggregation normed = linear_rescale(mean_perf, best_opt, base_clip_val, 0.0, 1.0, enforce_bounds=False) agg_result[NORMED_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = normed assert not any(np.any(np.isnan(agg_result[kk].values)) for kk in agg_result) # Compute summary score over all test cases, summarize performance of each method summary = xru.ds_like( perf_da, (PERF_MED, LB_MED, UB_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN, LB_NORMED_MEAN, UB_NORMED_MEAN), (ITER, METHOD), ) summary[PERF_MED], summary[LB_MED], summary[UB_MED] = xr.apply_ufunc( qt.quantile_and_CI, agg_result[NORMED_MED], input_core_dims=[[TEST_CASE]], kwargs={"q": EVAL_Q, "alpha": ALPHA}, output_core_dims=[[], [], []], ) summary[PERF_MEAN] = agg_result[NORMED_MEAN].mean(dim=TEST_CASE) EB = xr.apply_ufunc(t_EB, agg_result[NORMED_MEAN], input_core_dims=[[TEST_CASE]]) summary[LB_MEAN] = summary[PERF_MEAN] - EB summary[UB_MEAN] = summary[PERF_MEAN] + EB normalizer = baseline_mean_da.mean(dim=TEST_CASE) summary[NORMED_MEAN] = summary[PERF_MEAN] / normalizer summary[LB_NORMED_MEAN] = summary[LB_MEAN] / normalizer summary[UB_NORMED_MEAN] = summary[UB_MEAN] / normalizer assert all(tuple(summary[kk].dims) == (ITER, METHOD) for kk in summary) return agg_result, summary
def pnr_refine( varr: xr.DataArray, seeds: pd.DataFrame, noise_freq=0.25, thres: Union[float, str] = 1.5, q=(0.1, 99.9), med_wnd: Optional[int] = None, ) -> Tuple[pd.DataFrame, xr.DataArray, Optional[GaussianMixture]]: """ Filter seeds by thresholding peak-to-noise ratio. For each input seed, the noise is defined as high-pass filtered fluorescence trace of the seed. The peak-to-noise ratio (pnr) of that seed is then defined as the ratio between the peak-to-peak value of the originial fluorescence trace and that of the noise trace. Optionally, if abrupt changes in baseline fluorescence is expected, then the baseline can be estimated by median-filtering the fluorescence trace and subtracted from the original trace before computing the peak-to-noise ratio. In addition, if a hard threshold of pnr is not desired, then a Gaussian Mixture Model with 2 components can be fitted to the distribution of pnr across all seeds, and only seeds with pnr belonging to the higher-mean Gaussian will be considered valide. Parameters ---------- varr : xr.DataArray Input movie data, should have dimensions "height", "width" and "frame". seeds : pd.DataFrame The input over-complete set of seeds to be filtered. noise_freq : float, optional Cut-off frequency for the high-pass filter used to define noise, specified as fraction of sampling frequency. By default `0.25`. thres : Union[float, str], optional Threshold of the peak-to-noise ratio. If `"auto"` then a :class:`GMM <sklearn.mixture.GaussianMixture>` will be fit to the distribution of pnr. By default `1.5`. q : tuple, optional Percentile to use to compute the peak-to-peak values. For a given fluorescence fluctuation `f`, the peak-to-peak value for that seed is computed as `np.percentile(f, q[1]) - np.percentile(f, q[0])`. By default `(0.1, 99.9)`. med_wnd : int, optional Size of the median filter window to remove baseline. If `None` then no filtering will be done. By default `None`. Returns ------- seeds : pd.DataFrame The resulting seeds dataframe with an additional column "mask_pnr", indicating whether the seed is considered valid by this function. If the column already exists in input `seeds` it will be overwritten. pnr : xr.DataArray The computed peak-to-noise ratio for each seeds. gmm : GaussianMixture, optional The GMM model object fitted to the distribution of pnr. Will be `None` unless `thres` is `"auto"`. """ print("selecting seeds") # vectorized indexing on dask arrays produce a single chunk. # to memory issue, split seeds into 128 chunks, with chunk size no greater than 100 chk_size = min(int(len(seeds) / 128), 100) vsub_ls = [] for _, seed_sub in seeds.groupby(np.arange(len(seeds)) // chk_size): vsub = varr.sel(height=seed_sub["height"].to_xarray(), width=seed_sub["width"].to_xarray()) vsub_ls.append(vsub) varr_sub = xr.concat(vsub_ls, "index") if med_wnd: print("removing baseline") varr = xr.apply_ufunc( med_baseline, varr_sub, input_core_dims=[["frame"]], output_core_dims=[["frame"]], dask="parallelized", kwargs={"wnd": med_wnd}, vectorize=True, output_dtypes=[varr.dtype], ) print("computing peak-noise ratio") pnr = xr.apply_ufunc( pnr_perseed, varr_sub, input_core_dims=[["frame"]], output_core_dims=[[]], kwargs={ "freq": noise_freq, "q": q }, vectorize=True, dask="parallelized", output_dtypes=[float], ).compute() if thres == "auto": gmm = GaussianMixture(n_components=2) gmm.fit(np.nan_to_num(pnr.values.reshape(-1, 1))) idg = np.argsort(gmm.means_.reshape(-1))[-1] idx_valid = np.isin(gmm.predict(pnr.values.reshape(-1, 1)), idg) seeds["mask_pnr"] = idx_valid else: mask = pnr > thres mask_df = mask.to_pandas().rename("mask_pnr") seeds["mask_pnr"] = mask_df gmm = None return seeds, pnr, gmm
def ensemble_percentiles( ens: xr.Dataset, values: Tuple[int, int, int] = (10, 50, 90), keep_chunk_size: Optional[bool] = None, ) -> xr.Dataset: """Calculate ensemble statistics between a results from an ensemble of climate simulations. Returns a Dataset containing ensemble percentiles for input climate simulations. Parameters ---------- ens: xr.Dataset Ensemble dataset (see xclim.ensembles.create_ensemble). values : Tuple[int, int, int] Percentile values to calculate. Default: (10, 50, 90). keep_chunk_size : Optional[bool] For ensembles using dask arrays, all chunks along the 'realization' axis are merged. If True, the dataset is rechunked along the dimension with the largest chunks, so that the chunks keep the same size (approx) If False, no shrinking is performed, resulting in much larger chunks If not defined, the function decides which is best Returns ------- xr.Dataset Dataset with containing data variables of requested ensemble statistics Examples -------- >>> from xclim import ensembles >>> import glob >>> ncfiles = glob.glob('/*tas*.nc') Create ensemble dataset >>> ens = ensembles.create_ensemble(ncfiles) Calculate default ensemble percentiles >>> ens_percs = ensembles.ensemble_percentiles(ens) >>> print(ens_percs['tas_p10']) Calculate non-default percentiles (25th and 75th) >>> ens_percs = ensembles.ensemble_percentiles(ens, values=(25, 50, 75)) >>> print(ens_percs['tas_p25']) If the original array has many small chunks, it might be more efficient to do: >>> ens_percs = ensembles.ensemble_percentiles(ens, keep_chunk_size=False) >>> print(ens_percs['tas_p25']) """ ds_out = xr.Dataset(attrs=ens.attrs) for v in ens.data_vars: # Percentile calculation forbids any chunks along realization if len(ens.chunks.get("realization", [])) > 1: if keep_chunk_size is None: # Enable smart rechunking is chunksize exceed 2E8 elements after merging along realization keep_chunk_size = ( np.prod(ens[v].isel(realization=0).data.chunksize) * ens.realization.size > 2e8) if keep_chunk_size: # Smart rechunk on dimension where chunks are the largest chkDim, chks = max( ens.chunks.items(), key=lambda kv: 0 if kv[0] == "realization" else max(kv[1]), ) var = ens[v].chunk({ "realization": -1, chkDim: len(chks) * ens.realization.size }) else: var = ens[v].chunk({"realization": -1}) else: var = ens[v] for p in values: perc = xr.apply_ufunc( _calc_perc, var, input_core_dims=[["realization"]], output_core_dims=[[]], keep_attrs=True, kwargs=dict(p=p), dask="parallelized", output_dtypes=[ens[v].dtype], ) perc.name = f"{v}_p{p:02d}" ds_out[perc.name] = perc if "description" in ds_out[perc.name].attrs: ds_out[perc.name].attrs[ "description"] = f"{ds_out[perc.name].attrs['description']} : {p}th percentile of ensemble" else: ds_out[perc.name].attrs[ "description"] = f"{p}th percentile of ensemble" ds_out.attrs["history"] = update_history( f"Computation of the percentiles on {ens.realization.size} ensemble members.", ds_out, ) return ds_out
def vector_norm(x, dim, ord=None): return xr.apply_ufunc( np.linalg.norm, x, input_core_dims=[[dim]], kwargs={"ord": ord, "axis": -1} )
ds = xr.open_dataset(data_path + '/' + name_file) var = ds[name_var] #bounds for calculation lat_s = 0. lat_n = 60. lon_w = 285. lon_e = 352.5 weights = np.cos(ds.lat * np.pi / 180.).sel(lat=slice(lat_s, lat_n)) amo_index = var.sel(lat=slice(lat_s, lat_n), lon=slice( lon_w, lon_e)).weighted(weights).mean(dim=['lat', 'lon']) climatology_mean = amo_index.groupby("time.month").mean('time') anomalies = xr.apply_ufunc( lambda x, m: (x - m), amo_index.groupby("time.month"), climatology_mean, ) fig = plt.figure(1, figsize=(9, 4)) ax = fig.add_subplot(111) ax.plot(np.linspace(1979, 2018, 480), anomalies, lw=1, color='k') ax.axhline(0., color='k') ax.fill_between(np.linspace(1979, 2018, 480), anomalies, where=anomalies > 0., facecolor='red', alpha=0.7) ax.fill_between(np.linspace(1979, 2018, 480), anomalies, where=anomalies < 0.,
def detrend(da, dim, detrend_type="constant"): """ Detrend a DataArray Parameters ---------- da : xarray.DataArray The data to detrend dim : str or list Dimensions along which to apply detrend. Can be either one dimension or a list with two dimensions. Higher-dimensional detrending is not supported. If dask data are passed, the data must be chunked along dim. detrend_type : {'constant', 'linear'} If ``constant``, a constant offset will be removed from each dim. If ``linear``, a linear least-squares fit will be estimated and removed from the data. Returns ------- da : xarray.DataArray The detrended data. Notes ----- This function will act lazily in the presence of dask arrays on the input. """ if dim is None: dim = list(da.dims) else: if isinstance(dim, str): dim = [dim] if detrend_type not in ["constant", "linear", None]: raise NotImplementedError( "%s is not a valid detrending option. Valid " "options are: 'constant','linear', or None." % detrend_type ) if detrend_type is None: return da elif detrend_type == "constant": return da - da.mean(dim=dim) elif detrend_type == "linear": data = da.data axis_num = [da.get_axis_num(d) for d in dim] chunks = getattr(data, "chunks", None) if chunks: axis_chunks = [data.chunks[a] for a in axis_num] if not all([len(ac) == 1 for ac in axis_chunks]): raise ValueError("Contiguous chunks required for detrending.") if len(dim) == 1: dt = xr.apply_ufunc( sps.detrend, da, axis_num[0], output_dtypes=[da.dtype], dask="parallelized", ) elif len(dim) == 2: dt = xr.apply_ufunc( _detrend_2d_ufunc, da, input_core_dims=[dim], output_core_dims=[dim], output_dtypes=[da.dtype], vectorize=True, dask="parallelized", ) else: # pragma: no cover raise NotImplementedError( "Only 1D and 2D detrending are implemented so far." ) return dt
def bounds_to_vertices( bounds: DataArray, bounds_dim: str, core_dims=None, order: Optional[str] = "counterclockwise", ) -> DataArray: """ Convert bounds variable to vertices. There 2 covered cases: - 1D coordinates, with bounds of shape (N, 2), converted to vertices of shape (N+1,) - 2D coordinates, with bounds of shape (N, M, 4). converted to vertices of shape (N+1, M+1). Parameters ---------- bounds : DataArray The bounds to convert. bounds_dim : str The name of the bounds dimension of `bounds` (the one of length 2 or 4). order : {'counterclockwise', 'clockwise', None} Valid for 2D coordinates only (i.e. bounds of shape (..., N, M, 4), ignored otherwise. Order the bounds are given in, assuming that ax0-ax1-upward is a right handed coordinate system, where ax0 and ax1 are the two first dimensions of `bounds`. If None, the counterclockwise version is computed and then verified. If the check fails the clockwise version is returned. See Notes for more details. core_dims : list, optional List of core dimensions for apply_ufunc. This must not include bounds_dims. The shape of (*core_dims, bounds_dim) must be (N, 2) or (N, M, 4). Returns ------- DataArray Either of shape (N+1,) or (N+1, M+1). New vertex dimensions are named from the intial dimension and suffix "_vertices". Notes ----- Getting the correct axes "order" is tricky. There are no real standards for dimension names or even axes order, even though the CF conventions mentions the ax0-ax1-upward (counterclockwise bounds) as being the default. Moreover, xarray can tranpose data without raising any warning or error, which make attributes unreliable. Please refer to the CF conventions document : http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#cell-boundaries. """ if core_dims is None: core_dims = [dim for dim in bounds.dims if dim != bounds_dim] output_sizes = { f"{dim}_vertices": bounds.sizes[dim] + 1 for dim in core_dims } output_core_dims = list(output_sizes.keys()) n_core_dims = len(core_dims) nbounds = bounds[bounds_dim].size if not (n_core_dims == 2 and nbounds == 4) and not (n_core_dims == 1 and nbounds == 2): raise ValueError( f"Bounds format not understood. Got {bounds.dims} with shape {bounds.shape}." ) return xr.apply_ufunc( _bounds_helper, bounds, input_core_dims=[core_dims + [bounds_dim]], dask="parallelized", kwargs={ "n_core_dims": n_core_dims, "nbounds": nbounds, "order": order }, output_core_dims=[output_core_dims], dask_gufunc_kwargs=dict(output_sizes=output_sizes), output_dtypes=[bounds.dtype], )
def _resample_iterations_idx( init, iterations, dim="member", replace=True, chunk=True, dim_max=None ): """Resample over ``dim`` by index ``iterations`` times. .. note:: This is a much faster way to bootstrap than resampling each iteration individually and applying the function to it. However, this will create a DataArray with dimension ``iteration`` of size ``iterations``. It is probably best to do this out-of-memory with ``dask`` if you are doing a large number of iterations or using spatial output (i.e., not time series data). Args: init (xr.DataArray, xr.Dataset): Initialized prediction ensemble. iterations (int): Number of bootstrapping iterations. dim (str): Dimension name to bootstrap over. Defaults to ``'member'``. replace (bool): Bootstrapping with or without replacement. Defaults to ``True``. chunk: (bool): Auto-chunk along chunking_dims to get optimal blocksize dim_max (int): Number of indices from `dim` to return. Not implemented. Returns: xr.DataArray, xr.Dataset: Bootstrapped data with additional dim ```iteration``` """ if dask.is_dask_collection(init): init = init.chunk({"lead": -1, "member": -1}) init = init.copy(deep=True) def select_bootstrap_indices_ufunc(x, idx): """Selects multi-level indices ``idx`` from xarray object ``x`` for all iterations.""" # `apply_ufunc` sometimes adds a singleton dimension on the end, so we squeeze # it out here. This leverages multi-level indexing from numpy, so we can # select a different set of, e.g., ensemble members for each iteration and # construct one large DataArray with ``iterations`` as a dimension. return np.moveaxis(x.squeeze()[idx.squeeze().transpose()], 0, -1) if dask.is_dask_collection(init): if chunk: chunking_dims = [d for d in init.dims if d not in CLIMPRED_DIMS] init = _chunk_before_resample_iterations_idx( init, iterations, chunking_dims ) # resample with or without replacement if replace: idx = np.random.randint(0, init[dim].size, (iterations, init[dim].size)) elif not replace: # create 2d np.arange() idx = np.linspace( (np.arange(init[dim].size)), (np.arange(init[dim].size)), iterations, dtype="int", ) # shuffle each line for ndx in np.arange(iterations): np.random.shuffle(idx[ndx]) idx_da = xr.DataArray( idx, dims=("iteration", dim), coords=({"iteration": range(iterations), dim: init[dim]}), ) transpose_kwargs = ( {"transpose_coords": False} if isinstance(init, xr.DataArray) else {} ) return xr.apply_ufunc( select_bootstrap_indices_ufunc, init.transpose(dim, ..., **transpose_kwargs), idx_da, dask="parallelized", output_dtypes=[float], )
def open_nwm_dataset(paths: list, chunks: dict = None, attrs_keep: list = [ 'featureType', 'proj4', 'station_dimension', 'esri_pe_string', 'Conventions', 'model_version' ], spatial_indices: list = None, drop_variables: list = None, npartitions: int = None, profile: int = False) -> xr.Dataset: if profile: then = timesince() # This is totally arbitrary be seems to work ok. if npartitions is None: npartitions = dask.config.get('pool')._processes * 4 # This choice does not seem to work well or at all, error? # npartitions = len(sorted(paths)) paths_bag = dask.bag.from_sequence(paths, npartitions=npartitions) if profile: then = timesince(then) print('after paths_bag') ds_list = paths_bag.map( preprocess_nwm_data, chunks=chunks, spatial_indices=spatial_indices, drop_variables=drop_variables).filter(is_not_none).compute() if profile: then = timesince(then) print("after ds_list preprocess/filter") # Group by and merge by choices have_members = 'member' in ds_list[0].coords if have_members: group_list = [group_member_lead_time, group_lead_time] merge_list = [merge_reference_time, merge_member] else: group_list = [group_lead_time] merge_list = [merge_reference_time] for group, merge in zip(group_list, merge_list): if profile: then = timesince(then) print('before sort') the_sort = sorted(ds_list, key=group) if profile: then = timesince(then) print('after sort, before group') ds_groups = [list(it) for k, it in itertools.groupby(the_sort, group)] if profile: then = timesince(then) print('after group, before merge') # npartitons = len(ds_groups) group_bag = dask.bag.from_sequence(ds_groups, npartitions=npartitions) ds_list = group_bag.map(merge).compute() if profile: then = timesince(then) print('after merge') del group_bag, ds_groups, the_sort nwm_dataset = merge_lead_time(ds_list) del ds_list # Create a valid_time variable. def calc_valid_time(ref, lead): return np.datetime64(int(ref) + int(lead), 'ns') nwm_dataset['valid_time'] = xr.apply_ufunc(calc_valid_time, nwm_dataset['reference_time'], nwm_dataset['lead_time'], vectorize=True) # Xarray sets nan as the fill value when there is none. Dont allow that... for key, val in nwm_dataset.variables.items(): if '_FillValue' not in nwm_dataset[key].encoding: nwm_dataset[key].encoding.update({'_FillValue': None}) # Clean up attributes new_attrs = collections.OrderedDict() if attrs_keep is not None: for key, value in nwm_dataset.attrs.items(): if key in attrs_keep: new_attrs[key] = nwm_dataset.attrs[key] nwm_dataset.attrs = new_attrs # Break into chunked dask array if chunks is not None: nwm_dataset = nwm_dataset.chunk(chunks=chunks) return nwm_dataset
def _adapt_freq_group(ds, dim=["time"]): if isinstance(ds.sim.data, dsk.Array): # In order to be efficient and lazy, some classical numpy ops will be replaced by dask's version mod = dsk kws = {"chunks": ds.sim.chunks} else: mod = np kws = {} # Compute the probability of finding a value <= thresh # This is the "dry-day frequency" in the precipitation case P0_sim = ecdf(ds.sim, thresh, dim=dim) P0_ref = ecdf(ds.ref, thresh, dim=dim) # The proportion of values <= thresh in sim that need to be corrected, compared to ref dP0 = (P0_sim - P0_ref) / P0_sim # Compute : ecdf_ref^-1( ecdf_sim( thresh ) ) # The value in ref with the same rank as the first non zero value in sim. pth = xr.apply_ufunc( np.nanpercentile, ds.ref, P0_sim * 100, # np.percentile takes values in [0, 100], ecdf outputs in [0, 1] input_core_dims=[dim, []], dask="parallelized", vectorize=True, output_dtypes=[ds.ref.dtype], ).where( dP0 > 0) # pth is meaningless when freq. adaptation is not needed if "window" in ds.sim.dims: # P0_sim was computed using the window, but only the original timeseries is corrected. sim = ds.sim.isel(window=(ds.sim.window.size - 1) // 2) dim = [dim[0]] else: sim = ds.sim # Get the percentile rank of each value in sim. # da.rank() doesn't work with dask arrays. rank = (xr.apply_ufunc( lambda da: np.argsort(np.argsort(da, axis=-1), axis=-1), sim, input_core_dims=[dim], output_core_dims=[dim], dask="parallelized", output_dtypes=[sim.dtype], ) / sim.notnull().sum(dim=dim)) # Frequency-adapted sim sim_ad = sim.where( dP0 < 0, # dP0 < 0 means no-adaptation. sim.where( (rank < P0_ref) | (rank > P0_sim), # Preserve current values # Generate random numbers ~ U[T0, Pth] (pth.broadcast_like(sim) - thresh) * mod.random.random_sample(size=sim.shape, **kws) + thresh, ), ) # Set some metadata sim_ad.attrs.update(ds.sim.attrs) pth.attrs[ "long_name"] = "Smallest value of the timeseries not corrected by frequency adaptation." dP0.attrs[ "long_name"] = "Proportion of values smaller than {thresh} in the timeseries corrected by frequency adaptation" # Tell group_apply that these will need reshaping (regrouping) # This is needed since if any variable comes out a groupby with the original group axis, the whole output is broadcasted back to the original dims. pth.attrs["_group_apply_reshape"] = True dP0.attrs["_group_apply_reshape"] = True return xr.Dataset(data_vars={"pth": pth, "dP0": dP0, "sim_ad": sim_ad})
def smooth(da, k=3): if len(da.shape) == 1: raise Exception("'Smooth' does not currently operate on 1D timeseries") da = da.transpose("y", "x", "time") func = lambda arr, k: wiener(da, (1, 1, k)) return xr.apply_ufunc(func, da, k, dask='allowed')
def median_absolute_error(a, b, dim=None, skipna=False, keep_attrs=False): """ Median Absolute Error. .. math:: \\mathrm{median}(\\vert a - b\\vert) Parameters ---------- a : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. b : xarray.Dataset or xarray.DataArray Labeled array(s) over which to apply the function. dim : str, list The dimension(s) to apply the median absolute error along. Note that this dimension will be reduced as a result. Defaults to None reducing all dimensions. skipna : bool If True, skip NaNs when computing function. keep_attrs : bool If True, the attributes (attrs) will be copied from the first input to the new one. If False (default), the new object will be returned without attributes. Returns ------- xarray.Dataset or xarray.DataArray Median Absolute Error. See Also -------- sklearn.metrics.median_absolute_error Examples -------- >>> import numpy as np >>> import xarray as xr >>> from xskillscore import median_absolute_error >>> a = xr.DataArray(np.random.rand(5, 3, 3), dims=['time', 'x', 'y']) >>> b = xr.DataArray(np.random.rand(5, 3, 3), dims=['time', 'x', 'y']) >>> median_absolute_error(a, b, dim='time') """ dim, axis = _preprocess_dims(dim, a) a, b = xr.broadcast(a, b, exclude=dim) return xr.apply_ufunc( _median_absolute_error, a, b, input_core_dims=[dim, dim], kwargs={ "axis": axis, "skipna": skipna }, dask="parallelized", output_dtypes=[float], keep_attrs=keep_attrs, )
def ensemble_percentiles( ens: Union[xr.Dataset, xr.DataArray], values: Sequence[int] = (10, 50, 90), keep_chunk_size: Optional[bool] = None, split: bool = True, ) -> xr.Dataset: """Calculate ensemble statistics between a results from an ensemble of climate simulations. Returns a Dataset containing ensemble percentiles for input climate simulations. Parameters ---------- ens: Union[xr.Dataset, xr.DataArray] Ensemble dataset or dataarray (see xclim.ensembles.create_ensemble). values : Tuple[int, int, int] Percentile values to calculate. Default: (10, 50, 90). keep_chunk_size : Optional[bool] For ensembles using dask arrays, all chunks along the 'realization' axis are merged. If True, the dataset is rechunked along the dimension with the largest chunks, so that the chunks keep the same size (approx) If False, no shrinking is performed, resulting in much larger chunks If not defined, the function decides which is best split : bool Whether to split each percentile into a new variable of concatenate the ouput along a new "percentiles" dimension. Returns ------- Union[xr.Dataset, xr.DataArray] If split is True, same type as ens; dataset otherwise, containing data variable(s) of requested ensemble statistics Examples -------- >>> from xclim.ensembles import create_ensemble, ensemble_percentiles Create ensemble dataset: >>> ens = create_ensemble(temperature_datasets) Calculate default ensemble percentiles: >>> ens_percs = ensemble_percentiles(ens) Calculate non-default percentiles (25th and 75th) >>> ens_percs = ensemble_percentiles(ens, values=(25, 50, 75)) If the original array has many small chunks, it might be more efficient to do: >>> ens_percs = ensemble_percentiles(ens, keep_chunk_size=False) """ if isinstance(ens, xr.Dataset): out = xr.merge( [ ensemble_percentiles( da, values, keep_chunk_size=keep_chunk_size, split=split ) for da in ens.data_vars.values() if "realization" in da.dims ] ) out.attrs.update(ens.attrs) out.attrs["xclim_history"] = update_history( f"Computation of the percentiles on {ens.realization.size} ensemble members.", ens, ) return out # Percentile calculation forbids any chunks along realization if ens.chunks and len(ens.chunks[ens.get_axis_num("realization")]) > 1: if keep_chunk_size is None: # Enable smart rechunking is chunksize exceed 2E8 elements after merging along realization keep_chunk_size = ( np.prod(ens.isel(realization=0).data.chunksize) * ens.realization.size > 2e8 ) if keep_chunk_size: # Smart rechunk on dimension where chunks are the largest chk_dim, chks = max( enumerate(ens.chunks), key=lambda kv: 0 if kv[0] == ens.get_axis_num("realization") else max(kv[1]), ) ens = ens.chunk( {"realization": -1, ens.dims[chk_dim]: len(chks) * ens.realization.size} ) else: ens = ens.chunk({"realization": -1}) out = xr.apply_ufunc( _calc_perc, ens, input_core_dims=[["realization"]], output_core_dims=[["percentiles"]], keep_attrs=True, kwargs=dict(p=values), dask="parallelized", output_dtypes=[ens.dtype], dask_gufunc_kwargs=dict(output_sizes={"percentiles": len(values)}), ) out = out.assign_coords( percentiles=xr.DataArray(list(values), dims=("percentiles",)) ) if split: out = out.to_dataset(dim="percentiles") for p, perc in out.data_vars.items(): perc.attrs.update(ens.attrs) perc.attrs["description"] = ( perc.attrs.get("description", "") + f" {p}th percentile of ensemble." ) out[p] = perc out = out.rename(name_dict={p: f"{ens.name}_p{int(p):02d}"}) out.attrs["xclim_history"] = update_history( f"Computation of the percentiles on {ens.realization.size} ensemble members.", ens, ) return out
import numpy as np import xarray as xr da = xr.DataArray(np.random.rand(1000, 100)) da = da.rename({'dim_0': 'rows_a'}) db = xr.DataArray(np.random.rand(1000, 100)) db = db.rename({'dim_0': 'rows_b'}) def print_shape(a): print(a.shape) return np.zeros(shape=(a.shape[0])) def print_two_shapes(a, b): print(a.shape) print(b.shape) return np.zeros(shape=(a.shape[0], b.shape[0])) print('\nprint_shape') xr.apply_ufunc( print_shape, da, input_core_dims=[['dim_1']] ) print('\nprint_two_shapes') xr.apply_ufunc( print_two_shapes, da, db, input_core_dims=[['dim_1'], ['dim_1']] )
def wrap_xarray_ufunc( ufunc, *datasets, ufunc_kwargs=None, func_args=None, func_kwargs=None, dask_kwargs=None, **kwargs, ): """Wrap make_ufunc with xarray.apply_ufunc. Parameters ---------- ufunc : callable datasets : xarray.dataset ufunc_kwargs : dict Keyword arguments passed to `make_ufunc`. - 'n_dims', int, by default 2 - 'n_output', int, by default 1 - 'n_input', int, by default len(datasets) - 'index', slice, by default Ellipsis - 'ravel', bool, by default True func_args : tuple Arguments passed to 'ufunc'. func_kwargs : dict Keyword arguments passed to 'ufunc'. - 'out_shape', int, by default None dask_kwargs : dict Dask related kwargs passed to :func:`xarray:xarray.apply_ufunc`. Use :meth:`~arviz.Dask.enable_dask` to set default kwargs. **kwargs Passed to xarray.apply_ufunc. Returns ------- xarray.dataset """ if ufunc_kwargs is None: ufunc_kwargs = {} ufunc_kwargs.setdefault("n_input", len(datasets)) if func_args is None: func_args = tuple() if func_kwargs is None: func_kwargs = {} if dask_kwargs is None: dask_kwargs = {} kwargs.setdefault( "input_core_dims", tuple( ("chain", "draw") for _ in range(len(func_args) + len(datasets)))) ufunc_kwargs.setdefault("n_dims", len(kwargs["input_core_dims"][-1])) kwargs.setdefault( "output_core_dims", tuple([] for _ in range(ufunc_kwargs.get("n_output", 1)))) callable_ufunc = make_ufunc(ufunc, **ufunc_kwargs) return apply_ufunc(callable_ufunc, *datasets, *func_args, kwargs=func_kwargs, **dask_kwargs, **kwargs)
def pearson_correlation(x, y, dim): return xr.apply_ufunc( pearson_correlation_gufunc, x, y, input_core_dims=[[dim], [dim]], dask='parallelized', output_dtypes=[float])
# For converting to a 10m height for use with WN def logu(u, zin=40, zout=10): z0 = 0.01 newu = u * np.log(zout / z0) / np.log(zin / z0) return newu # In[ ]: gem = xr.open_mfdataset("out.nc", combine='by_coords') # In[ ]: #The GEM winds were output at 40m, convert to a 10m windspeed gem['u10'] = xr.apply_ufunc( logu, gem['u'], dask='allowed') # the GEM-CHM file has u @ 40m reference height #zonal and meridonal components gem['U10'] = -gem['u10'] * np.sin(gem['vw_dir'] * np.pi / 180.) gem['V10'] = -gem['u10'] * np.cos(gem['vw_dir'] * np.pi / 180.) # air temp needs to be in K gem['t'] += 273.15 # In[ ]: gem # In[ ]:
def run_snap_biophys(dataset, variable): """Compute specified variable using the SNAP algorithm. See ATBD at https://step.esa.int/docs/extra/ATBD_S2ToolBox_L2B_V1.1.pdf Parameters ---------- dataset : xr dataset xarray dataset. variable : str Options 'FAPAR', 'FCOVER', 'LAI', 'LAI_Cab' or 'LAI_Cw' Returns ------- xarray dataset Adds the specified variable array to dataset (variable name in lowercase). """ # generate view angle bands/layers vz = (np.ones_like(dataset.band_data[:, 0, :, :]).T * np.cos(np.radians(dataset.view_zenith)).values) vz = vz[..., np.newaxis] vzarr = xr.DataArray( vz, coords=[dataset.y, dataset.x, dataset.time, ["view_zenith"]], dims=["y", "x", "time", "band"], ) sz = (np.ones_like(dataset.band_data[:, 0, :, :]).T * np.cos(np.radians(dataset.sun_zenith)).values) sz = sz[..., np.newaxis] szarr = xr.DataArray( sz, coords=[dataset.y, dataset.x, dataset.time, ["sun_zenith"]], dims=["y", "x", "time", "band"], ) raz = ( np.ones_like(dataset.band_data[:, 0, :, :]).T * np.cos(np.radians(dataset.sun_azimuth - dataset.view_azimuth)).values) raz = raz[..., np.newaxis] razarr = xr.DataArray( raz, coords=[dataset.y, dataset.x, dataset.time, ["relative_azimuth"]], dims=["y", "x", "time", "band"], ) newarr = xr.concat([dataset.band_data, vzarr, szarr, razarr], dim="band") newarr = newarr.stack(xy=("x", "y")) arr = xr.apply_ufunc( _compute_variable, newarr, input_core_dims=[["band", "xy"]], output_core_dims=[["xy"]], kwargs={ "variable": variable }, vectorize=True, ).unstack() return dataset.assign({variable.lower(): arr})
def cross_phase(da1, da2, spacing_tol=1e-3, dim=None, detrend=None, window=False, chunks_to_segments=False): """ Calculates the cross-phase between da1 and da2. Returned values are in [-pi, pi]. .. math:: da1' = da1 - \overline{da1};\ \ da2' = da2 - \overline{da2} .. math:: cp = \text{Arg} [\mathbb{F}(da1')^*, \mathbb{F}(da2')] Parameters ---------- da1 : `xarray.DataArray` The data to be transformed da2 : `xarray.DataArray` The data to be transformed spacing_tol: float, optional Spacing tolerance. Fourier transform should not be applied to uneven grid but this restriction can be relaxed with this setting. Use caution. dim : list, optional The dimension along which to take the real Fourier transformation. If `None`, all dimensions will be transformed. shift : bool, optional Whether to shift the fft output. detrend : str, optional If `constant`, the mean across the transform dimensions will be subtracted before calculating the Fourier transform (FT). If `linear`, the linear least-square fit along one axis will be subtracted before the FT. It will give an error if the length of `dim` is longer than one. window : bool, optional Whether to apply a Hann window to the data before the Fourier transform is taken Returns ------- cp : `xarray.DataArray` Cross-phase as a function of frequency. """ if dim is None: dim = da1.dims dim2 = da2.dims if dim != dim2: raise ValueError('The two datasets have different dimensions') elif not isinstance(dim, list): dim = [dim] if len(dim)>1: raise ValueError('Cross phase calculation should only be done along ' 'a single dimension.') daft1 = dft(da1, spacing_tol, dim=dim, real=dim[0], shift=False, detrend=detrend, window=window, chunks_to_segments=chunks_to_segments) daft2 = dft(da2, spacing_tol, dim=dim, real=dim[0], shift=False, detrend=detrend, window=window, chunks_to_segments=chunks_to_segments) if daft1.chunks and daft2.chunks: _cross_phase = lambda a, b: dsar.angle(a * dsar.conj(b)) else: _cross_phase = lambda a, b: np.angle(a * np.conj(b)) cp = xr.apply_ufunc(_cross_phase, daft1, daft2, dask='allowed') if da1.name and da2.name: cp.name = "{}_{}_phase".format(da1.name, da2.name) return cp