Beispiel #1
0
 def __gt__(self, other):
     if isinstance(other, Longitude):
         if self.hemisphere == 'W':
             if other.hemisphere == 'E':
                 return False
             else:
                 return self.longitude < other.longitude
         else:
             if other.hemisphere == 'W':
                 return True
             else:
                 return self.longitude > other.longitude
     else:
         return xr.apply_ufunc(np.less, other, self)
Beispiel #2
0
def aggregate_da(da, agg_dims, suf='_agg'):
    input_core_dims = list(agg_dims)
    n_agg = len(input_core_dims)
    core_block_size = tuple([agg_dims[k] for k in input_core_dims])
    block_size = (da.ndim - n_agg)*(1,) + core_block_size
    output_core_dims = [dim + suf for dim in input_core_dims]
    output_sizes = {(dim + suf): da.shape[da.get_axis_num(dim)]//agg_dims[dim] for dim in input_core_dims}
    output_dtypes = da.dtype
    da_out = xr.apply_ufunc(block_reduce, da, kwargs={'block_size': block_size},
                            input_core_dims=[input_core_dims],
                            output_core_dims=[output_core_dims],
                            output_sizes=output_sizes,
                            output_dtypes=[output_dtypes],
                            dask='parallelized')
    for dim in input_core_dims:
        new_coord = block_reduce(da[dim].data, (agg_dims[dim],), func=np.mean)
        da_out.coords[dim + suf] = (dim + suf, new_coord)
    return da_out
Beispiel #3
0
def xr_moment(x, dim, order=1):
    """Calculate statistical moment of an XArray DataArray object.

    Parameters
    ----------
    x : xarray object
    dim : str or sequence of str
        Dimension(s) over which to calculate moment.
    order : int or array_like of ints, optional
        Order of central moment that is returned. Default is 1 (mean).

    Returns
    -------
    moment : Calculated moment as an XArray object.

    """
    return xr.apply_ufunc(
        moment, x, input_core_dims=[[dim]],
        kwargs={'moment': order, 'axis': -1, 'nan_policy': 'omit'},
        dask='parallelized', output_dtypes=[float]
    )
Beispiel #4
0
def destagger(xarr, dim, **kwargs):
    """Destagger an inteface located variable along a dimension

    Parameters
    ----------
    xarr : xr.Dataset
        input datarray
    dim : str
        dimension to destagger the data along
    mode : str
        Passed to np.take

    Returns
    -------
    destaggered : xr.Dataset
        cell centered DataArray

    See Also
    --------
    numpy.take

    Examples
    --------
    >>> x = xr.DataArray(np.arange(0, 5), [('x', np.arange(0, 5))])
    >>> destagger(x, 'x')
    <xarray.DataArray (x: 5)>
    array([ 0.5,  1.5,  2.5,  3.5,  2. ])
    Coordinates:
      * x        (x) int64 0 1 2 3 4
    """
    return apply_ufunc(destagger_dask, xarr,
                       input_core_dims=[[dim]],
                       output_core_dims=[[dim]],
                       dask='parallelized',
                       output_dtypes=[xarr.dtype],
                       kwargs=kwargs)
Beispiel #5
0
def smape(a, b, dim=None, weights=None, skipna=False, keep_attrs=False):
    """Symmetric Mean Absolute Percentage Error.

    .. math::
        \\mathrm{SMAPE} = \\frac{1}{n} \\sum_{i=1}^{n}
                          \\frac{ \\vert a_{i} - b_{i} \\vert }
                          { \\vert a_{i} \\vert + \\vert b_{i} \\vert  }

    .. note::
        Percent error is reported as decimal percent. I.e., a value of 1 is
        100%.

    Parameters
    ----------
    a : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
        (Truth which will be divided by)
    b : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    dim : str, list
        The dimension(s) to apply the smape along. Note that this dimension will
        be reduced as a result. Defaults to None reducing all dimensions.
    weights : xarray.Dataset or xarray.DataArray or None
        Weights matching dimensions of ``dim`` to apply during the function.
    skipna : bool
        If True, skip NaNs when computing function.
    keep_attrs : bool
        If True, the attributes (attrs) will be copied
        from the first input to the new one.
        If False (default), the new object will
        be returned without attributes.

    Returns
    -------
    xarray.Dataset or xarray.DataArray
        Symmetric Mean Absolute Percentage Error.

    References
    ----------
    https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error

    Examples
    --------
    >>> import numpy as np
    >>> import xarray as xr
    >>> from xskillscore import smape
    >>> a = xr.DataArray(np.random.rand(5, 3, 3),
                        dims=['time', 'x', 'y'])
    >>> b = xr.DataArray(np.random.rand(5, 3, 3),
                        dims=['time', 'x', 'y'])
    >>> smape(a, b, dim='time')
    """
    dim, axis = _preprocess_dims(dim, a)
    a, b = xr.broadcast(a, b, exclude=dim)
    weights = _preprocess_weights(a, dim, dim, weights)
    input_core_dims = _determine_input_core_dims(dim, weights)

    return xr.apply_ufunc(
        _smape,
        a,
        b,
        weights,
        input_core_dims=input_core_dims,
        kwargs={
            "axis": axis,
            "skipna": skipna
        },
        dask="parallelized",
        output_dtypes=[float],
        keep_attrs=keep_attrs,
    )
Beispiel #6
0
def xr_moment(x, dim, order=1):
    return xr.apply_ufunc(
        moment, x, input_core_dims=[[dim]],
        kwargs={'moment': order, 'axis': -1, 'nan_policy': 'omit'},
        dask='parallelized', output_dtypes=[float]
    )
Beispiel #7
0
# %% [markdown]
# # Calculate height range (h_range)
#
# A simple way of finding active subglacial lakes is to see where
# there has been a noticeably rapid change in elevation over
# a short period of time such as 2-5 metres a year (or ~4x91-day ICESat-2 cycles).
# 'Range of height' is quick way to do this,
# basically just doing maximum height minus minimum height.

# %%
# Calculate height range across cycles, parallelized using dask
ds["h_range"]: xr.DataArray = xr.apply_ufunc(
    deepicedrain.
    nanptp,  # min point to max point (range) that handles NaN values
    ds.h_corr,
    input_core_dims=[["cycle_number"]],
    dask="allowed",
    output_dtypes=[ds.h_corr.dtype],
    kwargs={"axis": 1},
)

# %%
# %%time
# Compute height range. Also include all height and time info
ds_ht: xr.Dataset = ds[["h_range", "h_corr", "delta_time"]].compute()

# %%
# Non-parallelized
# h_range = deepicedrain.nanptp(a=ds.h_corr[0:1], axis=1)
# Ensure no height range values which are zero (usually due to only 1 data point)
# assert len(dask.array.argwhere(dsh.h_range <= 0.0).compute()) == 0
Beispiel #8
0
def regrid(ds, dimx, dimy, **kwargs):
    """
    Interpolate Dataset or DataArray `ds` to a new grid, using rasterio's
    reproject facility.

    See also: https://mapbox.github.io/rasterio/topics/resampling.html

    Parameters
    ----------
    ds : xr.Dataset|xr.DataArray
      N-dim data on a spatial grid
    dimx : pd.Index
      New x-coordinates in destination crs.
      dimx.name MUST refer to x-coord of ds.
    dimy : pd.Index
      New y-coordinates in destination crs.
      dimy.name MUST refer to y-coord of ds.
    **kwargs :
      Arguments passed to rio.wrap.reproject; of note:
      - resampling is one of gis.Resampling.{average,cubic,bilinear,nearest}
      - src_crs, dst_crs define the different crs (default: latlong)
    """
    namex = dimx.name
    namey = dimy.name

    ds = maybe_swap_spatial_dims(ds, namex, namey)

    src_transform = _as_transform(ds.indexes[namex], ds.indexes[namey])
    dst_transform = _as_transform(dimx, dimy)
    dst_shape = len(dimy), len(dimx)

    kwargs.update(dst_shape=dst_shape,
                  src_transform=src_transform,
                  dst_transform=dst_transform)
    kwargs.setdefault("src_crs", 'longlat')
    kwargs.setdefault("dst_crs", 'longlat')

    def _reproject(src, dst_shape, **kwargs):
        dst = np.empty(src.shape[:-2] + dst_shape, dtype=src.dtype)
        rio.warp.reproject(np.asarray(src), dst, **kwargs)
        return dst

    data_vars = ds.data_vars.values() if isinstance(ds, xr.Dataset) else (ds, )
    dtypes = {da.dtype for da in data_vars}
    assert len(
        dtypes
    ) == 1, "regrid can only reproject datasets with homogeneous dtype"

    return (xr.apply_ufunc(_reproject,
                           ds,
                           input_core_dims=[[namey, namex]],
                           output_core_dims=[['yout', 'xout']],
                           output_dtypes=[dtypes.pop()],
                           output_sizes={
                               'yout': dst_shape[0],
                               'xout': dst_shape[1]
                           },
                           dask='parallelized',
                           kwargs=kwargs).rename({
                               'yout': namey,
                               'xout': namex
                           }).assign_coords(
                               **{
                                   namey: (namey, dimy,
                                           ds.coords[namey].attrs),
                                   namex: (namex, dimx, ds.coords[namex].attrs)
                               }).assign_attrs(**ds.attrs))
Beispiel #9
0
    def second_derivative(self, dim):
        """Compute second derivative with the 4th order accurate centered scheme.

        It is fully functional with all boundary conditions available on
        Xcompact3d and stretched mesh in y direction.
        The **atribute** ``BC`` is used to store Boundary Condition information
        in a dictionary (see examples), default is ``ncl1 = ncln = 2`` and
        ``npaire = 1``.

        Parameters
        ----------
        dim : str
            Coordinate used for the derivative.

        Returns
        -------
        :obj:`xarray.DataArray`
            **differentiated**

        Examples
        -------

        >>> da.attrs['BC'] = {
        ...     'x': {
        ...         'ncl1': 1,
        ...         'ncln': 1,
        ...         'npaire': 0
        ...     },
        ...     'y': {
        ...         'ncl1': 2,
        ...         'ncln': 1,
        ...         'npaire': 1
        ...         'istret': 0,
        ...         'beta': 1.0
        ...     },
        ...     'z': {
        ...         'ncl1': 0,
        ...         'ncln': 0,
        ...         'npaire': 1
        ... }
        >>> da.x3d.second_derivative('x')

        Notes
        -----
        The **atribute** ``BC`` is automatically defined for ``ux``, ``uy``,
        ``uz``, ``pp`` and ``phi`` when read from the disc with
        :obj:`xcompact3d_toolbox.io.readfield` or initialized at
        :obj:`xcompact3d_toolbox.sendbox.init_dataset`.
        """
        if dim not in self._Dxx:
            try:
                ncl1 = self._data_array.attrs["BC"][dim]["ncl1"]
                ncln = self._data_array.attrs["BC"][dim]["ncln"]
                npaire = self._data_array.attrs["BC"][dim]["npaire"]
            except:
                ncl1, ncln, npaire = 2, 2, 1

            n = self._data_array[dim].size
            m = n if ncl1 == 0 and ncln == 0 else n - 1
            d = (self._data_array[dim][-1] -
                 self._data_array[dim][0]).values / m
            self._Dxx[dim] = SecondDerivative(n, d, ncl1, ncln, npaire)

        try:
            istret = self._data_array.attrs["BC"][dim]["istret"]
            beta = self._data_array.attrs["BC"][dim]["beta"]
        except:
            istret = 0
            beta = 1.0

        if istret == 0:

            return xr.apply_ufunc(
                lambda f: self._Dxx[dim].dot(f),
                self._data_array,
                input_core_dims=[[dim]],
                output_core_dims=[[dim]],
                dask="parallelized",
                vectorize=True,
                output_dtypes=[param["mytype"]],
            )

        else:

            yly = (self._data_array[dim][-1] - self._data_array[dim][0]).values

            yp, ppy, pp2y, pp4y = stretching(istret, beta, yly, m, n)

            da_pp2y = xr.DataArray(pp2y,
                                   coords=[self._data_array[dim]],
                                   name="pp2y")
            da_pp4y = xr.DataArray(pp4y,
                                   coords=[self._data_array[dim]],
                                   name="pp4y")

            return da_pp2y * xr.apply_ufunc(
                lambda f: self._Dxx[dim].dot(f),
                self._data_array,
                input_core_dims=[[dim]],
                output_core_dims=[[dim]],
                dask="parallelized",
                vectorize=True,
                output_dtypes=[param["mytype"]],
            ) - da_pp4y * self._data_array.x3d.first_derivative(dim)
Beispiel #10
0
def xr_linregress(x, y, dim="time"):
    """Calculates linear regression along dimension `dim`.
    Results are equivalent to `scipy.stats.linregress`.

    Parameters
    ----------
    x : {xr.DataArray}
        Independent variable for linear regression. E.g. time.
    y : {xr.DataArray, xr.Dataset}
        Dependent variable.
    dim : str
        Dimension over which to perform linear regression.
        Must be present in both `a` and `b` (the default is 'time').

    Returns
    -------
    type(b)
        Returns a dataarray containing the parameter values
        for each data_variable in `b`. The naming convention
        follows `scipy.stats.linregress`

    """
    # align the nan Values before...
    x = x.where(~np.isnan(y))
    y = y.where(~np.isnan(x))
    # TODO: think about making this optional? Right now I err on the side of caution

    # Inspired by this post https://stackoverflow.com/a/60352716 but adjusted, so that
    # results are exactly as with scipy.stats.linregress for 1d vectors.

    n = y.notnull().sum(dim)

    nanmask = np.isnan(y).all(dim)

    xmean = x.mean(dim)
    ymean = y.mean(dim)
    xstd = x.std(dim)
    ystd = y.std(dim)

    cov = ((x - xmean) * (y - ymean)).sum(dim) / (n)
    cor = cov / (xstd * ystd)

    slope = cov / (xstd**2)
    intercept = ymean - xmean * slope

    df = n - 2
    TINY = 1.0e-20
    tstats = cor * np.sqrt(df / ((1.0 - cor + TINY) * (1.0 + cor + TINY)))
    stderr = slope / tstats

    pval = (xr.apply_ufunc(
        stats.distributions.t.sf,
        abs(tstats),
        df,
        dask="parallelized",
        output_dtypes=[y.dtype],
    ) * 2)

    return xr.Dataset({
        "slope": slope,
        "intercept": intercept,
        "r_value": cor.fillna(0).where(~nanmask),
        "p_value": pval,
        "std_err": stderr.where(~np.isinf(stderr), 0),
    })
Beispiel #11
0
def seeds_init(
    varr: xr.DataArray,
    wnd_size=500,
    method="rolling",
    stp_size=200,
    nchunk=100,
    max_wnd=10,
    diff_thres=2,
):
    """
    Generate over-complete set of seeds by finding local maxima across frames.

    This function computes the maximum intensity projection of a subset of
    frames and finds the local maxima. The subsetting use either a rolling
    window or random sampling of frames. `wnd_size` `stp_size` and `nchunk`
    controls different aspects of the subsetting. `max_wnd` and `diff_thres`
    controls how local maxima are computed. The set of all local maxima found in
    this process constitutes  an overly-complete set of seeds, representing
    putative locations of cells.

    Parameters
    ----------
    varr : xr.DataArray
        Input movie data. Should have dimensions "frame", "height" and "width".
    wnd_size : int, optional
        Number of frames in each chunk, for which a max projection will be
        calculated. By default `500`.
    method : str, optional
        Either `"rolling"` or `"random"`. Controls whether to use rolling window
        or random sampling of frames to construct chunks. By default
        `"rolling"`.
    stp_size : int, optional
        Number of frames between the center of each chunk when stepping through
        the data with rolling windows. Only used if `method is "rolling"`. By
        default `200`.
    nchunk : int, optional
        Number of chunks to sample randomly. Only used if `method is "random"`.
        By default `100`.
    max_wnd : int, optional
        Radius (in pixels) of the disk window used for computing local maxima.
        Local maximas are defined as pixels with maximum intensity in such a
        window. By default `10`.
    diff_thres : int, optional
        Intensity threshold for the difference between local maxima and its
        neighbours. Any local maxima that is not birghter than its neighbor
        (defined by the same disk window) by `diff_thres` intensity values will
        be filtered out. By default `2`.

    Returns
    -------
    seeds : pd.DataFrame
        Seeds dataframe with each seed as a row. Has column "height" and "width"
        which are location of the seeds. Also has column "seeds" which is an
        integer showing how many chunks where the seed is considered a local
        maxima.
    """
    int_path = os.environ["MINIAN_INTERMEDIATE"]
    print("constructing chunks")
    idx_fm = varr.coords["frame"]
    nfm = len(idx_fm)
    if method == "rolling":
        nstp = np.ceil(nfm / stp_size) + 1
        centers = np.linspace(0, nfm - 1, int(nstp))
        hwnd = np.ceil(wnd_size / 2)
        max_idx = list(
            map(
                lambda c: slice(int(np.floor(c - hwnd).clip(0)),
                                int(np.ceil(c + hwnd))),
                centers,
            ))
    elif method == "random":
        max_idx = [
            np.random.randint(0, nfm - 1, wnd_size) for _ in range(nchunk)
        ]
    print("computing max projections")
    res = [max_proj_frame(varr, cur_idx) for cur_idx in max_idx]
    max_res = xr.concat(res, "sample")
    max_res = save_minian(max_res.rename("max_res"), int_path, overwrite=True)
    print("calculating local maximum")
    loc_max = xr.apply_ufunc(
        local_max_roll,
        max_res,
        input_core_dims=[["height", "width"]],
        output_core_dims=[["height", "width"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[np.uint8],
        kwargs=dict(k0=2, k1=max_wnd, diff=diff_thres),
    ).sum("sample")
    seeds = (loc_max.where(
        loc_max > 0).rename("seeds").to_dataframe().dropna().reset_index())
    return seeds[["height", "width", "seeds"]]
Beispiel #12
0
 def __ge__(self, other):
     if isinstance(other, Longitude):
         return self > other or self == other
     else:
         return xr.apply_ufunc(np.less_equal, other, self)
Beispiel #13
0
def effective_sample_size(a, b, dim, skipna=False):
    """Effective sample size for temporally correlated data.

    .. note::
        This metric should only be applied over the time dimension,
        since it is designed for temporal autocorrelation. Weights
        are not included due to the reliance on temporal
        autocorrelation.

    The effective sample size extracts the number of independent samples
    between two time series being correlated. This is derived by assessing
    the magnitude of the lag-1 autocorrelation coefficient in each of the time series
    being correlated. A higher autocorrelation induces a lower effective sample
    size which raises the correlation coefficient for a given p value.

     .. math::
        N_{eff} = N\\left( \\frac{1 -
                   \\rho_{f}\\rho_{o}}{1 + \\rho_{f}\\rho_{o}} \\right),

    where :math:`\\rho_{f}` and :math:`\\rho_{o}` are the lag-1 autocorrelation
    coefficients for the forecast and observations.

    Parameters
    ----------
    a : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    b : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    dim : str, list
        The dimension(s) to apply the function along.
    skipna : bool
        If True, skip NaNs when computing function.

    Returns
    -------
    xarray.Dataset or xarray.DataArray
        Effective sample size.

    Reference
    ---------
    * Bretherton, Christopher S., et al. "The effective number of spatial degrees of
      freedom of a time-varying field." Journal of climate 12.7 (1999): 1990-2009.
    * Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
      Academic press, 2011.

    """
    dim, _ = _preprocess_dims(dim)
    if len(dim) > 1:
        raise ValueError(
            'Effective sample size should only be applied to a singular time dimension.'
        )
    else:
        new_dim = dim[0]
    if new_dim != 'time':
        warnings.warn(
            f"{dim} is not 'time'. Make sure that you are applying this over a "
            f'temporal dimension.')

    return xr.apply_ufunc(
        _effective_sample_size,
        a,
        b,
        input_core_dims=[[new_dim], [new_dim]],
        kwargs={
            'axis': -1,
            'skipna': skipna
        },
        dask='parallelized',
        output_dtypes=[float],
    )
Beispiel #14
0
def fuzzy_where( cond: xr.DataArray, x, y, join="left" ) -> xr.DataArray:
    from xarray.core import duck_array_ops
    return xr.apply_ufunc( duck_array_ops.where, cond, x, y, join=join, dataset_join=join, dask="allowed" )
Beispiel #15
0
 def __call__(self, correlations, n):
     return xr.apply_ufunc(lambda correlation: self.correct(correlation, n),
                           correlations)
Beispiel #16
0
    return find_ridges_spherical_hessian(x,
                                         sigma=4,
                                         tolerance_threshold=0.0015e-3,
                                         scheme='second_order',
                                         return_eigvectors=True)[return_idx]


def func_to_apply(x):
    ridge = x.groupby('time').apply(find_ridges)
    ridge = ridge.where(x > 1.2)
    return ridge


for i, filename in enumerate(file_list[61:]):
    print(1)

    #  print('*--- Reading file {} of {} ---*'.format(i, filename))
    outname = filename.split('/')[-1]
    da = xr.open_dataarray(filename, chunks={'time': 60})
    da = da.sortby('time')
    da = .5 * xr.apply_ufunc(np.log, da, dask='allowed')
    print(da)
    with ProgressBar():
        ridges = da.map_blocks(func_to_apply,
                               template=da).compute(scheduler='processes',
                                                    num_workers=4)

    print('Calculation end.')
    print('Writing output.')
    ridges.to_netcdf(outpath + outname)
Beispiel #17
0
def compute_dataset(cube_func: CubeFunc,
                    *input_cubes: xr.Dataset,
                    input_cube_schema: CubeSchema = None,
                    input_var_names: Sequence[str] = None,
                    input_params: Dict[str, Any] = None,
                    output_var_name: str = 'output',
                    output_var_dims: AbstractSet[str] = None,
                    output_var_dtype: Any = np.float64,
                    output_var_attrs: Dict[str, Any] = None,
                    vectorize: bool = None,
                    cube_asserted: bool = False) -> xr.Dataset:
    """
    Compute a new output dataset with a single variable named *output_var_name*
    from variables named *input_var_names* contained in zero, one, or more
    input data cubes in *input_cubes* using a cube factory function *cube_func*.

    *cube_func* is called concurrently for each of the chunks of the input variables.
    It is expected to return a chunk block whith is type ``np.ndarray``.

    If *input_cubes* is not empty, *cube_func* receives variables as specified by *input_var_names*.
    If *input_cubes* is empty, *input_var_names* must be empty too, and *input_cube_schema*
    must be given, so that a new cube can be created.

    The full signature of *cube_func* is:::

        def cube_func(*input_vars: np.ndarray,
                      input_params: Dict[str, Any] = None,
                      dim_coords: Dict[str, np.ndarray] = None,
                      dim_ranges: Dict[str, Tuple[int, int]] = None) -> np.ndarray:
            pass

    The arguments are:

    * ``input_vars``: the variables according to the given *input_var_names*;
    * ``input_params``: is this call's *input_params*, a mapping from parameter name to value;
    * ``dim_coords``: a mapping from dimension names to the current chunk's coordinate arrays;
    * ``dim_ranges``: a mapping from dimension names to the current chunk's index ranges.

    Only the ``input_vars`` argument is mandatory. The keyword arguments
    ``input_params``, ``input_params``, ``input_params`` do need to be present at all.

    *output_var_dims* my be given in the case, where ...
    TODO: describe new output_var_dims...

    :param cube_func: The cube factory function.
    :param input_cubes: An optional sequence of input cube datasets, must be provided if *input_cube_schema* is not.
    :param input_cube_schema: An optional input cube schema, must be provided if *input_cubes* is not.
    :param input_var_names: A sequence of variable names
    :param input_params: Optional dictionary with processing parameters passed to *cube_func*.
    :param output_var_name: Optional name of the output variable, defaults to ``'output'``.
    :param output_var_dims: Optional set of names of the output dimensions,
        used in the case *cube_func* reduces dimensions.
    :param output_var_dtype: Optional numpy datatype of the output variable, defaults to ``'float32'``.
    :param output_var_attrs: Optional metadata attributes for the output variable.
    :param vectorize: Whether all *input_cubes* have the same variables which are concatenated and passed as vectors
        to *cube_func*. Not implemented yet.
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new dataset that contains the computed output variable.
    """
    if vectorize is not None:
        # TODO: support vectorize = all cubes have same variables and cube_func
        #       receives variables as vectors (with extra dim)
        raise NotImplementedError('vectorize is not supported yet')

    if not cube_asserted:
        for cube in input_cubes:
            assert_cube(cube)

    # Check compatibility of inputs
    if input_cubes:
        input_cube_schema = CubeSchema.new(input_cubes[0])
        for cube in input_cubes:
            if not cube_asserted:
                assert_cube(cube)
            if cube != input_cubes[0]:
                # noinspection PyUnusedLocal
                other_schema = CubeSchema.new(cube)
                # TODO (forman): broadcast all cubes to same shape, rechunk to same chunks
    elif input_cube_schema is None:
        raise ValueError('input_cube_schema must be given')

    output_var_name = output_var_name or 'output'

    # Collect named input variables, raise if not found
    input_var_names = input_var_names or []
    input_vars = []
    for var_name in input_var_names:
        input_var = None
        for cube in input_cubes:
            if var_name in cube.data_vars:
                input_var = cube[var_name]
                break
        if input_var is None:
            raise ValueError(
                f'variable {var_name!r} not found in any of cubes')
        input_vars.append(input_var)

    # Find out, if cube_func uses any of _PREDEFINED_KEYWORDS
    has_input_params, has_dim_coords, has_dim_ranges = _inspect_cube_func(
        cube_func, input_var_names)

    def cube_func_wrapper(index_chunk, *input_var_chunks):
        nonlocal input_cube_schema, input_var_names, input_params, input_vars
        nonlocal has_input_params, has_dim_coords, has_dim_ranges

        # Note, xarray.apply_ufunc does a test call with empty input arrays,
        # so index_chunk.size == 0 is a valid case
        empty_call = index_chunk.size == 0

        # TODO: when output_var_dims is given, index_chunk must be reordered
        #   as core dimensions are moved to the and of index_chunk and input_var_chunks
        if not empty_call:
            index_chunk = index_chunk.ravel()

        if index_chunk.size < 2 * input_cube_schema.ndim:
            if not empty_call:
                warnings.warn(
                    f"unexpected index_chunk of size {index_chunk.size} received!"
                )
                return None

        dim_ranges = None
        if has_dim_ranges or has_dim_coords:
            dim_ranges = {}
            for i in range(input_cube_schema.ndim):
                dim_name = input_cube_schema.dims[i]
                if not empty_call:
                    start = int(index_chunk[2 * i + 0])
                    end = int(index_chunk[2 * i + 1])
                    dim_ranges[dim_name] = start, end
                else:
                    dim_ranges[dim_name] = ()

        dim_coords = None
        if has_dim_coords:
            dim_coords = {}
            for coord_var_name, coord_var in input_cube_schema.coords.items():
                coord_slices = [slice(None)] * coord_var.ndim
                for i in range(input_cube_schema.ndim):
                    dim_name = input_cube_schema.dims[i]
                    if dim_name in coord_var.dims:
                        j = coord_var.dims.index(dim_name)
                        coord_slices[j] = slice(*dim_ranges[dim_name])
                dim_coords[coord_var_name] = coord_var[tuple(
                    coord_slices)].values

        kwargs = {}
        if has_input_params:
            kwargs['input_params'] = input_params
        if has_dim_ranges:
            kwargs['dim_ranges'] = dim_ranges
        if has_dim_coords:
            kwargs['dim_coords'] = dim_coords

        return cube_func(*input_var_chunks, **kwargs)

    index_var = _gen_index_var(input_cube_schema)

    all_input_vars = [index_var] + input_vars

    input_core_dims = None
    if output_var_dims:
        input_core_dims = []
        has_warned = False
        for i in range(len(all_input_vars)):
            input_var = all_input_vars[i]
            var_core_dims = [
                dim for dim in input_var.dims if dim not in output_var_dims
            ]
            must_rechunk = False
            if var_core_dims and input_var.chunks:
                for var_core_dim in var_core_dims:
                    dim_index = input_var.dims.index(var_core_dim)
                    dim_chunk_size = input_var.chunks[dim_index][0]
                    dim_shape_size = input_var.shape[dim_index]
                    if dim_chunk_size != dim_shape_size:
                        must_rechunk = True
                        break
            if must_rechunk:
                if not has_warned:
                    warnings.warn(
                        f'Input variables must not be chunked in dimension(s): {", ".join(var_core_dims)}.\n'
                        f'Rechunking applies, which may drastically decrease runtime performance '
                        f'and increase memory usage.')
                    has_warned = True
                all_input_vars[i] = input_var.chunk(
                    {var_core_dim: -1
                     for var_core_dim in var_core_dims})
            input_core_dims.append(var_core_dims)

    output_var = xr.apply_ufunc(cube_func_wrapper,
                                *all_input_vars,
                                dask='parallelized',
                                input_core_dims=input_core_dims,
                                output_dtypes=[output_var_dtype])
    if output_var_attrs:
        output_var.attrs.update(output_var_attrs)
    return xr.Dataset({output_var_name: output_var},
                      coords=input_cube_schema.coords)
Beispiel #18
0
 def __eq__(self, other):
     if isinstance(other, Longitude):
         return (self.hemisphere == other.hemisphere and
                 self.longitude == other.longitude)
     else:
         return xr.apply_ufunc(np.equal, other, self)
Beispiel #19
0
    'SpeciesConc_NO2',
    'SpeciesConc_HNO3',
    'SpeciesConc_PAN',
    'SpeciesConc_CO',
    'SpeciesConc_CH2O',
    'SpeciesConc_SO2',
    'SpeciesConc_NH3',
]
spc_mmr = spc_mmr[relevant_spc]
spc_mmr = drop_non_dmmr_variables(spc_mmr)
spc_mol = dmmr_to_moles(spc_mmr, mass_dry_air=met.Met_AD)

print("Calculating tropospheric totals ...")
tropo_total = xr.apply_ufunc(total_below,
                             spc_mol,
                             met.Met_TropP,
                             met.Met_PS1WET,
                             input_core_dims=[['lev'], [], []],
                             vectorize=True)
global_tropo_total = tropo_total.sum(dim=('nf', 'Ydim', 'Xdim'))
global_tropo_total = global_tropo_total.rename(
    {old_name: f'Global{old_name}'
     for old_name in global_tropo_total.keys()})

# strat_total = xr.apply_ufunc(
#     total_below,
#     spc_mol, met.Met_TropP, met.Met_PS1WET, kwargs=dict(above_instead=True),
#     input_core_dims=[['lev'], [], []],
#     vectorize=True
# )

print("Subsetting budget dataset ...")
Beispiel #20
0
def spearman_r_eff_p_value(a, b, dim, skipna=False):
    """
    2-tailed p-value associated with Spearman rank correlation coefficient,
    accounting for autocorrelation.

    .. note::
        This metric should only be applied over the time dimension,
        since it is designed for temporal autocorrelation. Weights
        are not included due to the reliance on temporal
        autocorrelation.

    The effective p value is computed by replacing the sample size :math:`N` in the
    t-statistic with the effective sample size, :math:`N_{eff}`. The same Spearman's
    rank correlation coefficient :math:`r` is used as when computing the standard p
    value.

    .. math::
        t = r\\sqrt{ \\frac{N_{eff} - 2}{1 - r^{2}} },

    where :math:`N_{eff}` is computed via the autocorrelation in the forecast and
    observations.

    .. math::
        N_{eff} = N\\left( \\frac{1 -
                   \\rho_{f}\\rho_{o}}{1 + \\rho_{f}\\rho_{o}} \\right),

    where :math:`\\rho_{f}` and :math:`\\rho_{o}` are the lag-1 autocorrelation
    coefficients for the forecast and observations.

    Parameters
    ----------
    a : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    b : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    dim : str, list
        The dimension(s) to compute the p value over.
    skipna : bool
        If True, skip NaNs when computing function.

    Returns
    -------
    xarray.Dataset or xarray.DataArray
        2-tailed p-value of Spearman's correlation coefficient, accounting for
        autocorrelation.

    Reference
    ---------
    * Bretherton, Christopher S., et al. "The effective number of spatial degrees of
      freedom of a time-varying field." Journal of climate 12.7 (1999): 1990-2009.
    * Wilks, Daniel S. Statistical methods in the atmospheric sciences. Vol. 100.
      Academic press, 2011.

    See Also
    --------
    xarray.apply_ufunc
    scipy.stats.spearman_r
    xskillscore.core.np_deterministic._spearman_r_eff_p_value

    """
    dim, _ = _preprocess_dims(dim)
    if len(dim) > 1:
        raise ValueError(
            'Effective sample size should only be applied to a singular time dimension.'
        )
    else:
        new_dim = dim[0]
    if new_dim != 'time':
        warnings.warn(
            f"{dim} is not 'time'. Make sure that you are applying this over a "
            f'temporal dimension.')

    return xr.apply_ufunc(
        _spearman_r_eff_p_value,
        a,
        b,
        input_core_dims=[[new_dim], [new_dim]],
        kwargs={
            'axis': -1,
            'skipna': skipna
        },
        dask='parallelized',
        output_dtypes=[float],
    )
Beispiel #21
0
 def __le__(self, other):
     if isinstance(other, Longitude):
         return self < other or self == other
     else:
         return xr.apply_ufunc(np.greater_equal, other, self)
Beispiel #22
0
def gmm_refine(
    varr: xr.DataArray,
    seeds: pd.DataFrame,
    q=(0.1, 99.9),
    n_components=2,
    valid_components=1,
    mean_mask=True,
) -> Tuple[pd.DataFrame, xr.DataArray, GaussianMixture]:
    """
    Filter seeds by fitting a GMM to peak-to-peak values.

    This function assume that the distribution of peak-to-peak values of
    fluorescence across all seeds can be model by a Gaussian Mixture Model (GMM)
    with different means. It computes peak-to-peak value for all the seeds, then
    fit a GMM with `n_components` to the distribution, and filter out the seeds
    belonging to the `n_components - valid_components` number of gaussians with
    lower means.

    Parameters
    ----------
    varr : xr.DataArray
        The input movie data. Should have dimension "spatial" and "frame".
    seeds : pd.DataFrame
        The input over-complete set of seeds to be filtered.
    q : tuple, optional
        Percentile to use to compute the peak-to-peak values. For a given seed
        with corresponding fluorescent fluctuation `f`, the peak-to-peak value
        for that seed is computed as `np.percentile(f, q[1]) - np.percentile(f,
        q[0])`. By default `(0.1, 99.9)`.
    n_components : int, optional
        Number of components (Gaussians) in the GMM model. By default `2`.
    valid_components : int, optional
        Number of components (Gaussians) to be considered as modeling the
        distribution of peak-to-peak values of valid seeds. Should be smaller
        than `n_components`. By default `1`.
    mean_mask : bool, optional
        Whether to apply additional criteria where a seed is valid only if its
        peak-to-peak value exceeds the mean of the lowest gaussian distribution.
        Only useful in corner cases where the distribution of the gaussian
        heavily overlap. By default `True`.

    Returns
    -------
    seeds : pd.DataFrame
        The resulting seeds dataframe with an additional column "mask_gmm",
        indicating whether the seed is considered valid by this function. If the
        column already exists in input `seeds` it will be overwritten.
    varr_pv : xr.DataArray
        The computed peak-to-peak values for each seeds.
    gmm : GaussianMixture
        The fitted GMM model object.

    See Also
    -------
    sklearn.mixture.GaussianMixture
    """
    print("selecting seeds")
    varr_sub = varr.sel(
        spatial=[tuple(hw) for hw in seeds[["height", "width"]].values])
    print("computing peak-valley values")
    varr_valley = xr.apply_ufunc(
        np.percentile,
        varr_sub.chunk(dict(frame=-1)),
        input_core_dims=[["frame"]],
        kwargs=dict(q=q[0], axis=-1),
        dask="parallelized",
        output_dtypes=[varr_sub.dtype],
    )
    varr_peak = xr.apply_ufunc(
        np.percentile,
        varr_sub.chunk(dict(frame=-1)),
        input_core_dims=[["frame"]],
        kwargs=dict(q=q[1], axis=-1),
        dask="parallelized",
        output_dtypes=[varr_sub.dtype],
    )
    varr_pv = varr_peak - varr_valley
    varr_pv = varr_pv.compute()
    print("fitting GMM models")
    dat = varr_pv.values.reshape(-1, 1)
    gmm = GaussianMixture(n_components=n_components)
    gmm.fit(dat)
    idg = np.argsort(gmm.means_.reshape(-1))[-valid_components:]
    idx_valid = np.isin(gmm.predict(dat), idg)
    if mean_mask:
        idx_mean = dat > np.sort(gmm.means_)[0]
        idx_valid = np.logical_and(idx_mean.squeeze(), idx_valid)
    seeds["mask_gmm"] = idx_valid
    return seeds, varr_pv, gmm
Beispiel #23
0
def compute_aggregates(perf_da, baseline_ds):
    """Aggregate function evaluations in the experiments to get performance summaries of each method.

    Parameters
    ----------
    perf_da : :class:`xarray:xarray.DataArray`
        Aggregate experimental results with each function evaluation in the experiments. `all_perf` has dimensions
        ``(ITER, SUGGEST, TEST_CASE, METHOD, TRIAL)`` as is assumed to have no nan values.
    baseline_ds : :class:`xarray:xarray.Dataset`
        Dataset with baseline performance. It was variables ``(PERF_MED, PERF_MEAN, PERF_CLIP, PERF_BEST)`` with
        dimensions ``(ITER, TEST_CASE)``, ``(ITER, TEST_CASE)``, ``(TEST_CASE,)``, and ``(TEST_CASE,)``, respectively.
        `PERF_MED` is a baseline of performance based on random search when using medians to summarize performance.
        Likewise, `PERF_MEAN` is for means. `PERF_CLIP` is an upperbound to clip poor performance when using the mean.
        `PERF_BEST` is an estimate on the global minimum.

    Returns
    -------
    agg_result : :class:`xarray:xarray.Dataset`
        Dataset with summary of performance for each method and test case combination. Contains variables:
        ``(PERF_MED, LB_MED, UB_MED, NORMED_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN)``
        each with dimensions ``(ITER, METHOD, TEST_CASE)``. `PERF_MED` is a median summary of performance with `LB_MED`
        and `UB_MED` as error bars. `NORMED_MED` is a rescaled `PERF_MED` so we expect the optimal performance is 0,
        and random search gives 1 at all `ITER`. Likewise, `PERF_MEAN`, `LB_MEAN`, `UB_MEAN`, `NORMED_MEAN` are for
        mean performance.
    summary : :class:`xarray:xarray.Dataset`
        Dataset with overall summary of performance of each method. Contains variables
        ``(PERF_MED, LB_MED, UB_MED, PERF_MEAN, LB_MEAN, UB_MEAN)``
        each with dimensions ``(ITER, METHOD)``.
    """
    validate_agg_perf(perf_da, min_trial=1)

    assert isinstance(baseline_ds, xr.Dataset)
    assert tuple(baseline_ds[PERF_BEST].dims) == (TEST_CASE,)
    assert tuple(baseline_ds[PERF_CLIP].dims) == (TEST_CASE,)
    assert tuple(baseline_ds[PERF_MED].dims) == (ITER, TEST_CASE)
    assert tuple(baseline_ds[PERF_MEAN].dims) == (ITER, TEST_CASE)
    assert xru.coord_compat((perf_da, baseline_ds), (ITER, TEST_CASE))
    assert not any(np.any(np.isnan(baseline_ds[kk].values)) for kk in baseline_ds)

    # Now actually get the aggregate performance numbers per test case
    agg_result = xru.ds_like(
        perf_da,
        (PERF_MED, LB_MED, UB_MED, NORMED_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN),
        (ITER, METHOD, TEST_CASE),
    )
    baseline_mean_da = xru.only_dataarray(xru.ds_like(perf_da, ["ref"], (ITER, TEST_CASE)))
    # Using values here since just clearer to get raw items than xr object for func_name
    for func_name in perf_da.coords[TEST_CASE].values:
        rand_perf_med = baseline_ds[PERF_MED].sel({TEST_CASE: func_name}, drop=True).values
        rand_perf_mean = baseline_ds[PERF_MEAN].sel({TEST_CASE: func_name}, drop=True).values
        best_opt = baseline_ds[PERF_BEST].sel({TEST_CASE: func_name}, drop=True).values
        base_clip_val = baseline_ds[PERF_CLIP].sel({TEST_CASE: func_name}, drop=True).values

        assert np.all(np.diff(rand_perf_med) <= 0), "Baseline should be decreasing with iteration"
        assert np.all(np.diff(rand_perf_mean) <= 0), "Baseline should be decreasing with iteration"
        assert np.all(rand_perf_med > best_opt)
        assert np.all(rand_perf_mean > best_opt)
        assert np.all(rand_perf_mean <= base_clip_val)

        baseline_mean_da.loc[{TEST_CASE: func_name}] = linear_rescale(
            rand_perf_mean, best_opt, base_clip_val, 0.0, 1.0, enforce_bounds=False
        )
        for method_name in perf_da.coords[METHOD].values:
            # Take the minimum over all suggestion at given iter + sanity check perf_da
            curr_da = perf_da.sel({METHOD: method_name, TEST_CASE: func_name}, drop=True).min(dim=SUGGEST)
            assert curr_da.dims == (ITER, TRIAL)

            # Want to evaluate minimum so far during optimization
            perf_array = np.minimum.accumulate(curr_da.values, axis=0)

            # Compute median perf and CI on it
            med_perf, LB, UB = qt.quantile_and_CI(perf_array, EVAL_Q, alpha=ALPHA)
            assert med_perf.shape == rand_perf_med.shape
            agg_result[PERF_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = med_perf
            agg_result[LB_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = LB
            agg_result[UB_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = UB

            # Now store normed version, which is better for aggregation
            normed = linear_rescale(med_perf, best_opt, rand_perf_med, 0.0, 1.0, enforce_bounds=False)
            agg_result[NORMED_MED].loc[{TEST_CASE: func_name, METHOD: method_name}] = normed

            # Compute mean perf and CI on it
            perf_array = np.minimum(base_clip_val, perf_array)
            mean_perf = np.mean(perf_array, axis=1)
            assert mean_perf.shape == rand_perf_mean.shape
            EB = t_EB(perf_array, alpha=ALPHA, axis=1)
            assert EB.shape == rand_perf_mean.shape
            agg_result[PERF_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf
            agg_result[LB_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf - EB
            agg_result[UB_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = mean_perf + EB

            # Now store normed version, which is better for aggregation
            normed = linear_rescale(mean_perf, best_opt, base_clip_val, 0.0, 1.0, enforce_bounds=False)
            agg_result[NORMED_MEAN].loc[{TEST_CASE: func_name, METHOD: method_name}] = normed
    assert not any(np.any(np.isnan(agg_result[kk].values)) for kk in agg_result)

    # Compute summary score over all test cases, summarize performance of each method
    summary = xru.ds_like(
        perf_da,
        (PERF_MED, LB_MED, UB_MED, PERF_MEAN, LB_MEAN, UB_MEAN, NORMED_MEAN, LB_NORMED_MEAN, UB_NORMED_MEAN),
        (ITER, METHOD),
    )
    summary[PERF_MED], summary[LB_MED], summary[UB_MED] = xr.apply_ufunc(
        qt.quantile_and_CI,
        agg_result[NORMED_MED],
        input_core_dims=[[TEST_CASE]],
        kwargs={"q": EVAL_Q, "alpha": ALPHA},
        output_core_dims=[[], [], []],
    )

    summary[PERF_MEAN] = agg_result[NORMED_MEAN].mean(dim=TEST_CASE)
    EB = xr.apply_ufunc(t_EB, agg_result[NORMED_MEAN], input_core_dims=[[TEST_CASE]])
    summary[LB_MEAN] = summary[PERF_MEAN] - EB
    summary[UB_MEAN] = summary[PERF_MEAN] + EB

    normalizer = baseline_mean_da.mean(dim=TEST_CASE)
    summary[NORMED_MEAN] = summary[PERF_MEAN] / normalizer
    summary[LB_NORMED_MEAN] = summary[LB_MEAN] / normalizer
    summary[UB_NORMED_MEAN] = summary[UB_MEAN] / normalizer

    assert all(tuple(summary[kk].dims) == (ITER, METHOD) for kk in summary)
    return agg_result, summary
Beispiel #24
0
def pnr_refine(
    varr: xr.DataArray,
    seeds: pd.DataFrame,
    noise_freq=0.25,
    thres: Union[float, str] = 1.5,
    q=(0.1, 99.9),
    med_wnd: Optional[int] = None,
) -> Tuple[pd.DataFrame, xr.DataArray, Optional[GaussianMixture]]:
    """
    Filter seeds by thresholding peak-to-noise ratio.

    For each input seed, the noise is defined as high-pass filtered fluorescence
    trace of the seed. The peak-to-noise ratio (pnr) of that seed is then
    defined as the ratio between the peak-to-peak value of the originial
    fluorescence trace and that of the noise trace. Optionally, if abrupt
    changes in baseline fluorescence is expected, then the baseline can be
    estimated by median-filtering the fluorescence trace and subtracted from the
    original trace before computing the peak-to-noise ratio. In addition, if a
    hard threshold of pnr is not desired, then a Gaussian Mixture Model with 2
    components can be fitted to the distribution of pnr across all seeds, and
    only seeds with pnr belonging to the higher-mean Gaussian will be considered
    valide.

    Parameters
    ----------
    varr : xr.DataArray
        Input movie data, should have dimensions "height", "width" and "frame".
    seeds : pd.DataFrame
        The input over-complete set of seeds to be filtered.
    noise_freq : float, optional
        Cut-off frequency for the high-pass filter used to define noise,
        specified as fraction of sampling frequency. By default `0.25`.
    thres : Union[float, str], optional
        Threshold of the peak-to-noise ratio. If `"auto"` then a :class:`GMM
        <sklearn.mixture.GaussianMixture>` will be fit to the distribution of
        pnr. By default `1.5`.
    q : tuple, optional
        Percentile to use to compute the peak-to-peak values. For a given
        fluorescence fluctuation `f`, the peak-to-peak value for that seed is
        computed as `np.percentile(f, q[1]) - np.percentile(f, q[0])`. By
        default `(0.1, 99.9)`.
    med_wnd : int, optional
        Size of the median filter window to remove baseline. If `None` then no
        filtering will be done. By default `None`.

    Returns
    -------
    seeds : pd.DataFrame
        The resulting seeds dataframe with an additional column "mask_pnr",
        indicating whether the seed is considered valid by this function. If the
        column already exists in input `seeds` it will be overwritten.
    pnr : xr.DataArray
        The computed peak-to-noise ratio for each seeds.
    gmm : GaussianMixture, optional
        The GMM model object fitted to the distribution of pnr. Will be `None`
        unless `thres` is `"auto"`.
    """
    print("selecting seeds")
    # vectorized indexing on dask arrays produce a single chunk.
    # to memory issue, split seeds into 128 chunks, with chunk size no greater than 100
    chk_size = min(int(len(seeds) / 128), 100)
    vsub_ls = []
    for _, seed_sub in seeds.groupby(np.arange(len(seeds)) // chk_size):
        vsub = varr.sel(height=seed_sub["height"].to_xarray(),
                        width=seed_sub["width"].to_xarray())
        vsub_ls.append(vsub)
    varr_sub = xr.concat(vsub_ls, "index")
    if med_wnd:
        print("removing baseline")
        varr = xr.apply_ufunc(
            med_baseline,
            varr_sub,
            input_core_dims=[["frame"]],
            output_core_dims=[["frame"]],
            dask="parallelized",
            kwargs={"wnd": med_wnd},
            vectorize=True,
            output_dtypes=[varr.dtype],
        )
    print("computing peak-noise ratio")
    pnr = xr.apply_ufunc(
        pnr_perseed,
        varr_sub,
        input_core_dims=[["frame"]],
        output_core_dims=[[]],
        kwargs={
            "freq": noise_freq,
            "q": q
        },
        vectorize=True,
        dask="parallelized",
        output_dtypes=[float],
    ).compute()
    if thres == "auto":
        gmm = GaussianMixture(n_components=2)
        gmm.fit(np.nan_to_num(pnr.values.reshape(-1, 1)))
        idg = np.argsort(gmm.means_.reshape(-1))[-1]
        idx_valid = np.isin(gmm.predict(pnr.values.reshape(-1, 1)), idg)
        seeds["mask_pnr"] = idx_valid
    else:
        mask = pnr > thres
        mask_df = mask.to_pandas().rename("mask_pnr")
        seeds["mask_pnr"] = mask_df
        gmm = None
    return seeds, pnr, gmm
Beispiel #25
0
def ensemble_percentiles(
    ens: xr.Dataset,
    values: Tuple[int, int, int] = (10, 50, 90),
    keep_chunk_size: Optional[bool] = None,
) -> xr.Dataset:
    """Calculate ensemble statistics between a results from an ensemble of climate simulations.

    Returns a Dataset containing ensemble percentiles for input climate simulations.

    Parameters
    ----------
    ens: xr.Dataset
      Ensemble dataset (see xclim.ensembles.create_ensemble).
    values : Tuple[int, int, int]
      Percentile values to calculate. Default: (10, 50, 90).
    keep_chunk_size : Optional[bool]
      For ensembles using dask arrays, all chunks along the 'realization' axis are merged.
      If True, the dataset is rechunked along the dimension with the largest chunks, so that the chunks keep the same size (approx)
      If False, no shrinking is performed, resulting in much larger chunks
      If not defined, the function decides which is best

    Returns
    -------
    xr.Dataset
      Dataset with containing data variables of requested ensemble statistics

    Examples
    --------
    >>> from xclim import ensembles
    >>> import glob
    >>> ncfiles = glob.glob('/*tas*.nc')
    Create ensemble dataset
    >>> ens = ensembles.create_ensemble(ncfiles)
    Calculate default ensemble percentiles
    >>> ens_percs = ensembles.ensemble_percentiles(ens)
    >>> print(ens_percs['tas_p10'])
    Calculate non-default percentiles (25th and 75th)
    >>> ens_percs = ensembles.ensemble_percentiles(ens, values=(25, 50, 75))
    >>> print(ens_percs['tas_p25'])
    If the original array has many small chunks, it might be more efficient to do:
    >>> ens_percs = ensembles.ensemble_percentiles(ens, keep_chunk_size=False)
    >>> print(ens_percs['tas_p25'])
    """

    ds_out = xr.Dataset(attrs=ens.attrs)
    for v in ens.data_vars:
        # Percentile calculation forbids any chunks along realization
        if len(ens.chunks.get("realization", [])) > 1:
            if keep_chunk_size is None:
                # Enable smart rechunking is chunksize exceed 2E8 elements after merging along realization
                keep_chunk_size = (
                    np.prod(ens[v].isel(realization=0).data.chunksize) *
                    ens.realization.size > 2e8)
            if keep_chunk_size:
                # Smart rechunk on dimension where chunks are the largest
                chkDim, chks = max(
                    ens.chunks.items(),
                    key=lambda kv: 0 if kv[0] == "realization" else max(kv[1]),
                )
                var = ens[v].chunk({
                    "realization": -1,
                    chkDim: len(chks) * ens.realization.size
                })
            else:
                var = ens[v].chunk({"realization": -1})
        else:
            var = ens[v]

        for p in values:
            perc = xr.apply_ufunc(
                _calc_perc,
                var,
                input_core_dims=[["realization"]],
                output_core_dims=[[]],
                keep_attrs=True,
                kwargs=dict(p=p),
                dask="parallelized",
                output_dtypes=[ens[v].dtype],
            )

            perc.name = f"{v}_p{p:02d}"
            ds_out[perc.name] = perc

            if "description" in ds_out[perc.name].attrs:
                ds_out[perc.name].attrs[
                    "description"] = f"{ds_out[perc.name].attrs['description']} : {p}th percentile of ensemble"
            else:
                ds_out[perc.name].attrs[
                    "description"] = f"{p}th percentile of ensemble"

    ds_out.attrs["history"] = update_history(
        f"Computation of the percentiles on {ens.realization.size} ensemble members.",
        ds_out,
    )
    return ds_out
def vector_norm(x, dim, ord=None):
    return xr.apply_ufunc(
        np.linalg.norm, x, input_core_dims=[[dim]], kwargs={"ord": ord, "axis": -1}
    )
Beispiel #27
0
ds = xr.open_dataset(data_path + '/' + name_file)
var = ds[name_var]

#bounds for calculation
lat_s = 0.
lat_n = 60.
lon_w = 285.
lon_e = 352.5
weights = np.cos(ds.lat * np.pi / 180.).sel(lat=slice(lat_s, lat_n))
amo_index = var.sel(lat=slice(lat_s, lat_n), lon=slice(
    lon_w, lon_e)).weighted(weights).mean(dim=['lat', 'lon'])

climatology_mean = amo_index.groupby("time.month").mean('time')
anomalies = xr.apply_ufunc(
    lambda x, m: (x - m),
    amo_index.groupby("time.month"),
    climatology_mean,
)

fig = plt.figure(1, figsize=(9, 4))
ax = fig.add_subplot(111)
ax.plot(np.linspace(1979, 2018, 480), anomalies, lw=1, color='k')
ax.axhline(0., color='k')
ax.fill_between(np.linspace(1979, 2018, 480),
                anomalies,
                where=anomalies > 0.,
                facecolor='red',
                alpha=0.7)
ax.fill_between(np.linspace(1979, 2018, 480),
                anomalies,
                where=anomalies < 0.,
Beispiel #28
0
def detrend(da, dim, detrend_type="constant"):
    """
    Detrend a DataArray

    Parameters
    ----------
    da : xarray.DataArray
        The data to detrend
    dim : str or list
        Dimensions along which to apply detrend.
        Can be either one dimension or a list with two dimensions.
        Higher-dimensional detrending is not supported.
        If dask data are passed, the data must be chunked along dim.
    detrend_type : {'constant', 'linear'}
        If ``constant``, a constant offset will be removed from each dim.
        If ``linear``, a linear least-squares fit will be estimated and removed
        from the data.

    Returns
    -------
    da : xarray.DataArray
        The detrended data.

    Notes
    -----
    This function will act lazily in the presence of dask arrays on the
    input.
    """

    if dim is None:
        dim = list(da.dims)
    else:
        if isinstance(dim, str):
            dim = [dim]

    if detrend_type not in ["constant", "linear", None]:
        raise NotImplementedError(
            "%s is not a valid detrending option. Valid "
            "options are: 'constant','linear', or None." % detrend_type
        )

    if detrend_type is None:
        return da
    elif detrend_type == "constant":
        return da - da.mean(dim=dim)
    elif detrend_type == "linear":
        data = da.data
        axis_num = [da.get_axis_num(d) for d in dim]
        chunks = getattr(data, "chunks", None)
        if chunks:
            axis_chunks = [data.chunks[a] for a in axis_num]
            if not all([len(ac) == 1 for ac in axis_chunks]):
                raise ValueError("Contiguous chunks required for detrending.")
        if len(dim) == 1:
            dt = xr.apply_ufunc(
                sps.detrend,
                da,
                axis_num[0],
                output_dtypes=[da.dtype],
                dask="parallelized",
            )
        elif len(dim) == 2:
            dt = xr.apply_ufunc(
                _detrend_2d_ufunc,
                da,
                input_core_dims=[dim],
                output_core_dims=[dim],
                output_dtypes=[da.dtype],
                vectorize=True,
                dask="parallelized",
            )
        else:  # pragma: no cover
            raise NotImplementedError(
                "Only 1D and 2D detrending are implemented so far."
            )

    return dt
Beispiel #29
0
def bounds_to_vertices(
    bounds: DataArray,
    bounds_dim: str,
    core_dims=None,
    order: Optional[str] = "counterclockwise",
) -> DataArray:
    """
    Convert bounds variable to vertices. There 2 covered cases:
     - 1D coordinates, with bounds of shape (N, 2),
       converted to vertices of shape (N+1,)
     - 2D coordinates, with bounds of shape (N, M, 4).
       converted to vertices of shape (N+1, M+1).

    Parameters
    ----------
    bounds : DataArray
        The bounds to convert.
    bounds_dim : str
        The name of the bounds dimension of `bounds` (the one of length 2 or 4).
    order : {'counterclockwise', 'clockwise', None}
        Valid for 2D coordinates only (i.e. bounds of shape (..., N, M, 4), ignored otherwise.
        Order the bounds are given in, assuming that ax0-ax1-upward is a right handed
        coordinate system, where ax0 and ax1 are the two first dimensions of `bounds`.
        If None, the counterclockwise version is computed and then verified. If the
        check fails the clockwise version is returned. See Notes for more details.
    core_dims : list, optional
        List of core dimensions for apply_ufunc. This must not include bounds_dims.
        The shape of (*core_dims, bounds_dim) must be (N, 2) or (N, M, 4).

    Returns
    -------
    DataArray
        Either of shape (N+1,) or (N+1, M+1). New vertex dimensions are named
        from the intial dimension and suffix "_vertices".

    Notes
    -----
    Getting the correct axes "order" is tricky. There are no real standards for
    dimension names or even axes order, even though the CF conventions mentions the
    ax0-ax1-upward (counterclockwise bounds) as being the default. Moreover, xarray can
    tranpose data without raising any warning or error, which make attributes
    unreliable.

    Please refer to the CF conventions document : http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#cell-boundaries.
    """

    if core_dims is None:
        core_dims = [dim for dim in bounds.dims if dim != bounds_dim]

    output_sizes = {
        f"{dim}_vertices": bounds.sizes[dim] + 1
        for dim in core_dims
    }
    output_core_dims = list(output_sizes.keys())

    n_core_dims = len(core_dims)
    nbounds = bounds[bounds_dim].size

    if not (n_core_dims == 2 and nbounds == 4) and not (n_core_dims == 1
                                                        and nbounds == 2):
        raise ValueError(
            f"Bounds format not understood. Got {bounds.dims} with shape {bounds.shape}."
        )

    return xr.apply_ufunc(
        _bounds_helper,
        bounds,
        input_core_dims=[core_dims + [bounds_dim]],
        dask="parallelized",
        kwargs={
            "n_core_dims": n_core_dims,
            "nbounds": nbounds,
            "order": order
        },
        output_core_dims=[output_core_dims],
        dask_gufunc_kwargs=dict(output_sizes=output_sizes),
        output_dtypes=[bounds.dtype],
    )
Beispiel #30
0
def _resample_iterations_idx(
    init, iterations, dim="member", replace=True, chunk=True, dim_max=None
):
    """Resample over ``dim`` by index ``iterations`` times.

    .. note::
        This is a much faster way to bootstrap than resampling each iteration
        individually and applying the function to it. However, this will create a
        DataArray with dimension ``iteration`` of size ``iterations``. It is probably
        best to do this out-of-memory with ``dask`` if you are doing a large number
        of iterations or using spatial output (i.e., not time series data).

    Args:
        init (xr.DataArray, xr.Dataset): Initialized prediction ensemble.
        iterations (int): Number of bootstrapping iterations.
        dim (str): Dimension name to bootstrap over. Defaults to ``'member'``.
        replace (bool): Bootstrapping with or without replacement. Defaults to ``True``.
        chunk: (bool): Auto-chunk along chunking_dims to get optimal blocksize
        dim_max (int): Number of indices from `dim` to return. Not implemented.

    Returns:
        xr.DataArray, xr.Dataset: Bootstrapped data with additional dim ```iteration```

    """
    if dask.is_dask_collection(init):
        init = init.chunk({"lead": -1, "member": -1})
        init = init.copy(deep=True)

    def select_bootstrap_indices_ufunc(x, idx):
        """Selects multi-level indices ``idx`` from xarray object ``x`` for all
        iterations."""
        # `apply_ufunc` sometimes adds a singleton dimension on the end, so we squeeze
        # it out here. This leverages multi-level indexing from numpy, so we can
        # select a different set of, e.g., ensemble members for each iteration and
        # construct one large DataArray with ``iterations`` as a dimension.
        return np.moveaxis(x.squeeze()[idx.squeeze().transpose()], 0, -1)

    if dask.is_dask_collection(init):
        if chunk:
            chunking_dims = [d for d in init.dims if d not in CLIMPRED_DIMS]
            init = _chunk_before_resample_iterations_idx(
                init, iterations, chunking_dims
            )

    # resample with or without replacement
    if replace:
        idx = np.random.randint(0, init[dim].size, (iterations, init[dim].size))
    elif not replace:
        # create 2d np.arange()
        idx = np.linspace(
            (np.arange(init[dim].size)),
            (np.arange(init[dim].size)),
            iterations,
            dtype="int",
        )
        # shuffle each line
        for ndx in np.arange(iterations):
            np.random.shuffle(idx[ndx])
    idx_da = xr.DataArray(
        idx,
        dims=("iteration", dim),
        coords=({"iteration": range(iterations), dim: init[dim]}),
    )
    transpose_kwargs = (
        {"transpose_coords": False} if isinstance(init, xr.DataArray) else {}
    )
    return xr.apply_ufunc(
        select_bootstrap_indices_ufunc,
        init.transpose(dim, ..., **transpose_kwargs),
        idx_da,
        dask="parallelized",
        output_dtypes=[float],
    )
Beispiel #31
0
def open_nwm_dataset(paths: list,
                     chunks: dict = None,
                     attrs_keep: list = [
                         'featureType', 'proj4', 'station_dimension',
                         'esri_pe_string', 'Conventions', 'model_version'
                     ],
                     spatial_indices: list = None,
                     drop_variables: list = None,
                     npartitions: int = None,
                     profile: int = False) -> xr.Dataset:

    if profile:
        then = timesince()

    # This is totally arbitrary be seems to work ok.
    if npartitions is None:
        npartitions = dask.config.get('pool')._processes * 4
    # This choice does not seem to work well or at all, error?
    # npartitions = len(sorted(paths))
    paths_bag = dask.bag.from_sequence(paths, npartitions=npartitions)

    if profile:
        then = timesince(then)
        print('after paths_bag')

    ds_list = paths_bag.map(
        preprocess_nwm_data,
        chunks=chunks,
        spatial_indices=spatial_indices,
        drop_variables=drop_variables).filter(is_not_none).compute()

    if profile:
        then = timesince(then)
        print("after ds_list preprocess/filter")

    # Group by and merge by choices
    have_members = 'member' in ds_list[0].coords
    if have_members:
        group_list = [group_member_lead_time, group_lead_time]
        merge_list = [merge_reference_time, merge_member]
    else:
        group_list = [group_lead_time]
        merge_list = [merge_reference_time]

    for group, merge in zip(group_list, merge_list):

        if profile:
            then = timesince(then)
            print('before sort')

        the_sort = sorted(ds_list, key=group)

        if profile:
            then = timesince(then)
            print('after sort, before group')

        ds_groups = [list(it) for k, it in itertools.groupby(the_sort, group)]

        if profile:
            then = timesince(then)
            print('after group, before merge')

        # npartitons = len(ds_groups)
        group_bag = dask.bag.from_sequence(ds_groups, npartitions=npartitions)
        ds_list = group_bag.map(merge).compute()

        if profile:
            then = timesince(then)
            print('after merge')

        del group_bag, ds_groups, the_sort

    nwm_dataset = merge_lead_time(ds_list)
    del ds_list

    # Create a valid_time variable.
    def calc_valid_time(ref, lead):
        return np.datetime64(int(ref) + int(lead), 'ns')

    nwm_dataset['valid_time'] = xr.apply_ufunc(calc_valid_time,
                                               nwm_dataset['reference_time'],
                                               nwm_dataset['lead_time'],
                                               vectorize=True)

    # Xarray sets nan as the fill value when there is none. Dont allow that...
    for key, val in nwm_dataset.variables.items():
        if '_FillValue' not in nwm_dataset[key].encoding:
            nwm_dataset[key].encoding.update({'_FillValue': None})

    # Clean up attributes
    new_attrs = collections.OrderedDict()
    if attrs_keep is not None:
        for key, value in nwm_dataset.attrs.items():
            if key in attrs_keep:
                new_attrs[key] = nwm_dataset.attrs[key]

    nwm_dataset.attrs = new_attrs

    # Break into chunked dask array
    if chunks is not None:
        nwm_dataset = nwm_dataset.chunk(chunks=chunks)

    return nwm_dataset
Beispiel #32
0
    def _adapt_freq_group(ds, dim=["time"]):
        if isinstance(ds.sim.data, dsk.Array):
            # In order to be efficient and lazy, some classical numpy ops will be replaced by dask's version
            mod = dsk
            kws = {"chunks": ds.sim.chunks}
        else:
            mod = np
            kws = {}

        # Compute the probability of finding a value <= thresh
        # This is the "dry-day frequency" in the precipitation case
        P0_sim = ecdf(ds.sim, thresh, dim=dim)
        P0_ref = ecdf(ds.ref, thresh, dim=dim)

        # The proportion of values <= thresh in sim that need to be corrected, compared to ref
        dP0 = (P0_sim - P0_ref) / P0_sim

        # Compute : ecdf_ref^-1( ecdf_sim( thresh ) )
        # The value in ref with the same rank as the first non zero value in sim.
        pth = xr.apply_ufunc(
            np.nanpercentile,
            ds.ref,
            P0_sim *
            100,  # np.percentile takes values in [0, 100], ecdf outputs in [0, 1]
            input_core_dims=[dim, []],
            dask="parallelized",
            vectorize=True,
            output_dtypes=[ds.ref.dtype],
        ).where(
            dP0 > 0)  # pth is meaningless when freq. adaptation is not needed

        if "window" in ds.sim.dims:
            # P0_sim was computed using the window, but only the original timeseries is corrected.
            sim = ds.sim.isel(window=(ds.sim.window.size - 1) // 2)
            dim = [dim[0]]
        else:
            sim = ds.sim

        # Get the percentile rank of each value in sim.
        # da.rank() doesn't work with dask arrays.
        rank = (xr.apply_ufunc(
            lambda da: np.argsort(np.argsort(da, axis=-1), axis=-1),
            sim,
            input_core_dims=[dim],
            output_core_dims=[dim],
            dask="parallelized",
            output_dtypes=[sim.dtype],
        ) / sim.notnull().sum(dim=dim))

        # Frequency-adapted sim
        sim_ad = sim.where(
            dP0 < 0,  # dP0 < 0 means no-adaptation.
            sim.where(
                (rank < P0_ref) | (rank > P0_sim),  # Preserve current values
                # Generate random numbers ~ U[T0, Pth]
                (pth.broadcast_like(sim) - thresh) *
                mod.random.random_sample(size=sim.shape, **kws) + thresh,
            ),
        )

        # Set some metadata
        sim_ad.attrs.update(ds.sim.attrs)
        pth.attrs[
            "long_name"] = "Smallest value of the timeseries not corrected by frequency adaptation."
        dP0.attrs[
            "long_name"] = "Proportion of values smaller than {thresh} in the timeseries corrected by frequency adaptation"

        # Tell group_apply that these will need reshaping (regrouping)
        # This is needed since if any variable comes out a groupby with the original group axis, the whole output is broadcasted back to the original dims.
        pth.attrs["_group_apply_reshape"] = True
        dP0.attrs["_group_apply_reshape"] = True
        return xr.Dataset(data_vars={"pth": pth, "dP0": dP0, "sim_ad": sim_ad})
Beispiel #33
0
def smooth(da, k=3):
    if len(da.shape) == 1:
        raise Exception("'Smooth' does not currently operate on 1D timeseries")
    da = da.transpose("y", "x", "time")
    func = lambda arr, k: wiener(da, (1, 1, k))
    return xr.apply_ufunc(func, da, k, dask='allowed')
Beispiel #34
0
def median_absolute_error(a, b, dim=None, skipna=False, keep_attrs=False):
    """
    Median Absolute Error.

    .. math::
        \\mathrm{median}(\\vert a - b\\vert)

    Parameters
    ----------
    a : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    b : xarray.Dataset or xarray.DataArray
        Labeled array(s) over which to apply the function.
    dim : str, list
        The dimension(s) to apply the median absolute error along.
        Note that this dimension will be reduced as a result.
        Defaults to None reducing all dimensions.
    skipna : bool
        If True, skip NaNs when computing function.
    keep_attrs : bool
        If True, the attributes (attrs) will be copied
        from the first input to the new one.
        If False (default), the new object will
        be returned without attributes.

    Returns
    -------
    xarray.Dataset or xarray.DataArray
        Median Absolute Error.

    See Also
    --------
    sklearn.metrics.median_absolute_error

    Examples
    --------
    >>> import numpy as np
    >>> import xarray as xr
    >>> from xskillscore import median_absolute_error
    >>> a = xr.DataArray(np.random.rand(5, 3, 3),
                        dims=['time', 'x', 'y'])
    >>> b = xr.DataArray(np.random.rand(5, 3, 3),
                        dims=['time', 'x', 'y'])
    >>> median_absolute_error(a, b, dim='time')
    """
    dim, axis = _preprocess_dims(dim, a)
    a, b = xr.broadcast(a, b, exclude=dim)

    return xr.apply_ufunc(
        _median_absolute_error,
        a,
        b,
        input_core_dims=[dim, dim],
        kwargs={
            "axis": axis,
            "skipna": skipna
        },
        dask="parallelized",
        output_dtypes=[float],
        keep_attrs=keep_attrs,
    )
Beispiel #35
0
def ensemble_percentiles(
    ens: Union[xr.Dataset, xr.DataArray],
    values: Sequence[int] = (10, 50, 90),
    keep_chunk_size: Optional[bool] = None,
    split: bool = True,
) -> xr.Dataset:
    """Calculate ensemble statistics between a results from an ensemble of climate simulations.

    Returns a Dataset containing ensemble percentiles for input climate simulations.

    Parameters
    ----------
    ens: Union[xr.Dataset, xr.DataArray]
      Ensemble dataset or dataarray (see xclim.ensembles.create_ensemble).
    values : Tuple[int, int, int]
      Percentile values to calculate. Default: (10, 50, 90).
    keep_chunk_size : Optional[bool]
      For ensembles using dask arrays, all chunks along the 'realization' axis are merged.
      If True, the dataset is rechunked along the dimension with the largest chunks, so that the chunks keep the same size (approx)
      If False, no shrinking is performed, resulting in much larger chunks
      If not defined, the function decides which is best
    split : bool
      Whether to split each percentile into a new variable of concatenate the ouput along a new
      "percentiles" dimension.

    Returns
    -------
    Union[xr.Dataset, xr.DataArray]
      If split is True, same type as ens; dataset otherwise,
      containing data variable(s) of requested ensemble statistics

    Examples
    --------
    >>> from xclim.ensembles import create_ensemble, ensemble_percentiles

    Create ensemble dataset:

    >>> ens = create_ensemble(temperature_datasets)

    Calculate default ensemble percentiles:

    >>> ens_percs = ensemble_percentiles(ens)

    Calculate non-default percentiles (25th and 75th)

    >>> ens_percs = ensemble_percentiles(ens, values=(25, 50, 75))

    If the original array has many small chunks, it might be more efficient to do:

    >>> ens_percs = ensemble_percentiles(ens, keep_chunk_size=False)
    """
    if isinstance(ens, xr.Dataset):
        out = xr.merge(
            [
                ensemble_percentiles(
                    da, values, keep_chunk_size=keep_chunk_size, split=split
                )
                for da in ens.data_vars.values()
                if "realization" in da.dims
            ]
        )
        out.attrs.update(ens.attrs)
        out.attrs["xclim_history"] = update_history(
            f"Computation of the percentiles on {ens.realization.size} ensemble members.",
            ens,
        )

        return out

    # Percentile calculation forbids any chunks along realization
    if ens.chunks and len(ens.chunks[ens.get_axis_num("realization")]) > 1:
        if keep_chunk_size is None:
            # Enable smart rechunking is chunksize exceed 2E8 elements after merging along realization
            keep_chunk_size = (
                np.prod(ens.isel(realization=0).data.chunksize) * ens.realization.size
                > 2e8
            )
        if keep_chunk_size:
            # Smart rechunk on dimension where chunks are the largest
            chk_dim, chks = max(
                enumerate(ens.chunks),
                key=lambda kv: 0
                if kv[0] == ens.get_axis_num("realization")
                else max(kv[1]),
            )
            ens = ens.chunk(
                {"realization": -1, ens.dims[chk_dim]: len(chks) * ens.realization.size}
            )
        else:
            ens = ens.chunk({"realization": -1})

    out = xr.apply_ufunc(
        _calc_perc,
        ens,
        input_core_dims=[["realization"]],
        output_core_dims=[["percentiles"]],
        keep_attrs=True,
        kwargs=dict(p=values),
        dask="parallelized",
        output_dtypes=[ens.dtype],
        dask_gufunc_kwargs=dict(output_sizes={"percentiles": len(values)}),
    )

    out = out.assign_coords(
        percentiles=xr.DataArray(list(values), dims=("percentiles",))
    )

    if split:
        out = out.to_dataset(dim="percentiles")
        for p, perc in out.data_vars.items():
            perc.attrs.update(ens.attrs)
            perc.attrs["description"] = (
                perc.attrs.get("description", "") + f" {p}th percentile of ensemble."
            )
            out[p] = perc
            out = out.rename(name_dict={p: f"{ens.name}_p{int(p):02d}"})

    out.attrs["xclim_history"] = update_history(
        f"Computation of the percentiles on {ens.realization.size} ensemble members.",
        ens,
    )

    return out
Beispiel #36
0
import numpy as np
import xarray as xr

da = xr.DataArray(np.random.rand(1000, 100))
da = da.rename({'dim_0': 'rows_a'})
db = xr.DataArray(np.random.rand(1000, 100))
db = db.rename({'dim_0': 'rows_b'})

def print_shape(a):
    print(a.shape)
    return np.zeros(shape=(a.shape[0]))

def print_two_shapes(a, b):
    print(a.shape)
    print(b.shape)
    return np.zeros(shape=(a.shape[0], b.shape[0]))

print('\nprint_shape')
xr.apply_ufunc(
    print_shape,
    da,
    input_core_dims=[['dim_1']]
)

print('\nprint_two_shapes')
xr.apply_ufunc(
    print_two_shapes,
    da, db,
    input_core_dims=[['dim_1'], ['dim_1']]
)
Beispiel #37
0
def wrap_xarray_ufunc(
    ufunc,
    *datasets,
    ufunc_kwargs=None,
    func_args=None,
    func_kwargs=None,
    dask_kwargs=None,
    **kwargs,
):
    """Wrap make_ufunc with xarray.apply_ufunc.

    Parameters
    ----------
    ufunc : callable
    datasets : xarray.dataset
    ufunc_kwargs : dict
        Keyword arguments passed to `make_ufunc`.
            - 'n_dims', int, by default 2
            - 'n_output', int, by default 1
            - 'n_input', int, by default len(datasets)
            - 'index', slice, by default Ellipsis
            - 'ravel', bool, by default True
    func_args : tuple
        Arguments passed to 'ufunc'.
    func_kwargs : dict
        Keyword arguments passed to 'ufunc'.
            - 'out_shape', int, by default None
    dask_kwargs : dict
        Dask related kwargs passed to :func:`xarray:xarray.apply_ufunc`.
        Use :meth:`~arviz.Dask.enable_dask` to set default kwargs.
    **kwargs
        Passed to xarray.apply_ufunc.

    Returns
    -------
    xarray.dataset
    """
    if ufunc_kwargs is None:
        ufunc_kwargs = {}
    ufunc_kwargs.setdefault("n_input", len(datasets))
    if func_args is None:
        func_args = tuple()
    if func_kwargs is None:
        func_kwargs = {}
    if dask_kwargs is None:
        dask_kwargs = {}

    kwargs.setdefault(
        "input_core_dims",
        tuple(
            ("chain", "draw") for _ in range(len(func_args) + len(datasets))))
    ufunc_kwargs.setdefault("n_dims", len(kwargs["input_core_dims"][-1]))
    kwargs.setdefault(
        "output_core_dims",
        tuple([] for _ in range(ufunc_kwargs.get("n_output", 1))))

    callable_ufunc = make_ufunc(ufunc, **ufunc_kwargs)

    return apply_ufunc(callable_ufunc,
                       *datasets,
                       *func_args,
                       kwargs=func_kwargs,
                       **dask_kwargs,
                       **kwargs)
Beispiel #38
0
def pearson_correlation(x, y, dim):
    return xr.apply_ufunc(
        pearson_correlation_gufunc, x, y,
        input_core_dims=[[dim], [dim]],
        dask='parallelized',
        output_dtypes=[float])
Beispiel #39
0
# For converting to a 10m height for use with WN
def logu(u, zin=40, zout=10):
    z0 = 0.01
    newu = u * np.log(zout / z0) / np.log(zin / z0)
    return newu


# In[ ]:

gem = xr.open_mfdataset("out.nc", combine='by_coords')

# In[ ]:

#The GEM winds were output at 40m, convert to a 10m windspeed
gem['u10'] = xr.apply_ufunc(
    logu, gem['u'],
    dask='allowed')  # the GEM-CHM file has u @ 40m reference height

#zonal and meridonal components
gem['U10'] = -gem['u10'] * np.sin(gem['vw_dir'] * np.pi / 180.)
gem['V10'] = -gem['u10'] * np.cos(gem['vw_dir'] * np.pi / 180.)

# air temp needs to be in K
gem['t'] += 273.15

# In[ ]:

gem

# In[ ]:
Beispiel #40
0
def run_snap_biophys(dataset, variable):
    """Compute specified variable using the SNAP algorithm.

    See ATBD at https://step.esa.int/docs/extra/ATBD_S2ToolBox_L2B_V1.1.pdf

    Parameters
    ----------
    dataset : xr dataset
        xarray dataset.
    variable : str
        Options 'FAPAR', 'FCOVER', 'LAI', 'LAI_Cab' or 'LAI_Cw'

    Returns
    -------
    xarray dataset
        Adds the specified variable array to dataset (variable name in
        lowercase).

    """
    # generate view angle bands/layers
    vz = (np.ones_like(dataset.band_data[:, 0, :, :]).T *
          np.cos(np.radians(dataset.view_zenith)).values)
    vz = vz[..., np.newaxis]
    vzarr = xr.DataArray(
        vz,
        coords=[dataset.y, dataset.x, dataset.time, ["view_zenith"]],
        dims=["y", "x", "time", "band"],
    )

    sz = (np.ones_like(dataset.band_data[:, 0, :, :]).T *
          np.cos(np.radians(dataset.sun_zenith)).values)
    sz = sz[..., np.newaxis]
    szarr = xr.DataArray(
        sz,
        coords=[dataset.y, dataset.x, dataset.time, ["sun_zenith"]],
        dims=["y", "x", "time", "band"],
    )

    raz = (
        np.ones_like(dataset.band_data[:, 0, :, :]).T *
        np.cos(np.radians(dataset.sun_azimuth - dataset.view_azimuth)).values)
    raz = raz[..., np.newaxis]
    razarr = xr.DataArray(
        raz,
        coords=[dataset.y, dataset.x, dataset.time, ["relative_azimuth"]],
        dims=["y", "x", "time", "band"],
    )

    newarr = xr.concat([dataset.band_data, vzarr, szarr, razarr], dim="band")
    newarr = newarr.stack(xy=("x", "y"))
    arr = xr.apply_ufunc(
        _compute_variable,
        newarr,
        input_core_dims=[["band", "xy"]],
        output_core_dims=[["xy"]],
        kwargs={
            "variable": variable
        },
        vectorize=True,
    ).unstack()
    return dataset.assign({variable.lower(): arr})
Beispiel #41
0
def cross_phase(da1, da2, spacing_tol=1e-3, dim=None, detrend=None,
                window=False, chunks_to_segments=False):
    """
    Calculates the cross-phase between da1 and da2.

    Returned values are in [-pi, pi].

    .. math::
        da1' = da1 - \overline{da1};\ \ da2' = da2 - \overline{da2}
    .. math::
        cp = \text{Arg} [\mathbb{F}(da1')^*, \mathbb{F}(da2')]

    Parameters
    ----------
    da1 : `xarray.DataArray`
        The data to be transformed
    da2 : `xarray.DataArray`
        The data to be transformed
    spacing_tol: float, optional
        Spacing tolerance. Fourier transform should not be applied to uneven grid but
        this restriction can be relaxed with this setting. Use caution.
    dim : list, optional
        The dimension along which to take the real Fourier transformation.
        If `None`, all dimensions will be transformed.
    shift : bool, optional
        Whether to shift the fft output.
    detrend : str, optional
        If `constant`, the mean across the transform dimensions will be
        subtracted before calculating the Fourier transform (FT).
        If `linear`, the linear least-square fit along one axis will be
        subtracted before the FT. It will give an error if the length of
        `dim` is longer than one.
    window : bool, optional
        Whether to apply a Hann window to the data before the Fourier
        transform is taken

    Returns
    -------
    cp : `xarray.DataArray`
        Cross-phase as a function of frequency.
    """

    if dim is None:
        dim = da1.dims
        dim2 = da2.dims
        if dim != dim2:
            raise ValueError('The two datasets have different dimensions')
    elif not isinstance(dim, list):
        dim = [dim]
    if len(dim)>1:
        raise ValueError('Cross phase calculation should only be done along '
                        'a single dimension.')

    daft1 = dft(da1, spacing_tol,
                dim=dim, real=dim[0], shift=False, detrend=detrend,
                window=window, chunks_to_segments=chunks_to_segments)
    daft2 = dft(da2, spacing_tol,
                dim=dim, real=dim[0], shift=False, detrend=detrend,
                window=window, chunks_to_segments=chunks_to_segments)

    if daft1.chunks and daft2.chunks:
        _cross_phase = lambda a, b: dsar.angle(a * dsar.conj(b))
    else:
        _cross_phase = lambda a, b: np.angle(a * np.conj(b))
    cp = xr.apply_ufunc(_cross_phase, daft1, daft2, dask='allowed')

    if da1.name and da2.name:
        cp.name = "{}_{}_phase".format(da1.name, da2.name)

    return cp