Ejemplo n.º 1
0
    def adjust(self, sim: DataArray, **kwargs):
        """Return bias-adjusted data. Refer to the class documentation for the algorithm details.

        Parameters
        ----------
        sim : DataArray
          Time series to be bias-adjusted, usually a model output.
        kwargs :
          Algorithm-specific keyword arguments, see class doc.
        """
        if not self._trained:
            raise ValueError("train() must be called before adjusting.")

        if hasattr(self, "group"):
            # Right now there is no other way of getting the main adjustment dimension
            _raise_on_multiple_chunk(sim, self.group.dim)

            if (self.group.prop == "dayofyear"
                    and get_calendar(sim) != self.hist_calendar):
                warn(
                    ("This adjustment was trained on a simulation with the "
                     f"{self._hist_calendar} calendar but the sim input uses "
                     f"{get_calendar(sim)}. This is not recommended with dayofyear "
                     "grouping and could give strange results."),
                    stacklevel=4,
                )

        scen = self._adjust(sim, **kwargs)
        params = ", ".join([f"{k}={repr(v)}" for k, v in kwargs.items()])
        scen.attrs["xclim_history"] = update_history(
            f"Bias-adjusted with {str(self)}.adjust(sim, {params})", sim)
        return scen
Ejemplo n.º 2
0
def test_update_history():
    a = xr.DataArray([0], attrs={"history": "Text1"}, name="a")
    b = xr.DataArray([0], attrs={"history": "Text2"})
    c = xr.Dataset(attrs={"history": "Text3"})

    merged = update_history("text", a, new_name="d", b=b, c=c)

    assert "d: text" in merged.split("\n")[-1]
    assert merged.startswith("a: Text1")
Ejemplo n.º 3
0
def ensemble_mean_std_max_min(ens: xr.Dataset) -> xr.Dataset:
    """Calculate ensemble statistics between a results from an ensemble of climate simulations.

    Returns an xarray Dataset containing ensemble mean, standard-deviation, minimum and maximum for input climate
    simulations.

    Parameters
    ----------
    ens: xr.Dataset
      Ensemble dataset (see xclim.ensembles.create_ensemble).

    Returns
    -------
    xr.Dataset
      Dataset with data variables of ensemble statistics.

    Examples
    --------
    >>> from xclim.ensembles import create_ensemble, ensemble_mean_std_max_min

    Create the ensemble dataset:

    >>> ens = create_ensemble(temperature_datasets)

    Calculate ensemble statistics:

    >>> ens_mean_std = ensemble_mean_std_max_min(ens)
    """
    ds_out = xr.Dataset(attrs=ens.attrs)
    for v in ens.data_vars:

        ds_out[f"{v}_mean"] = ens[v].mean(dim="realization")
        ds_out[f"{v}_stdev"] = ens[v].std(dim="realization")
        ds_out[f"{v}_max"] = ens[v].max(dim="realization")
        ds_out[f"{v}_min"] = ens[v].min(dim="realization")
        for vv in ds_out.data_vars:
            ds_out[vv].attrs = ens[v].attrs
            if "description" in ds_out[vv].attrs.keys():
                vv.split()
                ds_out[vv].attrs["description"] = (
                    ds_out[vv].attrs["description"]
                    + " : "
                    + vv.split("_")[-1]
                    + " of ensemble"
                )
    ds_out.attrs["xclim_history"] = update_history(
        f"Computation of statistics on {ens.realization.size} ensemble members.", ds_out
    )
    return ds_out
Ejemplo n.º 4
0
    def adjust(
        cls,
        ref: xr.DataArray,
        hist: xr.DataArray,
        sim: xr.DataArray,
        **kwargs,
    ):
        """Return bias-adjusted data. Refer to the class documentation for the algorithm details.

        Parameters
        ----------
        ref : DataArray
          Training target, usually a reference time series drawn from observations.
        hist : DataArray
          Training data, usually a model output whose biases are to be adjusted.
        sim : DataArray
          Time series to be bias-adjusted, usually a model output.
        kwargs :
          Algorithm-specific keyword arguments, see class doc.
        """
        kwargs = parse_group(cls._adjust, kwargs)
        skip_checks = kwargs.pop("skip_input_checks", False)

        if not skip_checks:
            if "group" in kwargs:
                cls._check_inputs(ref, hist, sim, group=kwargs["group"])

            (ref, hist, sim), _ = cls._harmonize_units(ref, hist, sim)

        out = cls._adjust(ref, hist, sim, **kwargs)

        if isinstance(out, xr.DataArray):
            out = out.rename("scen").to_dataset()

        scen = out.scen

        params = ", ".join([f"{k}={repr(v)}" for k, v in kwargs.items()])
        infostr = f"{cls.__name__}.adjust(ref, hist, sim, {params})"
        scen.attrs["history"] = update_history(f"Bias-adjusted with {infostr}", sim)
        scen.attrs["bias_adjustment"] = infostr
        scen.attrs["units"] = ref.units

        if OPTIONS[SDBA_EXTRA_OUTPUT]:
            return out
        return scen
Ejemplo n.º 5
0
    def adjust(self, sim: DataArray, *args, **kwargs):
        """Return bias-adjusted data. Refer to the class documentation for the algorithm details.

        Parameters
        ----------
        sim : DataArray
          Time series to be bias-adjusted, usually a model output.
        args : xr.DataArray
          Other DataArrays needed for the adjustment (usually none).
        kwargs :
          Algorithm-specific keyword arguments, see class doc.
        """
        skip_checks = kwargs.pop("skip_input_checks", False)
        if not skip_checks:
            (sim, *args), _ = self._harmonize_units(sim, *args, target=self.train_units)

            if "group" in self:
                self._check_inputs(sim, *args, group=self.group)

            sim = convert_units_to(sim, self.train_units)
        out = self._adjust(sim, *args, **kwargs)

        if isinstance(out, xr.DataArray):
            out = out.rename("scen").to_dataset()

        scen = out.scen

        # Keep attrs
        scen.attrs.update(sim.attrs)
        for name, crd in sim.coords.items():
            if name in scen.coords:
                scen[name].attrs.update(crd.attrs)

        params = ", ".join([f"{k}={repr(v)}" for k, v in kwargs.items()])
        infostr = f"{str(self)}.adjust(sim, {params})"
        scen.attrs["history"] = update_history(f"Bias-adjusted with {infostr}", sim)
        scen.attrs["bias_adjustment"] = infostr
        scen.attrs["units"] = self.train_units

        if OPTIONS[SDBA_EXTRA_OUTPUT]:
            return out
        return scen
Ejemplo n.º 6
0
def fit(
    da: xr.DataArray,
    dist: str = "norm",
    method: str = "ML",
    dim: str = "time",
    **fitkwargs,
) -> xr.DataArray:
    """Fit an array to a univariate distribution along the time dimension.

    Parameters
    ----------
    da : xr.DataArray
      Time series to be fitted along the time dimension.
    dist : str
      Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm
      (see scipy.stats for full list). If the PWM method is used, only the following distributions are
      currently supported: 'expon', 'gamma', 'genextreme', 'genpareto', 'gumbel_r', 'pearson3', 'weibull_min'.
    method : {"ML", "PWM"}
      Fitting method, either maximum likelihood (ML) or probability weighted moments (PWM), also called L-Moments.
      The PWM method is usually more robust to outliers.
    dim : str
      The dimension upon which to perform the indexing (default: "time").
    **fitkwargs
      Other arguments passed directly to :py:func:`_fitstart` and to the distribution's `fit`.

    Returns
    -------
    xr.DataArray
      An array of fitted distribution parameters.

    Notes
    -----
    Coordinates for which all values are NaNs will be dropped before fitting the distribution. If the array
    still contains NaNs, the distribution parameters will be returned as NaNs.
    """
    method_name = {"ML": "maximum likelihood", "PWM": "probability weighted moments"}

    # Get the distribution
    dc = get_dist(dist)
    if method == "PWM":
        lm3dc = get_lm3_dist(dist)

    shape_params = [] if dc.shapes is None else dc.shapes.split(",")
    dist_params = shape_params + ["loc", "scale"]

    # xarray.apply_ufunc does not yet support multiple outputs with dask parallelism.
    duck = dask.array if isinstance(da.data, dask.array.Array) else np
    data = duck.apply_along_axis(
        _fitfunc_1d,
        da.get_axis_num(dim),
        da,
        dist=dc if method == "ML" else lm3dc,
        nparams=len(dist_params),
        method=method,
        **fitkwargs,
    )

    # Coordinates for the distribution parameters
    coords = dict(da.coords.items())
    if dim in coords:
        coords.pop(dim)
    coords["dparams"] = dist_params

    # Dimensions for the distribution parameters
    dims = [d if d != dim else "dparams" for d in da.dims]

    out = xr.DataArray(data=data, coords=coords, dims=dims)
    out.attrs = prefix_attrs(
        da.attrs, ["standard_name", "long_name", "units", "description"], "original_"
    )
    attrs = dict(
        long_name=f"{dist} parameters",
        description=f"Parameters of the {dist} distribution",
        method=method,
        estimator=method_name[method].capitalize(),
        scipy_dist=dist,
        units="",
        xclim_history=update_history(
            f"Estimate distribution parameters by {method_name[method]} method along dimension {dim}.",
            new_name="fit",
            data=da,
        ),
    )
    out.attrs.update(attrs)
    return out
Ejemplo n.º 7
0
def change_significance(
    fut: Union[xr.DataArray, xr.Dataset],
    ref: Union[xr.DataArray, xr.Dataset] = None,
    test: str = "ttest",
    **kwargs,
) -> Tuple[Union[xr.DataArray, xr.Dataset], Union[xr.DataArray, xr.Dataset]]:
    """Robustness statistics qualifying how the members of an ensemble agree on the existence of change and on its sign.

    Parameters
    ----------
    fut : Union[xr.DataArray, xr.Dataset]
      Future period values along 'realization' and 'time' (..., nr, nt1)
      or if `ref` is None, Delta values along `realization` (..., nr).
    ref : Union[xr.DataArray, xr.Dataset], optional
      Reference period values along realization' and 'time'  (..., nt2, nr).
      The size of the 'time' axis does not need to match the one of `fut`.
      But their 'realization' axes must be identical.
      If `None` (default), values of `fut` are assumed to be deltas instead of
      a distribution across the future period.
      `fut` and `ref` must be of the same type (Dataset or DataArray). If they are
      Dataset, they must have the same variables (name and coords).
    test : {'ttest', 'welch-ttest', 'threshold', None}
      Name of the statistical test used to determine if there was significant change. See notes.
    **kwargs
      Other arguments specific to the statistical test.

      For 'ttest' and 'welch-ttest':
        p_change : float (default : 0.05)
          p-value threshold for rejecting the hypothesis of no significant change.
      For 'threshold': (Only one of those must be given.)
        abs_thresh : float (no default)
          Threshold for the (absolute) change to be considered significative.
        rel_thresh : float (no default, in [0, 1])
          Threshold for the relative change (in reference to ref) to be significative.
          Only valid if `ref` is given.

    Returns
    -------
    change_frac
      The fraction of members that show significant change [0, 1].
      Passing `test=None` yields change_frac = 1 everywhere. Same type as `fut`.
    pos_frac
      The fraction of members showing significant change that show a positive change ]0, 1].
      Null values are returned where no members show significant change.

      The table below shows the coefficient needed to retrieve the number of members
      that have the indicated characteristics, by multiplying it to the total
      number of members (`fut.realization.size`).

      +-----------------+------------------------------+------------------------+
      |                 | Significant change           | Non significant change |
      +-----------------+------------------------------+------------------------+
      | Any direction   | change_frac                  | 1 - change_frac        |
      +-----------------+------------------------------+------------------------+
      | Positive change | pos_frac * change_frac       | N.A.                   |
      +-----------------+------------------------------+                        |
      | Negative change | (1 - pos_frac) * change_frac |                        |
      +-----------------+------------------------------+------------------------+

    Notes
    -----
    Available statistical tests are :

      'ttest' :
        Single sample T-test. Same test as used by [tebaldi2011]_. The future
        values are compared against the reference mean (over 'time'). Change is qualified
        as 'significant' when the test's p-value is below the user-provided `p_change`
        value.
      'welch-ttest' :
        Two-sided T-test, without assuming equal population variance. Same
        significance criterion as 'ttest'.
      'threshold' :
        Change is considered significative if the absolute delta exceeds a given
        threshold (absolute or relative).
      None :
        Significant change is not tested and, thus, members showing no change are
        included in the `sign_frac` output.

    References
    ----------
    .. [tebaldi2011] Tebaldi C., Arblaster, J.M. and Knutti, R. (2011) Mapping model agreement on future climate projections. GRL. doi:10.1029/2011GL049863


    Example
    -------
    This example computes the mean temperature in an ensemble and compares two time
    periods, qualifying significant change through a single sample T-test.

    >>> from xclim import ensembles
    >>> ens = ensembles.create_ensemble(temperature_datasets)
    >>> tgmean = xclim.atmos.tg_mean(tas=ens.tas, freq='YS')
    >>> fut = tgmean.sel(time=slice('2020', '2050'))
    >>> ref = tgmean.sel(time=slice('1990', '2020'))
    >>> chng_f, pos_f = ensembles.change_significance(fut, ref, test='ttest')

    If the deltas were already computed beforehand, the 'threshold' test can still
    be used, here with a 2 K threshold.

    >>> delta = fut.mean('time') - ref.mean('time')
    >>> chng_f, pos_f = ensembles.change_significance(delta, test='threshold', abs_thresh=2)
    """
    test_params = {
        "ttest": ["p_change"],
        "welch-ttest": ["p_change"],
        "threshold": ["abs_thresh", "rel_thresh"],
    }
    changed = None
    if ref is None:
        delta = fut
        n_valid_real = delta.notnull().sum("realization")
        if test not in ["threshold", None]:
            raise ValueError(
                "When deltas are given (ref=None), 'test' must be one of ['threshold', None]"
            )
    else:
        delta = fut.mean("time") - ref.mean("time")
        n_valid_real = fut.notnull().all("time").sum("realization")

    if test == "ttest":
        p_change = kwargs.setdefault("p_change", 0.05)

        # Test hypothesis of no significant change
        pvals = xr.apply_ufunc(
            lambda f, r: spstats.ttest_1samp(f, r, axis=-1, nan_policy="omit")[
                1],
            fut,
            ref.mean("time"),
            input_core_dims=[["realization", "time"], ["realization"]],
            output_core_dims=[["realization"]],
            vectorize=True,
            dask="parallelized",
            output_dtypes=[float],
        )
        # When p < p_change, the hypothesis of no significant change is rejected.
        changed = pvals < p_change
    elif test == "welch-ttest":
        p_change = kwargs.setdefault("p_change", 0.05)

        # Test hypothesis of no significant change
        # equal_var=False -> Welch's T-test
        pvals = xr.apply_ufunc(
            lambda f, r: spstats.ttest_ind(
                f, r, axis=-1, equal_var=False, nan_policy="omit")[1],
            fut,
            ref,
            input_core_dims=[["realization", "time"], ["realization", "time"]],
            output_core_dims=[["realization"]],
            exclude_dims={"time"},
            vectorize=True,
            dask="parallelized",
            output_dtypes=[float],
        )

        # When p < p_change, the hypothesis of no significant change is rejected.
        changed = pvals < p_change
    elif test == "threshold":
        if "abs_thresh" in kwargs and "rel_thresh" not in kwargs:
            changed = abs(delta) > kwargs["abs_thresh"]
        elif "rel_thresh" in kwargs and "abs_thresh" not in kwargs and ref is not None:
            changed = abs(delta / ref.mean("time")) > kwargs["rel_thresh"]
        else:
            raise ValueError(
                "Invalid argument combination for test='threshold'.")
    elif test is not None:
        raise ValueError(
            f"Statistical test {test} must be one of {', '.join(test_params.keys())}."
        )

    if test is not None:
        delta_chng = delta.where(changed)
        change_frac = changed.sum("realization") / n_valid_real
    else:
        delta_chng = delta
        change_frac = xr.ones_like(delta.isel(realization=0))

    # Test that models agree on the sign of the change
    # This returns NaN (cause 0 / 0) where no model show significant change.
    pos_frac = (delta_chng > 0).sum("realization") / (change_frac *
                                                      n_valid_real)

    # Metadata
    kwargs_str = ", ".join(
        [f"{k}: {v}" for k, v in kwargs.items() if k in test_params[test]])
    test_str = (
        f"Significant change was tested with test {test} with parameters {kwargs_str}."
    )
    das = {"fut": fut} if ref is None else {"fut": fut, "ref": ref}
    pos_frac.attrs.update(
        description=
        "Fraction of members showing significant change that agree on a positive change. "
        + test_str,
        units="",
        test=str(test),
        xclim_history=update_history(
            f"pos_frac from change_significance(fut=fut, ref=ref, test={test}, {kwargs_str})",
            **das,
        ),
    )
    change_frac.attrs.update(
        description="Fraction of members showing significant change. " +
        test_str,
        units="",
        test=str(test),
        xclim_history=update_history(
            f"change_frac from change_significance(fut=fut, ref=ref, test={test}, {kwargs_str})",
            **das,
        ),
    )
    return change_frac, pos_frac
Ejemplo n.º 8
0
def robustness_coefficient(
        fut: Union[xr.DataArray, xr.Dataset],
        ref: Union[xr.DataArray,
                   xr.Dataset]) -> Union[xr.DataArray, xr.Dataset]:
    """Robustness coefficient quantifying the robustness of a climate change signal in an ensemble.

    Taken from Knutti and Sedlacek (2013).

    The robustness metric is defined as R = 1 − A1 / A2 , where A1 is defined
    as the integral of the squared area between two cumulative density functions
    characterizing the individual model projections and the multi-model mean
    projection and A2 is the integral of the squared area between two cumulative
    density functions characterizing the multi-model mean projection and the historical
    climate. (Description taken from [knutti2013]_)

    A value of R equal to one implies perfect model agreement. Higher model spread or
    smaller signal decreases the value of R.

    Parameters
    ----------
    fut : Union[xr.DataArray, xr.Dataset]
      Future ensemble values along 'realization' and 'time' (nr, nt). Can be a dataset,
      in which case the coeffcient is computed on each variables.
    ref : Union[xr.DataArray, xr.Dataset]
      Reference period values along 'time' (nt). Same type as `fut`.

    Returns
    -------
    R
      The robustness coeffcient, ]-inf, 1], float. Same type as `fut` or `ref`.

    References
    ----------
    .. [knutti2013] Knutti, R. and Sedláček, J. (2013) Robustness and uncertainties in the new CMIP5 climate model projections. Nat. Clim. Change. doi:10.1038/nclimate1716
    """
    def _knutti_sedlacek(reference, future):
        def diff_cdf_sq_area_int(x1, x2):
            """Exact integral of the squared area between the non-parametric CDFs of 2 vectors."""
            # Non-parametric CDF on points x1 and x2
            # i.e. y1(x) is the proportion of x1 <= x
            y1 = (np.arange(x1.size) + 1) / x1.size
            y2 = (np.arange(x2.size) + 1) / x2.size

            x2_in_1 = np.searchsorted(x1, x2,
                                      side="right")  # Where to insert x2 in x1
            x1_in_2 = np.searchsorted(x2, x1,
                                      side="right")  # Where to insert x1 in x2

            # Merge to get all "discontinuities" of the CDF difference
            # y1 with repeated value (to the right) where x2 is inserted
            # Same for y2. 0s are prepended where needed.
            x = np.insert(x1, x2_in_1, x2)
            y1_f = np.insert(y1, x2_in_1, np.r_[0, y1][x2_in_1])
            y2_f = np.insert(y2, x1_in_2, np.r_[0, y2][x1_in_2])

            # Discrete integral of the squared difference (distance) between the two CDFs.
            return np.sum(np.diff(x) * (y1_f - y2_f)[:-1]**2)

        # Get sorted vectors
        v_fut = np.sort(future.flatten())  # "cumulative" models distribution
        v_favg = np.sort(future.mean(axis=-1))  # Multi-model mean
        v_ref = np.sort(reference)  # Historical values

        A1 = diff_cdf_sq_area_int(v_fut, v_favg)  # noqa
        A2 = diff_cdf_sq_area_int(v_ref, v_favg)  # noqa

        return 1 - A1 / A2

    R = xr.apply_ufunc(  # noqa
        _knutti_sedlacek,
        ref,
        fut,
        input_core_dims=[["time"], ["realization", "time"]],
        exclude_dims={"time"},
        vectorize=True,
        dask="parallelized",
        output_dtypes=[float],
    )
    R.attrs.update(
        name="R",
        long_name="Ensemble robustness coefficient",
        description=
        "Ensemble robustness coefficient as defined by Knutti and Sedláček (2013).",
        reference=
        "Knutti, R. and Sedláček, J. (2013) Robustness and uncertainties in the new CMIP5 climate model projections. Nat. Clim. Change.",
        units="",
        xclim_history=update_history("knutti_sedlacek(fut, ref)",
                                     ref=ref,
                                     fut=fut),
    )
    return R
Ejemplo n.º 9
0
def fit(
    da: xr.DataArray,
    dist: str = "norm",
    method: str = "ML",
    dim: str = "time",
    **fitkwargs,
) -> xr.DataArray:
    """Fit an array to a univariate distribution along the time dimension.

    Parameters
    ----------
    da : xr.DataArray
      Time series to be fitted along the time dimension.
    dist : str
      Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm
      (see scipy.stats for full list). If the PWM method is used, only the following distributions are
      currently supported: 'expon', 'gamma', 'genextreme', 'genpareto', 'gumbel_r', 'pearson3', 'weibull_min'.
    method : {"ML", "PWM"}
      Fitting method, either maximum likelihood (ML) or probability weighted moments (PWM), also called L-Moments.
      The PWM method is usually more robust to outliers.
    dim : str
      The dimension upon which to perform the indexing (default: "time").
    fitkwargs
      Other arguments passed directly to :py:func:`_fitstart` and to the distribution's `fit`.

    Returns
    -------
    xr.DataArray
      An array of fitted distribution parameters.

    Notes
    -----
    Coordinates for which all values are NaNs will be dropped before fitting the distribution. If the array
    still contains NaNs, the distribution parameters will be returned as NaNs.
    """
    method_name = {
        "ML": "maximum likelihood",
        "PWM": "probability weighted moments"
    }

    # Get the distribution
    dc = get_dist(dist)
    if method == "PWM":
        lm3dc = get_lm3_dist(dist)

    shape_params = [] if dc.shapes is None else dc.shapes.split(",")
    dist_params = shape_params + ["loc", "scale"]

    data = xr.apply_ufunc(
        _fitfunc_1d,
        da,
        input_core_dims=[[dim]],
        output_core_dims=[["dparams"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[float],
        keep_attrs=True,
        kwargs=dict(
            dist=dc if method == "ML" else lm3dc,
            nparams=len(dist_params),
            method=method,
            **fitkwargs,
        ),
        dask_gufunc_kwargs={"output_sizes": {
            "dparams": len(dist_params)
        }},
    )

    # Add coordinates for the distribution parameters and transpose to original shape (with dim -> dparams)
    dims = [d if d != dim else "dparams" for d in da.dims]
    out = data.assign_coords(dparams=dist_params).transpose(*dims)

    out.attrs = prefix_attrs(
        da.attrs, ["standard_name", "long_name", "units", "description"],
        "original_")
    attrs = dict(
        long_name=f"{dist} parameters",
        description=f"Parameters of the {dist} distribution",
        method=method,
        estimator=method_name[method].capitalize(),
        scipy_dist=dist,
        units="",
        history=update_history(
            f"Estimate distribution parameters by {method_name[method]} method along dimension {dim}.",
            new_name="fit",
            data=da,
        ),
    )
    out.attrs.update(attrs)
    return out
Ejemplo n.º 10
0
def parametric_cdf(p: xr.DataArray, v: Union[float, Sequence]) -> xr.DataArray:
    """Return the cumulative distribution function corresponding to the given distribution parameters and value.

    Parameters
    ----------
    p : xr.DataArray
      Distribution parameters returned by the `fit` function.
      The array should have dimension `dparams` storing the distribution parameters,
      and attribute `scipy_dist`, storing the name of the distribution.
    v : Union[float, Sequence]
      Value to compute the CDF.

    Returns
    -------
    xarray.DataArray
      An array of parametric CDF values estimated from the distribution parameters.

    Notes
    -----
    """
    v = np.atleast_1d(v)

    # Get the distribution
    dist = p.attrs["scipy_dist"]
    dc = get_dist(dist)

    # Create a lambda function to facilitate passing arguments to dask. There is probably a better way to do this.
    def func(x):
        return dc.cdf(v, *x)

    data = xr.apply_ufunc(
        func,
        p,
        input_core_dims=[["dparams"]],
        output_core_dims=[["cdf"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[float],
        keep_attrs=True,
        dask_gufunc_kwargs={"output_sizes": {
            "cdf": len(v)
        }},
    )

    # Assign quantile coordinates and transpose to preserve original dimension order
    dims = [d if d != "dparams" else "cdf" for d in p.dims]
    out = data.assign_coords(cdf=v).transpose(*dims)
    out.attrs = unprefix_attrs(p.attrs, ["units", "standard_name"],
                               "original_")

    attrs = dict(
        long_name=f"{dist} cdf",
        description=f"CDF estimated by the {dist} distribution",
        cell_methods="dparams: cdf",
        history=update_history(
            "Compute parametric cdf from distribution parameters",
            new_name="parametric_cdf",
            parameters=p,
        ),
    )
    out.attrs.update(attrs)
    return out
Ejemplo n.º 11
0
def ensemble_percentiles(
    ens: Union[xr.Dataset, xr.DataArray],
    values: Sequence[int] = (10, 50, 90),
    keep_chunk_size: Optional[bool] = None,
    split: bool = True,
) -> xr.Dataset:
    """Calculate ensemble statistics between a results from an ensemble of climate simulations.

    Returns a Dataset containing ensemble percentiles for input climate simulations.

    Parameters
    ----------
    ens: Union[xr.Dataset, xr.DataArray]
      Ensemble dataset or dataarray (see xclim.ensembles.create_ensemble).
    values : Tuple[int, int, int]
      Percentile values to calculate. Default: (10, 50, 90).
    keep_chunk_size : Optional[bool]
      For ensembles using dask arrays, all chunks along the 'realization' axis are merged.
      If True, the dataset is rechunked along the dimension with the largest chunks, so that the chunks keep the same size (approx)
      If False, no shrinking is performed, resulting in much larger chunks
      If not defined, the function decides which is best
    split : bool
      Whether to split each percentile into a new variable of concatenate the ouput along a new
      "percentiles" dimension.

    Returns
    -------
    Union[xr.Dataset, xr.DataArray]
      If split is True, same type as ens; dataset otherwise,
      containing data variable(s) of requested ensemble statistics

    Examples
    --------
    >>> from xclim.ensembles import create_ensemble, ensemble_percentiles

    Create ensemble dataset:

    >>> ens = create_ensemble(temperature_datasets)

    Calculate default ensemble percentiles:

    >>> ens_percs = ensemble_percentiles(ens)

    Calculate non-default percentiles (25th and 75th)

    >>> ens_percs = ensemble_percentiles(ens, values=(25, 50, 75))

    If the original array has many small chunks, it might be more efficient to do:

    >>> ens_percs = ensemble_percentiles(ens, keep_chunk_size=False)
    """
    if isinstance(ens, xr.Dataset):
        out = xr.merge(
            [
                ensemble_percentiles(
                    da, values, keep_chunk_size=keep_chunk_size, split=split
                )
                for da in ens.data_vars.values()
                if "realization" in da.dims
            ]
        )
        out.attrs.update(ens.attrs)
        out.attrs["xclim_history"] = update_history(
            f"Computation of the percentiles on {ens.realization.size} ensemble members.",
            ens,
        )

        return out

    # Percentile calculation forbids any chunks along realization
    if ens.chunks and len(ens.chunks[ens.get_axis_num("realization")]) > 1:
        if keep_chunk_size is None:
            # Enable smart rechunking is chunksize exceed 2E8 elements after merging along realization
            keep_chunk_size = (
                np.prod(ens.isel(realization=0).data.chunksize) * ens.realization.size
                > 2e8
            )
        if keep_chunk_size:
            # Smart rechunk on dimension where chunks are the largest
            chkDim, chks = max(
                enumerate(ens.chunks),
                key=lambda kv: 0
                if kv[0] == ens.get_axis_num("realization")
                else max(kv[1]),
            )
            ens = ens.chunk(
                {"realization": -1, ens.dims[chkDim]: len(chks) * ens.realization.size}
            )
        else:
            ens = ens.chunk({"realization": -1})

    out = xr.apply_ufunc(
        _calc_perc,
        ens,
        input_core_dims=[["realization"]],
        output_core_dims=[["percentiles"]],
        keep_attrs=True,
        kwargs=dict(p=values),
        dask="parallelized",
        output_dtypes=[ens.dtype],
        output_sizes={"percentiles": len(values)},
    )

    out = out.assign_coords(
        percentiles=xr.DataArray(list(values), dims=("percentiles",))
    )

    if split:
        out = out.to_dataset(dim="percentiles")
        for p, perc in out.data_vars.items():
            perc.attrs.update(ens.attrs)
            perc.attrs["description"] = (
                perc.attrs.get("description", "") + f" {p}th percentile of ensemble."
            )
            out[p] = perc
            out = out.rename(name_dict={p: f"{ens.name}_p{int(p):02d}"})

    out.attrs["xclim_history"] = update_history(
        f"Computation of the percentiles on {ens.realization.size} ensemble members.",
        ens,
    )

    return out
Ejemplo n.º 12
0
    def adjust(
        self,
        scen: xr.DataArray,
        sim: xr.DataArray,
        frac: float = 0.25,
        power: float = 1.0,
    ):
        """Return second order bias-adjusted data. Refer to the class documentation for the algorithm details.

        Parameters
        ----------
        scen: DataArray
          Bias-adjusted time series.
        sim : DataArray
          Time series to be bias-adjusted, source of scen.
        kwargs :
          Algorithm-specific keyword arguments, see class doc.
        """
        if not self._trained:
            raise ValueError("train() must be called before adjusting.")

        def _adjust_extremes_1d(scen, sim, ref_params, thresh, *, dist,
                                cluster_thresh):
            # Clusters of large values of sim
            _, _, sim_posmax, sim_maxs = get_clusters_1d(
                sim, thresh, cluster_thresh)

            new_scen = scen.copy()
            if sim_posmax.size == 0:
                # Happens if everything is under `cluster_thresh`
                return new_scen

            # Fit the dist, force location at thresh
            sim_fit = stats._fitfunc_1d(sim_maxs,
                                        dist=dist,
                                        nparams=len(ref_params),
                                        method="ML",
                                        floc=thresh)

            # Cumulative density function for extreme values in sim's distribution
            sim_cdf = dist.cdf(sim_maxs, *sim_fit)
            # Equivalent value of sim's CDF's but in ref's distribution.
            new_sim = dist.ppf(sim_cdf, *ref_params) + thresh

            # Get the transition weights based on frac and power values
            transition = (((sim_maxs - sim_maxs.min()) /
                           ((sim_maxs.max()) - sim_maxs.min())) / frac)**power
            np.clip(transition, None, 1, out=transition)

            # Apply smooth linear transition between scen and corrected scen
            new_scen_trans = (new_sim * transition) + (scen[sim_posmax] *
                                                       (1.0 - transition))

            # We change new_scen to the new data
            new_scen[sim_posmax] = new_scen_trans
            return new_scen

        new_scen = xr.apply_ufunc(
            _adjust_extremes_1d,
            scen,
            sim,
            self.ds.fit_params,
            self.ds.thresh,
            input_core_dims=[["time"], ["time"], ["dparams"], []],
            output_core_dims=[["time"]],
            vectorize=True,
            kwargs={
                "dist": stats.get_dist("genpareto"),
                "cluster_thresh": convert_units_to(self.cluster_thresh, sim),
            },
            dask="parallelized",
            output_dtypes=[scen.dtype],
        )

        params = f"frac={frac}, power={power}"
        new_scen.attrs["xclim_history"] = update_history(
            f"Second order bias-adjustment with {str(self)}.adjust(sim, {params})",
            sim)
        return new_scen
Ejemplo n.º 13
0
    def __call__(self, *args, **kwds):
        # Bind call arguments. We need to use the class signature, not the instance, otherwise it removes the first
        # argument.
        ba = self._sig.bind(*args, **kwds)
        ba.apply_defaults()

        # Update attributes
        out_attrs = self.format(self.cf_attrs, ba.arguments)
        for locale in LOCALES:
            out_attrs.update(
                self.format(
                    get_local_attrs(
                        self,
                        locale,
                        names=self._cf_names,
                        fill_missing=False,
                        append_locale_name=True,
                    ),
                    args=ba.arguments,
                    formatter=get_local_formatter(locale),
                ))
        vname = self.format({"var_name": self.var_name},
                            ba.arguments)["var_name"]

        # Update the signature with the values of the actual call.
        cp = OrderedDict()
        for (k, v) in ba.signature.parameters.items():
            if v.default is not None and isinstance(v.default,
                                                    (float, int, str)):
                cp[k] = v.replace(default=ba.arguments[k])
            else:
                cp[k] = v

        # Assume the first arguments are always the DataArray.
        das = OrderedDict()
        for i in range(self._nvar):
            das[self._parameters[i]] = ba.arguments.pop(self._parameters[i])

        # Get history and cell method attributes from source data
        attrs = defaultdict(str)
        attrs["cell_methods"] = merge_attributes("cell_methods",
                                                 new_line=" ",
                                                 missing_str=None,
                                                 **das)
        if "cell_methods" in out_attrs:
            attrs["cell_methods"] += " " + out_attrs.pop("cell_methods")
        attrs["history"] = update_history(
            f"{self.identifier}{ba.signature.replace(parameters=cp.values())}",
            new_name=vname,
            **das,
        )
        attrs.update(out_attrs)

        # Pre-computation validation checks
        for da in das.values():
            self.validate(da)
        self.cfprobe(*das.values())

        # Compute the indicator values, ignoring NaNs.
        out = self.compute(**das, **ba.kwargs)

        # Convert to output units
        out = convert_units_to(out, self.units, self.context)

        # Update netCDF attributes
        out.attrs.update(attrs)

        # Bind call arguments to the `missing` function, whose signature might be different from `compute`.
        mba = signature(self.missing).bind(*das.values(), **ba.arguments)

        # Mask results that do not meet criteria defined by the `missing` method.
        mask = self.missing(*mba.args, **mba.kwargs)
        ma_out = out.where(~mask)

        return ma_out.rename(vname)
Ejemplo n.º 14
0
def fit(da: xr.DataArray, dist: str = "norm", method="ML"):
    """Fit an array to a univariate distribution along the time dimension.

    Parameters
    ----------
    da : xr.DataArray
      Time series to be fitted along the time dimension.
    dist : str
      Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm
      (see scipy.stats for full list). If the PWM method is used, only the following distributions are
      currently supported: 'expon', 'gamma', 'genextreme', 'genpareto', 'gumbel_r', 'pearson3', 'weibull_min'.
    method : {"ML", "PWM"}
      Fitting method, either maximum likelihood (ML) or probability weighted moments (PWM), also called L-Moments.
      The PWM method is usually more robust to outliers.

    Returns
    -------
    xr.DataArray
      An array of fitted distribution parameters.

    Notes
    -----
    Coordinates for which all values are NaNs will be dropped before fitting the distribution. If the array
    still contains NaNs, the distribution parameters will be returned as NaNs.
    """
    method_name = {"ML": "maximum likelihood", "PWM": "probability weighted moments"}

    # Get the distribution
    dc = get_dist(dist)
    if method == "PWM":
        lm3dc = get_lm3_dist(dist)

    shape_params = [] if dc.shapes is None else dc.shapes.split(",")
    dist_params = shape_params + ["loc", "scale"]

    # Fit the parameters.
    # This would also be the place to impose constraints on the series minimum length if needed.
    def fitfunc(arr):
        """Fit distribution parameters."""
        x = np.ma.masked_invalid(arr).compressed()

        # Return NaNs if array is empty.
        if len(x) <= 1:
            return [np.nan] * len(dist_params)

        # Estimate parameters
        if method == "ML":
            args, kwargs = _fit_start(x, dist)
            params = dc.fit(x, *args, **kwargs)
        elif method == "PWM":
            params = list(lm3dc.lmom_fit(x).values())

        # Fill with NaNs if one of the parameters is NaN
        if np.isnan(params).any():
            params[:] = np.nan

        return params

    # xarray.apply_ufunc does not yet support multiple outputs with dask parallelism.
    duck = dask.array if isinstance(da.data, dask.array.Array) else np
    data = duck.apply_along_axis(fitfunc, da.get_axis_num("time"), da)

    # Coordinates for the distribution parameters
    coords = dict(da.coords.items())
    coords.pop("time")
    coords["dparams"] = dist_params

    # Dimensions for the distribution parameters
    dims = [d if d != "time" else "dparams" for d in da.dims]

    out = xr.DataArray(data=data, coords=coords, dims=dims)
    out.attrs = prefix_attrs(
        da.attrs, ["standard_name", "long_name", "units", "description"], "original_"
    )
    attrs = dict(
        long_name=f"{dist} parameters",
        description=f"Parameters of the {dist} distribution",
        method=method,
        estimator=method_name[method].capitalize(),
        scipy_dist=dist,
        units="",
        xclim_history=update_history(
            f"Estimate distribution parameters by {method_name[method]} method.",
            new_name="fit",
            data=da,
        ),
    )
    out.attrs.update(attrs)
    return out
Ejemplo n.º 15
0
def parametric_quantile(p: xr.DataArray, q: Union[int, Sequence]):
    """Return the value corresponding to the given distribution parameters and quantile.

    Parameters
    ----------
    p : xr.DataArray
      Distribution parameters returned by the `fit` function. The array should have dimension `dparams` storing the
      distribution parameters, and attribute `scipy_dist`, storing the name of the distribution.
    q : Union[float, Sequence]
      Quantile to compute, which must be between 0 and 1 inclusive.

    Returns
    -------
    xarray.DataArray
      An array of parametric quantiles estimated from the distribution parameters.

    Notes
    -----
    When all quantiles are above 0.5, the `isf` method is used instead of `ppf` because accuracy is sometimes better.
    """
    q = np.atleast_1d(q)

    # Get the distribution
    dist = p.attrs["scipy_dist"]
    dc = get_dist(dist)

    # Create a lambda function to facilitate passing arguments to dask. There is probably a better way to do this.
    if np.all(q > 0.5):

        def func(x):
            return dc.isf(1 - q, *x)

    else:

        def func(x):
            return dc.ppf(q, *x)

    duck = dask.array if isinstance(p.data, dask.array.Array) else np
    data = duck.apply_along_axis(func, p.get_axis_num("dparams"), p)

    # Create coordinate for the return periods
    coords = dict(p.coords.items())
    coords.pop("dparams")
    coords["quantile"] = q
    # Create dimensions
    dims = [d if d != "dparams" else "quantile" for d in p.dims]

    out = xr.DataArray(data=data, coords=coords, dims=dims)
    out.attrs = p.attrs
    out.attrs["standard_name"] = f"{dist} quantile"
    out.attrs[
        "long_name"
    ] = f"{dist} return period values for {p.attrs.get('standard_name', '')}"
    out.attrs["cell_methods"] = (
        out.attrs.get("cell_methods", "") + " dparams: ppf"
    ).strip()
    out.attrs["units"] = p.attrs["original_units"]

    out.attrs["history"] = update_history(
        "Compute parametric quantiles from distribution parameters",
        new_name="parametric_quantile",
        parameters=p,
    )
    return out
Ejemplo n.º 16
0
def parametric_quantile(p: xr.DataArray, q: Union[int,
                                                  Sequence]) -> xr.DataArray:
    """Return the value corresponding to the given distribution parameters and quantile.

    Parameters
    ----------
    p : xr.DataArray
      Distribution parameters returned by the `fit` function.
      The array should have dimension `dparams` storing the distribution parameters,
      and attribute `scipy_dist`, storing the name of the distribution.
    q : Union[float, Sequence]
      Quantile to compute, which must be between `0` and `1`, inclusive.

    Returns
    -------
    xarray.DataArray
      An array of parametric quantiles estimated from the distribution parameters.

    Notes
    -----
    When all quantiles are above 0.5, the `isf` method is used instead of `ppf` because accuracy is sometimes better.
    """
    q = np.atleast_1d(q)

    # Get the distribution
    dist = p.attrs["scipy_dist"]
    dc = get_dist(dist)

    # Create a lambda function to facilitate passing arguments to dask. There is probably a better way to do this.
    if np.all(q > 0.5):

        def func(x):
            return dc.isf(1 - q, *x)

    else:

        def func(x):
            return dc.ppf(q, *x)

    data = xr.apply_ufunc(
        func,
        p,
        input_core_dims=[["dparams"]],
        output_core_dims=[["quantile"]],
        vectorize=True,
        dask="parallelized",
        output_dtypes=[float],
        keep_attrs=True,
        dask_gufunc_kwargs={"output_sizes": {
            "quantile": len(q)
        }},
    )

    # Assign quantile coordinates and transpose to preserve original dimension order
    dims = [d if d != "dparams" else "quantile" for d in p.dims]
    out = data.assign_coords(quantile=q).transpose(*dims)
    out.attrs = unprefix_attrs(p.attrs, ["units", "standard_name"],
                               "original_")

    attrs = dict(
        long_name=f"{dist} quantiles",
        description=f"Quantiles estimated by the {dist} distribution",
        cell_methods="dparams: ppf",
        history=update_history(
            "Compute parametric quantiles from distribution parameters",
            new_name="parametric_quantile",
            parameters=p,
        ),
    )
    out.attrs.update(attrs)
    return out
Ejemplo n.º 17
0
def fit(da: xr.DataArray, dist: str = "norm"):
    """Fit an array to a univariate distribution along the time dimension.

    Parameters
    ----------
    da : xr.DataArray
      Time series to be fitted along the time dimension.
    dist : str
      Name of the univariate distribution, such as beta, expon, genextreme, gamma, gumbel_r, lognorm, norm
      (see scipy.stats).

    Returns
    -------
    xr.DataArray
      An array of distribution parameters fitted using the method of Maximum Likelihood.

    Notes
    -----
    Coordinates for which all values are NaNs will be dropped before fitting the distribution. If the array
    still contains NaNs, the distribution parameters will be returned as NaNs.
    """
    # Get the distribution
    dc = get_dist(dist)
    shape_params = [] if dc.shapes is None else dc.shapes.split(",")
    dist_params = shape_params + ["loc", "scale"]

    # Fit the parameters.
    # This would also be the place to impose constraints on the series minimum length if needed.
    def fitfunc(arr):
        """Fit distribution parameters."""
        x = np.ma.masked_invalid(arr).compressed()

        # Return NaNs if array is empty.
        if len(x) <= 1:
            return [np.nan] * len(dist_params)

        # Fill with NaNs if one of the parameters is NaN
        params = dc.fit(x)
        if np.isnan(params).any():
            params[:] = np.nan

        return params

    # xarray.apply_ufunc does not yet support multiple outputs with dask parallelism.
    data = dask.array.apply_along_axis(fitfunc, da.get_axis_num("time"), da)

    # Count the number of values used for the fit.
    # n = da.notnull().count(dim='time')

    # Coordinates for the distribution parameters
    coords = dict(da.coords.items())
    coords.pop("time")
    coords["dparams"] = dist_params

    # Dimensions for the distribution parameters
    dims = [d if d != "time" else "dparams" for d in da.dims]

    out = xr.DataArray(data=data, coords=coords, dims=dims)
    out.attrs = da.attrs
    out.attrs["original_name"] = da.attrs.get("standard_name", "")
    out.attrs["original_units"] = da.attrs.get("units", "")
    out.attrs[
        "description"
    ] = f"Parameters of the {dist} distribution fitted over {out.attrs['original_name']}"
    out.attrs["estimator"] = "Maximum likelihood"
    out.attrs["scipy_dist"] = dist
    out.attrs["units"] = ""
    out.attrs["history"] = update_history(
        "Estimate distribution parameters by maximum likelihood.",
        new_name="fit",
        data=da,
    )
    return out
Ejemplo n.º 18
0
def ensemble_percentiles(
    ens: xr.Dataset,
    values: Tuple[int, int, int] = (10, 50, 90),
    keep_chunk_size: Optional[bool] = None,
) -> xr.Dataset:
    """Calculate ensemble statistics between a results from an ensemble of climate simulations.

    Returns a Dataset containing ensemble percentiles for input climate simulations.

    Parameters
    ----------
    ens: xr.Dataset
      Ensemble dataset (see xclim.ensembles.create_ensemble).
    values : Tuple[int, int, int]
      Percentile values to calculate. Default: (10, 50, 90).
    keep_chunk_size : Optional[bool]
      For ensembles using dask arrays, all chunks along the 'realization' axis are merged.
      If True, the dataset is rechunked along the dimension with the largest chunks, so that the chunks keep the same size (approx)
      If False, no shrinking is performed, resulting in much larger chunks
      If not defined, the function decides which is best

    Returns
    -------
    xr.Dataset
      Dataset with containing data variables of requested ensemble statistics

    Examples
    --------
    >>> from xclim import ensembles
    >>> import glob
    >>> ncfiles = glob.glob('/*tas*.nc')
    Create ensemble dataset
    >>> ens = ensembles.create_ensemble(ncfiles)
    Calculate default ensemble percentiles
    >>> ens_percs = ensembles.ensemble_percentiles(ens)
    >>> print(ens_percs['tas_p10'])
    Calculate non-default percentiles (25th and 75th)
    >>> ens_percs = ensembles.ensemble_percentiles(ens, values=(25, 50, 75))
    >>> print(ens_percs['tas_p25'])
    If the original array has many small chunks, it might be more efficient to do:
    >>> ens_percs = ensembles.ensemble_percentiles(ens, keep_chunk_size=False)
    >>> print(ens_percs['tas_p25'])
    """

    ds_out = xr.Dataset(attrs=ens.attrs)
    for v in ens.data_vars:
        # Percentile calculation forbids any chunks along realization
        if len(ens.chunks.get("realization", [])) > 1:
            if keep_chunk_size is None:
                # Enable smart rechunking is chunksize exceed 2E8 elements after merging along realization
                keep_chunk_size = (
                    np.prod(ens[v].isel(realization=0).data.chunksize) *
                    ens.realization.size > 2e8)
            if keep_chunk_size:
                # Smart rechunk on dimension where chunks are the largest
                chkDim, chks = max(
                    ens.chunks.items(),
                    key=lambda kv: 0 if kv[0] == "realization" else max(kv[1]),
                )
                var = ens[v].chunk({
                    "realization": -1,
                    chkDim: len(chks) * ens.realization.size
                })
            else:
                var = ens[v].chunk({"realization": -1})
        else:
            var = ens[v]

        for p in values:
            perc = xr.apply_ufunc(
                _calc_perc,
                var,
                input_core_dims=[["realization"]],
                output_core_dims=[[]],
                keep_attrs=True,
                kwargs=dict(p=p),
                dask="parallelized",
                output_dtypes=[ens[v].dtype],
            )

            perc.name = f"{v}_p{p:02d}"
            ds_out[perc.name] = perc

            if "description" in ds_out[perc.name].attrs:
                ds_out[perc.name].attrs[
                    "description"] = f"{ds_out[perc.name].attrs['description']} : {p}th percentile of ensemble"
            else:
                ds_out[perc.name].attrs[
                    "description"] = f"{p}th percentile of ensemble"

    ds_out.attrs["history"] = update_history(
        f"Computation of the percentiles on {ens.realization.size} ensemble members.",
        ds_out,
    )
    return ds_out