Exemple #1
0
def eva_model_pot(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="POT",
        extremes_type="high",
        threshold=1.35,
        r="24H",
    )
    return eva_model
Exemple #2
0
def eva_model_bm(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="BM",
        extremes_type="high",
        block_size="365.2425D",
        errors="raise",
    )
    return eva_model
def _calculate_return_value(
    args: typing.Tuple[pd.Series,  # ts (time series)
                       float,  # return_period
                       typing.Union[str, pd.Timedelta],  # return_period_size
                       float,  # threshold
                       typing.Union[str, pd.Timedelta],  # r
                       str,  # extremes_type
                       typing.Union[str,
                                    scipy.stats.rv_continuous],  # distribution
                       str,  # distribution_name
                       typing.Optional[float],  # alpha
                       int,  # n_samples
                       ],
) -> typing.Dict[str, typing.Union[str, typing.Optional[float]]]:
    (
        ts,
        return_period,
        return_period_size,
        threshold,
        r,
        extremes_type,
        distribution,
        distribution_name,
        alpha,
        n_samples,
    ) = args
    model = EVA(data=ts)
    model.get_extremes(
        method="POT",
        extremes_type=extremes_type,
        threshold=threshold,
        r=r,
    )
    model.fit_model(
        model="MLE",
        distribution=distribution,
    )
    # TODO - this is a hack to avoid spawning nested subprocesses
    _n_samples = n_samples % 10
    while _n_samples < n_samples:
        _n_samples += 10
        model.get_return_value(
            return_period=return_period,
            return_period_size=return_period_size,
            alpha=alpha,
            n_samples=_n_samples,
        )
    rv, cil, ciu = model.get_return_value(
        return_period=return_period,
        return_period_size=return_period_size,
        alpha=alpha,
        n_samples=n_samples,
    )
    return {
        "distribution_name": distribution_name,
        "threshold": threshold,
        "rv": rv,
        "cil": cil,
        "ciu": ciu,
    }
Exemple #4
0
    def test_from_extremes(self):
        index = pd.date_range(start="2000", end="2050", periods=100)
        eva_model = EVA.from_extremes(
            extremes=pd.Series(
                data=np.arange(100),
                index=index,
                name="water level [m]",
            ),
            method="BM",
            extremes_type="high",
        )
        assert eva_model.extremes_method == "BM"
        assert eva_model.extremes_type == "high"
        assert eva_model.extremes_kwargs["errors"] == "ignore"
        assert eva_model.extremes_kwargs["min_last_block"] is None
        expected_block_size = (((index.max() - index.min()) /
                                (len(index) - 1)).to_numpy().astype(float))
        actual_block_size = (
            eva_model.extremes_kwargs["block_size"].to_numpy().astype(float))
        assert np.isclose(expected_block_size,
                          actual_block_size,
                          rtol=0,
                          atol=1e-6)

        # Test default POT arguments
        eva_model = EVA.from_extremes(
            extremes=pd.Series(
                data=np.arange(100),
                index=pd.date_range(start="2000", end="2050", periods=100),
                name="water level [m]",
            ),
            method="POT",
            extremes_type="high",
        )
        assert np.isclose(eva_model.extremes_kwargs["threshold"],
                          0,
                          rtol=0,
                          atol=1e-6)
        assert eva_model.extremes_kwargs["r"] == pd.to_timedelta("24H")
        eva_model = EVA.from_extremes(
            extremes=pd.Series(
                data=np.arange(100),
                index=pd.date_range(start="2000", end="2050", periods=100),
                name="water level [m]",
            ),
            method="POT",
            extremes_type="low",
        )
        assert np.isclose(eva_model.extremes_kwargs["threshold"],
                          99,
                          rtol=0,
                          atol=1e-6)
        assert eva_model.extremes_kwargs["r"] == pd.to_timedelta("24H")
Exemple #5
0
def eva_model_bm_emcee(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="BM",
        extremes_type="high",
        block_size="365.2425D",
        errors="raise",
    )
    eva_model.fit_model("Emcee", n_walkers=10, n_samples=100)
    return eva_model
Exemple #6
0
def eva_model_bm_mle(battery_wl_preprocessed) -> EVA:
    eva_model = EVA(data=battery_wl_preprocessed)
    eva_model.get_extremes(
        method="BM",
        extremes_type="high",
        block_size="1Y",
        errors="raise",
    )
    eva_model.fit_model("MLE")
    return eva_model
Exemple #7
0
    def test_init_errors(self):
        with pytest.raises(
                TypeError,
                match=r"invalid type.*`data` argument.*pandas.Series"):
            EVA(data=1)

        with pytest.warns(RuntimeWarning,
                          match=r"`data`.*not numeric.*converting"):
            eva_model = EVA(data=pd.Series(
                data=["1", "2", "3"],
                index=pd.DatetimeIndex(["2020", "2021", "2022"]),
            ))
            assert np.allclose(eva_model.data.values, [1, 2, 3])

        with pytest.warns(RuntimeWarning,
                          match=r"`data`.*not numeric.*converting"):
            with pytest.raises(
                    TypeError,
                    match=r"invalid dtype.*`data` argument.*numeric"):
                EVA(data=pd.Series(
                    data=["a", "b", "c"],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                ))

        with pytest.raises(TypeError,
                           match=r"index of `data`.*date-time.*not"):
            EVA(data=pd.Series(data=[1, 2, 3], index=["2020", "2021", "2022"]))

        with pytest.warns(RuntimeWarning,
                          match=r"index is not sorted.*sorting"):
            eva_model = EVA(data=pd.Series(
                data=[1, 2, 3],
                index=pd.DatetimeIndex(["2022", "2021", "2020"]),
            ))
            assert np.allclose(eva_model.data.index.year.values,
                               [2020, 2021, 2022])

        with pytest.warns(RuntimeWarning,
                          match=r"Null values found.*removing invalid"):
            eva_model = EVA(data=pd.Series(
                data=[1, 2, np.nan, 3],
                index=pd.DatetimeIndex(["2020", "2021", "2022", "2023"]),
            ))
            assert np.allclose(eva_model.data.values, [1, 2, 3])
            assert np.allclose(eva_model.data.index.year.values,
                               [2020, 2021, 2023])
Exemple #8
0
def eva_model(battery_wl_preprocessed) -> EVA:
    return EVA(data=battery_wl_preprocessed)
Exemple #9
0
    def test_set_extremes_errors(self):
        eva_model = EVA(data=pd.Series(
            data=np.arange(100),
            index=pd.date_range(start="2000", end="2050", periods=100),
            name="water level [m]",
        ))

        # Test invalid `extremes`
        with pytest.raises(TypeError,
                           match=r"invalid type.*must be pandas.Series"):
            eva_model.set_extremes([1, 2, 3])
        with pytest.raises(TypeError,
                           match=r"invalid index.*must be date-time"):
            eva_model.set_extremes(pd.Series(
                data=[1, 2, 3],
                index=[1, 2, 3],
            ))
        with pytest.raises(TypeError,
                           match=r"`extremes` must have numeric values"):
            eva_model.set_extremes(
                pd.Series(
                    data=["a", "b", "c"],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                ))
        with pytest.raises(ValueError, match="name doesn't match"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name="different name",
                ))
        with pytest.raises(ValueError, match=".+time range must fit within.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["1990", "2021", "2022"]),
                ))

        # Test invalid general kwargs
        with pytest.raises(ValueError, match=r"`method` must be either.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="wrong method",
            )
        with pytest.raises(ValueError,
                           match=r"`extremes_type` must be either.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="BM",
                extremes_type="wrong type",
            )

        # Test invalid BM kwargs
        with pytest.raises(ValueError,
                           match=r"`block_size` must be a positive.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="BM",
                extremes_type="high",
                block_size="-1D",
            )
        with pytest.raises(ValueError,
                           match=r"invalid value.+`errors` argument"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="BM",
                extremes_type="high",
                errors="wrong errors",
            )
        with pytest.raises(ValueError,
                           match=r"`min_last_block` must be a number.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="BM",
                extremes_type="high",
                min_last_block=2.0,
            )

        # Test invalid POT kwargs
        with pytest.raises(ValueError, match=r"invalid `threshold` value"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="POT",
                extremes_type="high",
                threshold=2,
            )
        with pytest.raises(ValueError, match=r"`r` must be a positive.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="POT",
                extremes_type="high",
                r="-1D",
            )

        # Test unrecognized arguments
        with pytest.raises(TypeError, match=r"unrecognized arguments.+"):
            eva_model.set_extremes(
                pd.Series(
                    data=[1, 2, 3],
                    index=pd.DatetimeIndex(["2020", "2021", "2022"]),
                    name=eva_model.data.name,
                ),
                method="BM",
                extremes_type="high",
                unrecognized_argument=1,
            )
def plot_aic_scores(
        ts: pd.Series,
        thresholds=None,
        r: typing.Union[str, pd.Timedelta] = "24H",
        extremes_type: str = "high",
        distributions: typing.Optional[typing.List[typing.Union[
            str, scipy.stats.rv_continuous]]] = None,
        ax: typing.Optional[plt.Axes] = None,
        figsize: tuple = (8, 5),
) -> plt.Axes:
    """
    Plot AIC scores for each distribution and threshold.

    Used to investigate which distribution better explains data variance for each
    threshold value. Does NOT indicate which threshold value is better because
    it will always have the same shape - logarithmic curve.

    Parameters
    ----------
    ts : pandas.Series
        Time series of the signal.
    thresholds : array-like, optional
        An array of thresholds for which the AIC plot is plotted.
        If None (default), plots AIC for 100 equally-spaced thresholds
        between 90th (10th if extremes_type='high') percentile
        and 10th largest (smallest if extremes_type='low') value in the series.
    r : pandas.Timedelta or value convertible to timedelta, optional
        Duration of window used to decluster the exceedances.
        By default r='24H' (24 hours).
        See pandas.to_timedelta for more information.
    extremes_type : str, optional
        high (default) - extreme high values
        low - extreme low values
    distributions : list, optional
        List of distributions for which the AIC curves are plotted.
        By default these are "genpareto" and "expon".
        A distribution must be either a name of distribution from scipy.stats
        or a subclass of scipy.stats.rv_continuous.
        See https://docs.scipy.org/doc/scipy/reference/stats.html
    ax : matplotlib.axes._axes.Axes, optional
        If provided, then the plot is drawn on this axes.
        If None (default), new figure and axes are created
    figsize : tuple, optional
        Figure size in inches in format (width, height).
        By default it is (8, 5).

    Returns
    -------
    plt.Axes
        Axes object.

    """
    # Get default `thresholds`
    if thresholds is None:
        thresholds = get_default_thresholds(
            ts=ts,
            extremes_type=extremes_type,
            num=100,
        )

    # Get default `distributions`
    if distributions is None:
        distributions = [
            "genpareto",
            "expon",
        ]
    distribution_names: typing.List[str] = []
    for distribution in distributions:
        if isinstance(distribution, str):
            distribution_names.append(distribution)
        else:
            distribution_names.append(distribution.name)

    # Calculate AIC values
    model = EVA(data=ts)
    results = []
    for distribution, distribution_name in zip(distributions,
                                               distribution_names):
        for threshold in thresholds:
            model.get_extremes(
                method="POT",
                extremes_type=extremes_type,
                threshold=threshold,
                r=r,
            )
            model.fit_model(model="MLE", distribution=distribution)
            results.append({
                "distribution_name": distribution_name,
                "threshold": threshold,
                "aic": model.AIC,
            })
    results = pd.DataFrame(data=results).sort_values("threshold",
                                                     ascending=True)

    with plt.rc_context(rc=pyextremes_rc):
        if ax is None:
            _, ax = plt.subplots(figsize=figsize, dpi=96)
            ax.grid(False)

        for i, (distribution_name,
                df) in enumerate(results.groupby("distribution_name")):
            ax.plot(
                df.loc[:, "threshold"],
                df.loc[:, "aic"],
                color=pyextremes_rc["axes.prop_cycle"].by_key()["color"][i],
                lw=2,
                ls="-",
                label=distribution_name,
                zorder=(i + 3) * 5,
            )

        # Plot legend
        ax.legend(frameon=True, framealpha=0.9)

        # Label axes
        ax.set_xlabel("Threshold")
        ax.set_ylabel("AIC Score")

        return ax
Exemple #11
0
def plot_return_value_stability(
        ts: pd.Series,
        return_period,
        return_period_size: typing.Union[str, pd.Timedelta] = "365.2425D",
        thresholds=None,
        r: typing.Union[str, pd.Timedelta] = "24H",
        extremes_type: str = "high",
        distributions: typing.Optional[typing.List[typing.Union[
            str, scipy.stats.rv_continuous]]] = None,
        alpha: typing.Optional[float] = None,
        n_samples: int = 100,
        figsize: tuple = (8, 5),
) -> tuple:  # pragma: no cover
    """
    Plot return value stability plot for given threshold values.

    The return value stability plot shows return values for given return period
    for given thresholds.
    The purpose of this plot is to investigate statibility and sensitivity of the
    Generalized Pareto Distribution model to threshold value.
    Threshold value selection should still be guided by the mean residual life plot
    and the parameter stability plot. This plot should be used as additional check.

    Parameters
    ----------
    ts : pandas.Series
        Time series of the signal.
    return_period : number
        Return period.
        Given as a multiple of `return_period_size`.
    return_period_size : str or pandas.Timedelta, optional
        Size of return period (default='365.2425D').
        If set to '30D', then a return period of 12
        would be roughly equivalent to a 1 year return period (360 days).
    thresholds : array-like, optional
        An array of thresholds for which the mean residual life plot is plotted.
        If None (default), plots mean residual life for 100 equally-spaced thresholds
        between 90th (10th if extremes_type='low') percentile
        and 10th largest (smallest if extremes_type='low') value in the series.
    r : str or pandas.Timedelta, optional
        Duration of window used to decluster the exceedances.
        By default r='24H' (24 hours).
    extremes_type : str, optional
        high (default) - extreme high values
        low - extreme low values
    distributions : list, optional
        List of distributions for which the return value curves are plotted.
        By default these are "genpareto" and "expon".
        A distribution must be either a name of distribution from with scipy.stats
        or a subclass of scipy.stats.rv_continuous.
        See https://docs.scipy.org/doc/scipy/reference/stats.html
    alpha : float, optional
        Confidence interval width in the range (0, 1).
        If None (default), then confidence interval is not shown.
    n_samples : int, optional
        Number of bootstrap samples used to estimate
        confidence interval bounds (default=100).
        Ignored if `alpha` is None.
    figsize : tuple, optional
        Figure size in inches in format (width, height).
        By default it is (8, 5).

    Returns
    -------
    figure : matplotlib.figure.Figure
        Figure object.
    axes : matplotlib.axes._axes.Axes
        Axes object.

    """
    # Get default `thresholds`
    if thresholds is None:
        thresholds = get_default_thresholds(
            ts=ts,
            extremes_type=extremes_type,
            num=100,
        )

    # Get default `distributions`
    if distributions is None:
        distributions = [
            "genpareto",
            "expon",
        ]

    # Instantiate model
    model = EVA(data=ts)

    # Calculate return values for each threshold and distribution
    return_values: typing.Dict[str, typing.List[float]] = {}
    ci_lower: typing.Dict[str, typing.List[float]] = {}
    ci_upper: typing.Dict[str, typing.List[float]] = {}
    for distribution in distributions:
        for threshold in thresholds:
            model.get_extremes(
                method="POT",
                extremes_type=extremes_type,
                threshold=threshold,
                r=r,
            )
            model.fit_model(
                model="MLE",
                distribution=distribution,
            )
            rv, cil, ciu = model.get_return_value(
                return_period=return_period,
                return_period_size=return_period_size,
                alpha=alpha,
                n_samples=n_samples,
            )
            try:
                return_values[distribution].append(rv)
                ci_lower[distribution].append(cil)
                ci_upper[distribution].append(ciu)
            except KeyError:
                return_values[distribution] = [rv]
                ci_lower[distribution] = [cil]
                ci_upper[distribution] = [ciu]

    with plt.rc_context(rc=pyextremes_rc):
        # Create figure and axes
        fig, ax = plt.subplots(figsize=figsize, dpi=96)
        ax.grid(False)

        # Plot central estimate of return values
        for i, distribution in enumerate(distributions):
            color = pyextremes_rc["axes.prop_cycle"].by_key()["color"][i]
            ax.plot(
                thresholds,
                return_values[distribution],
                color=color,
                lw=2,
                ls="-",
                label=distribution,
                zorder=(i + 3) * 5,
            )

            # Plot confidence bounds
            if alpha is not None:
                for ci in [ci_lower[distribution], ci_upper[distribution]]:
                    ax.plot(
                        thresholds,
                        ci,
                        color=color,
                        lw=1,
                        ls="--",
                        zorder=(i + 2) * 5,
                    )
                ax.fill_between(
                    thresholds,
                    ci_lower[distribution],
                    ci_upper[distribution],
                    facecolor=color,
                    edgecolor="None",
                    alpha=0.25,
                    zorder=(i + 1) * 5,
                )

        # Plot legend
        ax.legend(
            frameon=True,
            framealpha=0.9,
        )

        # Label axes
        ax.set_xlabel("Threshold")
        ax.set_ylabel("Return value")

        return fig, ax