def test_single_cluster(): index = pd.date_range(start="2021/01/01", end="2021/01/02", freq="1H") data = pd.Series(data=np.random.random(size=len(index)), index=index) # Tesh high extremes extremes = get_extremes( ts=data, method="POT", extremes_type="high", threshold=data.min() - 1, r="24H", ) assert len(extremes) == 1 assert np.isclose(extremes.values[0], data.max()) # Tesh low extremes extremes = get_extremes( ts=data, method="POT", extremes_type="low", threshold=data.max() + 1, r="24H", ) assert len(extremes) == 1 assert np.isclose(extremes.values[0], data.min())
def test_invalid_arguments(battery_wl): # Test wrong extremes_type value with pytest.raises(ValueError, match=r"invalid value.*extremes_type"): get_extremes( ts=battery_wl, method="POT", extremes_type="BAD EXTREMES_TYPE VALUE", threshold=2, r="24H", ) # Test wrong r type with pytest.raises(ValueError, match=r"invalid value.*'r' argument"): get_extremes( ts=battery_wl, method="POT", extremes_type="high", threshold=2, r="abc" )
def test_invalid_arguments(battery_wl): # Test wrong extremes_type value with pytest.raises(ValueError, match=r"invalid value.*extremes_type"): get_extremes( ts=battery_wl, method="BM", extremes_type="BAD EXTREMES_TYPE VALUE", block_size="365.2425D", errors="coerce", ) # Test wrong block_size type with pytest.raises(TypeError, match=r"invalid type.*block_size"): get_extremes( ts=battery_wl, method="BM", extremes_type="high", block_size=1, errors="coerce", ) # Test wrong errors value with pytest.raises(ValueError, match=r"invalid value.*errors.*argument"): get_extremes( ts=battery_wl, method="BM", extremes_type="high", block_size="365.2425D", errors="BAD ERRORS VALUE", )
def test_extreme_value_extraction(battery_wl, extremes_type): # Test errors=raise with pytest.raises(ValueError): get_extremes( ts=battery_wl, method="BM", extremes_type=extremes_type, block_size="365.2425D", errors="raise", ) # Test errors=ignore with pytest.warns(NoDataBlockWarning, match=r"blocks contained no data"): extremes_ignored = get_extremes( ts=battery_wl, method="BM", extremes_type=extremes_type, block_size="365.2425D", errors="ignore", ) assert len(extremes_ignored) == 96 # Test errors=coerce with pytest.warns(NoDataBlockWarning, match=r"blocks contained no data"): extremes_coerced = get_extremes( ts=battery_wl, method="BM", extremes_type=extremes_type, block_size="365.2425D", errors="coerce", ) assert len(extremes_coerced) == 100 if extremes_type == "high": assert np.isclose(extremes_ignored.max(), extremes_coerced.max()) else: assert np.isclose(extremes_ignored.min(), extremes_coerced.min()) assert np.isclose(extremes_ignored.mean(), extremes_coerced.mean())
def test_extreme_value_extraction(battery_wl, extremes_type, threshold): extremes = get_extremes( ts=battery_wl, method="POT", extremes_type=extremes_type, threshold=threshold, r="24H", ) if extremes_type == "high": assert np.isclose(extremes.max(), battery_wl.max()) assert len(extremes) == 117 elif extremes_type == "low": assert np.isclose(extremes.min(), battery_wl.min()) assert len(extremes) == 104 assert np.all(np.diff(extremes.index) > pd.to_timedelta("24H").to_numpy())
def test_min_last_block(battery_wl, extremes_type): with pytest.warns(NoDataBlockWarning, match=r"blocks contained no data"): extremes_full = get_extremes( ts=battery_wl, method="BM", extremes_type=extremes_type, block_size="365.2425D", errors="coerce", min_last_block=None, ) with pytest.warns(NoDataBlockWarning, match=r"blocks contained no data"): extremes_trimmed = get_extremes( ts=battery_wl, method="BM", extremes_type=extremes_type, block_size="365.2425D", errors="coerce", min_last_block=0.9, ) assert len(extremes_full) - len(extremes_trimmed) == 1 assert np.allclose(extremes_full.values[:-1], extremes_trimmed.values, atol=0.01, rtol=0)
def get_extremes(self, method: str, extremes_type: str = "high", **kwargs) -> None: """ Get extreme events from time series. Extracts extreme values from the 'self.data' attribute. Stores extreme values in the 'self.extremes' attribute. Parameters ---------- method : str Extreme value extraction method. Supported values: BM - Block Maxima POT - Peaks Over Threshold extremes_type : str, optional high (default) - get extreme high values low - get extreme low values kwargs if method is BM: block_size : str or pandas.Timedelta, optional Block size (default='1Y'). errors : str, optional raise (default) - raise an exception when encountering a block with no data ignore - ignore blocks with no data coerce - get extreme values for blocks with no data as mean of all other extreme events in the series with index being the middle point of corresponding interval if method is POT: threshold : float Threshold used to find exceedances. r : str or pandas.Timedelta, optional Duration of window used to decluster the exceedances. By default r='24H' (24 hours). """ message = f"for method='{method}' and extremes_type='{extremes_type}'" logger.debug(f"extracting extreme values {message}") self.__extremes = get_extremes( method=method, ts=self.data, extremes_type=extremes_type, **kwargs ) self.__extremes_method = method self.__extremes_type = extremes_type logger.info(f"successfully extracted extreme values {message}") logger.debug("collecting extreme value properties ") self.__extremes_kwargs = {} if method == "BM": self.__extremes_kwargs["block_size"] = pd.to_timedelta( kwargs.get("block_size", "1Y") ) self.__extremes_kwargs["errors"] = kwargs.get("errors", "raise") elif method == "POT": self.__extremes_kwargs["threshold"] = kwargs["threshold"] self.__extremes_kwargs["r"] = pd.to_timedelta(kwargs.get("r", "24H")) else: raise AssertionError logger.info("successfully collected extreme value properties") logger.debug("creating extremes transformer") self.__extremes_transformer = ExtremesTransformer( extremes=self.__extremes, extremes_type=self.__extremes_type, ) logger.info("successfully created extremes transformer") logger.info("removing any previously declared models") self.__model = None
def _calculate_modified_parameters( args: typing.Tuple[pd.Series, # ts (time series) str, # extremes_type float, # threshold typing.Union[str, pd.Timedelta], # r typing.Optional[float], # alpha int, # n_samples int, # seed ], ) -> typing.Dict[str, typing.Optional[float]]: ( ts, extremes_type, threshold, r, alpha, n_samples, seed, ) = args result: typing.Dict[str, typing.Optional[float]] = {"threshold": threshold} # Get extremes extremes = get_extremes( ts=ts, method="POT", extremes_type=extremes_type, threshold=threshold, r=r, ) extremes_transformer = ExtremesTransformer( extremes=extremes, extremes_type=extremes_type, ) # Get central estimates for shape and scale parameters c, _, scale = scipy.stats.genpareto.fit( data=extremes_transformer.transformed_extremes, floc=threshold, ) result["shape"] = c result["scale"] = scale - c * threshold # Get confidence bounds if alpha is None: result["shape_ci_lower"] = None result["shape_ci_upper"] = None result["scale_ci_lower"] = None result["scale_ci_upper"] = None if alpha is not None: # Get fit parameters rng_generator = np.random.default_rng(seed=seed) fit_parameters = [ scipy.stats.genpareto.fit( data=rng_generator.choice( a=extremes.values, size=len(extremes), replace=True, ), floc=threshold, ) for _ in range(n_samples) ] # Calculate confidence bounds for shape and scale parameters result["shape_ci_lower"], result["shape_ci_upper"] = np.quantile( a=np.transpose(fit_parameters)[0], q=[(1 - alpha) / 2, (1 + alpha) / 2], ) result["scale_ci_lower"], result["scale_ci_upper"] = np.quantile( a=np.transpose(fit_parameters)[2] - np.transpose(fit_parameters)[0] * threshold, q=[(1 - alpha) / 2, (1 + alpha) / 2], ) return result
def plot_parameter_stability( ts: pd.Series, thresholds=None, r: typing.Union[str, pd.Timedelta] = "24H", extremes_type: str = "high", alpha: typing.Optional[float] = None, n_samples: int = 100, figsize: tuple = (8, 5), ) -> tuple: # pragma: no cover """ Plot parameter stability plot for given threshold values. The parameter stability plot shows shape and modified scale parameters of the Generalized Pareto Distribution (GPD). Both shape and modified scale parameters should be approximately constant above a threshold for which the GPD model is valid. The strategy is to select the smallest (largest for extremes_type='low') threshold value immediately above (below for extremes_type='low') which the GPD parameters are approximately constant. Parameters ---------- ts : pandas.Series Time series of the signal. thresholds : array-like, optional An array of thresholds for which the mean residual life plot is plotted. If None (default), plots mean residual life for 100 equally-spaced thresholds between 90th (10th if extremes_type='low') percentile and 10th largest (smallest if extremes_type='low') value in the series. r : str or pandas.Timedelta, optional Duration of window used to decluster the exceedances. By default r='24H' (24 hours). extremes_type : str, optional high (default) - extreme high values low - extreme low values alpha : float, optional Confidence interval width in the range (0, 1). If None (default), then confidence interval is not shown. n_samples : int, optional Number of bootstrap samples used to estimate confidence interval bounds (default=100). Ignored if `alpha` is None. figsize : tuple, optional Figure size in inches in format (width, height). By default it is (8, 5). Returns ------- figure : matplotlib.figure.Figure Figure object. axes : matplotlib.axes._axes.Axes Axes object. """ # Get default thresholds if thresholds is None: thresholds = get_default_thresholds( ts=ts, extremes_type=extremes_type, num=100, ) # Calculate shape and modified scale parameters for each threshold shape_parameters: typing.Dict[str, typing.List[float]] = { "values": [], "ci_lower": [], "ci_upper": [], } scale_parameters: typing.Dict[str, typing.List[float]] = { "values": [], "ci_lower": [], "ci_upper": [], } distribution = scipy.stats.genpareto for threshold in thresholds: # Get extremes extremes = get_extremes( ts=ts, method="POT", extremes_type=extremes_type, threshold=threshold, r=r, ) extremes_transformer = ExtremesTransformer( extremes=extremes, extremes_type=extremes_type, ) # Get central estimates for shape and scale parameters c, _, scale = distribution.fit( data=extremes_transformer.transformed_extremes, floc=threshold, ) shape_parameters["values"].append(c) scale_parameters["values"].append(scale - c * threshold) # Get confidence bounds if alpha is not None: # Prepare local variables used by fit parameter calculator fit_function = distribution.fit fixed_parameters = {"floc": threshold} min_samples_per_core = 50 if n_samples <= min_samples_per_core: # Calculate without multiprocessing seed = np.random.randint(low=0, high=1e6, size=None) fit_parameters = get_fit_parameters(params=( n_samples, fit_function, extremes, fixed_parameters, seed, )) else: # Find number of cores n_cores = min( os.cpu_count() or 2, int(np.ceil(n_samples / min_samples_per_core)), ) # Calculate number of samples per core min_samples_per_core = int(n_samples / n_cores) core_samples = [min_samples_per_core for _ in range(n_cores)] # Distribute remaining samples evenly across cores for i in range(n_samples - sum(core_samples)): core_samples[i] += 1 # Get unique random seed for each core seeds: typing.List[int] = [] while len(seeds) < n_cores: seed = np.random.randint(low=0, high=1e6, size=None) if seed not in seeds: seeds.append(seed) # Calculate new fit parameters using processor pool with multiprocessing.Pool(processes=n_cores) as pool: fit_parameters = list( itertools.chain(*pool.map( get_fit_parameters, zip( core_samples, [fit_function for _ in range(n_cores)], [extremes for _ in range(n_cores)], [fixed_parameters for _ in range(n_cores)], seeds, ), ))) # Calculate confidence bounds shapes = np.transpose(fit_parameters)[0] scales = np.transpose(fit_parameters)[0] - shapes * threshold cil, ciu = np.quantile( a=shapes, q=[(1 - alpha) / 2, (1 + alpha) / 2], ) shape_parameters["ci_lower"].append(cil) shape_parameters["ci_upper"].append(ciu) cil, ciu = np.quantile( a=scales, q=[(1 - alpha) / 2, (1 + alpha) / 2], ) scale_parameters["ci_lower"].append(cil) scale_parameters["ci_upper"].append(ciu) with plt.rc_context(rc=pyextremes_rc): # Create figure fig = plt.figure(figsize=figsize, dpi=96) # Create gridspec gs = matplotlib.gridspec.GridSpec( nrows=2, ncols=1, wspace=0.1, hspace=0.1, width_ratios=[1], height_ratios=[1, 1], ) # Create and configure axes ax_shape = fig.add_subplot(gs[0, 0]) ax_scale = fig.add_subplot(gs[1, 0]) # Plot central estimates of shape and modified scale parameters ax_shape.plot( thresholds, shape_parameters["values"], ls="-", color="#F85C50", lw=2, zorder=15, ) ax_scale.plot( thresholds, scale_parameters["values"], ls="-", color="#F85C50", lw=2, zorder=15, ) # Plot confidence bounds if alpha is not None: for ci in [ shape_parameters["ci_lower"], shape_parameters["ci_upper"] ]: ax_shape.plot( thresholds, ci, color="#5199FF", lw=1, ls="--", zorder=10, ) ax_shape.fill_between( thresholds, shape_parameters["ci_lower"], shape_parameters["ci_upper"], facecolor="#5199FF", edgecolor="None", alpha=0.25, zorder=5, ) for ci in [ scale_parameters["ci_lower"], scale_parameters["ci_upper"] ]: ax_scale.plot( thresholds, ci, color="#5199FF", lw=1, ls="--", zorder=10, ) ax_scale.fill_between( thresholds, scale_parameters["ci_lower"], scale_parameters["ci_upper"], facecolor="#5199FF", edgecolor="None", alpha=0.25, zorder=5, ) # Configure axes ax_shape.tick_params(axis="x", which="both", labelbottom=False, length=0) ax_scale.set_xlim(ax_shape.get_xlim()) # Label axes ax_shape.set_ylabel(r"Shape, $\xi$") ax_scale.set_ylabel(r"Modified scale, $\sigma^*$") ax_scale.set_xlabel("Threshold") return fig, (ax_shape, ax_scale)
def test_get_extremes(): with pytest.raises(ValueError, match=r"invalid value.*method.*argument"): get_extremes(ts=pd.Series([1, 2, 3]), method="BAD METHOD", extremes_type="high")