def qq_plot_exponential_scale( fit: Distribution, data: Union[pd.DataFrame, np.array, pd.Series], ax=plt.gca(), path_to_figure: str = None, figure_name="qq_plot", ci_confidence=0.99, ): ( theoretical, empirical, lower_bound, upper_bound, ) = get_quantiles_and_confidence_intervals_uniform_scale( fit, data, ci_confidence) unit_exp = Exponential() theo_exp = unit_exp.inverse_cdf(theoretical) empi_exp = unit_exp.inverse_cdf(empirical) lb_exp = unit_exp.inverse_cdf(lower_bound) ub_exp = unit_exp.inverse_cdf(upper_bound) n = len(data) ax.scatter(theo_exp, empi_exp, s=5, color="navy") ax.plot(theo_exp, theo_exp, label=f"$x=y$", color="navy") ax.fill_betweenx(y=empi_exp, x1=lb_exp, x2=ub_exp, alpha=0.2, color="navy") ax.legend() ax.set_xlabel(f"Theoretical unit Exponential quantiles ({n} observations)") ax.set_ylabel("Empirical quantiles") ax.set_title("QQ Plot: Unit Exponential Scale") plt.tight_layout() if path_to_figure is not None: plt.savefig(f"{path_to_figure}/{figure_name}.png") return ax
def HawkesByThinningModified(T, mu, alpha, theta, phi=0): """ Implements Ogata's modified thinning algorithm for sampling from a univariate Hawkes process with exponential decay. :param T: the maximum time :param mu: the exogenous intensity :param alpha: the infectivity factor alpha :param theta: intensity parameter of the delay density :param phi: (optionally) the starting phi, the running sum of exponential differences specifying the history until a certain time, thus making it possible to take conditional samples :return: 1-d numpy ndarray of samples """ t = 0.0 d = 0.0 P = [] while t < T: M = mu + alpha * theta * (1 + phi) e = Exponential(rate=M).rvs(1) t += e exp_decay = np.exp(-theta * (e + d)) lambda_ = mu + alpha * theta * exp_decay * (1 + phi) u = Uniform(loc=0.0, scale=M).rvs(1) if t < T and u <= lambda_: P.append(float(t)) phi = exp_decay * (1 + phi) d = 0 else: d += e return P
def PoissonByThinning(T, lambd, M): """ :param T: Maximum time :param lambd: Function or float describing the intensity of the Poisson Process :param M: Upper bound on [0, T] for lambd(.) :return: sample timestamps in ascending order """ if not isinstance(lambd, Callable): functional_lambd = lambda x: lambd else: functional_lambd = lambd P = [] t = 0 while t < T: e = Exponential(loc=0.0, rate=M).rvs(1) t += e u = Uniform(loc=0.0, scale=M).rvs(1) if t < T and u <= functional_lambd(t): P.append(float(t)) return P
def HawkesByThinning(T, lambd, tau=None): """ From article https://arxiv.org/pdf/1507.02822.pdf :param T: Maximum time :param lambd: Intensity function, must be non-decreasing inbetween arrival times :param tau: true inter-arrival times, optional, if not provided the simulated ones are used :return: sample timestamps in ascending order """ epsilon = 1e-10 P = [] tau = P if tau is None else tau t = 0 while t < T: M = lambd(t + epsilon, tau) e = Exponential(rate=M).rvs(1) t += e u = Uniform(scale=M).rvs(1) if t < T and u <= lambd(t, tau): P.append(float(t)) return P
def Kgaps_diagnostic_plots( range_K: Union[List, np.array], inter_exceedance_times: pd.Series, proba_of_exceedance: float, path_to_figure: str, ci_confidence=0.99, ): """ :param range_K: Range in which the parameter K of the model varies; should be the same unit as the inter-exceedance times :param inter_exceedance_times: Observed inter-exceedance times :param proba_of_exceedance: Normalizing factor corresponding to 1-F(un) in the model, proba of exceeding the defined threshold :param path_to_figure: Path to save the plots :return: Plots the diagnostic graphs """ Ks = range_K thetas = [] CIs = [] for K in Ks: iet_normalised_to_cluster_distance = ( np.clip(inter_exceedance_times - K, 0.0, a_max=None) * proba_of_exceedance) mem = MixtureExponentialModel.fit(iet_normalised_to_cluster_distance) theta = mem.theta() ll = Profiler(mem, iet_normalised_to_cluster_distance) CI = [ ll.profiles["theta"]["theta"].min(), ll.profiles["theta"]["theta"].max() ] thetas.append(theta) CIs.append(CI) freq, bins = np.histogram(iet_normalised_to_cluster_distance, bins="auto") freq = freq / len(iet_normalised_to_cluster_distance) x = (bins[:-1] + bins[1:]) / 2 ( theoretical, empirical, lower_bound, upper_bound, ) = get_quantiles_and_confidence_intervals( Exponential(rate=theta), iet_normalised_to_cluster_distance, ci_confidence) fig = plt.figure(figsize=(10, 5)) gs = fig.add_gridspec(1, 2) ax = [] for i in range(1): for j in range(2): axe = fig.add_subplot(gs[i, j]) ax.append(axe) plt.subplots_adjust(wspace=0.2) ax0, ax1 = ax ax0.bar(x, freq, color="navy") ax0.set_title( f"Spacings between exceedances distribution for K={round(K, 1)} days" ) ax0.set_xlabel("Interval") ax0.set_ylabel("Frequency") ax1.scatter(theoretical, empirical, s=5, color="navy") ax1.plot(theoretical, theoretical, label=f"$x=y$", color="navy") plt.fill_betweenx(y=empirical, x1=lower_bound, x2=upper_bound, alpha=0.5, color="navy") ax1.legend() ax1.set_title( "QQ Plot of positive spacing vs Exponential distribution") ax1.set_ylabel("Empirical") ax1.set_xlabel(r"Exponential with $\theta$" + f"= {round(theta, 2)}") fig.savefig( f"{path_to_figure}/positive_spacings_summary_{round(K, 1)}.png") serie = pd.Series([float(t) for t in thetas], index=Ks) CI = pd.DataFrame(CIs) plt.clf() plt.scatter(serie.index, serie, marker="x", color="navy") plt.vlines(serie.index, CI[0], CI[1], color="navy") plt.xlabel("Inter-cluster interval $K$ (days)") plt.ylabel(r"$\hat{\theta}$") plt.title("Extremal index estimate") plt.savefig(f"{path_to_figure}/K-gaps_extremal_index_estimate.png") plt.clf()
def cum_number_exceedances( data: pd.DataFrame, length_total_period: Union[int, float], origin: pd.Timestamp, path_to_figure: str, compare_to_hawkes=False, figure_name="cum_number_of_exceedances", ): """ Computes the observed and simulated cumulative number of exceedances and compares a homogeneous poisson process estimate with the one obtained using a Hawkes process for modeling inter-exceedance times. :param data: Dataframe pandas containing the columns "data", "threshold" and "time" (normalized between 0 and 1) for less numerical errors. :param length_total_period: Length of the total period for scaling (graphical purpose only, unlike in previous graphs comparing Hawkes and HomoPoisson where a discrete scale is preferred vs a normalized "continuous" one between 0 and 1 due to precision concerns using input block sizes and deltas), same unit as inter-exceedance times. :param origin: The start date for scaling purposes (graphical). :param path_to_figure: Path to save the figure. :return: Plots diagnostic graphs. """ data_gpd = data.set_index("time")[["data", "threshold"]] data_gpd = (data_gpd[ data_gpd["data"] >= data_gpd["threshold"]].reset_index().assign( iat=lambda x: x["time"].diff()).fillna(0.0)) # Empirical realized = data.set_index("time").assign( bool=lambda x: (x["data"] >= x["threshold"]).astype(int))["bool"] realized = (realized[realized == True].cumsum().reset_index().set_index( "bool")["time"]) def swap_axes(s): return pd.Series(s.index.values, index=s) realized_ = swap_axes(realized) x = pd.to_datetime(length_total_period * data_gpd["time"], unit="d", origin=origin) realized_.index = x # Simulated Poisson process exp_fit = Exponential.fit(data_gpd["iat"], loc=0.0) simulations = [] for _ in range(500): simulations.append( pd.DataFrame(np.cumsum(exp_fit.rvs(len(data_gpd))), columns=[_])) simulations = pd.concat(simulations, axis=1) s_pp = (simulations.stack().reset_index().pivot( index=0, columns="level_1", values="level_0").interpolate("index", axis=0).bfill(0.0)) mean_pp = s_pp.mean(axis=1) quantile_inf_pp = np.quantile(s_pp, q=0.001, axis=1) quantile_sup_pp = np.quantile(s_pp, q=0.99, axis=1) mean_pp.index = pd.to_datetime(length_total_period * mean_pp.index, unit="d", origin=origin) to_return = [realized_, mean_pp, quantile_inf_pp, quantile_sup_pp] plt.plot(mean_pp, label="Poisson Process Simulated", color="salmon") plt.fill_between( x=mean_pp.index, y1=quantile_inf_pp, y2=quantile_sup_pp, color="salmon", alpha=0.2, ) if compare_to_hawkes: if UEHP is not None: uv = UEHP() uv.fit(np.array(data_gpd["time"])) simulations_hp = [] for _ in range(500): simulations_hp.append(pd.DataFrame(uv.sample(T=1), columns=[_])) else: from pykelihood.samplers import HawkesByThinningModified mu = 1 / (len(data_gpd) / len(data)) alpha = 0 theta = 0 def score(dist, data): if dist.rate.alpha >= 1.0: return 10**10 else: return opposite_log_likelihood(dist, data) hawkes_fit = Exponential.fit( data_gpd["iat"], x0=(mu, alpha, theta), loc=0.0, rate=kernels.hawkes_with_exp_kernel(np.array( data_gpd["time"])), score=score, ) mu, alpha, theta = hawkes_fit.optimisation_params simulations_hp = [] for _ in range(500): simulations_hp.append( pd.DataFrame( HawkesByThinningModified(1, mu, alpha, theta), columns=[_], )) simulations_hp = pd.concat(simulations_hp, axis=1) s_hp = (simulations_hp.stack().reset_index().pivot( index=0, columns="level_1", values="level_0").interpolate("index", axis=0).bfill(0.0)) mean_hp = s_hp.mean(axis=1) quantile_inf_hp = np.nanquantile(s_hp, q=0.001, axis=1) quantile_sup_hp = np.nanquantile(s_hp, q=0.999, axis=1) mean_hp.index = pd.to_datetime(length_total_period * mean_hp.index, unit="d", origin=origin) to_return.extend([mean_hp, quantile_inf_hp, quantile_sup_hp]) plt.plot(mean_hp, label="Hawkes Process Simulated", color="royalblue") plt.fill_between( x=mean_hp.index, y1=quantile_inf_hp, y2=quantile_sup_hp, color="royalblue", alpha=0.2, ) plt.plot(realized_, label="Empirical", color="slategrey") plt.title("Cumulative Exceedances over Threshold") plt.ylabel("Number of Exceedances") plt.xlabel("Time") plt.legend() plt.savefig(f"{path_to_figure}/{figure_name}.png") plt.clf() return to_return
def mean_cluster_size( data: pd.DataFrame, block_sizes: Union[List, np.array], path_to_figure: str, compare_to_hawkes=False, figure_name="mean_cluster_size", ): """ Estimates the extremal intex by computing the mean cluster size considering blocks of sizes pre-defined in the parameter block_sizes, given that the block contains an exceedance (ie the mean of pi, the distribution of the extremal index). Compares the estimate using an homogeneous poisson process vs a Hawkes process. :param data: Dataframe pandas containing the columns "data" for the variable of interest, "threshold" for the defined (possibly seasonal) threshold(s) and "days_since_start" which is a non-normalized version of the time in days unit. :param block_sizes: Range for cluster sizes. :param path_to_figure: Path to save the figure. :return: Plots diagnostic graphs. """ def loop_mean_cluster_size(indices_exceedances, length_observation_period, block_sizes): local_count = [] for bs in block_sizes: hp = np.histogram( indices_exceedances, bins=[ bs * i for i in range(math.ceil(length_observation_period / bs)) ], )[0] hp = hp[hp > 0] local_count.append(np.mean(hp)) local_count = pd.Series(local_count, index=block_sizes) return local_count data_extremal_index = data[["data", "days_since_start", "threshold"]] data_extremal_index = (data_extremal_index[ data_extremal_index["data"] >= data_extremal_index["threshold"]].assign( iat=lambda x: x["days_since_start"].diff()).fillna(0.0)) # Empirical empirical_mean_cluster_size = [] for bs in block_sizes: exceedances_clusters = np.histogram( data_extremal_index["days_since_start"], bins=[bs * i for i in range(math.ceil(len(data) / bs))], )[0] exceedances_clusters = exceedances_clusters[ exceedances_clusters > 0] # given that the block contains an exceedance empirical_mean_cluster_size.append(np.mean(exceedances_clusters)) empirical_mean_cluster_size = pd.Series(empirical_mean_cluster_size, index=block_sizes) # Simulated exp_fit = Exponential.fit(data_extremal_index["iat"], loc=0.0, x0=[len(data_extremal_index) / len(data)]) exceedances_pp = [] for i in range(1000): exceedances_pp.append(exp_fit.rvs(len(data_extremal_index)).cumsum()) pp_cluster_sizes = [] for i in range(len(exceedances_pp)): # PP ex_pp = exceedances_pp[i].copy() local_count_pp = loop_mean_cluster_size(ex_pp, data["days_since_start"].max(), block_sizes) pp_cluster_sizes.append(local_count_pp) pp_cluster_sizes = pd.concat(pp_cluster_sizes, axis=1) x = block_sizes mean_pp = pp_cluster_sizes.mean(axis=1) quantile_inf_pp = np.nanquantile(pp_cluster_sizes, q=0.001, axis=1) quantile_sup_pp = np.nanquantile(pp_cluster_sizes, q=0.999, axis=1) to_return = pd.concat( [ empirical_mean_cluster_size.rename("realized"), mean_pp.rename("mean_pp"), pd.DataFrame([quantile_inf_pp, quantile_sup_pp], index=["pp_lb", "pp_ub"], columns=x).T, ], axis=1, ) plt.bar( x=x, height=empirical_mean_cluster_size, label="Empirical", width=0.3, color="slategrey", alpha=0.6, ) plt.plot(x, mean_pp, label="Poisson Process Simulated", color="salmon") plt.fill_between(x=x, y1=quantile_inf_pp, y2=quantile_sup_pp, alpha=0.2, color="salmon") if compare_to_hawkes: if UEHP is not None: uv = UEHP() uv.fit(np.array(data_extremal_index["days_since_start"])) exceedances_hp = [] for _ in range(1000): exceedances_hp.append( pd.DataFrame(uv.sample(T=data["days_since_start"].max()), columns=[_])) else: from pykelihood.samplers import HawkesByThinningModified mu = 1 / (len(data_extremal_index) / len(data)) alpha = 0 theta = 0 def score(dist, data): if dist.rate.alpha >= 1.0: return 10**10 else: return opposite_log_likelihood(dist, data) hawkes_fit = Exponential.fit( data_extremal_index["iat"], x0=(mu, alpha, theta), loc=0.0, rate=kernels.hawkes_with_exp_kernel( np.array(data_extremal_index["days_since_start"])), score=score, ) mu, alpha, theta = hawkes_fit.optimisation_params exceedances_hp = [] for _ in range(1000): exceedances_hp.append( pd.DataFrame( HawkesByThinningModified( data["days_since_start"].max(), mu, alpha, theta), columns=[_], )) hp_cluster_sizes = [] for i in range(len(exceedances_hp)): # HP ex_hp = exceedances_hp[i].copy() local_count_hp = loop_mean_cluster_size( ex_hp, data["days_since_start"].max(), block_sizes) hp_cluster_sizes.append(local_count_hp) hp_cluster_sizes = pd.concat(hp_cluster_sizes, axis=1) mean_hp = hp_cluster_sizes.mean(axis=1) quantile_inf_hp = np.nanquantile(hp_cluster_sizes, q=0.001, axis=1) quantile_sup_hp = np.nanquantile(hp_cluster_sizes, q=0.999, axis=1) to_return = pd.concat( [ to_return, mean_hp.rename("mean_hp"), pd.DataFrame( [quantile_inf_hp, quantile_sup_hp], columns=x, index=["hp_lb", "hp_ub"], ).T, ], axis=1, ) plt.plot(x, mean_hp, label="Hawkes Process Simulated", color="royalblue") plt.fill_between(x=x, y1=quantile_inf_hp, y2=quantile_sup_hp, alpha=0.3, color="royalblue") plt.title(f"Mean Cluster Size") plt.xlabel(r"Block size $r$ (days)") plt.ylabel(r"Number of exceedances per block $\theta^{-1}_r(u)$") plt.legend(loc="upper left") plt.savefig(f"{path_to_figure}/{figure_name}.png") plt.clf() return to_return
def extremogram_plot( data: pd.DataFrame, h_range: Union[List, np.array], path_to_figure: str, compare_to_hawkes=False, figure_name="extremogram", ): """ Plots the observed extremogram (ie proba of the observation in t+h to be an exceedance knowing that the observation in t was one. Compares the estimate using an homogeneous poisson process vs a Hawkes process. :param data: Dataframe pandas containing the columns "data" for the variable of interest, "threshold" for the (possibly seasonal) threshold(s) and "days_since_start" which is a non-normalized version of the time in days unit. :param h_range: Range for the h parameter to vary through. :param path_to_figure: Path to plot the extremogram. :return: Plots diagnostic graphs. """ def extremogram_loop(h_range, indices_exceedances): counts = [] indices_exceedances = np.array(indices_exceedances) for h in h_range: mat1 = np.tile(indices_exceedances, (len(indices_exceedances), 1)) mat2 = np.tile(indices_exceedances + h, (len(indices_exceedances), 1)) diff = mat2.T - mat1 diff = diff[np.abs(diff) < 0.5] counts.append(len(diff) / len(indices_exceedances)) return counts # empirical data_extremogram = data[["data", "days_since_start", "threshold"]] data_extremogram = ( data_extremogram[data_extremogram["data"] >= data["threshold"]].assign( iat=lambda x: x["days_since_start"].diff()).fillna(0.0)) indices_exceedances = [ int(round(e, 0)) for e in list(data_extremogram["days_since_start"]) ] extremogram_realized = extremogram_loop(h_range, indices_exceedances) extremogram_realized = pd.Series(extremogram_realized, index=h_range) # Simulated Poisson process exp_fit = Exponential.fit(data_extremogram["iat"], loc=0.0, x0=[len(data_extremogram) / len(data)]) exceedances_pp = [] for i in range(1000): exceedances_pp.append(exp_fit.rvs(len(data_extremogram)).cumsum()) count_pp = [] for i in range(len(exceedances_pp)): # PP ex_pp = exceedances_pp[i].copy() indices_exceedances = [e for e in ex_pp] local_count_pp = extremogram_loop(h_range, indices_exceedances) local_count_pp = pd.Series(local_count_pp, index=h_range) count_pp.append(local_count_pp) count_pp = pd.concat(count_pp, axis=1) mean_pp = count_pp.mean(axis=1) quantile_inf_pp = np.quantile(count_pp, q=0.001, axis=1) quantile_sup_pp = np.quantile(count_pp, q=0.999, axis=1) to_return = pd.concat( [ extremogram_realized.rename("realized"), mean_pp.rename("mean_pp"), pd.DataFrame( [quantile_inf_pp, quantile_sup_pp], index=["pp_lb", "pp_ub"], columns=h_range, ).T, ], axis=1, ) plt.bar( x=h_range, height=extremogram_realized, label="Empirical", width=0.8, color="slategrey", alpha=0.6, ) plt.plot(h_range, mean_pp, label="Poisson Process Simulated", color="salmon") plt.fill_between(x=h_range, y1=quantile_inf_pp, y2=quantile_sup_pp, alpha=0.2, color="salmon") # Simulated Hawkes process if compare_to_hawkes: if UEHP is not None: uv = UEHP() uv.fit(np.array(data_extremogram["days_since_start"])) exceedances_hp = [] for _ in range(1000): exceedances_hp.append(uv.sample( data["days_since_start"].max())) else: from pykelihood.samplers import HawkesByThinningModified mu = 1 / (len(data_extremogram) / len(data)) alpha = 0 theta = 0 def score(dist, data): if dist.rate.alpha >= 1.0: return 10**10 else: return opposite_log_likelihood(dist, data) hawkes_fit = Exponential.fit( data_extremogram["iat"], x0=(mu, alpha, theta), loc=0.0, rate=kernels.hawkes_with_exp_kernel( np.array(data_extremogram["days_since_start"])), score=score, ) mu, alpha, theta = hawkes_fit.optimisation_params exceedances_hp = [] for _ in range(1000): exceedances_hp.append( HawkesByThinningModified(data["days_since_start"].max(), mu, alpha, theta)) count_hp = [] for i in range(len(exceedances_hp)): # HP ex_hp = exceedances_hp[i] indices_exceedances = [e for e in ex_hp] local_count_hp = extremogram_loop(h_range, indices_exceedances) local_count_hp = pd.Series(local_count_hp, index=h_range) count_hp.append(local_count_hp) count_hp = pd.concat(count_hp, axis=1) mean_hp = count_hp.mean(axis=1) quantile_inf_hp = np.nanquantile(count_hp, q=0.001, axis=1) quantile_sup_hp = np.nanquantile(count_hp, q=0.999, axis=1) to_return = pd.concat( [ to_return, mean_hp.rename("mean_hp"), pd.DataFrame( [quantile_inf_hp, quantile_sup_hp], index=["hp_lb", "hp_ub"], columns=h_range, ).T, ], axis=1, ) plt.plot(h_range, mean_hp, label="Hawkes Process Simulated", color="royalblue") plt.fill_between( x=h_range, y1=quantile_inf_hp, y2=quantile_sup_hp, alpha=0.3, color="royalblue", ) plt.title(f"Extremogram") plt.xlabel(r"$h$") plt.ylabel(r"$\pi_h(u)$") plt.legend() plt.savefig(f"{path_to_figure}/{figure_name}.png") plt.clf() return to_return