def add_spikes_asymmetric(ts: torch.Tensor, xi: List[float] = [1 / 50.0, 1 / 25.0]): """ Adds spikes to 15% of the time series in the form of heavy-tailed (Generalized Pareto) realizations Arguments: ts: time series xi: [float, float], GenPareto heaviness parameter for [lower, upper] noise respectively """ num_spikes = int(0.15 * ts.shape[0]) half_num_spikes = [int(num_spikes / 2)] half_num_spikes.append(num_spikes - half_num_spikes[0]) spike_direction = [-1, 1] indices_for_gp_spikes = np.random.choice(np.arange(len(ts)), replace=False, size=num_spikes) idx = 0 spikes = (stats.genpareto(xi[idx]).rvs(half_num_spikes[idx]) * spike_direction[idx]) ts[indices_for_gp_spikes[:half_num_spikes[0]]] += spikes idx = 1 spikes = (stats.genpareto(xi[idx]).rvs(half_num_spikes[idx]) * spike_direction[idx]) ts[indices_for_gp_spikes[half_num_spikes[0]:]] += spikes return ts
def _kstest(self, loc, scale, conc, samples): # Uses the Kolmogorov-Smirnov test for goodness of fit. ks, _ = sp_stats.kstest( samples, sp_stats.genpareto(conc, loc=loc, scale=scale).cdf) # Return True when the test passes. return ks < 0.02
def find_optimal_tail(self): """ The function fits all tails and saves the generated fit information. After all tails have been fitted the tail with the minimal AU2 test statistic and the index of the tail are saved. Returns: None """ # make sure all lists are cleaned up self.cdf_list = [] self.rv_list = [] # fit the tails for index, tail in enumerate(self.generate_tails(self.data)): print("\t" + str(index) + "/" + str(self.data.size), end='\r', flush=True) cdf, fit_out = self.fit_tail(tail) self.cdf_list.append(cdf) # save rv's rv = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2]) self.rv_list.append(rv) # calculate the test statitics self.au_2_data = np.array([au2(tail) for tail in self.cdf_list]) self.cramer_data = np.array([cramer_von_mises(tail) for tail in self.cdf_list]) self.anderson_data = np.array([anderson_darling(tail) for tail in self.cdf_list]) self.optimal_tail_index = self.au_2_data.argmin() self.optimal_tail = self.cdf_list[self.au_2_data.argmin()]
def testCDF(self, dist): xs = self.evaluate(dist.sample()) cdf = dist.cdf(xs) self.assertEqual(dist.batch_shape, cdf.shape) loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) expected_cdf = sp_stats.genpareto(conc, loc=loc, scale=scale).cdf(xs) self.assertAllClose(expected_cdf, self.evaluate(cdf), rtol=5e-5)
def testMean(self, dist): loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) self.assertEqual(dist.batch_shape, dist.mean().shape) if np.abs(conc) < 1e-5 and conc != 0: return # scipy does badly at small nonzero concentrations. expected = sp_stats.genpareto(conc, loc=loc, scale=scale).mean() actual = self.evaluate(dist.mean()) self.assertAllClose(expected, actual, rtol=5e-4)
def extremeDistribution_peaksOverThreshold(x, x_e, t_x, t_st, u): '''Approximates the short-term extreme distribution using the peaks over threshold method. Parameters ---------- x : np.array Independent random variable (global peaks) x_e : np.array Array of x values at which to evaluate the short-term extreme CDF t_x : float Time length of the x array t_st : float Short-term period u : float Threshold below which peaks (x) are ignored Returns ------- stextreme_dist: ecmDist object Probability distribution of the short-term extreme. stextreme_dist : ecmDist object Probability distribution of the short-term extreme. peaks_dist : ecmDist object Probability distribution of the peaks. peaksOverThreshold_dist: scipy.stats rv_frozen Probaility distribution of the peaks over threshold. pot_params: np.array length 3 Parameters of peak over threshold's distribution using Generalized Pareto[shape_c, loc, scale]. ''' # peaks over threshold pot = np.sort(x) pot = pot[(pot > u)] - u N = len(x) Npot = len(pot) # Fit a generalized Pareto pot_params = stats.genpareto.fit(pot, floc=0.) peaksOverThreshold_dist = stats.genpareto(c=pot_params[0], loc=pot_params[1], scale=pot_params[2]) # peaks x_e_pot = x_e[x_e >= u] genpareto_cdf = peaksOverThreshold_dist.cdf(x_e_pot - u) A = 1. - genpareto_cdf k = 1. * Npot / (1. * N) peaks_cdf = 1. - (k * A) peaks_dist = ecmDist(x_e_pot, cdf=peaks_cdf) # short-term extreme ratio = t_st / t_x N_st = 1. * N * ratio ste_cdf = peaks_cdf**N_st stextreme_dist = ecmDist(x_e_pot, cdf=ste_cdf) # return return stextreme_dist, peaks_dist, peaksOverThreshold_dist, pot_params
def extremeDistribution_peaksOverThreshold(x, x_e, t_x, t_st, u): '''Approximates the short-term extreme distribution using the peaks over threshold method. Parameters ---------- x : np.array Independent random variable (global peaks) x_e : np.array Array of x values at which to evaluate the short-term extreme CDF t_x : float Time length of the x array t_st : float Short-term period u : float Threshold below which peaks (x) are ignored Returns ------- stextreme_dist: ecmDist object Probability distribution of the short-term extreme. stextreme_dist : ecmDist object Probability distribution of the short-term extreme. peaks_dist : ecmDist object Probability distribution of the peaks. peaksOverThreshold_dist: scipy.stats rv_frozen Probaility distribution of the peaks over threshold. pot_params: np.array length 3 Parameters of peak over threshold's distribution (Generalized Pareto)[shape_c, loc, scale]. ''' # peaks over threshold pot = np.sort(x) pot = pot[(pot > u)] - u N = len(x) Npot = len(pot) # Fit a generalized Pareto pot_params = stats.genpareto.fit(pot, floc=0.) peaksOverThreshold_dist = stats.genpareto(c=pot_params[0], loc=pot_params[1], scale=pot_params[2]) # peaks x_e_pot = x_e[x_e>=u] genpareto_cdf = peaksOverThreshold_dist.cdf(x_e_pot-u) A = 1. - genpareto_cdf k = 1.*Npot / (1.*N) peaks_cdf = 1. - (k * A) peaks_dist = ecmDist(x_e_pot, cdf=peaks_cdf) # short-term extreme ratio = t_st / t_x N_st = 1.*N * ratio ste_cdf = peaks_cdf ** N_st stextreme_dist = ecmDist(x_e_pot, cdf=ste_cdf) # return return stextreme_dist, peaks_dist, peaksOverThreshold_dist, pot_params
def testMean(self, dist): loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) hp.note('Location: {}, scale: {}, concentration: {}'.format( loc, scale, conc)) self.assertEqual(dist.batch_shape, dist.mean().shape) # scipy doesn't seem to be very accurate for small concentrations, so use # higher precision. expected = sp_stats.genpareto(np.float64(conc), loc=np.float64(loc), scale=np.float64(scale)).mean() actual = self.evaluate(dist.mean()) self.assertAllClose(expected, actual, rtol=5e-4)
def testVariance(self, dist): loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) self.assertEqual(dist.batch_shape, dist.variance().shape) expected = sp_stats.genpareto(conc, loc=loc, scale=scale).var() if np.abs(conc) < 1e-4 and conc != 0: return # scipy does badly at small nonzero concentrations. if expected <= 0: return # scipy sometimes returns nonsense zero or negative variances. actual = self.evaluate(dist.variance()) print('var', loc, scale, conc, expected, actual, file=sys.stderr) self.assertAllClose(expected, actual, rtol=.01)
def testCDF(self, dist): xs = self.evaluate(dist.sample()) cdf = dist.cdf(xs) self.assertEqual(dist.batch_shape, cdf.shape) loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) expected_cdf = sp_stats.genpareto(conc, loc=loc, scale=scale).cdf(xs) actual_cdf = self.evaluate(cdf) msg = ('Location: {}, scale: {}, concentration: {}, xs: {} ' 'scipy cdf: {}, tfp cdf: {}') hp.note(msg.format(loc, scale, conc, xs, expected_cdf, actual_cdf)) self.assertAllClose(expected_cdf, actual_cdf, rtol=5e-5)
def testLogPDF(self, dist): xs = self.evaluate(dist.sample()) logp = dist.log_prob(xs) self.assertEqual(dist.batch_shape, logp.shape) p = dist.prob(xs) self.assertEqual(dist.batch_shape, p.shape) loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) expected_logp = sp_stats.genpareto(conc, loc=loc, scale=scale).logpdf(xs) actual_logp = self.evaluate(logp) self.assertAllClose(expected_logp, actual_logp, rtol=1e-5) self.assertAllClose(np.exp(expected_logp), self.evaluate(p), rtol=1e-5)
def _p(test_i, null_i, M_i, d_i): gpd_fit = None gpd_fit_p_value = None n_i = n # TODO: no need to sort as much as N numbers, do partial sort: # but this requires some tests (both performance and unit) # null_i_partitioned = np.partition(null_i, n_i+1) # null_i_first_n_sorted = sorted(null_i_partitioned[:-n_i+1]) null_i = sorted(null_i) t = None if all(np.isnan(null_i)): return np.nan, False, np.nan, np.nan # compute ecdf based, biased estimate of p-value raw_ecdf_estimate = (ecdf_pseudocount + d_i.sum()) / (N + 1) if M_i < m: # fit GDP, reducing $n$ until convergance while n_i > 0: # -1 because Python has 0-based indexing t = (null_i[-n_i-1] + null_i[-n_i-2]) / 2 y_untill_n = null_i[-n_i:] exceedences = y_untill_n - t assert all(y_untill_n >= t) assert len(exceedences) == n_i fit = genpareto.fit(exceedences) fitted = genpareto(*fit) gpd_fit = fitted gpd_fit_p_value = ad_test(exceedences, fitted).pvalue if gpd_fit_p_value <= 0.05: break else: n_i -= decrease_n_by if gpd_fit and gpd_fit_p_value < 0.05: return n_i / N * (1 - gpd_fit.cdf(test_i - t)), True, gpd_fit_p_value, raw_ecdf_estimate else: if gpd_fit: # TODO: get index and highlight which observation could not be fitted! warn(f'A good GPD fit could not be reached, using ECDF estimate instead') return raw_ecdf_estimate, False, np.nan, raw_ecdf_estimate
def testVariance(self, dist): loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) self.assertEqual(dist.batch_shape, dist.variance().shape) # scipy doesn't seem to be very accurate for small concentrations, so use # higher precision. expected = sp_stats.genpareto(np.float64(conc), loc=np.float64(loc), scale=np.float64(scale)).var() if expected <= 0: return # scipy sometimes returns nonsense zero or negative variances. actual = self.evaluate(dist.variance()) msg = ('Location: {}, scale: {}, concentration: {}, ' 'scipy variance: {}, tfp variance: {}') hp.note(msg.format(loc, scale, conc, expected, actual)) self.assertAllClose(expected, actual)
def square_error_genpareto(shape): from scipy.stats import genpareto distribution = genpareto(c=shape) square_errors = [ np.power(mean - distribution.mean(), 2.0) * mean_error_weight, np.power(lejp - distribution.ppf(percentile_lower), 2.0) * lejp_error_weight, np.power(uejp - distribution.ppf(percentile_upper), 2.0) * uejp_error_weight ] return square_errors
def test_ppf(self): xi = 0.1 mu = 0.05 sig = 0.5 model = GenParetoDist(xi, mu, sig) model_sp = genpareto(c=xi, loc=mu, scale=sig) us = np.array(np.linspace(0.01, 0.99), dtype=float) vec = model.ppf(us) vec_sp = model_sp.ppf(us) for idx in range(us.__len__()): self.assertAlmostEqual(vec[idx], vec_sp[idx], delta=1e-10) pass
def testVariance(self, dist): loc, scale, conc = self.evaluate([dist.loc, dist.scale, dist.concentration]) # scipy doesn't seem to be very accurate for small concentrations, so use # higher precision. expected = sp_stats.genpareto(np.float64(conc), loc=np.float64(loc), scale=np.float64(scale)).var() # scipy sometimes returns nonsense zero or negative variances. hp.assume(expected > 0) # scipy gets bad answers for very small concentrations even in 64-bit. # https://github.com/scipy/scipy/issues/11168 hp.assume(conc > 1e-5) self.assertEqual(dist.batch_shape, dist.variance().shape) actual = self.evaluate(dist.variance()) msg = ('Location: {}, scale: {}, concentration: {}, ' 'scipy variance: {}, tfp variance: {}') hp.note(msg.format(loc, scale, conc, expected, actual)) self.assertAllClose(expected, actual)
def add_spikes(ts: torch.Tensor, only_upper_spikes: bool = False): """ Adds spikes to 15% of the time series in the form of heavy-tailed (Generalized Pareto) realizations Arguments: ts: time series only_upper_spikes: boolean to indicate upper-tailed or two-tailed spikes """ num_spikes = int(0.15 * ts.shape[0]) indices_for_gp_spikes = np.random.choice(np.arange(len(ts)), replace=False, size=num_spikes) spike_direction = np.random.choice([-1, 1], replace=True, size=num_spikes) if only_upper_spikes: spike_direction = np.ones_like(spike_direction) spikes = stats.genpareto(1 / 50).rvs(num_spikes) * spike_direction ts[indices_for_gp_spikes] += spikes return ts
def fit_tail(tail): """ Fitting the tail using scipys genpareto and calculating the cdf of the tail for the fitted distribution Args: tail (numpy.ndarray): tail to fit Returns: numpy.ndarray, tuple: Cdf of the data for the fitted tail, fit parameters (c, loc, scale). """ # floc is set to zero because the data is expected to be transformed, so the location of the pareto distribution # is 0. Check generate_tails for further information. fit_out = genpareto.fit(tail, floc=0) # generate distribution with the fitted parameters estimated_distribution = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2]) # calculate the cdf of the estimated distribution in ascending order cdf_of_tail = estimated_distribution.cdf(tail) cdf_of_tail.sort() return cdf_of_tail, fit_out
def montecarlo_simulation(self, mc_steps=None): """ Runs Monte Carlo simulation for the optimal position. Args: mc_steps: number of Monte Carlo steps to run. Returns: float: p-value for the AU2 test statistic float: p-value for the Anderson-Darling test statistic float: p-value for the Cramér-von Mises test statistic int: number of montecarlo steps Raises: RuntimeError is the function gets called, when the fit for the optimal tail start has not been run before. """ if (self.optimal_tail_index is None or self.rv_list is None or self.cdf_list is None): raise RuntimeError("Fits have to run before the Monte Carlo simulation") if mc_steps is None: mc_steps = self.mc_steps # generate mc points mc_counter_au2 = 0 mc_counter_a2 = 0 mc_counter_w2 = 0 # make sure every thread has a different seed random_state = np.random.RandomState(np.random.seed()) random_variates = self.rv_list[self.optimal_tail_index].rvs(size=(mc_steps, self.optimal_tail.size), random_state=random_state) for index, random_variate in enumerate(random_variates): print("\t" + str(index) + "/" + str(mc_steps), end='\r', flush=True) fit_out = genpareto.fit(np.sort(random_variate)[::-1], floc=0) my_pareto = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2]) cdf_of_tail = np.sort(my_pareto.cdf(random_variate)) if au2(cdf_of_tail) > self.au_2_data[self.optimal_tail_index]: mc_counter_au2 += 1 if anderson_darling(cdf_of_tail) > self.anderson_data[self.optimal_tail_index]: mc_counter_a2 += 1 if cramer_von_mises(cdf_of_tail) > self.cramer_data[self.optimal_tail_index]: mc_counter_w2 += 1 return mc_counter_au2, mc_counter_a2, mc_counter_w2, mc_steps
def inter_arrival_dist(num_samples): dist = genpareto(0.154971, loc=15, scale=16.0292) number = random.random() output = [] probs = [0.00536, 0.00047, 0.17820, 0.09239, 0.00018, 0.02740, 0.00065, 0.00606, 0.00023, 0.00837, 0.08989, 0.00092, 0.00326, 0.01980] for i in range(num_samples): if number < sum(probs[0:1]): output.append(0) elif number < sum(probs[0:2]): output.append(1) elif number < sum(probs[0:3]): output.append(2) elif number < sum(probs[0:4]): output.append(3) elif number < sum(probs[0:5]): output.append(4) elif number < sum(probs[0:6]): output.append(5) elif number < sum(probs[0:7]): output.append(6) elif number < sum(probs[0:8]): output.append(7) elif number < sum(probs[0:9]): output.append(8) elif number < sum(probs[0:10]): output.append(9) elif number < sum(probs[0:11]): output.append(10) elif number < sum(probs[0:12]): output.append(11) elif number < sum(probs[0:13]): output.append(12) elif number < sum(probs[0:14]): output.append(13) elif number < sum(probs[0:15]): output.append(14) else: output.append(dist.rvs()) return output
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows = 2, ncols = 2, dpi=1300) pp_plot(logeados, stats.gennorm(beta = parametros_normal[0], loc = parametros_normal[1], scale=parametros_normal[2]), line = True, ax=ax1) ax1.set_title('Normal generalizada', fontsize=11) pp_plot(logeados, stats.genpareto(c = parametros_pareto[0], loc = parametros_pareto[1], scale=parametros_pareto[2]), line = True,ax=ax2) ax2.set_title('Pareto generalizada', fontsize=11) pp_plot(logeados, stats.dweibull(c = parametros_weibull[0], loc = parametros_weibull[1], scale=parametros_weibull[2]), line = True,ax=ax3) ax3.set_title('Weibull doble', fontsize=11) pp_plot(logeados, stats.gamma(a = parametros_gamma[0], loc = parametros_gamma[1], scale=parametros_gamma[2]), line = True,ax=ax4) ax4.set_title('Gamma', fontsize=11)
def _run_gpd_p(x, x0=0, side="upper", nx=260, fit_alpha=0.05, plot=False): """Fit tail with generalized pareto distribution to get p-value of x0. Based on Knijnenburg et al, 2009 (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2687965/) Here we use a komogorov-smirnof test for equality of distributions instead of Anderson-Darling as used in their paper. Parameters ========== x: array Array of values containing the bootstrap or permutation distribution x0: float The value the distribution is being tested against side: string Specify the tail of the distribution to be tested must be one of ["upper", "lower"] nx: int Starting value for the number of excedences to begin counting down from while attempting to fit the GPD fit_alpha: float Alpha used to reject the null hypothesis that the tail of the data comes from the fitted GPD. Returns ======= p: float fitted p-value """ x = np.sort(x) fit_p = 0 n = len(x) if nx > len(x): nx = len(x) if side == "upper": epc = np.count_nonzero(x >= x0) elif side == "lower": epc = np.count_nonzero(x <= x0) else: raise ValueError(f'side must be one of ["upper", "lower"], you provided {side}') if epc >= 10: # TODO: binomial estimate of this return (epc + 1) / (n + 1) while (fit_p < fit_alpha) & (nx > 10): nx -= 10 if side == "upper": t = np.mean([x[-1 * nx], x[-1 * nx - 1]]) tail = x[-1 * nx :] - t else: t = np.mean([x[nx], x[nx + 1]]) tail = np.sort((x[:nx]) - t) fit_params = stats.genpareto.fit(tail) fitted_gpd = stats.genpareto(*fit_params) k = fitted_gpd.args[2] fit_stat, fit_p = stats.kstest(tail, fitted_gpd.cdf) if fit_p < fit_alpha: print( "Could not fit GPD to tail of distribution, returning empirical cdf based p.", flush=True, ) return (epc + 1) / (n + 1) # raise Exception("Could not fit GPD to tail of distribution") if plot: plot_tail(tail, fitted_gpd.cdf) if side == "upper": p = nx / n * (1 - fitted_gpd.cdf(x0 - t)) # If p == 0 and K > 0 then we're in a domain where # GPD is finite and unsuitable for extrapolation # In these cases, return the pvalue for the extreme of x, # which will be conservative if (p == 0) & (k > 0): p = nx / n * (1 - fitted_gpd.cdf(x.max() - t)) if p == 0: return (epc + 1) / (n + 1) # raise Exception("p = 0") elif (p == 0) & (k <= 0): raise Exception("p=0 and k is not > 0") else: p = nx / n * (fitted_gpd.cdf(x0 - t)) if (p == 0) & (k > 0): p = nx / n * (fitted_gpd.cdf(x.min() - t)) if p == 0: return (epc + 1) / (n + 1) # raise Exception("p = 0") elif (p == 0) & (k <= 0): raise Exception("p=0 and k is not > 0") # return nx, t, fitted_gpd, p return p
#the results for the following look strange, maybe refactoring error he, h = hess_ndt(logmps, parsgpd, argsgpd, options) print(np.linalg.eigh(he)[0]) f = lambda params: logmps(params, *argsgpd) print(f(parsgpd)) #add binned fp2, bp2 = np.histogram(p2rvs, bins=50) 'fitbinned t-distribution' gpdparest_mlebinel = fitbinned(stats.genpareto, fp2, bp2, x0p) gpdparest_gmmbinelidentity = fitbinnedgmm(stats.genpareto, fp2, bp2, x0p) print('gpdparest_mlebinel', gpdparest_mlebinel) print('gpdparest_gmmbinelidentity', gpdparest_gmmbinelidentity) gpdparest_gmmquantile2 = fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, pquant=None, frozen=None) print('gpdparest_gmmquantile2', gpdparest_gmmquantile2) print( fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, pquant=np.linspace(0.01, 0.99, 10), frozen=None)) fp2, bp2 = np.histogram(p2rvs, bins=stats.genpareto(2).ppf( np.linspace(0, 0.99, 10))) print('fitbinnedgmm equal weight bins') print(fitbinnedgmm(stats.genpareto, fp2, bp2, x0p))
self.block5 = nn.ConvTranspose2d(64, out_channels, 4, 2, 1) def forward(self, latent, continuous_code): inp = torch.cat((latent, continuous_code), 1) out = self.block1(inp) out = self.block2(out) out = self.block3(out) out = self.block4(out) return torch.tanh(self.block5(out)) latentdim = 20 G = Generator(in_channels=latentdim, out_channels=1).cuda() genpareto_params = (1.33, 0, 0.0075761900937239765) threshold = -0.946046018600464 rv = genpareto(*genpareto_params) G.load_state_dict(torch.load('ExGAN/G999.pt')) G.eval() c = 0.75 k = 10 for tau in [0.05, 0.01]: tau_prime = tau / (c**k) val = rv.ppf(1 - tau_prime) + threshold t = time.time() code = Variable(torch.ones(100, 1, 1, 1) * val).cuda(2) latent = Variable(FloatTensor(torch.randn((100, latentdim, 1, 1)))).cuda(2) images = G(latent, code) print(time.time() - t) torch.save(0.5 * (images + 1), 'ExGAN' + str(tau) + '.pt')
def loglikelihood(par): logscale = par[0] shape = par[1] dist = genpareto(loc=0,scale=np.exp(logscale),c=shape) return -np.mean(dist.logpdf(x))
def fit_marginal_models(self,n=100,c=0,policy="veto",qq_plots=False,seed=1): """Fit Generalised Pareto models to the negative portion of post-interconnection power margin distributions and returns fitted parameters **Parameters**: `n` (`int`): Number of simulated samples with which to fit the models `c` (`int`): Assumed interconnection capacity `policy` (`string`): Either 'veto' or 'share' `qq_plots` (`bool`): if `True`, outputs QQ-plots of fitted models `seed` (`int`): random seed """ a1_shortfalls = self.hindcast.simulate_region(n=n,m=(-1,np.Inf),c=c,policy=policy) a2_shortfalls = self.hindcast.simulate_region(n=n,m=(np.Inf,-1),c=c,policy=policy) a1_shortfalls = -a1_shortfalls[:,0] a2_shortfalls = -a2_shortfalls[:,1] pars0 = self._cdf_univar_gp_fitter(a1_shortfalls,upper_endpoint_lb = -self.hindcast_limits[0]) pars1 = self._cdf_univar_gp_fitter(a2_shortfalls,upper_endpoint_lb = -self.hindcast_limits[1]) a1_scale_est = np.exp(pars0.x[0]) a1_shape_est = pars0.x[1] a2_scale_est = np.exp(pars1.x[0]) a2_shape_est = pars1.x[1] #a1_scale_std = np.exp(pars0.hess_inv[0,0]/np.sqrt(n)) #a1_shape_std = pars0.hess_inv[1,1]/np.sqrt(n) #a2_scale_std = np.exp(pars1.hess_inv[0,0]/np.sqrt(n)) #a2_shape_std = pars1.hess_inv[1,1]/np.sqrt(n) if qq_plots: q_grid = np.linspace(0.01,0.99,99) eq1 = np.quantile(a1_shortfalls,q = q_grid) eq2 = np.quantile(a2_shortfalls,q = q_grid) fq1 = genpareto(loc=0,scale=a1_scale_est,c=a1_shape_est).ppf(q=q_grid) fq2 = genpareto(loc=0,scale=a2_scale_est,c=a2_shape_est).ppf(q=q_grid) fig = plt.figure(figsize=(5,5)) ax = fig.add_subplot(211) ax.scatter(eq1, fq1, color = '#004C99') ax.plot([0, max(eq1)], [0, max(eq1)], '--', color = '#FF8000') #ax.xlim(lineStart, lineEnd) #ax.ylim(lineStart, lineEnd) ax.set_xlabel('Empirical quantiles') ax.set_ylabel('Fitted quantiles') ax.set_title('Area 1') ##plt.axis('scaled') ax = fig.add_subplot(212) ax.scatter(eq2, fq2, color = '#004C99') ax.plot([0, max(eq2)], [0, max(eq2)], '--', color = '#FF8000') #ax.xlim(lineStart, lineEnd) #ax.ylim(lineStart, lineEnd) ax.set_xlabel('Empirical quantiles') ax.set_ylabel('Fitted quantiles') ax.set_title('Area 2') ##plt.axis('scaled') plt.tight_layout() plt.show() # res = { # "area1":{ # "pars":{ # "scale":a1_scale_est, # "c":a1_shape_est # }, # "std":{ # "scale":a1_scale_std, # "c":a1_shape_std # } # }, # "area2":{ # "pars":{ # "scale":a2_scale_est, # "c":a2_shape_est # }, # "std":{ # "scale":a2_scale_std, # "c":a2_shape_std # } # } # } res = { "a1_scale":a1_scale_est, "a1_shape":a1_shape_est, "a2_scale":a2_scale_est, "a2_shape":a2_shape_est} return res
## Gráfico pp distribución completa fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows = 2, ncols = 2, dpi=1300) pp_plot(logeados, stats.gennorm(beta = parametros_normal[0], loc = parametros_normal[1], scale=parametros_normal[2]), line = True, ax=ax1) ax1.set_title('Normal generalizada', fontsize=11) pp_plot(logeados, stats.genpareto(c = parametros_pareto[0], loc = parametros_pareto[1], scale=parametros_pareto[2]), line = True,ax=ax2) ax2.set_title('Pareto generalizada', fontsize=11) pp_plot(logeados, stats.dweibull(c = parametros_weibull[0], loc = parametros_weibull[1], scale=parametros_weibull[2]), line = True,ax=ax3) ax3.set_title('Weibull doble', fontsize=11) pp_plot(logeados, stats.gamma(a = parametros_gamma[0], loc = parametros_gamma[1], scale=parametros_gamma[2]), line = True,ax=ax4) ax4.set_title('Gamma', fontsize=11)
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
def value_distribution(num_samples): dist = genpareto(0.348238, loc=0, scale=214.476) return dist.rvs(num_samples)
from scipy.stats import lognorm dist_lognorm = lognorm(s=lognorm_params[1], scale=np.exp(lognorm_params[0])) from scipy.stats import pareto dist_pareto = pareto(b=pareto_params) from scipy.stats import chi2 dist_chi2 = chi2(df=chi2_params) from scipy.stats import genpareto dist_genpareto = genpareto(c=genpareto_params) from scipy.stats import expon dist_expon = expon(scale=1 / exp_params) x = np.linspace(0, 1, num=500) import matplotlib.pyplot as plt # Use # to uncomment the non-relevant lines and rerun this portion only to get a better view of the fit #plt.plot(x, dist_gamma.ppf(x), color ='red', label='gamma') #plt.plot(x, dist_pareto.ppf(x), color ='brown', label='pareto') #plt.plot(x, dist_chi2.ppf(x), color ='purple', label='chi') #plt.plot(x, dist_genpareto.ppf(x), color ='purple', label='chi')
import numpy as np from scipy.stats import genpareto import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) c = 0.1 mean, var, skew, kurt = genpareto.stats(c, moments='mvsk') x = np.linspace(genpareto.ppf(0.01, c),genpareto.ppf(0.99, c), 100) ax.plot(x, genpareto.pdf(x, c),'r-', lw=5, alpha=0.6, label='genpareto pdf') rv = genpareto(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = genpareto.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c)) r = genpareto.rvs(c, size=1000) ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
gpdparest_mlebinel = fitbinned(stats.genpareto, fp2, bp2, x0p) gpdparest_gmmbinelidentity = fitbinnedgmm(stats.genpareto, fp2, bp2, x0p) print('gpdparest_mlebinel', gpdparest_mlebinel) print('gpdparest_gmmbinelidentity', gpdparest_gmmbinelidentity) gpdparest_gmmquantile2 = fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, pquant=None, frozen=None) print('gpdparest_gmmquantile2', gpdparest_gmmquantile2) #something wrong : something hard coded ? ''' >>> fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, pquant=np.linspace(0.5,0.95,10), frozen=None) Traceback (most recent call last): File "<pyshell#6>", line 1, in <module> fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, pquant=np.linspace(0.5,0.95,10), frozen=None) File "C:\...\scikits\statsmodels\sandbox\stats\distribution_estimators.py", line 224, in fitquantilesgmm parest = optimize.fmin(lambda params:np.sum(momentcondquant(distfn, params, mom2s,(pq,xqs), shape=None)**2), start) File "c:\...\scipy-trunk_after\trunk\dist\scipy-0.8.0.dev6156.win32\programs\python25\lib\site-packages\scipy\optimize\optimize.py", line 183, in fmin fsim[0] = func(x0) File "c:\...\scipy-trunk_after\trunk\dist\scipy-0.8.0.dev6156.win32\programs\python25\lib\site-packages\scipy\optimize\optimize.py", line 103, in function_wrapper return function(x, *args) File "C:\...\scikits\statsmodels\sandbox\stats\distribution_estimators.py", line 224, in <lambda> parest = optimize.fmin(lambda params:np.sum(momentcondquant(distfn, params, mom2s,(pq,xqs), shape=None)**2), start) File "C:\...\scikits\statsmodels\sandbox\stats\distribution_estimators.py", line 210, in momentcondquant cdfdiff = distfn.cdf(xq, *params) - pq ValueError: shape mismatch: objects cannot be broadcast to a single shape ''' print(fitquantilesgmm(stats.genpareto, p2rvs, start=x0p, pquant=np.linspace(0.01,0.99,10), frozen=None)) fp2, bp2 = np.histogram(p2rvs, bins=stats.genpareto(2).ppf(np.linspace(0,0.99,10))) print('fitbinnedgmm equal weight bins') print(fitbinnedgmm(stats.genpareto, fp2, bp2, x0p))
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, dpi=1300) pp_plot(logeados, stats.gennorm(beta=parametros_gennormal[0], loc=parametros_gennormal[1], scale=parametros_gennormal[2]), line=True, ax=ax1) ax1.set_title('Normal generalizada', fontsize=11) pp_plot(logeados, stats.genpareto(c=parametros_pareto[0], loc=parametros_pareto[1], scale=parametros_pareto[2]), line=True, ax=ax2) ax2.set_title('Pareto generalizada', fontsize=11) pp_plot(logeados, stats.dweibull(c=parametros_weibull[0], loc=parametros_weibull[1], scale=parametros_weibull[2]), line=True, ax=ax3) ax3.set_title('Weibull doble', fontsize=11) pp_plot(logeados, stats.gamma(a=parametros_gamma[0],
import numpy as np from scipy.stats import genpareto import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) c = 0.1 mean, var, skew, kurt = genpareto.stats(c, moments='mvsk') x = np.linspace(genpareto.ppf(0.01, c), genpareto.ppf(0.99, c), 100) ax.plot(x, genpareto.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genpareto pdf') rv = genpareto(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = genpareto.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c)) r = genpareto.rvs(c, size=1000) ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()