def fit(self, data): recs = data[data > 0] mu = scoreatpercentile(recs, self.threshold) loc, scale, shp = [self.nodata] * 3 w = nodata * np.ones(len(self.intervals)) if len(data) < self.minrecords: return Rp, loc, scale, shp # Fill each day that a cyclone isn't recorded with zero so we get # the correct rate for the return periods datafilled = np.zeros(int(self.numsim * NPYR)) datafilled[-len(data):] = data log.debug("The length of the filled data is {0}".format( len(datafilled))) rate = float(len(datafilled[datafilled > mu])) / float(len(datafilled)) log.debug("The calculated rate is: {0}".format(rate)) try: shape, location, scale = genpareto.fit(datafilled[datafilled > mu], floc=mu) except: return w, loc, scale, shp w = gpdReturnLevel(self.intervals, mu, shape, scale, rate) if shape > 0: # or Rpeval[0] < 0.0: return w, loc, scl, shp else: return w, location, scale, shape
def gpdfit(data, years, numsim, missingValue=-9999, minrecords=50, threshold=99.5): """ Fit a Generalised Pareto Distribution to the data. For a quick evaluation, we use the 99.5th percentile as a threshold. :param data: array of data values. :type data: :class:`numpy.ndarray` :param years: array of years for which to calculate return period values. :param int numsim: number of simulations created. :type years: :class:`numpy.ndarray` :param float missingValue: value to insert if fit does not converge. :param int minrecords: minimum number of valid observations required to perform fitting. :param float threshold: Threshold for performing the fitting. Default is the 99.5th percentile Returns: -------- :param Rpeval: `numpy.array` of return period wind speed values :param location: location parameter :param scale: scale parameter :param shape: shape parameter """ recs = data[data > 0] mu = scoreatpercentile(data, threshold) loc, scl, shp = [missingValue, missingValue, missingValue] Rp = missingValue * np.ones(len(years)) log.debug("The length of the data currently is {0}".format(len(data))) if len(data) < minrecords: return Rp, loc, scl, shp # Fill each day that a cyclone isn't recorded with zero so we get # the correct rate for the return periods datafilled = np.zeros(int(numsim * 365.25)) datafilled[-len(data):] = data log.debug("The length of the filled data is {0}".format(len(datafilled))) rate = float(len(datafilled[datafilled > mu])) / float(len(datafilled)) log.debug("The calculated rate is: {0}".format(rate)) try: shape, location, scale = genpareto.fit(datafilled[datafilled > mu], floc=mu) except: return Rp, loc, scl, shp Rpeval = gpdReturnLevel(years, mu, shape, scale, rate) if shape > 0: # or Rpeval[0] < 0.0: return Rp, loc, scl, shp else: return Rpeval, location, scale, shape
def calculateShape(mu, data): """ :param float mu: threshold parameter for the GPD distribution. :param data: :class:`numpy.ndarray` of data values to fit. """ nobs = len(data) nexc = len(data[data > mu]) rate = float(nexc) / float(nobs) gpd = genpareto.fit(data[data > mu] - mu) return gpd
def calculateShape(mu, data): """ :param float mu: threshold parameter for the GPD distribution. :param data: :class:`numpy.ndarray` of data values to fit. """ nobs = len(data) nexc = len(data[data > mu]) rate = float(nexc)/float(nobs) gpd = genpareto.fit(data[data > mu] - mu) return gpd
def GeneralizedPareto_ICDF(x, p): ''' Generalized Pareto fit Returns inverse cumulative probability function at p points ''' # fit a generalized pareto and get params shape, _, scale = genpareto.fit(x) # get percent points (inverse of CDF) icdf = genpareto.ppf(p, shape, scale=scale) return icdf
def GeneralizedPareto_CDF(x): ''' Generalized Pareto fit Returns cumulative probability function at x. ''' # fit a generalized pareto and get params shape, _, scale = genpareto.fit(x) # get generalized pareto CDF cdf = genpareto.cdf(x, shape, scale=scale) return cdf
def CalculaParametros(self): if self.tipoSerie == 'Parcial': #Achando o valor limiar: Parametro = genpareto.fit(self.dadoSerie) print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro elif self.tipoSerie == 'Anual': Parametro = genextreme.fit(self.dadoSerie) print('Parametros com Gev: \nForma: %f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro
def generalized_pareto_distribution_fit(peaks_over_th, threshold, loc=None, scale=None): # Fit the exceedances over threshold to Generalized Pareto distribution # BUG Missing default values get different results than default parameters values if loc is None and scale is not None: gpd_param = genpareto.fit(peaks_over_th - threshold, scale=scale) elif loc is not None and scale is None: gpd_param = genpareto.fit(peaks_over_th - threshold, loc=loc) elif loc is None and scale is None: gpd_param = genpareto.fit(peaks_over_th - threshold) else: gpd_param = genpareto.fit(peaks_over_th - threshold, loc=loc, scale=scale) # Set the localization parameter equal to the threshold gpd_param = list(gpd_param) gpd_param[1] = threshold return gpd_param
def _p(test_i, null_i, M_i, d_i): gpd_fit = None gpd_fit_p_value = None n_i = n # TODO: no need to sort as much as N numbers, do partial sort: # but this requires some tests (both performance and unit) # null_i_partitioned = np.partition(null_i, n_i+1) # null_i_first_n_sorted = sorted(null_i_partitioned[:-n_i+1]) null_i = sorted(null_i) t = None if all(np.isnan(null_i)): return np.nan, False, np.nan, np.nan # compute ecdf based, biased estimate of p-value raw_ecdf_estimate = (ecdf_pseudocount + d_i.sum()) / (N + 1) if M_i < m: # fit GDP, reducing $n$ until convergance while n_i > 0: # -1 because Python has 0-based indexing t = (null_i[-n_i-1] + null_i[-n_i-2]) / 2 y_untill_n = null_i[-n_i:] exceedences = y_untill_n - t assert all(y_untill_n >= t) assert len(exceedences) == n_i fit = genpareto.fit(exceedences) fitted = genpareto(*fit) gpd_fit = fitted gpd_fit_p_value = ad_test(exceedences, fitted).pvalue if gpd_fit_p_value <= 0.05: break else: n_i -= decrease_n_by if gpd_fit and gpd_fit_p_value < 0.05: return n_i / N * (1 - gpd_fit.cdf(test_i - t)), True, gpd_fit_p_value, raw_ecdf_estimate else: if gpd_fit: # TODO: get index and highlight which observation could not be fitted! warn(f'A good GPD fit could not be reached, using ECDF estimate instead') return raw_ecdf_estimate, False, np.nan, raw_ecdf_estimate
def fit_tail(tail): """ Fitting the tail using scipys genpareto and calculating the cdf of the tail for the fitted distribution Args: tail (numpy.ndarray): tail to fit Returns: numpy.ndarray, tuple: Cdf of the data for the fitted tail, fit parameters (c, loc, scale). """ # floc is set to zero because the data is expected to be transformed, so the location of the pareto distribution # is 0. Check generate_tails for further information. fit_out = genpareto.fit(tail, floc=0) # generate distribution with the fitted parameters estimated_distribution = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2]) # calculate the cdf of the estimated distribution in ascending order cdf_of_tail = estimated_distribution.cdf(tail) cdf_of_tail.sort() return cdf_of_tail, fit_out
def montecarlo_simulation(self, mc_steps=None): """ Runs Monte Carlo simulation for the optimal position. Args: mc_steps: number of Monte Carlo steps to run. Returns: float: p-value for the AU2 test statistic float: p-value for the Anderson-Darling test statistic float: p-value for the Cramér-von Mises test statistic int: number of montecarlo steps Raises: RuntimeError is the function gets called, when the fit for the optimal tail start has not been run before. """ if (self.optimal_tail_index is None or self.rv_list is None or self.cdf_list is None): raise RuntimeError("Fits have to run before the Monte Carlo simulation") if mc_steps is None: mc_steps = self.mc_steps # generate mc points mc_counter_au2 = 0 mc_counter_a2 = 0 mc_counter_w2 = 0 # make sure every thread has a different seed random_state = np.random.RandomState(np.random.seed()) random_variates = self.rv_list[self.optimal_tail_index].rvs(size=(mc_steps, self.optimal_tail.size), random_state=random_state) for index, random_variate in enumerate(random_variates): print("\t" + str(index) + "/" + str(mc_steps), end='\r', flush=True) fit_out = genpareto.fit(np.sort(random_variate)[::-1], floc=0) my_pareto = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2]) cdf_of_tail = np.sort(my_pareto.cdf(random_variate)) if au2(cdf_of_tail) > self.au_2_data[self.optimal_tail_index]: mc_counter_au2 += 1 if anderson_darling(cdf_of_tail) > self.anderson_data[self.optimal_tail_index]: mc_counter_a2 += 1 if cramer_von_mises(cdf_of_tail) > self.cramer_data[self.optimal_tail_index]: mc_counter_w2 += 1 return mc_counter_au2, mc_counter_a2, mc_counter_w2, mc_steps
def SetUpSemiParametricCDFPlot(c1, u, plotvsc1=False, name="Semi-Parametric Fit", xlabel="", ylabel=""): result = dict() us = list([u]) x = np.linspace(min(c1), max(c1), 1000) if plotvsc1 == False else c1 i = 1 for u in us: exceedances = list() internals = list() for rvs in c1: if abs(rvs) > u: exceedances.append(abs(rvs) - u) else: internals.append(rvs) fits = None while fits == None: with warnings.catch_warnings(): warnings.filterwarnings('ignore') try: fits = genpareto.fit(exceedances) except Warning as e: print('error found:', e) warnings.filterwarnings('default') internals = np.array(internals).reshape((len(internals), 1)) r1c, r2c, r3c, r4c, bwArr_Cdf = HybridSemiParametricGPDCDF( x, u, c1, fits[0], loc=fits[1], scale=fits[2]) emp = pd.Series(r1c).apply(Empirical_StepWise_CDF(sorted(c1))) r1s, r2cs = DualSortByL1(r1c, r2c) r1p, r2p, r3p, r4p, bwArr_pdf = HybridSemiParametricGPDPDF( x, u, c1, fits[0], loc=fits[1], scale=fits[2]) r1s, r2ps = DualSortByL1(r1p, r2p) result['%.10f' % (u)] = (r2c, r2p) i += 3 plt.subplots_adjust(hspace=0.48) return result, c1, u, r1c, r2c, r3c, r4c, bwArr_Cdf, r1p, r2p, r3p, r4p, bwArr_pdf, plotvsc1, name, xlabel, ylabel
def CalculaParametros(self): if self.tipoSerie == 'Parcial': #Achando o valor limiar: limite = lp.LimiteParcial(self.dadoSerie).AchaLimite(2) print(limite) Parciais = se.Series(self.dadoSerie).serieMaxParcial(limite) datasP, PicosParciais = se.Series(Parciais).separaDados() Parametro = genpareto.fit(PicosParciais) print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro elif self.tipoSerie == 'Anual': Anuais = se.Series(self.dadoSerie).serieMaxAnual() datasA, PicosAnuais = se.Series(Anuais).separaDados() Parametro = genextreme.fit(PicosAnuais) print('Parametros com Gev: \nForma: %.f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro
def gpdfit(data, rate, years, missingValue=-9999, minrecords=50, thresh=99.7): """ Fit a Generalised Pareto Distribution to the data. For a quick evaluation, we use the 99.7th percentile as a threshold. :param data: array of data values. :type data: :class:`numpy.ndarray` :param years: array of years for which to calculate return period values. :type years: :class:`numpy.ndarray` :param float missingValue: value to insert if fit does not converge. :param int minRecords: minimum number of valid observations required to perform fitting. :param float thresh: Threshold for performing the fitting. Default is the 99.7th percentile :return: return period values :rtype: :class:`numpy.ndarray` :return: location, shape and scale parameters of the distribution, determined using MLE (as per :meth:`scipy.stats.rv_continuous`) :rtype: float """ mu = scoreatpercentile(data, thresh) if len(data[data > 0]) < minrecords: return loc, scale, shp = [missingValue, missingValue, missingValue] w = missingValue * np.ones(len(years)) if len(data[data > 0]) < minrecords: return w, loc, scale, shp rate = float(len(data[data > mu])) / float(len(mu)) try: params = genpareto.fit(data[data > mu], floc=mu) except: return w, loc, scale, shp w = gpdReturnLevel(years, mu, params[0], params[2], rate) return w, mu, params[2], params[0]
lamda = theta = 0.97 mu = np.mean(log_return.iloc[:499,]) covariance = np.std(log_return.iloc[:499,]) ewma_mu,ewma_covariance = EWMA(mu,covariance,lamda,theta,log_return) blocks_size = 125 num_blocks = math.floor(log_return.shape[0]/blocks_size) max_in_blocks = block_maximuns(num_blocks,blocks_size, port_loss) gev_values = gev.fit(max_in_blocks) alpha_thresh = 0.95 u_value = sort_port_loss[math.floor(log_return.shape[0] * alpha_thresh)] cdf_u_value = math.ceil(log_return.shape[0] * alpha_thresh) / log_return.shape[0] sort_data = port_loss[port_loss > u_value].dropna()- u_value y_data = np.array(sort_data)[:, 0].T gpd_values = gpd.fit(y_data) high_alpha = 0.9999 low_alpha = 0.99 delta_alpha = 0.000099 alpha = np.arange(low_alpha,high_alpha,delta_alpha) emp_var_table = EMP_VaR(alpha, sort_port_loss) ewma_var_table = EWMA_VaR(ewma_mu[0],ewma_covariance[0], alpha,port_value ) gev_var_table = GEV_VaR(gev_values,blocks_size,alpha) gpd_var_table = GPD_VaR(gpd_values,u_value,cdf_u_value,alpha) crash_log_return = -0.099452258 crash_loss = -port_value * crash_log_return emp_pre = see_crash(emp_var_table, crash_loss, delta_alpha) ewma_pre = see_crash(ewma_var_table, crash_loss, delta_alpha)
label="rnds") ax.legend(loc="best", frameon=False) print("maximum: {}".format(max(random_numbers))) print("minimum: {}".format(min(random_numbers))) sample_mean = np.mean(random_numbers) sample_mean_sq = np.mean(random_numbers**2) sample_variance = np.var(random_numbers) sample_variance_sq = np.var(random_numbers**2) sigma_est = 0.5 * sample_mean * sample_mean_sq / ( sample_mean_sq - sample_mean**2) # OK gamma_est = 0.5 - (sample_mean**2 / (2 * (sample_mean_sq - sample_mean**2))) # ok print("sigma closed est: {}".format(sigma_est)) print("gamma closed est: {}".format(gamma_est)) sigma_moments = 0.5 * sample_mean * (sample_mean**2 / sample_variance + 1) gamma_moments = -0.5 * (sample_mean**2 / sample_variance - 1) print("sigma moments est: {}".format(sigma_moments)) print("gamma moments est: {}".format(gamma_moments)) # print(sigma_moments - sigma_est) # print(gamma_moments - gamma_est) # print(1 - sigma/sigma_moments) mle_fit = genpareto.fit(random_numbers) print("gamma mle {}".format(mle_fit[0])) print("mu mle {}".format(mle_fit[1])) print("sigma mle {}".format(mle_fit[2])) plt.show()
y, e, w = honu_filter.run(desired_output, filter_data) elbnd = pa.detection.ELBND(w, e, function="sum") dw = np.copy(w) dw[1:] = np.abs(np.diff(dw, n=1, axis=0)) dw_count = int(dw.shape[0]) hpp = np.ones((dw_count - gev_window, filter_len)) for i in range(gev_window, dw.shape[0]): if i % 100 == 0: pass # print((str(datetime.now())), " processing: ", i) for j in range(filter_len): poted_values = pot(dw[i - gev_window:i, j], 1) if dw[i, j] > poted_values[-1]: fit = genpareto.fit(poted_values, floc=[poted_values[-1]]) fit = genpareto.fit(poted_values, floc=fit[1], fscale=fit[2]) if j == 0: #print(fit[2]) mu_check.append(poted_values[-1]) gamma = fit[0] mu = fit[1] sigma = fit[2] #gpd_params_dict[str(j + 1)]["gamma"].append(gamma) #gpd_params_dict[str(j + 1)]["mu"].append(mu[0]) #gpd_params_dict[str(j + 1)]["sigma"].insert(sigma) if dw[i, j] >= fit[1]: hpp[i - gev_window, j] = 1 - genpareto.cdf(
def extremesl_fit(station_data_file, pipeline_id): # Load the station data file try: f = open(station_data_file, 'rb') except: print("Cannot open station data file: {}\n".format(station_data_file)) # Extract the configuration variables station_data = pickle.load(f) f.close() # Keep track of station IDs that don't produce valid GPD fit bad_stations = [] # Initialize dictionary to hold fitted data fitted_data = {} # Loop through the available station data for station_id in station_data: #range of Peak-Over-Threshold values and corresponding heights at station potvals = station_data[station_id]['pot_vals'] #etreme values based on POT with 95% extremes_prepo = station_data[station_id]['decl_extremes'] #Determine location parameter, i.e. find the peak height corresponding to POT threshold in projection settings #location parameter = threshold water-level above which return levels are estimated with the GPD, #loc = potvals[np.where(potvals[:,0] == gpd_pot_threshold)][0][1] loc = potvals #extremes that exceed location parameter extremes_loc = extremes_prepo[extremes_prepo['height'] > loc] extremes_loc_timesorted = extremes_loc.sort_values(by=['mytime']) #count declustered days with extreme obs per year decl_esls_pyear = extremes_loc.groupby( extremes_loc_timesorted.mytime.dt.year, as_index=True).count() esl_years = np.unique(extremes_loc_timesorted['mytime'].dt.year.values) #average exceedances per year for configured threshold avg_exceed = 365.25 * 24 * len(extremes_loc) / len( station_data[station_id]['obs']) #^ above is not completely correct, because there are less than so many hours in one year? should take the average number of hours of obs in one year? ##### FIT GPD #provide initial guess using method of moments (code based on gpfit.m) xbar = np.mean(extremes_loc['height'].values - loc) #mean of extremes s2 = np.var(extremes_loc['height'].values - loc) #variance of extremes k0 = -.5 * ((xbar**2) / s2 - 1) #initial guesses sigma0 = .5 * xbar * ((xbar**2) / s2 + 1) xmax = max(extremes_loc['height'].values - loc) # Method of moments invalid (code based on gpfit.m) if (k0 < 0 and xmax >= -sigma0 / k0): #assume exponential distribution k0 = 0 sigma0 = xbar #fit gpd based on exceedences and initial guess, note that optimization vals differ slightly from gpfit in MATLAB gp_params = genpareto.fit(extremes_loc['height'].values - loc, loc=0, scale=sigma0) #calculate covariance matrix of estimated parameters gp_nlogl, gp_cov = gplike(gp_params[0], gp_params[2], extremes_loc['height'].values - loc) # Are the GPD parameters outside the supported range? If so, add this station to the list of bad stations. if gp_nlogl == np.Inf: print( 'GDP parameters of ' + station_id + ' are outside of the supported range, confidence intervals and standard errors cannot be computed reliably.' ) print(station_id + ' will be ommitted from the results.') bad_stations.append(station_id) ###### QUERY HISTORICAL RETURN FREQUENCIES FOR TEST HEIGHTS #get MHHW for Gumbel distribution below Pareto obs = station_data[station_id]['obs'] obs2D = obs.groupby(pd.Grouper(key='mytime', freq='2D')).max() #2-daily maxima mhhw = np.nanmean( obs2D.height ) #calculate MHHW as mean of maximum value per 2 days (to account for tidal cycle longer than 1 day) mhhwFreq = 365.25 / 2 #assume MHHW is exceeded once every 2 days ##### STORE DATA IN DICTIONARY fitted_data[station_id] = {'loc': loc, 'gp_shape': gp_params[0], 'gp_scale': gp_params[2], \ 'gp_cov': gp_cov, 'extremes_loc': extremes_loc, 'avg_exceed': avg_exceed, 'decl_esls_pyear': decl_esls_pyear, \ 'esl_years': esl_years, 'mhhw': mhhw, 'mhhwFreq': mhhwFreq} #fitted_data[station_id]['gp_shape'] = gp_params[0] #fitted_data[station_id]['gp_scale'] = gp_params[2] #fitted_data[station_id]['gp_cov'] = gp_cov #fitted_data[station_id]['extremes_loc'] = extremes_loc #fitted_data[station_id]['avg_exceed'] =avg_exceed #fitted_data[station_id]['decl_esls_pyear'] =decl_esls_pyear #fitted_data[station_id]['esl_years'] = esl_years #fitted_data[station_id]['mhhw'] = mhhw #fitted_data[station_id]['mhhwFreq'] = mhhwFreq # Remove the stations that don't support GPD fit for bad_station in bad_stations: del fitted_data[bad_station] # If there are no more stations left, raise an error if len(fitted_data) == 0: raise Exception("No stations available with valid GPD fit") # Write station data to an output pickle outdir = os.path.dirname(__file__) outfile = open(os.path.join(outdir, "{}_fit.pkl".format(pipeline_id)), 'wb') pickle.dump(fitted_data, outfile) outfile.close() return (0)
def _PlotSemiParametricFitResults(self, c1, u, r1c, r2c, r3c, r4c, bwArr_Cdf, r1p, r2p, r3p, r4p, bwArr_pdf, plotvsc1=False, name="Semi-Parametric Fit", xlabel="", ylabel=""): ''' Wrapper to plot the results of SemiParametricCDFFit returns void ''' x = np.linspace(min(c1), max(c1), 1000) if plotvsc1 == False else c1 us = list([u]) fig, ax = plt.subplots(3, len(us), sharey=True, figsize=(7, 7 * len(us))) fig.subplots_adjust(wspace=0) fig.canvas.set_window_title(name) fig.canvas.figure.set_label(name) result = dict() i = 1 for u in us: exceedances = list() internals = list() for rvs in c1: if abs(rvs) > u: exceedances.append(abs(rvs) - u) else: internals.append(rvs) fits = None while fits == None: with warnings.catch_warnings(): warnings.filterwarnings('ignore') try: fits = genpareto.fit(exceedances) except Warning as e: print('error found:', e) warnings.filterwarnings('default') internals = np.array(internals).reshape((len(internals), 1)) #c1s = np.array(c1).reshape((len(c1),1)) #cdf_smoother = kde_statsmodels_m_cdf(internals,x,bandwidth=0.2) #pdf_smoother = kde_statsmodels_m_pdf(internals,x,bandwidth=0.2) #plt.subplot(2,len(us),i) #plt.plot(x, HybridNormalGPDCDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2) #plt.plot(x, norm.cdf(x,mean(c1),sd(c1)), linewidth=2) #plt.plot(x, HybridNormalGPDPDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2) #plt.plot(x, norm.pdf(x,mean(c1),sd(c1)), linewidth=2) #plt.hist(np.array(c1), bins=15, normed=True) #plt.xlabel(xlabel) #plt.ylabel(ylabel) #plt.title("Generalised Pareto Tails on Gaussian Fitted Center") #plt.legend(["Fitted_HybridCDF", "Fitted_Normal_CDF", "Fitted_HybridPDF", "Fitted_Normal_PDF", "Data Histogram"],loc='best') plt.subplot(2, len(us), i) emp = pd.Series(r1c).apply(Empirical_StepWise_CDF(sorted(c1))) r1s, r2cs = DualSortByL1(r1c, r2c) plt.plot(r3c, r4c, linewidth=2) plt.plot(r1c, emp, linewidth=2) plt.plot(r1s, r2cs, linewidth=2) #plt.plot(r1, norm.cdf(r1,mean(c1),sd(c1)), linewidth=2) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title("Semi Parametric CDF with BandWidth {0}".format( bwArr_Cdf).replace('[ ', "list_").replace(']', "_")) #plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother", "Fitted_NormalCDF", "Fitted_HybridPDF", "PDF_Smoother", "Fitted_Normal_PDF", "Student_T Hist"],loc='best') plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother"], loc='best') plt.subplot(2, len(us), i + 1) r1s, r2ps = DualSortByL1(r1p, r2p) plt.plot(r3p, r4p, linewidth=2) plt.plot(r1s, r2ps, linewidth=2) #plt.plot(r1, norm.pdf(r1,mean(c1),sd(c1)), linewidth=2) plt.hist(np.array(c1), bins=15, normed=True) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title("Semi Parametric PDF with BandWidth {0}".format( bwArr_pdf).replace('[ ', "list_").replace(']', "_")) plt.legend(["Fitted_HybridPDF", "PDF_Smoother", "Data Histogram"], loc='best') result['%.10f' % (u)] = (r2c, r2p) i += 3 plt.subplots_adjust(hspace=0.48)
def fit_resample(self): resample = genpareto.rvs(self.shape, self.location, self.scale, self.size) return genpareto.fit(resample)
#print(bins_ot) fig, ax = plt.subplots(nrows=2, ncols=2) fig.tight_layout() ax[-1, -1].axis('off') hist_ot = ax[0][0].hist(x=lat, bins=bins_ot, histtype='stepfilled', alpha=0.3) ax[0][0].set_xlabel('latency [\u03BCs]', fontsize=8) ax[0][0].set_yscale('log') #print(hist_ot[0]) hist_ot_norm = ax[1][0].hist(x=lat, bins=bins_ot, density=True, histtype='stepfilled', alpha=0.3) # Fit using the fitter of the genpareto class (shown in red). ret = gpareto.fit(lat, loc=threshold) ax[1][0].plot(x, gpareto.pdf(x, c=ret[0], loc=ret[1], scale=ret[2]), 'r-', lw=1, color='red', alpha=0.8) ax[1][0].set_xlabel('latency [\u03BCs]', fontsize=8) print(ret) print('\ngoodness-of-fit: ' + '{:03.3f}'.format(chi2_test(hist_ot_norm, n_bins=n_bins, c=ret[0], loc=ret[1], scale=ret[2], norm=len(lat)))) print("\n curve_fit:") # Fit using the curve_fit fitter. Fix the value of the "loc" parameter. popt, pcov = cfit(lambda x, c, scale: gpareto.pdf(x, c=c, loc=threshold, scale=scale),
def gpd_fit(y): with warnings.catch_warnings(): warnings.simplefilter("ignore") xi_mle, _, sig_mle = genpareto.fit(y, floc=0) return xi_mle, sig_mle
def SemiParametricCDFFit(c1, u, plotvsc1=False, name="Semi-Parametric Fit", xlabel="", ylabel=""): ''' Calculates a SemiParametric fit to the data in c1. Uses a gaussian kernal estimation within the centre of the distribution of c1 which is decided by the threshold u. Uses a Generalised Pareto distribution to fit both tails outside of the threshold governed by u. Returns a tuple containing the the range (y points) of the (SemiPara-CDF,SemiPara-PDF); if (plotvsc1 = False) => the y points depend on 1000 equally spaced points between min(c1) and max(c1). if (plotvsc1 = True) => the y points depend on the points in c1 and maintain the order of c1 in the outputted array. i.e. F_n(c1) where F_n is the semiparametric fitted function. ''' 'https://mglerner.github.io/posts/histograms-and-kernel-density-estimation-kde-2.html?p=28' x = np.linspace(min(c1), max(c1), 1000) if plotvsc1 == False else c1 us = list([u]) fig, ax = plt.subplots(3, len(us), sharey=True, figsize=(7, 7 * len(us))) fig.subplots_adjust(wspace=0) fig.canvas.set_window_title(name) fig.canvas.figure.set_label(name) result = dict() i = 1 for u in us: exceedances = list() internals = list() for rvs in c1: if abs(rvs) > u: exceedances.append(abs(rvs) - u) else: internals.append(rvs) fits = None while fits == None: with warnings.catch_warnings(): warnings.filterwarnings('ignore') try: fits = genpareto.fit(exceedances) except Warning as e: print('error found:', e) warnings.filterwarnings('default') internals = np.array(internals).reshape((len(internals), 1)) #c1s = np.array(c1).reshape((len(c1),1)) #cdf_smoother = kde_statsmodels_m_cdf(internals,x,bandwidth=0.2) #pdf_smoother = kde_statsmodels_m_pdf(internals,x,bandwidth=0.2) #plt.subplot(2,len(us),i) #plt.plot(x, HybridNormalGPDCDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2) #plt.plot(x, norm.cdf(x,mean(c1),sd(c1)), linewidth=2) #plt.plot(x, HybridNormalGPDPDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2) #plt.plot(x, norm.pdf(x,mean(c1),sd(c1)), linewidth=2) #plt.hist(np.array(c1), bins=15, normed=True) #plt.xlabel(xlabel) #plt.ylabel(ylabel) #plt.title("Generalised Pareto Tails on Gaussian Fitted Center") #plt.legend(["Fitted_HybridCDF", "Fitted_Normal_CDF", "Fitted_HybridPDF", "Fitted_Normal_PDF", "Data Histogram"],loc='best') plt.subplot(2, len(us), i) r1, r2c, r3, r4, bwArr = HybridSemiParametricGPDCDF(x, u, c1, fits[0], loc=fits[1], scale=fits[2]) emp = pd.Series(r1).apply(Empirical_StepWise_CDF(sorted(c1))) r1s, r2cs = DualSortByL1(r1, r2c) plt.plot(r3, r4, linewidth=2) plt.plot(r1, emp, linewidth=2) plt.plot(r1s, r2cs, linewidth=2) #plt.plot(r1, norm.cdf(r1,mean(c1),sd(c1)), linewidth=2) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title( "Semi Parametric CDF with BandWidth {0}".format(bwArr).replace( '[ ', "list_").replace(']', "_")) #plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother", "Fitted_NormalCDF", "Fitted_HybridPDF", "PDF_Smoother", "Fitted_Normal_PDF", "Student_T Hist"],loc='best') plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother"], loc='best') plt.subplot(2, len(us), i + 1) r1, r2p, r3, r4, bwArr = HybridSemiParametricGPDPDF(x, u, c1, fits[0], loc=fits[1], scale=fits[2]) r1s, r2ps = DualSortByL1(r1, r2p) plt.plot(r3, r4, linewidth=2) plt.plot(r1s, r2ps, linewidth=2) #plt.plot(r1, norm.pdf(r1,mean(c1),sd(c1)), linewidth=2) plt.hist(np.array(c1), bins=15, normed=True) plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title( "Semi Parametric PDF with BandWidth {0}".format(bwArr).replace( '[ ', "list_").replace(']', "_")) plt.legend(["Fitted_HybridPDF", "PDF_Smoother", "Data Histogram"], loc='best') result['%.10f' % (u)] = (r2c, r2p) i += 3 plt.subplots_adjust(hspace=0.48) return result
skew_SN = (mean_X_cube) * (lamda * (mean_X_sq)**3)**(-1 / 2) gamma_alpha = 4 * (skew_SN)**(-2) gamma_beta = (gamma_alpha / (lamda * mean_X_sq))**(1 / 2) gamma_k = lamda * mean_X - gamma_alpha / gamma_beta sort_comp_poisson_rnd = compound_poisson_distribution( lamda, num_values, mu, sigma) sort_comp_poisson_rnd.sort() alpha_low = .99 alpha_high = .99999 low_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_low)] high_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_high)] mu_gp = low_val sample_data = compound_poisson_distribution(lamda, num_values, mu, sigma) data_gp = sample_data[sample_data > mu_gp] - mu_gp gpd_value = gpd.fit(data_gp) cdf_values = np.arange(low_val, high_val + (high_val - low_val) / 1000, (high_val - low_val) / 1000) norm_cdf_tail = 1 - norm.cdf(cdf_values, mean_SN, (var_SN)**(1 / 2)) gamma_cdf_tail = 1 - gamma.cdf( cdf_values - gamma_k, gamma_alpha, scale=1 / gamma_beta) GP_cdf_tail = 1 - (genpareto.cdf( cdf_values - low_val, gpd_value[0], scale=gpd_value[2]) * 0.01 + 0.99) emp_cdf_tails = emp_cdf_tail(sample_data, cdf_values) plt.loglog(cdf_values, norm_cdf_tail, label='CLT') plt.loglog(cdf_values, gamma_cdf_tail, label='GAMMA') plt.loglog(cdf_values, GP_cdf_tail, label='GP') plt.loglog(cdf_values, emp_cdf_tails, label='EMP')