def estat(x, y, nboot=1000, replace=False, method='log', fitting=False): ''' Energy distance statistics test. Reference --------- Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free multivariate goodness-of-fit tests, two-sample comparison and unfolding. Nuc Instr and Meth in Phys Res A 537: 626-636 Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics based on distances. J Stat Planning & Infer 143: 1249-1272 Brian Lau, multdist, https://github.com/brian-lau/multdist ''' n, N = len(x), len(x) + len(y) stack = np.vstack([x, y]) stack = (stack - stack.mean(0)) / stack.std(0) if replace: rand = lambda x: random.randint(x, size=x) else: rand = random.permutation en = energy(stack[:n], stack[n:], method) en_boot = np.zeros(nboot, 'f') for i in range(nboot): idx = rand(N) en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method) if fitting: param = genextreme.fit(en_boot) p = genextreme.sf(en, *param) return p, en, param else: p = (en_boot >= en).sum() / nboot return p, en, en_boot
def Plot_Dist_Train_Extreme(r_err, GT_val,bin1=500,bin2=500,interval1 = 0.95,interval2=0.99): covMat = np.array(r_err["Err"], dtype=float) median = np.median(covMat) c, loc, scale = genextreme.fit(covMat, floc=median) min_extreme1,max_extreme1 = genextreme.interval(interval1,c,loc,scale) min_extreme2,max_extreme2 = genextreme.interval(interval2,c,loc,scale) x = np.linspace(min(covMat),max(covMat),2000) fig,ax = plt.subplots(figsize = (30,10)) plt.xlim(0,0.4) plt.plot(x, genextreme.pdf(x, *genextreme.fit(covMat)), linewidth=5) plt.hist(np.array(r_err["Err"], dtype=float),bins=bin1,alpha=0.3,density=True,edgecolor='black',facecolor='gray', linewidth=3,histtype='stepfilled') #{'bar', 'barstacked', 'step', 'stepfilled'}) plt.hist(np.asarray(GT_val["Err"]), bins=bin2, alpha=0.3,density=True,edgecolor='red',facecolor='red', linewidth=3,histtype='stepfilled') plt.xlabel('Lengths Counts') plt.ylabel('Probability') plt.title(r'max_extreme1=%.3f,max_extreme2=%.3f' %(max_extreme1, max_extreme2)) ax.tick_params(left = False, bottom = False) ax.axvline(min_extreme1, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.axvline(max_extreme1, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.text(min_extreme1, 8, "5th", size = 20, alpha = 0.8,color="red") ax.text(max_extreme1, 8, "95th", size = 20, alpha =.8,color="red") ax.axvline(min_extreme2, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.axvline(max_extreme2, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #, ax.text(min_extreme2, 8, "1st", size = 20, alpha = 0.8,color="red") ax.text(max_extreme2, 8, "99th", size = 20, alpha =.8,color="red") print("95% CI upper bound:",max_extreme1) print("99% CI upper bound:",max_extreme2) print("Median RE:",np.median(np.array(GT_val["Err"], dtype=float))) return c, loc, scale, fig,ax
def get_gev_fit(data): """ Tries GEV fits with several loc parameters, selects the best one MIGHT BE DIFFERENT THAN OTHER SCRIPT--BE CAREFUL """ md = mode(data)[0][0] std = np.std(data) # first try with loc=mode shape, loc, scale = gev.fit(data, loc=md) # if bad try again with mean if loc > md + std: shape, loc, scale = gev.fit(data, loc=np.mean(data)) else: print('GEV fit with mode') # if still bad (ugh), try again with mode - std if loc > md + std: shape, loc, scale = gev.fit(data, loc=md - std) else: print('GEV fit with mean') if loc > md + std: print('GEV fit with c=0') shape, loc, scale = gev.fit(data, 0) else: print('GEV fit with mode minus std deviation') return shape, loc, scale
def get_gev_fit(data): """ Tries GEV fits with several loc parameters, selects the best one BUG--doesn't always fit well. Might be related to x-range (needs to be wide enough to catch extremes) This also needs to get moved to UTIL eventually """ md = mode(data)[0][0] std = np.std(data) # first try with loc=mode shape, loc, scale = gev.fit(data, loc=md) # if bad try again with mean if loc > md + std: shape, loc, scale = gev.fit(data, loc=np.mean(data)) else: print('GEV fit with mode') # if still bad (ugh), try again with mode - std if loc > md + std: shape, loc, scale = gev.fit(data, loc=md - std) else: print('GEV fit with mean') if loc > md + std: print('GEV fit with c=0') shape, loc, scale = gev.fit(data, 0) else: print('GEV fit with mode minus std deviation') return shape, loc, scale
def fit_gev(data, user_estimates=[], generate_estimates=False): """Fit a GEV by providing fit and scale estimates. Parameters ---------- data : numpy ndarray user_estimates : list, optional Estimate of the location and scale parameters generate_estimates : bool, default False Fit GEV to data subset first to estimate parameters (useful for large datasets) Returns ------- shape : float Shape parameter loc : float Location parameter scale : float Scale parameter """ if user_estimates: loc_estimate, scale_estimate = user_estimates shape, loc, scale = gev.fit(data, loc=loc_estimate, scale=scale_estimate) elif generate_estimates: shape_estimate, loc_estimate, scale_estimate = gev.fit(data[::2]) shape, loc, scale = gev.fit(data, loc=loc_estimate, scale=scale_estimate) else: shape, loc, scale = gev.fit(data) return shape, loc, scale
def construct_IDF(self): if self.ci: bts = {} for col in self.reformatted_ams.columns: mams = [] for i in range(self.number_bootstrap): bootsams = np.random.choice( self.reformatted_ams[col].values, replace=True, size=len(self.reformatted_ams)) fit = gev.fit(bootsams) mams.append( gev.isf(self.quantiles, c=fit[0], loc=fit[1], scale=fit[2])) bts[col] = np.asarray(mams) p_lo = ((1.0-self.alpha)/2.0) * 100 p_up = (self.alpha+((1.0-self.alpha)/2.0)) * 100 for col in self.reformatted_ams.columns: lower = np.apply_along_axis(np.percentile, 0, bts[col], p_lo) upper = np.apply_along_axis(np.percentile, 0, bts[col], p_up) median = np.apply_along_axis( np.percentile, 0, bts[col], 50) self.idf[col] = np.append(lower, np.append(median, upper)) else: for col in self.reformatted_ams.columns: fit = gev.fit(self.reformatted_ams[col]) self.idf[col] = gev.isf(self.quantiles, c=fit[0], loc=fit[1], scale=fit[2])
def _compare_resamples(self, tvalues, null_max_tvalues, null_min_tvalues): pvalues = [] maxparams = genextreme.fit(null_max_tvalues) minparams = genextreme.fit([-x for x in null_min_tvalues]) for tvalue in tvalues: pvalue = genextreme.sf(tvalue, *maxparams) if tvalue >= 0 else genextreme.sf(-tvalue, *minparams) pvalues.append(pvalue) return pvalues
def testcase(): data = read_data() all_float = True for x in data: if type(x) != float: all_float = False print(f"All values were floats? {all_float}") print(f"Starting test with only first 2000 points of data...") shape, loc, scale = gev.fit(data[0:2000]) print(f"Starting test with all points of data...") shape, loc, scale = gev.fit(data)
def generalized_extreme_value_distribution_fit(annual_maxima, loc=None, scale=None): # Fit the exceedances over threshold to Generalized Pareto distribution # BUG Missing default values get different results than default parameters values if loc is None and scale is not None: gev_param = genextreme.fit(annual_maxima, scale=scale) elif loc is not None and scale is None: gev_param = genextreme.fit(annual_maxima, loc=loc) elif loc is None and scale is None: gev_param = genextreme.fit(annual_maxima) else: gev_param = genextreme.fit(annual_maxima, loc=loc, scale=scale) return gev_param
def test_fit(self, datasets): for ds in datasets: c, loc, scale = genextreme.fit(ds) fit = GEV.fit(ds) assert fit.loc() == approx(loc) assert fit.scale() == approx(scale) assert fit.shape() == approx(-c)
def getZScoreDistExpFunction(y_data): from scipy.stats import genextreme as ge fit_params = ge.fit(y_data) mean = fit_params[1] sigma = fit_params[2] shape = fit_params[0] return lambda x: ge.pdf(x, shape, loc=mean, scale=sigma)
def fit(self, data, c=0, loc=1, scale=1): (c, loc, scale) = genextreme.fit(data) self.c, self.loc, self.scale = (c, loc, scale) self.epsilon = -self.c self.params = {'c': self.c, 'loc': self.loc, 'scale': self.scale} self.setParams(self.params) return (self.c, self.loc, self.scale)
def test_lm(): x = [ 360.228515625, 513.506103515625, 273.85031127929688, 340.94839477539062, 244.13925170898438, 283.414306640625, 394.42819213867188, 284.3604736328125, 281.26956176757812, 241.46173095703125, 489.75482177734375, 236.31536865234375, 407.55133056640625, 244.6295166015625, 432.40670776367188, 260.501953125, 517.23052978515625, 317.6553955078125, 407.61935424804688, 275.0709228515625, 330.369140625, 285.92086791992188, 247.9954833984375, 344.34811401367188, 379.55596923828125, 330.80569458007812, 312.35330200195312, 251.79550170898438, 372.66928100585938, 239.72474670410156 ] # print(get_initial_params_using_lm(x)) print(np.mean(x)) pars = [128.28104749, 578.4927539, 0.62410911] data = [ 588.4747314453125, 693.6640625, 519.03155517578125, 716.58013916015625, 686.29168701171875, 432.65786743164062, 682.72113037109375, 730.12603759765625, 698.971923828125, 491.75332641601562, 597.258544921875, 487.13619995117188, 482.33123779296875, 573.57861328125, 801.67169189453125, 616.41668701171875, 690.954833984375, 671.31646728515625, 680.87554931640625, 534.18414306640625, 427.86019897460938, 236.22953796386719, 691.40972900390625, 599.84637451171875, 545.3563232421875, 553.059814453125, 549.1295166015625, 658.3983154296875, 719.122802734375, 636.84906005859375 ] import lmoments3 from lmoments3 import distr the_moments = lmoments3.lmom_ratios(sorted(data), 5) pars = distr.gev.lmom_fit(sorted(data), lmom_ratios=the_moments) print("Fitted params using lmoments: ", pars) xi, mu, sigma = pars.values() print(objective_function_stationary_high([sigma, mu, -xi], data)) print("Fitted using MLE: ", distr.gev.fit(sorted(data))) print( "Fitted using custom method (Huziy et al 2013), not using l-moments: ", optimize_stationary_for_period(np.array(sorted(data)))) print( "Fitted using custom method (Huziy et al 2013), using l-moments: ", optimize_stationary_for_period(np.array(sorted(data)), use_lmoments=True)) from scipy.stats import genextreme print("Fitted using scipy.stats.genextreme: ", genextreme.fit(np.array(sorted(data)))) print("10 year high flow return level: ", get_high_ret_level_stationary([sigma, mu, -xi, 0], 10)) print("10 year high flow return level: ", get_high_ret_level_stationary([sigma, mu, -0.5, 0], 10))
def gevfit(sr): gev_fit = gev.fit(sr) c = gev_fit[0] mu = gev_fit[1] sigma = gev_fit[2] print(""" GEV Fit Parameters: shape parameter c: %s location parameter mu: %s scale parameter sigma: %s """ % (c, sigma, mu)) print("Median", gev.median(c, mu, sigma)) print("Mean", gev.mean(c, mu, sigma)) print("Std dev", gev.std(c, mu, sigma)) print("95% interval: ", gev.interval(0.95, c, mu, sigma)) if (c > 0): lBnd = mu - sigma / c else: lBnd = mu + sigma / c srmax = np.max(sr) * 1.1 bins = sr.size x = np.linspace(np.min(sr) - 5, np.max(sr) + 5, 500) #x=np.linspace(lBnd,srmax,500) gev_pdf = gev.pdf(x, c, mu, sigma) gev_cdf = gev.cdf(x, c, mu, sigma) plt.figure(figsize=(12, 6)) ax1 = plt.subplot(1, 2, 1) plt.hist(sr, normed=True, alpha=0.2, label='Raw Data', bins='auto') plt.plot(x, gev_pdf, 'r--', label='GEV Fit') plt.legend(loc='upper left') ax1.set_title('%s_Probability Density Fraction' % (sr.name)) ax1.set_xlabel('Predicted Fatigue Limit (MPa)') ax1.set_ylabel('Probability') ax1.grid() ax2 = plt.subplot(1, 2, 2) plt.hist(sr, normed=True, alpha=0.2, label='Raw Data', cumulative=True, bins='auto') plt.plot(x, gev_cdf, 'r--', label='GEV Fit') plt.legend(loc='upper left') ax2.set_title('%s_Cumulative Density Fraction' % (sr.name)) ax2.set_xlabel('Predicted Fatigue Limit (MPa)') ax2.set_ylabel('Density') ax2.grid() plt.show() pass
def mvs(self): if self.data is None: raise ValueError("Data not's None") mvs = genextreme.fit(self.data) self.shape = mvs[0] self.loc = mvs[1] self.scale = mvs[2] return self.shape, self.loc, self.scale
def estat(x, y, nboot=1000, maxt=60., replace=False, method='log', fitting=False): """ Energy distance statistics test. References ---------- * Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free multivariate goodness-of-fit tests, two-sample comparison and unfolding. Nuc Instr and Meth in Phys Res A 537: 626-636 * Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics based on distances. J Stat Planning & Infer 143: 1249-1272 * Brian Lau, multdist, https://github.com/brian-lau/multdist """ n, N = len(x), len(x) + len(y) stack = np.vstack([x, y]) # stack = (stack - stack.mean(0)) / stack.std(0) stack = (stack - np.nanmean(stack, 0)) / np.nanstd(stack, 0) if replace: def rand(x): return np.random.randint(x, size=x) # rand = lambda x: np.random.randint(x, size=x) else: rand = np.random.permutation en = energy(stack[:n], stack[n:], method) en_boot = np.zeros(nboot, 'f') s = t.time() for i in range(nboot): idx = rand(N) en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method) if t.time() - s > maxt: print("Time consumed, exit bootstrap (N={})".format(i)) en_boot, nboot = en_boot[:i], i + 1 break if fitting: param = genextreme.fit(en_boot) p = genextreme.sf(en, *param) return p, en, param else: p = (en_boot >= en).sum() / nboot return p, en, en_boot
def srednie(plik_in): listy = [] domeny = [] li = 0 d1 = 0 with open(plik_in, 'r+') as f: for line in f: w = line.split() d = line.split() w = float(w[1]) d = float(d[2]) listy.append(w) # print(listy) domeny.append(d) for x, el in enumerate(domeny): if el == 0.0: domeny[x] = 1.0 for x in domeny: li += 1 if x == 1.0: d1 += 1 # -------------------------DANIO RERIO REVIEWED---------------------- data4 = pd.read_csv( 'Danio_reviewed_out.txt', sep='\t', names=['Nazwa białka', 'Długość łańcucha', 'Liczba domen']) # histogram długosc łańcucha dwiekolumny4 = data4[data4.columns[1:3]] np.seterr(divide='ignore', invalid='ignore') dwiekolumny4.hist(column='Długość łańcucha', bins=100, figsize=(10, 10), color='mediumvioletred', density=True) p = genextreme.fit(listy, -1) print(p) ss.genextreme.fit(listy) plt.plot(np.linspace(0, 3500), genextreme.pdf(np.linspace(0, 3500), p[0], p[1], p[2]), 'b--', lw=3, label='Generalized extreme value distribution ') plt.title('Danio rerio reviewed - Histogram długości łańucha', color='black') plt.xlabel('Długość łańcucha') plt.ylabel('Liczebność') plt.legend(loc='upper right') pylab.xlim([-10, 3500]) plt.show()
def mvs(self): if self.data is None: raise e.DataNotExist("Data not's None", 35) mvs = genextreme.fit(data=self.data) self.estimador = 'MVS' self.shape = mvs[0] self.loc = mvs[1] self.scale = mvs[2] self.dist = genextreme(c=self.shape, loc=self.loc, scale=self.scale) return self.shape, self.loc, self.scale
def extreme_value_prob_fit(NPM, perc): n = NPM.shape[0] t = NPM.shape[1] n_perc = int(round(t * perc)) m = np.zeros(n) for i in range(n): temp = np.abs(NPM[i, :]) temp = np.sort(temp) temp = temp[t - n_perc:] m[i] = trim_mean(temp, 0.05) params = genextreme.fit(m) return params
def calculate_required_effort(data_before_capture, n_bootstrapping, success_probability): effort_per_sighting = calculate_effort_per_sighting(data_before_capture) n_effort_per_sighting = len(effort_per_sighting) required_effort: np.array = np.zeros(n_bootstrapping) for i in range(n_bootstrapping): resampled_effort_per_sighting = np.random.choice( effort_per_sighting, n_effort_per_sighting) fit = genextreme.fit(resampled_effort_per_sighting) required_effort[i] = genextreme.ppf(success_probability, fit[0], fit[1], fit[2]) return required_effort
def FitGEV_KMA_Frechet(bmus, n_clusters, var): ''' Returns stationary GEV/Gumbel_L params for KMA bmus and varible series bmus - KMA bmus (time series of KMA centroids) n_clusters - number of KMA clusters var - time series of variable to fit to GEV/Gumbel_L returns np.array (n_clusters x parameters). parameters = (shape, loc, scale) for gumbel distributions shape value will be ~0 (0.0000000001) ''' param_GEV = np.empty((n_clusters, 3)) for i in range(n_clusters): c = i + 1 pos = np.where((bmus == c))[0] if len(pos) == 0: param_GEV[i, :] = [np.nan, np.nan, np.nan] else: # get variable at cluster position var_c = var[pos] var_c = var_c[~np.isnan(var_c)] # fit to Gumbel_l and get negative loglikelihood loc_gl, scale_gl = gumbel_l.fit(-var_c) theta_gl = (0.0000000001, -1 * loc_gl, scale_gl) nLogL_gl = genextreme.nnlf(theta_gl, var_c) # fit to GEV and get negative loglikelihood c = -0.1 shape_gev, loc_gev, scale_gev = genextreme.fit(var_c, c) theta_gev = (shape_gev, loc_gev, scale_gev) nLogL_gev = genextreme.nnlf(theta_gev, var_c) # store negative shape theta_gev_fix = (-shape_gev, loc_gev, scale_gev) # apply significance test if Frechet if shape_gev < 0: # TODO: cant replicate ML exact solution if nLogL_gl - nLogL_gev >= 1.92: param_GEV[i, :] = list(theta_gev_fix) else: param_GEV[i, :] = list(theta_gl) else: param_GEV[i, :] = list(theta_gev_fix) return param_GEV
def plot(self, FAPname, Nlevels, cheat=True): '''Plots the periodogram with significance levels provided by the bootstrap. Number of displayed significance levels adjusted with Nlevels. cheat shows the FAP calculated by astrop, as well as a marker showing a tabulated value for the frequency of the planets oscillation. Not finished code for calcualting the FAP based on the z-levels is still present.''' P = self.search() Ptop = np.amax(P) ftop = self.flist[np.where(P == Ptop)[0][0]] Levels = np.array([50, 90, 95, 99, 99.9])[:Nlevels] # Neff = 0 # for i in range(self.Neval-2): # if (P[i]<P[i+1]) & (P[i+1]>P[i+2]): # Neff = Neff + 1 # # print(Neff) # Neff = self.fmax * 1/(self.flist[1]-self.flist[0]) # # print(Neff) FAPfile = np.loadtxt(FAPname + 'FAPNormTest.txt') #PLevels = scoreperc(FAPfile,Levels) fit = gev.fit(FAPfile) PLevels = gev.ppf(Levels / 100, *fit) plt.figure(figsize=(20, 14)) plt.hlines(PLevels, self.fmin, self.fmax, 'g') plt.plot(self.flist, P) plt.text(self.fmax - (self.fmax - self.fmin) / 2.25, plt.ylim()[1], 'False alarm probability') plt.ylim(0, Ptop + 0.1) for i in range(Nlevels): plt.text(self.fmax - (self.fmax - self.fmin) / 3, PLevels[i] + 0.003, str(np.round(1 - Levels[i] / 100, 3))) plt.plot(ftop, Ptop, 'r', marker='o', linestyle='none', markerfacecolor='none', markersize=35) if cheat == True: plt.vlines(1 / self.cheat, 0, Ptop, 'g') CheatLevels = self.APFAP(1 - Levels / 100) plt.hlines(CheatLevels, self.fmin, self.fmax, 'r') plt.xlabel('Frequency [1/day]') plt.ylabel('Lomb-Scargle Power') plt.title( 'Lomb - Scargle Periodogram for {planet}'.format(planet=self.name)) print('Highest probability of period = {p} days'.format( p=round(1 / ftop, 3)))
def extreme_value_prob_fit(NPM, perc): n = NPM.shape[0] t = NPM.shape[1] n_perc = int(round(t * perc)) m = np.zeros(n) for i in range(n): temp = np.abs(NPM[i, :]) temp = np.sort(temp) temp = temp[t - n_perc:] temp = temp[0:int(np.floor(0.90 * temp.shape[0]))] m[i] = np.mean(temp) params = genextreme.fit(m) return params
def CalculaParametros(self): if self.tipoSerie == 'Parcial': #Achando o valor limiar: Parametro = genpareto.fit(self.dadoSerie) print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro elif self.tipoSerie == 'Anual': Parametro = genextreme.fit(self.dadoSerie) print('Parametros com Gev: \nForma: %f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro
def fit_GEV(self, theta=None, save=False): """ Fits GEV parameters to annual maxima of data using maximum likelihood (if theta is None) and draws Q-Q plot. ----------------------------------------------------------------------- theta: Fitted [mu, sigma, xi]. ----------------------------------------------------------------------- Returns: Fitted [mu, sigma, xi]. """ block_max = [x for x in self.block_max if x > 0] block_max.sort() if theta is None: xi, mu, sigma = genextreme.fit(block_max) xi = -xi theta = [mu, sigma, xi] a = np.array(range(len(block_max))) + 1.0 b = len(block_max) + 1.0 emp_p = a / b emp_q = quantile(theta, emp_p) fig, ax = plt.subplots(figsize=(6, 4)) ax.scatter(emp_q, block_max, s=10, color="k") ax.plot([0, 1], [0, 1], transform=ax.transAxes, color="k") ax.set(xlabel="Theoretical quantiles", ylabel="Empirical quantiles") both = np.concatenate((emp_q, block_max)) l = (max(both) - min(both)) / 10 limits = [min(both) - l, max(both) + l] ax.grid(True) plt.axis("square") ax.axis([*limits, *limits]) if save: fig.tight_layout() plt.savefig("%s/plots/%s-qq.pdf" % (save_path, self.name), bbox_inches="tight") plt.show() return theta
def calculate_pvalue_gev(original_score: float, scores: list) -> float: """Calculates a p-value to a target/query combination by int. with a given amount of shuffle iterations by fitting a generalized extreme value distribution and integrating from -inf to the original score >>> i = IntaRNApvalue(['-q', 'AGGAUG', '-t', 'UUUAUCGUU', '--scores', '10', '-m', 'b', '--threads', '3']) >>> i.calculate_pvalue_gev(-10.0, [-1.235, -1.435645, -6.234234, -12.999, -15.23, -6.98, -6.23, -2.78]) 0.17611816922560236 """ shape, loc, scale = gev.fit(scores) def f(x): return gev.pdf(x, shape, loc=loc, scale=scale) return integ(f, -np.inf, original_score)[0]
def _process_block(self) -> None: assert len(self.block) == self.block_size # We've finished a block. Take the extremum for it # as an observation for the GEV distribution and reset the block. self.extrema.append(self.take_extremum(self.block)) self.extremum = self.take_extremum(self.extrema) self.block.clear() # Fit a generalized extreme value (GEV) distribution to our # extremum samples. self.dist = gev(*gev.fit(self.extrema)) self.p = getattr(self.dist, self.find_p)(self.extremum) self.return_period = 1.0 / (self.block_size * self.p) # type: ignore self.return_time = self.return_period * self.avg_t self.can_report = True
def calc_match_statistics(self, oligo, charges, modifications, ms, ppm_error, score_to_test, random_oligo_to_test=1000): fr = Fragmentor() matcher = Matcher() column_headers = ['Sequence', 'Score'] min_char = len(oligo) max_char = len(oligo) allchar = ['A', 'G', 'C', 'T', 'U'] data_to_save = [] for i in range(random_oligo_to_test): random_oligo = "".join( choice(allchar) for _ in range(randint(min_char, max_char))) fragments = fr.fragment_oligo(random_oligo) df_search_space = matcher.create_search_space( fragments, charges, modifications) df_results = matcher.match_oligo_fragments_pandas( df_search_space, ms, ppm_error) score = self.simple_score(df_results) print('Oligo: {0:<30} Score: {1:7.3f}'.format(random_oligo, score)) data_to_save.append([random_oligo, score]) dist_df = pd.DataFrame(data_to_save, columns=column_headers) extreme_fit = genextreme.fit(dist_df.Score) c = extreme_fit[0] loc = extreme_fit[1] scale = extreme_fit[2] print(("Extreme value fits c = {0}, loc = {1}, scale = {2}").format( c, loc, scale)) extreme_to_plot = genextreme(c, loc, scale) p_value = extreme_to_plot.pdf(score_to_test) print(("p value of score {0} = {1}").format(score_to_test, p_value)) return dist_df, p_value, score_to_test, extreme_to_plot
def gev_fit(var_fit): c = -0.1 vv = np.linspace(0, 10, 200) sha_g, loc_g, sca_g = genextreme.fit(var_fit, c) pg = genextreme.cdf(vv, sha_g, loc_g, sca_g) ix = pg > 0.1 vv = vv[ix] ts = 1 / (1 - pg[ix]) # TODO gev params 95% confidence intervals return ts, vv
def doGev(dis, retPerYr): prob = 1-1/retPerYr npt = dis.shape[1] nretper = len(retPerYr) retLev = np.ones([npt, nretper])*np.nan for ipt in range(npt): disii = dis[:,ipt] disii = disii[~np.isnan(disii)] if len(disii) > 15: shape, loc, scale = gev.fit(-disii) retLevII = -gev.ppf(prob, shape, loc=loc, scale=scale) if sum(retLevII < 0) == 0: retLev[ipt, :] = retLevII return retLev
def rl_bootstrap(data, T=100, nsim=1000): """returns a return level :param data: list of input data :param T: timestep period :param nsim: number of recalcualtions """ from scipy.stats import genextreme as gev RL_bt=[] for i in range(0,nsim,1): subset = resample(data) s, a, b = gev.fit(subset) RL_bt.append(RL(T,a,b,s)) return RL_bt
def plot_probability_density(annual_max, station_id): mle = genextreme.fit(sorted(annual_max), 0) mu = mle[1] sigma = mle[2] xi = mle[0] min_x = min(annual_max)-0.5 max_x = max(annual_max)+0.5 x = np.linspace(min_x, max_x, num=100) y = [genextreme.pdf(z, xi, loc=mu, scale=sigma) for z in x] fig = plt.figure(figsize=(12,6)) axes = fig.add_axes([0.1, 0.1, 0.8, 0.8]) xlabel = (station_id + " - Annual Max Wind Speed (m/s)") axes.set_title("Probability Density & Normalized Histogram") axes.set_xlabel(xlabel) axes.plot(x, y, color='Red') axes.hist(annual_max, bins=arange(min_x, max_x, abs((max_x-min_x)/10)), normed=1, color='Yellow')
def CalculateEVDParameters (MaxSimilarities): # from rpy2.robjects.packages import importr # import rpy2.rpy_classic as rpy # import rpy2.robjects as robjects # from rpy2.robjects import r # Create an R object of sorted scores # maxsims = robjects.FloatVector( sorted(MaxSimilarities) ) # MLE Estimates using "ismev" package # r.library("ismev") # gev_fit = robjects.r['gev.fit'](maxsims) #Mu = location, Sigma = scale, Xi = shape # Mu = gev_fit[6][0] # Sigma = gev_fit[6][1] # Xi = gev_fit[6][2] #Standard errors for the Mu, Sigma and Xi # eMu = gev_fit[8][0] # eSigma = gev_fit[8][1] # eXi = gev_fit[8][2] # print "baseR: ", Xi,Mu,Sigma,eXi,eMu,eSigma i = 0 while i < len(MaxSimilarities): print i, " -- ", MaxSimilarities[i], " == ", if(MaxSimilarities[i] == 0): MaxSimilarities[i] = 1 print MaxSimilarities[i] i += 1 from scipy.stats import genextreme import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") gev_shape,gev_loc,gev_scale = genextreme.fit( sorted(MaxSimilarities) ) print "scipy: shape=", gev_shape, " loc=", gev_loc, " scale=", gev_scale Xi = abs(gev_shape) Mu = gev_loc Sigma = gev_scale eXi = 0 eMu = 0 eSigma = 0 return (Xi,Mu,Sigma,eXi,eMu,eSigma)
def plot_return_values(annual_max, station_id): fig, axes = plt.subplots(figsize=(20,6)) T=np.r_[1:500] mle = genextreme.fit(sorted(annual_max), 0) mu = mle[1] sigma = mle[2] xi = mle[0] # print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi) sT = genextreme.isf(1./T, 0, mu, sigma) axes.semilogx(T, sT, 'r'), hold N=np.r_[1:len(annual_max)+1]; Nmax=max(N); axes.plot(Nmax/N, sorted(annual_max)[::-1], 'bo') title = station_id axes.set_title(title) axes.set_xlabel('Return Period (yrs)') axes.set_ylabel('Wind Speed (m/s)') axes.grid(True)
def test_lm(): x = [360.228515625, 513.506103515625, 273.85031127929688, 340.94839477539062, 244.13925170898438, 283.414306640625, 394.42819213867188, 284.3604736328125, 281.26956176757812, 241.46173095703125, 489.75482177734375, 236.31536865234375, 407.55133056640625, 244.6295166015625, 432.40670776367188, 260.501953125, 517.23052978515625, 317.6553955078125, 407.61935424804688, 275.0709228515625, 330.369140625, 285.92086791992188, 247.9954833984375, 344.34811401367188, 379.55596923828125, 330.80569458007812, 312.35330200195312, 251.79550170898438, 372.66928100585938, 239.72474670410156] # print(get_initial_params_using_lm(x)) print(np.mean(x)) pars = [128.28104749, 578.4927539, 0.62410911] data = [588.4747314453125, 693.6640625, 519.03155517578125, 716.58013916015625, 686.29168701171875, 432.65786743164062, 682.72113037109375, 730.12603759765625, 698.971923828125, 491.75332641601562, 597.258544921875, 487.13619995117188, 482.33123779296875, 573.57861328125, 801.67169189453125, 616.41668701171875, 690.954833984375, 671.31646728515625, 680.87554931640625, 534.18414306640625, 427.86019897460938, 236.22953796386719, 691.40972900390625, 599.84637451171875, 545.3563232421875, 553.059814453125, 549.1295166015625, 658.3983154296875, 719.122802734375, 636.84906005859375] import lmoments3 from lmoments3 import distr the_moments = lmoments3.lmom_ratios(sorted(data), 5) pars = distr.gev.lmom_fit(sorted(data), lmom_ratios=the_moments) print("Fitted params using lmoments: ", pars) xi, mu, sigma = pars.values() print(objective_function_stationary_high([sigma, mu, -xi], data)) print("Fitted using MLE: ", distr.gev.fit(sorted(data))) print("Fitted using custom method (Huziy et al 2013), not using l-moments: ", optimize_stationary_for_period( np.array(sorted(data)))) print("Fitted using custom method (Huziy et al 2013), using l-moments: ", optimize_stationary_for_period(np.array(sorted(data)), use_lmoments=True)) from scipy.stats import genextreme print("Fitted using scipy.stats.genextreme: ", genextreme.fit(np.array(sorted(data)))) print("10 year high flow return level: ", get_high_ret_level_stationary([sigma, mu, -xi, 0], 10)) print("10 year high flow return level: ", get_high_ret_level_stationary([sigma, mu, -0.5, 0], 10))
def CalculaParametros(self): if self.tipoSerie == 'Parcial': #Achando o valor limiar: limite = lp.LimiteParcial(self.dadoSerie).AchaLimite(2) print(limite) Parciais = se.Series(self.dadoSerie).serieMaxParcial(limite) datasP, PicosParciais = se.Series(Parciais).separaDados() Parametro = genpareto.fit(PicosParciais) print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro elif self.tipoSerie == 'Anual': Anuais = se.Series(self.dadoSerie).serieMaxAnual() datasA, PicosAnuais = se.Series(Anuais).separaDados() Parametro = genextreme.fit(PicosAnuais) print('Parametros com Gev: \nForma: %.f, Localidade: %f, Escala: %f' % (Parametro[0],Parametro[1],Parametro[2])) return Parametro
def eventdistribution(data, per=[5,95], nsim=1000, rp = [ 10., 20., 50., 100., 200.,500., 1000. ], rp_scale_factor=1, white_noise=False): """ returns a matrix with (returnperiod,lower_percentil,return_level, upper_percentil) :param data: values of timeseries :param per: lower and upper percentile defining the uncertainty :param nsim: Number of returs for bootstrap calculation :param rp: list of return timestepps :param rp_scale_factor: scale factor for rp :param std_err: default = True :param white_noise: add a white noise (random number between 0 to std/10). In case of singular timeseries """ from scipy.stats import genextreme as gev from numpy import percentile, vstack if white_noise == True: s = std(data)/10 ts_white_noise = [n + uniform(0,s) for n in data] data = ts_white_noise s, a, b = gev.fit(data) rl = [] edist = [] per_low = [] per_high = [] for T in rp * rp_scale_factor : rl.append(RL(T,a,b,s)) RL_bt = rl_bootstrap(data, T=T, nsim=nsim) #per, b = percentile(RL_bt,[per[0],per[1]]) per_low.append(percentile(RL_bt, 5)) per_high.append(percentile(RL_bt, 95)) rl_c = vstack((rp, per_low, rl, per_high)) return (rl_c)
# <codecell> annual_max_levels = yx # <headingcell level=4> # Fit data to GEV distribution # <codecell> def sea_levels_gev_pdf(x): return genextreme.pdf(x, xi, loc=mu, scale=sigma) # <codecell> mle = genextreme.fit(sorted(annual_max_levels), 0) mu = mle[1] sigma = mle[2] xi = mle[0] print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi) # <headingcell level=4> # Probability Density Plot # <codecell> min_x = min(annual_max_levels)-0.5 max_x = max(annual_max_levels)+0.5 x = np.linspace(min_x, max_x, num=100) y = [sea_levels_gev_pdf(z) for z in x]
for i in annual_max: data_levels.append(float(i)) annual_max = data_levels # <headingcell level=4> # Fit data to GEV distribution # <codecell> def gev_pdf(x): return genextreme.pdf(x, xi, loc=mu, scale=sigma) # <codecell> mle = genextreme.fit(sorted(annual_max), 0) mu = mle[1] sigma = mle[2] xi = mle[0] print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi) # <headingcell level=4> # Probability Density Plot # <codecell> min_x = min(annual_max_levels)-0.5 max_x = max(annual_max_levels)+0.5 x = np.linspace(min_x, max_x, num=100) y = [gev_pdf(z) for z in x]
def create_gev_models(): """ parses the top values and creates extreme value dist and saves the parameters to the database """ base_dir = '/home/dtgillis/ccsim_workspace/evd/' dir_dict = {'inf': 'CC_0_0_0_150/', '1': 'CC_0_0_0_150.1/', '5': 'CC_0_0_0_150.5/', '10': 'CC_0_0_0_150.10/'} gev_model_list = [] for mouse_per in ['inf', '1', '5', '10']: for software in exp.Software.objects.all(): if software.name == 'htree' and (mouse_per == '5' or mouse_per == '10'): continue else: data_file = base_dir + dir_dict[mouse_per] + software.name + '.evd' np_extreme_values = np.genfromtxt(data_file) if software.name in ['plink', 'emmax']: np_extreme_values = -np.log(np_extreme_values) shape, location, scale = genextreme.fit(np_extreme_values, -1, loc=np_extreme_values.mean()) gev_model_list.append(exp.GevModelParam( software=software, mouse_per_strain=mouse_per, location=location, scale=scale, shape=shape, strains=150, var_env=.25)) ## additive large strain numbers models base_dir = '/home/dtgillis/ccsim_workspace/evd/strain_sweep' for mouse_per in ['inf']: for software in exp.Software.objects.filter(name='emmax'): for strain_num in [300, 450, 900]: data_file = base_dir + os.sep + 'CC_0_0_0_' + str(strain_num) + '.' + software.name + '.top' np_extreme_values = np.genfromtxt(data_file) np_extreme_values = -np.log(np_extreme_values) shape, location, scale = genextreme.fit(np_extreme_values, -1, loc=np_extreme_values.mean()) gev_model_list.append(exp.GevModelParam( software=software, mouse_per_strain=mouse_per, location=location, scale=scale, shape=shape, strains=strain_num, var_env=.25)) exp.GevModelParam.objects.bulk_create(gev_model_list) ## additive large strain numbers models base_dir = '/home/dtgillis/ccsim_workspace/evd/env_sweep' gev_model_list = [] for mouse_per in ['inf', '1', '5', '10']: for var_env in [.05, .50]: for software in exp.Software.objects.filter(name='emmax'): if mouse_per == 'inf': data_file = base_dir + os.sep + 'CC_0_0_0_' + str(int(var_env * 100)) + '_150.emmax.top' else: data_file = base_dir + os.sep + 'CC_0_0_0_' + str(int(var_env * 100)) + '_150.' + mouse_per + '.emmax.top' np_extreme_values = np.genfromtxt(data_file) np_extreme_values = -np.log(np_extreme_values) shape, location, scale = genextreme.fit(np_extreme_values, -1, loc=np_extreme_values.mean()) gev_model_list.append(exp.GevModelParam( software=software, mouse_per_strain=mouse_per, location=location, scale=scale, shape=shape, strains=150, var_env=var_env)) exp.GevModelParam.objects.bulk_create(gev_model_list) return 0
# <codecell> annual_max = list(annual_max_dict.values()) # <markdowncell> # ### Fit observation data to GEV distribution # <codecell> def gev_pdf(x): return genextreme.pdf(x, xi, loc=mu, scale=sigma) # <codecell> mle = genextreme.fit(sorted(annual_max), 0) mu = mle[1] sigma = mle[2] xi = mle[0] print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi) # <markdowncell> # ### Probability Density Plot # <codecell> min_x = min(annual_max)-0.5 max_x = max(annual_max)+0.5 x = np.linspace(min_x, max_x, num=100) y = [gev_pdf(z) for z in x]
def FAP(R_max, K, L, n, fap_levels): epsilon, mu, sig = gev.fit(R_max, c = -0.2, loc = 3.e-4, scale = 1) #fap_levels = 1./fap_levels epsilon = -epsilon return mu - sig/epsilon*(1-(-log(K*L/(fap_levels*n)))**(-epsilon))