def gumbel_max_test(X, y, min_count=5): x, y = _check_Xy(X, y, norm_y=True) min_count = max(min_count, 5) return _groupby_agg(x, y, 'max', min_count=min_count, func=lambda df: df.__setitem__( 'pval', gumbel_r.cdf(df.pop('max').values)))
def _cond_ext_pdf(self, x, i): # conditional extreme value cdf # x = point to evaluate # i = largest component (quantile-wise) # transform data to Gumbel margins first y = self._to_gumbel(x) j = 1 if i == 0 else 0 #the not-so-large component # integrating the assymptotic density Y_j|Y_i = y_i w.r.t. a std. Gumbel from y[i] to infty # and using the fact that Gumbel ~ exp(1) return gumbel.cdf(y[j], loc=0, scale=1) + np.exp(-y[j]) - np.sum( np.exp(-y / self.alpha))**self.alpha
def figure_gumbel_vs_normal(): from scipy.stats import gumbel_r, norm fig, axs = plt.subplots(1, 2, figsize=(7, 3), squeeze=True) x = np.linspace(-3, 5, 100) axs[0].plot(x, gumbel_r.pdf(x), label='Gumbel') axs[0].plot(x, norm.pdf(x, 0.577, 1.282), label='Normal') axs[0].set_title("Probability Density Functions\n(same mean and variance)") axs[0].legend() axs[1].plot(x, gumbel_r.cdf(x), label='Gumbel') axs[1].plot(x, norm.cdf(x, 0.577, 1.282), label='Normal') axs[1].set_title("Cumulative Density Functions\n(same mean and variance)") axs[1].legend() return xmle.Show(fig)
def ReturnPeriod(self, MapsPath, prefix, DistributionPrF, TraceF, SubsF, replacementF, HydrologicalInputsPath, SubIDMapF, ExtraSubsF, Fromfile, Tofile, SaveTo, wpath): AllResults = os.listdir(MapsPath) # list of the Max Depth files only MaxDepthList = list() for i in range(len(AllResults)): if AllResults[i].startswith(prefix): MaxDepthList.append(AllResults[i]) # Read Inputs # read the Distribution parameters for each upstream computatiopnal node DistributionPr = pd.read_csv(DistributionPrF) USnode = pd.read_csv(TraceF, header=None) USnode.columns = ['SubID', 'US', 'DS'] # get the sub basin Id from the guide file it is the same shape in RIM1.0 and RIM2.0 SubsID = pd.read_csv(SubsF, header=None, usecols=[0]) ReplacementSub = pd.read_csv(replacementF) # read the hydrograph for all the US nodes #StartDate = "1950-1-1" #StartDate = dt.datetime.strptime(StartDate,"%Y-%m-%d") #ind = pd.date_range(StartDate, StartDate + dt.timedelta(days = NoYears*365), freq = "D") ind = range( 1, len( pd.read_csv(HydrologicalInputsPath + "/" + str(int(USnode.loc[SubsID.loc[10, 0] - 1, 'US'])) + ".txt").values)) Hydrographs = pd.DataFrame(index=ind, columns=SubsID[0].to_list()) for i in range(len(SubsID)): # i=1 # search for the SubId in the USnode or it is listed by order so subID=343 exist # in the row 342 (SubID-1) # np.where(USnode['SubID'] == SubsID.loc[i,0]) try: if int(USnode.loc[SubsID.loc[i, 0] - 1, 'US']) != -1: Hydrographs.loc[:, SubsID.loc[i, 0]] = pd.read_csv( HydrologicalInputsPath + "/" + str(int(USnode.loc[SubsID.loc[i, 0] - 1, 'US'])) + ".txt").values[:len(Hydrographs)] except: OtherSubLoc = np.where( ReplacementSub['missing'] == SubsID.loc[i, 0])[0][0] if int(USnode.loc[ReplacementSub.loc[OtherSubLoc, 'replacement'] - 1, 'US']) != -1: Hydrographs.loc[:, SubsID.loc[i, 0]] = pd.read_csv( HydrologicalInputsPath + "/" + str( int(USnode.loc[ReplacementSub.loc[OtherSubLoc, 'replacement'] - 1, 'US'])) + ".txt").values[:len(Hydrographs)] # read sub basin map id SubIDMap = gdal.Open(SubIDMapF) SubIDMapV = SubIDMap.ReadAsArray() #NoValue = SubIDMap.GetRasterBand(1).GetNoDataValue() #SubIDMapV[SubIDMapV == NoValue] = 0 #plt.imshow(SubIDMapV) # read the added subs reference text file ExtraSubs = pd.read_csv(ExtraSubsF) # function to write the numbers in the ASCII file #read Max depth map check = list() Klist = list() if Tofile == "end" or Tofile > len(MaxDepthList): Tofile = len(MaxDepthList) #Fromfile = 48 #Tofile = Fromfile +1 for k in range(Fromfile, Tofile): try: # open the zip file Compressedfile = zipfile.ZipFile(MapsPath + "/" + MaxDepthList[k]) except: print("Error Opening the compressed file") check.append(MaxDepthList[k][len(prefix):-4]) Klist.append(k) continue # get the file name fname = Compressedfile.infolist()[0] # get the time step from the file name timestep = int(fname.filename[len(prefix):-4]) print("File= " + str(timestep)) ASCIIF = Compressedfile.open(fname) f = ASCIIF.readlines() SpatialRef = f[:6] ASCIIRaw = f[6:] # ASCIIF = Compressedfile.open(fname) # ASCIIRaw = ASCIIF.readlines()[6:] rows = len(ASCIIRaw) cols = len(ASCIIRaw[0].split()) MaxDepth = np.ones((rows, cols), dtype=np.float32) # read the ascii file for i in range(rows): x = ASCIIRaw[i].split() MaxDepth[i, :] = list(map(float, x)) # check on the values of the water depth # if np.shape(MaxDepth[np.isnan(MaxDepth)])[0] > 0: # check.append(timestep) # print("Error Check Max Depth values") # continue # plotting to check values # fromrow = np.where(MaxDepth == MaxDepth.max())[0][0] # fromcol = np.where(MaxDepth == MaxDepth.max())[1][0] # plt.imshow(MaxDepth[fromrow-20:fromrow+20,fromcol-20:fromcol+20]) # plt.imshow(MaxDepth) # plt.colorbar() # get the Peak of the hydrograph for the whole event # (14 days before the end of the event) MaxValuedf = Hydrographs.loc[timestep - 14:timestep, :] MaxValues = MaxValuedf.max().values.tolist() T = list() # Calculate the the Return period for the max Q at this time step for each for i in range(len(MaxValues)): # if the sub basin is a lateral and not routed in RIM it will not have a # hydrograph if np.isnan(MaxValues[i]): T.append(np.nan) if not np.isnan(MaxValues[i]): #np.where(USnode['SubID'] == SubsID.loc[i,0]) try: DSnode = USnode.loc[SubsID.loc[i, 0] - 1, 'US'] loc = np.where(DistributionPr['ID'] == DSnode)[0][0] except IndexError: OtherSubLoc = np.where( ReplacementSub['missing'] == SubsID.loc[i, 0])[0][0] DSnode = USnode.loc[ReplacementSub.loc[OtherSubLoc, 'replacement'] - 1, 'US'] loc = np.where(DistributionPr['ID'] == DSnode)[0][0] # to get the Non Exceedance probability for a specific Value F = gumbel_r.cdf(MaxValues[i], loc=DistributionPr.loc[loc, 'loc'], scale=DistributionPr.loc[loc, 'scale']) # then calculate the the T (return period) T = 1/(1-F) T.append(round(1 / (1 - F), 2)) try: RetunPeriodMap = np.ones((rows, cols), dtype=np.float32) * 0 for i in range(rows): for j in range(cols): # print("i = " + str(i) + ", j= " + str(j)) if not np.isnan(MaxDepth[i, j]): if MaxDepth[i, j] > 0: # print("i = " + str(i) + ", j= " + str(j)) # if the sub basin is in the Sub ID list if SubIDMapV[i, j] in SubsID[0].tolist(): # print("Sub = " + str(SubIDMapV[i,j])) # go get the return period directly RetunPeriodMap[i, j] = T[np.where( SubsID[0] == SubIDMapV[i, j])[0][0]] else: # print("Extra Sub = " + str(SubIDMapV[i,j])) # the sub ID is one of the added subs not routed by RIM # so it existed in the ExtraSubs list with a reference to # a SubID routed by RIM RIMSub = ExtraSubs.loc[np.where( ExtraSubs['addSub'] == SubIDMapV[ i, j])[0][0], 'RIMSub'] RetunPeriodMap[i, j] = T[np.where( SubsID[0] == RIMSub)[0][0]] except: print("Error") check.append(timestep) Klist.append(k) continue # save the return period ASCII file fname = "ReturnPeriod" + str(timestep) + ".asc" with open(SaveTo + "/" + fname, 'w') as File: # write the first lines for i in range(len(SpatialRef)): File.write(str(SpatialRef[i].decode()[:-2]) + "\n") for i in range(np.shape(RetunPeriodMap)[0]): File.writelines( list(map(self.StringSpace, RetunPeriodMap[i, :]))) File.write("\n") # zip the file with zipfile.ZipFile(SaveTo + "/" + fname[:-4] + ".zip", "w", zipfile.ZIP_DEFLATED) as newzip: newzip.write(SaveTo + "/" + fname, arcname=fname) # delete the file os.remove(SaveTo + "/" + fname) check = list(zip(check, Klist)) if len(check) > 0: np.savetxt(wpath + "CheckWaterDepth.txt", check, fmt='%6d')
def StatisticalProperties(self, PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles=False, Filter=False, Distibution="GEV", EstimateParameters=False, Quartile=0, RIMResults=False, SignificanceLevel=0.1): """ ============================================================================= StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles = False, Filter = False, RIMResults = False) ============================================================================= StatisticalProperties method reads the SWIM output file (.dat file) that contains the time series of discharge for some computational nodes and calculate some statistical properties the code assumes that the time series are of a daily temporal resolution, and that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH). Parameters ---------- 1-PathNodes : [String] the name of the file which contains the ID of the computational nodes you want to do the statistical analysis for, the ObservedFile should contain the discharge time series of these nodes in order. 2-PathTS : [String] the name of the SWIM result file (the .dat file). 3-StartDate : [string] the begining date of the time series. 4-WarmUpPeriod : [integer] the number of days you want to neglect at the begining of the Simulation (warm up period). 5-SavePlots : [Bool] DESCRIPTION. 6-SavePath : [String] the path where you want to save the statistical properties. 7-SeparateFiles: [Bool] if the discharge data are stored in separate files not all in one file SeparateFiles should be True, default [False]. 8-Filter: [Bool] for observed or RIMresult data it has gaps of times where the model did not run or gaps in the observed data if these gap days are filled with a specific value and you want to ignore it here give Filter = Value you want 9-RIMResults: [Bool] If the files are results form RIM or observed, as the format differes between the two. default [False] Returns ------- 1-Statistical Properties.csv: file containing some statistical properties like mean, std, min, 5%, 25%, median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50, q100, q200, q500. """ ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16) # hydrographs if SeparateFiles: TS = pd.DataFrame() if RIMResults: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = self.ReadRIMResult( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') else: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = np.loadtxt( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') #,skiprows = 0 StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) ind = pd.date_range(StartDate, EndDate) TS.index = ind else: TS = pd.read_csv(PathTS, delimiter=r'\s+', header=None) StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) TS.index = pd.date_range(StartDate, EndDate, freq="D") # delete the first two columns del TS[0], TS[1] TS.columns = ComputationalNodes # neglect the first year (warmup year) in the time series TS = TS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :] # List of the table output, including some general data and the return periods. col_csv = [ 'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max', 't_beg', 't_end', 'nyr' ] rp_name = [ 'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500', 'q1000' ] col_csv = col_csv + rp_name # In a table where duplicates are removed (np.unique), find the number of # gauges contained in the .csv file. # no_gauge = len(ComputationalNodes) # Declare a dataframe for the output file, with as index the gaugne numbers # and as columns all the output names. StatisticalPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=col_csv) StatisticalPr.index.name = 'ID' DistributionPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=['loc', 'scale']) DistributionPr.index.name = 'ID' # required return periods T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500, 1000] T = np.array(T) # these values are the Non Exceedance probability (F) of the chosen # return periods F = 1 - (1/T) # Non Exceedance propabilities #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998] F = 1 - (1 / T) # Iteration over all the gauge numbers. for i in ComputationalNodes: QTS = TS.loc[:, i] # The time series is resampled to the annual maxima, and turned into a # numpy array. # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH). amax = QTS.resample('A-OCT').max().values if type(Filter) != bool: amax = amax[amax != Filter] if EstimateParameters: # estimate the parameters through an optimization # alpha = (np.sqrt(6) / np.pi) * amax.std() # beta = amax.mean() - 0.5772 * alpha # param_dist = [beta, alpha] threshold = np.quantile(amax, Quartile) if Distibution == "GEV": print("Still to be finished later") else: param = Gumbel.EstimateParameter(amax, Gumbel.ObjectiveFn, threshold) param_dist = [param[1], param[2]] else: # estimate the parameters through an maximum liklehood method if Distibution == "GEV": param_dist = genextreme.fit(amax) else: # A gumbel distribution is fitted to the annual maxima param_dist = gumbel_r.fit(amax) if Distibution == "GEV": DistributionPr.loc[i, 'c'] = param_dist[0] DistributionPr.loc[i, 'loc'] = param_dist[1] DistributionPr.loc[i, 'scale'] = param_dist[2] else: DistributionPr.loc[i, 'loc'] = param_dist[0] DistributionPr.loc[i, 'scale'] = param_dist[1] # Return periods from the fitted distribution are stored. # get the Discharge coresponding to the return periods if Distibution == "GEV": Qrp = genextreme.ppf(F, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1]) # to get the Non Exceedance probability for a specific Value # sort the amax amax.sort() # calculate the F (Exceedence probability based on weibul) cdf_Weibul = ST.Weibul(amax) # Gumbel.ProbapilityPlot method calculates the theoretical values based on the Gumbel distribution # parameters, theoretical cdf (or weibul), and calculate the confidence interval if Distibution == "GEV": Qth, Qupper, Qlower = GEV.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = genextreme.pdf(Qx, param_dist[0], loc=param_dist[2], scale=param_dist[2]) cdf_fitted = genextreme.cdf(Qx, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qth, Qupper, Qlower = Gumbel.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # gumbel_r.interval(SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = gumbel_r.pdf(Qx, loc=param_dist[0], scale=param_dist[1]) cdf_fitted = gumbel_r.cdf(Qx, loc=param_dist[0], scale=param_dist[1]) # then calculate the the T (return period) T = 1/(1-F) if SavePlots: fig = plt.figure(60, figsize=(20, 10)) gs = gridspec.GridSpec(nrows=1, ncols=2, figure=fig) # Plot the histogram and the fitted distribution, save it for each gauge. ax1 = fig.add_subplot(gs[0, 0]) ax1.plot(Qx, pdf_fitted, 'r-') ax1.hist(amax, density=True) ax1.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax1.set_ylabel('pdf', fontsize=15) ax2 = fig.add_subplot(gs[0, 1]) ax2.plot(Qx, cdf_fitted, 'r-') ax2.plot(amax, cdf_Weibul, '.-') ax2.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax2.set_ylabel('cdf', fontsize=15) plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png', format='png') plt.close() fig = plt.figure(70, figsize=(10, 8)) plt.plot(Qth, amax, 'd', color='#606060', markersize=12, label='Gumbel Distribution') plt.plot(Qth, Qth, '^-.', color="#3D59AB", label="Weibul plotting position") if Distibution != "GEV": plt.plot(Qth, Qlower, '*--', color="#DC143C", markersize=12, label='Lower limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.plot(Qth, Qupper, '*--', color="#DC143C", markersize=12, label='Upper limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.legend(fontsize=15, framealpha=1) plt.xlabel('Theoretical Annual Discharge(m3/s)', fontsize=15) plt.ylabel('Annual Discharge(m3/s)', fontsize=15) plt.savefig(SavePath + "/" + "Figures/F-" + str(i) + '.png', format='png') plt.close() StatisticalPr.loc[i, 'mean'] = QTS.mean() StatisticalPr.loc[i, 'std'] = QTS.std() StatisticalPr.loc[i, 'min'] = QTS.min() StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05) StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25) StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50) StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75) StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95) StatisticalPr.loc[i, 'max'] = QTS.max() StatisticalPr.loc[i, 't_beg'] = QTS.index.min() StatisticalPr.loc[i, 't_end'] = QTS.index.max() StatisticalPr.loc[ i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] - StatisticalPr.loc[i, 't_beg']).days / 365.25 for irp, irp_name in zip(Qrp, rp_name): StatisticalPr.loc[i, irp_name] = irp # Print for prompt and check progress. print("Gauge", i, "done.") # # Output file StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv") self.StatisticalPr = StatisticalPr DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv") self.DistributionPr = DistributionPr
def rank_histogram_cal(X, R, Thresh=None, gumbel_params=None): """Calculation of "Corrected" Forecast Probability Distribution Using Rank Histogram. refer to: Hamill, T. M. and S. J. Colucci (1998). "Evaluation of Eta–RSM Ensemble Probabilistic Precipitation Forecasts." monthly weather review 126(3): 711-724. The compute scheme as following: R0, R1, R2, R3, R4,...,Rn, R{n+1} X0, X1, X2, X3, ,..., Xn if [0, Ta) , (Ta/X0)*R0 if [0, Ta) , R0+R1+(Ta-X1)/(X2-X1)*R2 if [0, Ta) , R0+R1+...+R{n-1} + (F(Ta)-F(Xn))/(1-F(Xn))*R{n+1} if [Ta, Tb) , ((Tb-Ta)/X0)*R0 if [Ta, Tb), (F(Tb)-F(Ta))/(1-F(Xn))*R{n+1} if [Ta, Tb) , (X1-Ta)/(X1-X0)*R1+R2+...+R{n-1} + (F(Tb)-F(Xn))/(1-F(Xn))*R{n+1} if [Ta, Tb) , (X1-Ta)/(X1-X0)*R1+R2+(Tb-X2)/(X3-X2)*R3 if [Ta, inf) , ((X0-Ta)/X0)*R0+R1+...+R{n+1} if [Ta, inf) , ((X3-Ta)/(X3-X2))*R3+R4+...+R{n+1} if [Ta,inf), (1-F(Ta))/(1-F(Xn))*R{n+1} Args: X (np.array): 1d array, ensemble forecast, N member R (np.array): 1d array, corresponding rank histogram, N+1 values, compuate form history forecasts and observations. Thresh (np.array): 1d array, precipiation category thresholds. [T1, T2, ..., Tn], T1 should larger than 0. gumbel_params (list): Gumbel parameters using the method of moments (Wilks 1995) [location, scale]. We assume that the probability beyond the highest ensemble member has the shape of Gumbel distribution. Return: np.array, the probability for each categories, [P(0 <= V < T1), P(T1 <= V < T2), ..., P(Tn <= V)]. Examples: X = [0, 0, 0, 0, 0, 0, 0.02, 0.04, 0.05, 0.07, 0.10, 0.11, 0.23, 0.26, 0.35] R = [0.25, 0.13, 0.09, 0.07, 0.05, 0.05, 0.04, 0.04,0.03, 0.03, 0.03, 0.02, 0.02, 0.03, 0.05, 0.07] Thresh = [0.01, 0.1, 0.25, 0.5, 1.0, 2.0] print(rank_histogram_cal(X, R, Thresh=Thresh)) # the answer should be [0.66, 0.15, 0.06, 0.11, 0.01, 0.0, 0.0] """ # sort ensemble forecast X = np.sort(X) nX = X.size # set precipiation category thresholds. if Thresh is None: Thresh = [0.1, 10, 25, 50, 100, 250] Thresh = np.sort(Thresh) # set gumbel params # default parameters from Hamill(1998) paper. if gumbel_params is None: gumbel_params = [0.03, 0.0898] gumbel = lambda x: gumbel_r.cdf( x, loc=gumbel_params[0], scale=gumbel_params[1]) # the probabilities each categories nt = Thresh.size P = np.zeros(nt + 1) # calculate P(0 <= V < T1) ind = np.searchsorted(X, Thresh[0]) if ind == 0: P[0] = (Thresh[0] / X[0]) * R[0] elif ind == nX: P[0] = np.sum(R[0:nX]) + (gumbel(Thresh[0]) - gumbel(X[nX - 1])) / ( 1.0 - gumbel(X[nX - 1])) * R[nX] else: P[0] = np.sum(R[0:ind]) + (Thresh[0] - X[ind - 1]) / (X[ind] - X[ind - 1]) * R[ind] # calculate P(T1 <= V < T2), ..., P(Tn-1 <= V < Tn) for it, _ in enumerate(Thresh[0:-1]): # get threshold range Ta = Thresh[it] Tb = Thresh[it + 1] if Tb < X[0]: P[it + 1] = ((Tb - Ta) / X[0]) * R[0] elif Ta >= X[-1]: P[it + 1] = (gumbel(Tb) - gumbel(Ta)) / (1.0 - gumbel(X[-1])) * R[nX] else: inda = np.searchsorted(X, Ta) indb = np.searchsorted(X, Tb) if indb == nX: P[it+1] = (X[inda] - Ta)/(X[inda]-X[inda-1])*R[inda] + \ np.sum(R[(inda+1):(indb)]) + (gumbel(Tb)-gumbel(X[-1]))/(1.0-gumbel(X[-1]))*R[nX] else: P[it+1] = (X[inda] - Ta)/(X[inda]-X[inda-1])*R[inda] + \ np.sum(R[(inda+1):(indb)]) + (Tb-X[indb-1])/(X[indb]-X[indb-1])*R[indb] # calculate P(Tn <= V) ind = np.searchsorted(X, Thresh[-1]) if ind == 0: P[nt] = ((X[0] - Thresh[-1]) / X[0]) * R[0] + np.sum(R[1:]) elif ind == nX: P[nt] = (1.0 - gumbel(Thresh[-1])) / (1.0 - gumbel(X[-1])) * R[nX] else: P[nt] = (X[ind] - Thresh[-1]) / ( X[ind] - X[ind - 1]) * R[ind] + np.sum(R[(ind + 1):]) return P
def F_Gumbel(r, m, s): scale, loc = p_Gumbel(m, s) return gumbel_r.cdf(r, loc, scale)
# Display the probability density function (``pdf``): x = np.linspace(gumbel_r.ppf(0.01), gumbel_r.ppf(0.99), 100) ax.plot(x, gumbel_r.pdf(x), 'r-', lw=5, alpha=0.6, label='gumbel_r pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = gumbel_r() ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = gumbel_r.ppf([0.001, 0.5, 0.999]) np.allclose([0.001, 0.5, 0.999], gumbel_r.cdf(vals)) # True # Generate random numbers: r = gumbel_r.rvs(size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
s, a, ma, ta = alignFunction( seqA, seqB, matScore, gapOpen, gapExtend, ScoreOnly=True) sscores.append(s) # Fit extreme value distribution to the scramble alignment data miu, beta = gumbel_r.fit(sscores) print("Length of sscores: ", len(sscores)) print("Computing histogram for {} scramble scores".format(N)) print("Max scrambled score:", max(sscores)) print("Min scrambled score:", min(sscores)) print("Median of scrambled scores:", np.median(sscores)) print("Gumbel miu:", miu) print("Gumbel beta:", beta) print("Probability of unscrambled score in a random alignment: ", 1-gumbel_r.cdf(uscore, miu, beta)) print() # Generate the basename for save files basename = "smith" if args.alignment_method == "local" else "needle" basename += "_{}_{}_{}_{:3.1f}_{:3.1f}".format( N, len(seqA), smatrix, abs(gapOpen), abs(gapExtend)) # Create the plot fig, ax = plt.subplots() ax.set_title("S-W, {} aligns,len {}, matrix {}, gapo {}, gape {}".format( N, len(seqA), smatrix, gapOpen, gapExtend)) counts, bins, _ = ax.hist(sscores, bins=np.arange( min(sscores), max(sscores)), align='left', rwidth=0.95) x = np.arange(bins[0], bins[-1], 0.01) ax.plot(x, sum(counts)*(bins[1]-bins[0])*gumbel_r.pdf(x, miu, beta),
loc, scale = gumbel_r.fit(hmax) fig, ax = plt.subplots() x = np.linspace(gumbel_r.ppf(0.01, loc=loc, scale=scale), gumbel_r.ppf(0.99, loc=loc, scale=scale), 100) ax.plot(x, gumbel_r.pdf(x, loc=loc, scale=scale), 'r-', lw=5, alpha=0.6, label='gumbel_r pdf') ax.hist(hmax, density=True) fig, ax = plt.subplots() ax.plot(x, gumbel_r.cdf(x, loc=loc, scale=scale), 'r-', lw=5, alpha=0.6, label='gumbel_r pdf') ecdf = statsmodels.distributions.ECDF(hmax) ax.plot(x, ecdf(x)) fig, ax = plt.subplots() probplot(hmax, dist=gumbel_r, sparams=(loc, scale), plot=ax) ax.grid() #%% from scipy.signal import find_peaks
def gumbel_max_test(X, y, min_count=5): x, y = _check_Xy(X, y, norm_y=True) min_count = max(min_count, 5) return _groupby_agg(x, y, 'max', min_count=min_count, func=lambda df: df.__setitem__('pval', gumbel_r.cdf(df.pop('max').values)))
# calculate the number of samples smaller than the reference score for score in permutation_scores: if score < reference_score: number_value_smaller +=1 # estimate the number of total samples + 1 number_samples = len(permutation_scores) +1 # calculate the p-value p_value = 1-(float(number_value_smaller) / float(number_samples)) print(p_value ) # Task 5: Compute the associate p-value using an estimated gumble distribution # estimate the parameter loc and scale using the fit function loc,scale = gumbel_r.fit(permutation_scores) # calculate the p-value p_value = 1-gumbel_r.cdf(reference_score, loc=loc, scale=scale ) print(p_value) # Task 6: Plot the histogram and the fitted probability density function with the reference score as vertical line fig, ax = plt.subplots(1, 1) x = np.linspace(gumbel_r.ppf(0.01,loc=loc, scale=scale),gumbel_r.ppf(0.99,loc=loc, scale=scale), 1000) ax.plot(x, gumbel_r.pdf(x, loc=loc, scale=scale), 'k-', lw=2, label='frozen pdf') ax.hist(permutation_scores, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) ax.axvline(reference_score) plt.show()
import numpy as np from matplotlib import pyplot as plt from scipy.stats import gumbel_r from scipy.stats import gumbel_l from scipy.stats import genextreme dataN = np.loadtxt("../data/Qdaily.txt") x_pdf = np.linspace(np.min(dataN), np.max(dataN), num=100) param = gumbel_r.fit(dataN) cdf1 = gumbel_r.cdf(x_pdf, *param[:-2], loc=param[-2], scale=param[-1]) plt.plot(x_pdf, -np.log(-np.log(cdf1)), 'o') print(param) num_bins = 200 counts, bin_edges = np.histogram(dataN, bins=num_bins) cdf2 = np.cumsum(counts) / np.sum(counts) plt.plot(bin_edges[1:], -np.log(-np.log(cdf2)), 'x') plt.show()