def fit_incs_pd(ascs, ascs_month, months, flag=0): # this period accepts a list of increasing periods and a month desired # and returns the parameter of the distribution of increments for that month ascs_select = list() for m in range(len(ascs)): #if ascs_month[m] == months: if np.in1d(ascs_month[m], months): ascs_select.append(ascs[m]) L = len(ascs_select) incs = list() for k in range(L): asc_temp = ascs_select[k] if hasattr(asc_temp, "__len__"): asc1 = asc_temp[1:] asc2 = asc_temp[:-1] incs.extend(np.subtract(asc1, asc2)) else: pass incs = list(filter(lambda a: a > 0, incs)) if flag == 0: optparms = fitweibull(incs) elif flag == 1: optparms = fitlognorm(incs) elif flag == 2: optparms = gumbel_r.fit(incs) elif flag == 3: optparms = powerlaw.fit(incs) elif flag == 4: optparms = genextreme.it(incs) return(optparms)
def fetch_duration_params(self, start_id): d = dict() start_time = time.time() journey_df = self.df_from_sql( f""" SELECT "EndStation Id" ,Duration / 60 AS Duration FROM journeys WHERE "StartStation Id" = {start_id} AND year >= {self.min_year} AND weekday_ind = 1 -- Query plan more efficient if we specify these rather than ESid > 0 AND "EndStation Id" != -1 AND "EndStation Id" NOT NULL {self.additional_filters} """ ) print(f"fetched {len(journey_df)} journeys for station {start_id} in {(time.time()-start_time)} seconds") start_time = time.time() journey_df.dropna(subset=['Duration'], inplace=True) for end_id in journey_df["EndStation Id"].unique(): durations = journey_df.loc[journey_df["EndStation Id"] == end_id]['Duration'].values # creates a tuple of scipy.stats.gumbel_r parameters params = gumbel_r.fit(durations) d[end_id] = params print(f"\tfitted {len(journey_df)} journeys in {(time.time()-start_time)/60} minutes") return d
def __init__(self, data, block_size=0): if len(data) < 10: raise Exception("Not enough data to make predictions") self.__data = data histogram = {} if block_size: self.__block_size = block_size else: # Block sizes between 20 and 50 are ok if # all the dataset can be represented in less # than 20 blocks self.__block_size = len(data)/20 if self.__block_size < 1: warnings.warn("Invalid block size, set it to 1") self.__block_size = 1 block_maxima = [] current_block_size = 0 current_block_maximum = -sys.maxint - 1 # Compute block maxima for value in data: if value in histogram: histogram[value] += 1 else: histogram[value] = 1 current_block_size += 1 if value > current_block_maximum: current_block_maximum = value if current_block_size == self.__block_size: block_maxima.append(current_block_maximum) current_block_maximum = 0 current_block_size = 0 # Build original 1-cdf histogram self.__values = histogram.keys() self.__values.sort() self.__values.pop() self.__frequencies = [] previous = 0.0 for value in self.__values: previous = histogram[value]/float(len(self.__data)) + previous self.__frequencies.append(1-previous) # Fit Gumbel distribution to block maxima params = gumbel.fit(block_maxima) self.__shape = 0 self.__location = params[0] self.__scale = params[1]
def EstimateParameter(data, ObjFunc, threshold): """ There are two likelihood functions (L1 and L2), one for values above some threshold (x>=C) and one for values below (x < C), now the likeliest parameters are those at the max value of mutiplication between two functions max(L1*L2). In this case the L1 is still the product of multiplication of probability density function's values at xi, but the L2 is the probability that threshold value C will be exceeded (1-F(C)). Parameters ---------- data : TYPE DESCRIPTION. threshold : TYPE DESCRIPTION. Returns ------- Param : TYPE DESCRIPTION. Example: from Hapi.statisticaltools import StatisticalTools as ST Param_dist = Gumbel.EstimateParameter(data, threshold) """ # obj_func = lambda p, x: (-np.log(Gumbel.Pdf(x, p[0], p[1]))).sum() # #first we make a simple Gumbel fit # Par1 = so.fmin(obj_func, [0.5,0.5], args=(np.array(data),)) Par1 = gumbel_r.fit(data) #then we use the result as starting value for your truncated Gumbel fit Param = so.fmin(ObjFunc, [threshold, Par1[0], Par1[1]], args=(np.array(data), ), maxiter=500, maxfun=500) # Param_dist = [Param[1], Param[2]] return Param
def StatisticalProperties(self, PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles=False, Filter=False, Distibution="GEV", EstimateParameters=False, Quartile=0, RIMResults=False, SignificanceLevel=0.1): """ ============================================================================= StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles = False, Filter = False, RIMResults = False) ============================================================================= StatisticalProperties method reads the SWIM output file (.dat file) that contains the time series of discharge for some computational nodes and calculate some statistical properties the code assumes that the time series are of a daily temporal resolution, and that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH). Parameters ---------- 1-PathNodes : [String] the name of the file which contains the ID of the computational nodes you want to do the statistical analysis for, the ObservedFile should contain the discharge time series of these nodes in order. 2-PathTS : [String] the name of the SWIM result file (the .dat file). 3-StartDate : [string] the begining date of the time series. 4-WarmUpPeriod : [integer] the number of days you want to neglect at the begining of the Simulation (warm up period). 5-SavePlots : [Bool] DESCRIPTION. 6-SavePath : [String] the path where you want to save the statistical properties. 7-SeparateFiles: [Bool] if the discharge data are stored in separate files not all in one file SeparateFiles should be True, default [False]. 8-Filter: [Bool] for observed or RIMresult data it has gaps of times where the model did not run or gaps in the observed data if these gap days are filled with a specific value and you want to ignore it here give Filter = Value you want 9-RIMResults: [Bool] If the files are results form RIM or observed, as the format differes between the two. default [False] Returns ------- 1-Statistical Properties.csv: file containing some statistical properties like mean, std, min, 5%, 25%, median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50, q100, q200, q500. """ ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16) # hydrographs if SeparateFiles: TS = pd.DataFrame() if RIMResults: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = self.ReadRIMResult( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') else: for i in range(len(ComputationalNodes)): TS.loc[:, int(ComputationalNodes[i])] = np.loadtxt( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') #,skiprows = 0 StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) ind = pd.date_range(StartDate, EndDate) TS.index = ind else: TS = pd.read_csv(PathTS, delimiter=r'\s+', header=None) StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1) TS.index = pd.date_range(StartDate, EndDate, freq="D") # delete the first two columns del TS[0], TS[1] TS.columns = ComputationalNodes # neglect the first year (warmup year) in the time series TS = TS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :] # List of the table output, including some general data and the return periods. col_csv = [ 'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max', 't_beg', 't_end', 'nyr' ] rp_name = [ 'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500', 'q1000' ] col_csv = col_csv + rp_name # In a table where duplicates are removed (np.unique), find the number of # gauges contained in the .csv file. # no_gauge = len(ComputationalNodes) # Declare a dataframe for the output file, with as index the gaugne numbers # and as columns all the output names. StatisticalPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=col_csv) StatisticalPr.index.name = 'ID' DistributionPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=['loc', 'scale']) DistributionPr.index.name = 'ID' # required return periods T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500, 1000] T = np.array(T) # these values are the Non Exceedance probability (F) of the chosen # return periods F = 1 - (1/T) # Non Exceedance propabilities #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998] F = 1 - (1 / T) # Iteration over all the gauge numbers. for i in ComputationalNodes: QTS = TS.loc[:, i] # The time series is resampled to the annual maxima, and turned into a # numpy array. # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH). amax = QTS.resample('A-OCT').max().values if type(Filter) != bool: amax = amax[amax != Filter] if EstimateParameters: # estimate the parameters through an optimization # alpha = (np.sqrt(6) / np.pi) * amax.std() # beta = amax.mean() - 0.5772 * alpha # param_dist = [beta, alpha] threshold = np.quantile(amax, Quartile) if Distibution == "GEV": print("Still to be finished later") else: param = Gumbel.EstimateParameter(amax, Gumbel.ObjectiveFn, threshold) param_dist = [param[1], param[2]] else: # estimate the parameters through an maximum liklehood method if Distibution == "GEV": param_dist = genextreme.fit(amax) else: # A gumbel distribution is fitted to the annual maxima param_dist = gumbel_r.fit(amax) if Distibution == "GEV": DistributionPr.loc[i, 'c'] = param_dist[0] DistributionPr.loc[i, 'loc'] = param_dist[1] DistributionPr.loc[i, 'scale'] = param_dist[2] else: DistributionPr.loc[i, 'loc'] = param_dist[0] DistributionPr.loc[i, 'scale'] = param_dist[1] # Return periods from the fitted distribution are stored. # get the Discharge coresponding to the return periods if Distibution == "GEV": Qrp = genextreme.ppf(F, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1]) # to get the Non Exceedance probability for a specific Value # sort the amax amax.sort() # calculate the F (Exceedence probability based on weibul) cdf_Weibul = ST.Weibul(amax) # Gumbel.ProbapilityPlot method calculates the theoretical values based on the Gumbel distribution # parameters, theoretical cdf (or weibul), and calculate the confidence interval if Distibution == "GEV": Qth, Qupper, Qlower = GEV.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = genextreme.pdf(Qx, param_dist[0], loc=param_dist[2], scale=param_dist[2]) cdf_fitted = genextreme.cdf(Qx, param_dist[0], loc=param_dist[1], scale=param_dist[2]) else: Qth, Qupper, Qlower = Gumbel.ProbapilityPlot( param_dist, cdf_Weibul, amax, SignificanceLevel) # gumbel_r.interval(SignificanceLevel) # to calculate the F theoretical Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = gumbel_r.pdf(Qx, loc=param_dist[0], scale=param_dist[1]) cdf_fitted = gumbel_r.cdf(Qx, loc=param_dist[0], scale=param_dist[1]) # then calculate the the T (return period) T = 1/(1-F) if SavePlots: fig = plt.figure(60, figsize=(20, 10)) gs = gridspec.GridSpec(nrows=1, ncols=2, figure=fig) # Plot the histogram and the fitted distribution, save it for each gauge. ax1 = fig.add_subplot(gs[0, 0]) ax1.plot(Qx, pdf_fitted, 'r-') ax1.hist(amax, density=True) ax1.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax1.set_ylabel('pdf', fontsize=15) ax2 = fig.add_subplot(gs[0, 1]) ax2.plot(Qx, cdf_fitted, 'r-') ax2.plot(amax, cdf_Weibul, '.-') ax2.set_xlabel('Annual Discharge(m3/s)', fontsize=15) ax2.set_ylabel('cdf', fontsize=15) plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png', format='png') plt.close() fig = plt.figure(70, figsize=(10, 8)) plt.plot(Qth, amax, 'd', color='#606060', markersize=12, label='Gumbel Distribution') plt.plot(Qth, Qth, '^-.', color="#3D59AB", label="Weibul plotting position") if Distibution != "GEV": plt.plot(Qth, Qlower, '*--', color="#DC143C", markersize=12, label='Lower limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.plot(Qth, Qupper, '*--', color="#DC143C", markersize=12, label='Upper limit (' + str(int( (1 - SignificanceLevel) * 100)) + " % CI)") plt.legend(fontsize=15, framealpha=1) plt.xlabel('Theoretical Annual Discharge(m3/s)', fontsize=15) plt.ylabel('Annual Discharge(m3/s)', fontsize=15) plt.savefig(SavePath + "/" + "Figures/F-" + str(i) + '.png', format='png') plt.close() StatisticalPr.loc[i, 'mean'] = QTS.mean() StatisticalPr.loc[i, 'std'] = QTS.std() StatisticalPr.loc[i, 'min'] = QTS.min() StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05) StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25) StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50) StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75) StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95) StatisticalPr.loc[i, 'max'] = QTS.max() StatisticalPr.loc[i, 't_beg'] = QTS.index.min() StatisticalPr.loc[i, 't_end'] = QTS.index.max() StatisticalPr.loc[ i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] - StatisticalPr.loc[i, 't_beg']).days / 365.25 for irp, irp_name in zip(Qrp, rp_name): StatisticalPr.loc[i, irp_name] = irp # Print for prompt and check progress. print("Gauge", i, "done.") # # Output file StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv") self.StatisticalPr = StatisticalPr DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv") self.DistributionPr = DistributionPr
fbest, pbest = my_per.get_best_frequencies() # Return best n_local_optima frequencies bestperiod = 1./fbest[0] bestperiod2 = 1./fbest[1] bestperiod3 = 1./fbest[2] bestperiod4 = 1./fbest[3] pbest_bootstrap = np.zeros(shape=(100, 2)) for index in range(pbest_bootstrap.shape[0]): P = np.random.permutation(len(mjd)) my_per.set_data(mjd, mag[P], err[P]) my_per.frequency_grid_evaluation(fmin=0.0, fmax=4.0, fresolution=1e-3) my_per.finetune_best_frequencies(fresolution=1e-4, n_local_optima=pbest_bootstrap.shape[1]) _, pbest_bootstrap[index, :] = my_per.get_best_frequencies() param = gumbel_r.fit(pbest_bootstrap.ravel()) rv = gumbel_r(loc=param[0], scale=param[1]) x = np.linspace(rv.ppf(0.001), rv.ppf(0.999), 100) p_vals = [0.01, 0.05, 0.08] sig1 = rv.ppf(1.-p_vals[0]) sig5 = rv.ppf(1.-p_vals[1]) sig8 = rv.ppf(1.-p_vals[2]) bestpower = pbest[0] bestpower2 = pbest[1] bestpower3 = pbest[2] bestpower4 = pbest[3]
exit(0) print("Calculating distribution of scores for {} scrambled alignments.".format(N)) sscores = [] # Calculate the N random alignments for i in tqdm(range(N)): seqB = "".join(sample(seqB, len(seqB))) s, a, ma, ta = alignFunction( seqA, seqB, matScore, gapOpen, gapExtend, ScoreOnly=True) sscores.append(s) # Fit extreme value distribution to the scramble alignment data miu, beta = gumbel_r.fit(sscores) print("Length of sscores: ", len(sscores)) print("Computing histogram for {} scramble scores".format(N)) print("Max scrambled score:", max(sscores)) print("Min scrambled score:", min(sscores)) print("Median of scrambled scores:", np.median(sscores)) print("Gumbel miu:", miu) print("Gumbel beta:", beta) print("Probability of unscrambled score in a random alignment: ", 1-gumbel_r.cdf(uscore, miu, beta)) print() # Generate the basename for save files basename = "smith" if args.alignment_method == "local" else "needle" basename += "_{}_{}_{}_{:3.1f}_{:3.1f}".format( N, len(seqA), smatrix, abs(gapOpen), abs(gapExtend))
fig, ax = plt.subplots() ericeira_wts.wave_data['Tp'].hist(density=True, bins=np.arange(22)) ax.set_title('Tp') #%% Extremes ericeira_wts.plot_timeseries() hmax = ericeira_wts.maxima() hmax.plot() #%% from scipy.stats import gumbel_r from scipy.stats import probplot import statsmodels.distributions loc, scale = gumbel_r.fit(hmax) fig, ax = plt.subplots() x = np.linspace(gumbel_r.ppf(0.01, loc=loc, scale=scale), gumbel_r.ppf(0.99, loc=loc, scale=scale), 100) ax.plot(x, gumbel_r.pdf(x, loc=loc, scale=scale), 'r-', lw=5, alpha=0.6, label='gumbel_r pdf') ax.hist(hmax, density=True) fig, ax = plt.subplots() ax.plot(x, gumbel_r.cdf(x, loc=loc, scale=scale), 'r-',
def StatisticalProperties(self, PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath, SeparateFiles=False, Filter=False): """ ============================================================================= StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, saveto) ============================================================================= StatisticalProperties method reads the SWIM output file (.dat file) that contains the time series of discharge for some computational nodes and calculate some statistical properties the code assumes that the time series are of a daily temporal resolution, and that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH). Parameters ---------- 1-PathNodes : [String] the name of the file which contains the ID of the computational nodes you want to do the statistical analysis for, the ObservedFile should contain the discharge time series of these nodes in order. 2-PathTS : [String] the name of the SWIM result file (the .dat file). 3-StartDate : [string] the begining date of the time series. 4-WarmUpPeriod : [integer] the number of days you want to neglect at the begining of the Simulation (warm up period). 5-SavePlots : [Bool] DESCRIPTION. 6-SavePath : [String] the path where you want to save the statistical properties. Returns ------- 1-Statistical Properties.csv: file containing some statistical properties like mean, std, min, 5%, 25%, median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50, q100, q200, q500. """ ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16) # hydrographs if SeparateFiles: ObservedTS = pd.DataFrame() for i in range(len(ComputationalNodes)): ObservedTS.loc[:, int(ComputationalNodes[i])] = np.loadtxt( PathTS + "/" + str(int(ComputationalNodes[i])) + '.txt') #,skiprows = 0 StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=ObservedTS.shape[0] - 1) ind = pd.date_range(StartDate, EndDate) ObservedTS.index = ind else: ObservedTS = pd.read_csv(PathTS, delimiter=r'\s+', header=None) StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d") EndDate = StartDate + dt.timedelta(days=ObservedTS.shape[0] - 1) ObservedTS.index = pd.date_range(StartDate, EndDate, freq="D") # delete the first two columns del ObservedTS[0], ObservedTS[1] ObservedTS.columns = ComputationalNodes # neglect the first year (warmup year) in the time series ObservedTS = ObservedTS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :] # List of the table output, including some general data and the return periods. col_csv = [ 'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max', 't_beg', 't_end', 'nyr' ] rp_name = [ 'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500' ] col_csv = col_csv + rp_name # In a table where duplicates are removed (np.unique), find the number of # gauges contained in the .csv file. # no_gauge = len(ComputationalNodes) # Declare a dataframe for the output file, with as index the gaugne numbers # and as columns all the output names. StatisticalPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=col_csv) StatisticalPr.index.name = 'ID' DistributionPr = pd.DataFrame(np.nan, index=ComputationalNodes, columns=['loc', 'scale']) DistributionPr.index.name = 'ID' # required return periods T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500] T = np.array(T) # these values are the Non Exceedance probability (F) of the chosen # return periods F = 1 - (1/T) # Non Exceedance propabilities #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998] F = 1 - (1 / T) # Iteration over all the gauge numbers. for i in ComputationalNodes: QTS = ObservedTS.loc[:, i] # The time series is resampled to the annual maxima, and turned into a # numpy array. # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH). amax = QTS.resample('A-OCT').max().values if type(Filter) != bool: amax = amax[amax != Filter] # A gumbel distribution is fitted to the annual maxima param_dist = gumbel_r.fit(amax) DistributionPr.loc[i, 'loc'] = param_dist[0] DistributionPr.loc[i, 'scale'] = param_dist[1] # Return periods from the fitted distribution are stored. # get the Discharge coresponding to the return periods Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1]) # to get the Non Exceedance probability for a specific Value #gumbel_r.cdf(Qrp, loc=param_dist[0], scale=param_dist[1]) # then calculate the the T (return period) T = 1/(1-F) # Plot the histogram and the fitted distribution, save it for each gauge. Qx = np.linspace(0, 1.5 * float(amax.max()), 10000) pdf_fitted = gumbel_r.pdf(Qx, loc=param_dist[0], scale=param_dist[1]) if SavePlots: plt.plot(Qx, pdf_fitted, 'r-') plt.hist(amax, normed=True) plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png', format='png') plt.close() StatisticalPr.loc[i, 'mean'] = QTS.mean() StatisticalPr.loc[i, 'std'] = QTS.std() StatisticalPr.loc[i, 'min'] = QTS.min() StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05) StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25) StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50) StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75) StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95) StatisticalPr.loc[i, 'max'] = QTS.max() StatisticalPr.loc[i, 't_beg'] = QTS.index.min() StatisticalPr.loc[i, 't_end'] = QTS.index.max() StatisticalPr.loc[ i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] - StatisticalPr.loc[i, 't_beg']).days / 365.25 for irp, irp_name in zip(Qrp, rp_name): StatisticalPr.loc[i, irp_name] = irp # Print for prompt and check progress. print("Gauge", i, "done.") # # Output file StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv") self.StatisticalPr = StatisticalPr DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv") self.DistributionPr = DistributionPr
number_value_smaller = 0 # calculate the number of samples smaller than the reference score for score in permutation_scores: if score < reference_score: number_value_smaller +=1 # estimate the number of total samples + 1 number_samples = len(permutation_scores) +1 # calculate the p-value p_value = 1-(float(number_value_smaller) / float(number_samples)) print(p_value ) # Task 5: Compute the associate p-value using an estimated gumble distribution # estimate the parameter loc and scale using the fit function loc,scale = gumbel_r.fit(permutation_scores) # calculate the p-value p_value = 1-gumbel_r.cdf(reference_score, loc=loc, scale=scale ) print(p_value) # Task 6: Plot the histogram and the fitted probability density function with the reference score as vertical line fig, ax = plt.subplots(1, 1) x = np.linspace(gumbel_r.ppf(0.01,loc=loc, scale=scale),gumbel_r.ppf(0.99,loc=loc, scale=scale), 1000) ax.plot(x, gumbel_r.pdf(x, loc=loc, scale=scale), 'k-', lw=2, label='frozen pdf') ax.hist(permutation_scores, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) ax.axvline(reference_score)
def compare_to_blank(blank_model_size, p_val=0.05, sparse_rounds=False, interactome_interface_instance=None): """ Recovers the statistics on the circulation nodes and shows the visual of a circulation system. There is no issue with using the same interactome interface instance, because they are forked when threads are generated and will not interfere. :param blank_model_size: the number of uniprots in the blank model :param p_val: desired p_value for the returned terms :param sparse_rounds: if set to a number, sparse computation technique would be used with the number of rounds equal the integer value of that argument :param interactome_interface_instance: :return: None if no significant nodes, the node and group characteristic dictionaries otherwise """ def get_max_for_each_degree(sample_sub_arrray): degrees = np.unique(sample_sub_arrray[1, :]) max_array = [] for degree in degrees: filter = sample_sub_arrray[1, :] == degree max_array.append([sample_sub_arrray[0, filter].max(), degree]) m_arr = np.array(max_array) return m_arr.T if interactome_interface_instance is None: interactome_interface_instance = InteractomeInterface(True, True) interactome_interface_instance.fast_load() md5_hash = interactome_interface_instance.md5_hash() background_sub_array_list = [] max_sub_array_list = [] count = 0 log.info("looking to test against:" "\t size: %s \t sys_hash: %s \t sparse_rounds: %s" % (blank_model_size, md5_hash, sparse_rounds)) log.info("samples found to test against:\t %s" % interactome_rand_samp_db.find({'size': blank_model_size, 'sys_hash': md5_hash, 'sparse_rounds': sparse_rounds} ).count()) for i, sample in enumerate(interactome_rand_samp_db.find( {'size': blank_model_size, 'sys_hash': md5_hash, 'sparse_rounds': sparse_rounds})): _, node_currents = pickle.loads(sample['currents']) dictionary_system = interactome_interface_instance.format_node_props(node_currents, limit=0) background_sub_array = list(dictionary_system.values()) background_sub_array_list.append(np.array(background_sub_array).T) max_arr = get_max_for_each_degree(np.array(background_sub_array).T) max_sub_array_list.append(max_arr) count = i # This part declares the pre-operators required for the verification of a # real sample background_array = np.concatenate(tuple(background_sub_array_list), axis=1) max_array = np.concatenate(tuple(max_sub_array_list), axis=1) node_currents = interactome_interface_instance.node_current dictionary_system = interactome_interface_instance.format_node_props(node_currents) curr_inf_conf_tot = np.array( [[int(key)] + list(val) for key, val in dictionary_system.items()]).T node_ids, query_array = (curr_inf_conf_tot[0, :], curr_inf_conf_tot[(1, 2), :]) log.info("stats on %s samples" % count) background_density = kde_compute(background_array[(1, 0), :], 50, count) base_bi_corr = background_array[(0, 1), :] r_rels = [] r_std_nodes = [] # TODO: idea for the improved statistics, cluster a test node of degree k with 100 nodes with # closest degrees samples_scatter_and_hist(background_array, query_array) degrees = np.unique(query_array[1, :]) combined_p_vals = np.ones_like(query_array[1, :]) for degree in degrees.tolist(): filter = query_array[1, :] == degree entry = query_array[:, filter] background_set = background_array[:, background_array[1, :] == degree] max_set = max_array[:, max_array[1, :] == degree] params = gumbel_r.fit(max_set[0, :]) arg = params[:-2] mu = params[-2] beta = params[-1] frozen_gumbel = gumbel_r(loc=mu, scale=beta) p_vals = 1 - frozen_gumbel.cdf(entry[0, :]) combined_p_vals[filter] = p_vals # TODO: insert into appropriate locations => we will assume that the order is preserved # samples_scatter_and_hist(max_set, entry) r_nodes = background_density(query_array[(1, 0), :]) # this is currently used as a p-value, which is problematic. r_nodes = combined_p_vals for point in query_array.T: selector = np.logical_and(base_bi_corr[1, :] > point[1]*0.9, base_bi_corr[1, :] < point[1]*1.1) r_rels.append(point[0]/np.mean(base_bi_corr[0, selector])) r_std_nodes.append((point[0]-np.mean(base_bi_corr[0, selector]))/np.std(base_bi_corr[0, selector])) r_rels = np.array(r_rels) r_std_nodes = np.array(r_std_nodes) not_random_nodes = [node_id for node_id in node_ids[r_nodes < p_val].tolist()] # basically the second element below are the nodes that contribute to the # information flow through the node that is considered as non-random log.debug('debug, not random nodes: %s', not_random_nodes) log.debug('debug bulbs_id_disp_name: %s', interactome_interface_instance.neo4j_id_2_display_name.items()[:10]) node_char_list = [ [int(nr_node_id), interactome_interface_instance.neo4j_id_2_display_name[nr_node_id]] + dictionary_system[nr_node_id] + r_nodes[node_ids == float(nr_node_id)].tolist() for nr_node_id in not_random_nodes] nodes_dict = np.hstack((node_ids[:, np.newaxis], r_nodes[:, np.newaxis], r_rels[:, np.newaxis], r_std_nodes[:, np.newaxis])) nodes_dict = dict((node[0], (node[1], node[2], node[3])) for node in nodes_dict.tolist()) nodes_dict = defaultdict(lambda: (1., 0., 0.), nodes_dict) # corresponds to the cases of super low flow - never significant # TODO: pull the groups corresponding to non-random associations. return sorted(node_char_list, key=lambda x: x[4]), nodes_dict
import numpy as np from matplotlib import pyplot as plt from scipy.stats import gumbel_r from scipy.stats import gumbel_l from scipy.stats import genextreme dataN = np.loadtxt("../data/Qdaily.txt") x_pdf = np.linspace(np.min(dataN), np.max(dataN), num=100) param = gumbel_r.fit(dataN) cdf1 = gumbel_r.cdf(x_pdf, *param[:-2], loc=param[-2], scale=param[-1]) plt.plot(x_pdf, -np.log(-np.log(cdf1)), 'o') print(param) num_bins = 200 counts, bin_edges = np.histogram(dataN, bins=num_bins) cdf2 = np.cumsum(counts) / np.sum(counts) plt.plot(bin_edges[1:], -np.log(-np.log(cdf2)), 'x') plt.show()
def lake_dike_system(h_0, forcings, parameters_cc, lake_par, dike_par, wind_par, policy): """Simulate the entire lake + dike system Args: h_0 (float): Initial condition, water level in the lake at t = 0 forcings (pd.Dataframe): External forcing historically observed parameters_cc (dict): list of parameters that set the change in the historically observed forcings lake_par (dict): dictionary of parameters for the model of the lake K: A: lake surface #TODO modify dike_par (dict): dictionary of parameters for the model of the dike slope: crown_height: gamma_b: gamma_beta: gamma_f: q_critical: wind_par (dict): dictionary of parameters for the model of the wind effects on the water level policy (dict): pumping_capacity h_target Returns: F (float): Frequency of dike failure """ # implement policy structural actions lake_par['K'] = lake_par['K'] * policy['sluices widening'] dike_par['height'] = dike_par['height'] + policy['raise dikes'] # model of water demand # oversimplified model: water demand proportional to potential evaporation forcings['water demand'] = 0 k_demand = 500 / 0.005 # m^3/s / ??? # max demand (estimated) / max potential evaporation (under stat conditions) forcings['water demand'] = forcings['potential evaporation'] * k_demand # model of rainfall-runoff S_lat = 1419 * 1000000 # km^2 to m^2 alpha = 0.8 forcings['inflow lateral'] = forcings[ 'precipitation'] * S_lat * alpha / lake_par[ 'Delta_t'] # rational formula # Variate Forcings (bottom-up climate change analysis) forcings_cc = variate_forcings(forcings, parameters_cc) # model of lake model_output = lake_sim(h_0, forcings_cc, lake_par, wind_par['Afsluitdijk'], policy) # supply deficit water_demand_daily = forcings_cc['water demand'].resample('D').mean() supply_relative_deficit = sum(water_demand_daily - model_output['water supply']) / \ sum(water_demand_daily) #((forcings.index[-1] - forcings.index[0]).days / 365.25 ) # Simulation horizon, in years # Dike boundary conditions h_year_max = yearly_max_wl(model_output['average water level'], forcings.resample('D').mean(), wind_par['Roggebotsluizen']) mu_wl, sigma_wl = gumbel_r.fit(h_year_max.values) water_level_pdf = gumbel_r.freeze(loc=mu_wl, scale=sigma_wl) # TEST THIS # Dike failure F = frequency_failure(water_level_pdf, dike_par, base_year=1000, N=1000, constant_waves=True) return (F, supply_relative_deficit)