Esempio n. 1
0
def gumbel_max_test(X, y, min_count=5):
    x, y = _check_Xy(X, y, norm_y=True)
    min_count = max(min_count, 5)
    return _groupby_agg(x,
                        y,
                        'max',
                        min_count=min_count,
                        func=lambda df: df.__setitem__(
                            'pval', gumbel_r.cdf(df.pop('max').values)))
    def _cond_ext_pdf(self, x, i):
        # conditional extreme value cdf
        # x = point to evaluate
        # i = largest component (quantile-wise)

        # transform data to Gumbel margins first
        y = self._to_gumbel(x)
        j = 1 if i == 0 else 0  #the not-so-large component

        # integrating the assymptotic density Y_j|Y_i = y_i w.r.t. a std. Gumbel from y[i] to infty
        # and using the fact that Gumbel ~ exp(1)

        return gumbel.cdf(y[j], loc=0, scale=1) + np.exp(-y[j]) - np.sum(
            np.exp(-y / self.alpha))**self.alpha
def figure_gumbel_vs_normal():
    from scipy.stats import gumbel_r, norm

    fig, axs = plt.subplots(1, 2, figsize=(7, 3), squeeze=True)

    x = np.linspace(-3, 5, 100)
    axs[0].plot(x, gumbel_r.pdf(x), label='Gumbel')
    axs[0].plot(x, norm.pdf(x, 0.577, 1.282), label='Normal')
    axs[0].set_title("Probability Density Functions\n(same mean and variance)")
    axs[0].legend()

    axs[1].plot(x, gumbel_r.cdf(x), label='Gumbel')
    axs[1].plot(x, norm.cdf(x, 0.577, 1.282), label='Normal')
    axs[1].set_title("Cumulative Density Functions\n(same mean and variance)")
    axs[1].legend()
    return xmle.Show(fig)
Esempio n. 4
0
    def ReturnPeriod(self, MapsPath, prefix, DistributionPrF, TraceF, SubsF,
                     replacementF, HydrologicalInputsPath, SubIDMapF,
                     ExtraSubsF, Fromfile, Tofile, SaveTo, wpath):

        AllResults = os.listdir(MapsPath)
        # list of the Max Depth files only
        MaxDepthList = list()
        for i in range(len(AllResults)):
            if AllResults[i].startswith(prefix):
                MaxDepthList.append(AllResults[i])
        # Read Inputs
        # read the Distribution parameters for each upstream computatiopnal node
        DistributionPr = pd.read_csv(DistributionPrF)
        USnode = pd.read_csv(TraceF, header=None)
        USnode.columns = ['SubID', 'US', 'DS']
        # get the sub basin Id from the guide file it is the same shape in RIM1.0 and RIM2.0
        SubsID = pd.read_csv(SubsF, header=None, usecols=[0])

        ReplacementSub = pd.read_csv(replacementF)

        # read the hydrograph for all the US nodes
        #StartDate = "1950-1-1"
        #StartDate = dt.datetime.strptime(StartDate,"%Y-%m-%d")
        #ind = pd.date_range(StartDate, StartDate + dt.timedelta(days = NoYears*365), freq = "D")

        ind = range(
            1,
            len(
                pd.read_csv(HydrologicalInputsPath + "/" +
                            str(int(USnode.loc[SubsID.loc[10, 0] - 1, 'US'])) +
                            ".txt").values))

        Hydrographs = pd.DataFrame(index=ind, columns=SubsID[0].to_list())

        for i in range(len(SubsID)):
            #    i=1
            # search for the SubId in the USnode or it is listed by order so subID=343 exist
            # in the row 342 (SubID-1)
            # np.where(USnode['SubID'] == SubsID.loc[i,0])
            try:
                if int(USnode.loc[SubsID.loc[i, 0] - 1, 'US']) != -1:
                    Hydrographs.loc[:, SubsID.loc[i, 0]] = pd.read_csv(
                        HydrologicalInputsPath + "/" +
                        str(int(USnode.loc[SubsID.loc[i, 0] - 1, 'US'])) +
                        ".txt").values[:len(Hydrographs)]
            except:
                OtherSubLoc = np.where(
                    ReplacementSub['missing'] == SubsID.loc[i, 0])[0][0]
                if int(USnode.loc[ReplacementSub.loc[OtherSubLoc,
                                                     'replacement'] - 1,
                                  'US']) != -1:
                    Hydrographs.loc[:, SubsID.loc[i, 0]] = pd.read_csv(
                        HydrologicalInputsPath + "/" + str(
                            int(USnode.loc[ReplacementSub.loc[OtherSubLoc,
                                                              'replacement'] -
                                           1, 'US'])) +
                        ".txt").values[:len(Hydrographs)]

        # read sub basin map id
        SubIDMap = gdal.Open(SubIDMapF)
        SubIDMapV = SubIDMap.ReadAsArray()
        #NoValue = SubIDMap.GetRasterBand(1).GetNoDataValue()
        #SubIDMapV[SubIDMapV == NoValue] = 0
        #plt.imshow(SubIDMapV)

        # read the added subs reference text file
        ExtraSubs = pd.read_csv(ExtraSubsF)

        # function to write the numbers in the ASCII file

        #read Max depth map
        check = list()
        Klist = list()

        if Tofile == "end" or Tofile > len(MaxDepthList):
            Tofile = len(MaxDepthList)

        #Fromfile = 48
        #Tofile = Fromfile +1

        for k in range(Fromfile, Tofile):

            try:
                # open the zip file
                Compressedfile = zipfile.ZipFile(MapsPath + "/" +
                                                 MaxDepthList[k])
            except:
                print("Error Opening the compressed file")
                check.append(MaxDepthList[k][len(prefix):-4])
                Klist.append(k)
                continue

            # get the file name
            fname = Compressedfile.infolist()[0]
            # get the time step from the file name
            timestep = int(fname.filename[len(prefix):-4])
            print("File= " + str(timestep))

            ASCIIF = Compressedfile.open(fname)
            f = ASCIIF.readlines()
            SpatialRef = f[:6]
            ASCIIRaw = f[6:]
            # ASCIIF = Compressedfile.open(fname)
            # ASCIIRaw = ASCIIF.readlines()[6:]
            rows = len(ASCIIRaw)
            cols = len(ASCIIRaw[0].split())
            MaxDepth = np.ones((rows, cols), dtype=np.float32)
            # read the ascii file
            for i in range(rows):
                x = ASCIIRaw[i].split()
                MaxDepth[i, :] = list(map(float, x))

            # check on the values of the water depth
        #    if np.shape(MaxDepth[np.isnan(MaxDepth)])[0] > 0:
        #        check.append(timestep)
        #        print("Error Check Max Depth values")
        #        continue

        # plotting to check values
        #    fromrow = np.where(MaxDepth == MaxDepth.max())[0][0]
        #    fromcol = np.where(MaxDepth == MaxDepth.max())[1][0]
        #    plt.imshow(MaxDepth[fromrow-20:fromrow+20,fromcol-20:fromcol+20])
        #    plt.imshow(MaxDepth)
        #    plt.colorbar()

        # get the Peak of the hydrograph for the whole event
        # (14 days before the end of the event)
            MaxValuedf = Hydrographs.loc[timestep - 14:timestep, :]
            MaxValues = MaxValuedf.max().values.tolist()
            T = list()

            # Calculate the the Return period for the max Q at this time step for each
            for i in range(len(MaxValues)):
                # if the sub basin is a lateral and not routed in RIM it will not have a
                # hydrograph
                if np.isnan(MaxValues[i]):
                    T.append(np.nan)
                if not np.isnan(MaxValues[i]):
                    #np.where(USnode['SubID'] == SubsID.loc[i,0])
                    try:
                        DSnode = USnode.loc[SubsID.loc[i, 0] - 1, 'US']
                        loc = np.where(DistributionPr['ID'] == DSnode)[0][0]
                    except IndexError:
                        OtherSubLoc = np.where(
                            ReplacementSub['missing'] == SubsID.loc[i,
                                                                    0])[0][0]
                        DSnode = USnode.loc[ReplacementSub.loc[OtherSubLoc,
                                                               'replacement'] -
                                            1, 'US']
                        loc = np.where(DistributionPr['ID'] == DSnode)[0][0]

                    # to get the Non Exceedance probability for a specific Value
                    F = gumbel_r.cdf(MaxValues[i],
                                     loc=DistributionPr.loc[loc, 'loc'],
                                     scale=DistributionPr.loc[loc, 'scale'])
                    # then calculate the the T (return period) T = 1/(1-F)
                    T.append(round(1 / (1 - F), 2))

            try:
                RetunPeriodMap = np.ones((rows, cols), dtype=np.float32) * 0
                for i in range(rows):
                    for j in range(cols):
                        # print("i = " + str(i) + ", j= " + str(j))
                        if not np.isnan(MaxDepth[i, j]):
                            if MaxDepth[i, j] > 0:
                                # print("i = " + str(i) + ", j= " + str(j))
                                # if the sub basin is in the Sub ID list
                                if SubIDMapV[i, j] in SubsID[0].tolist():
                                    # print("Sub = " + str(SubIDMapV[i,j]))
                                    # go get the return period directly
                                    RetunPeriodMap[i, j] = T[np.where(
                                        SubsID[0] == SubIDMapV[i, j])[0][0]]
                                else:
                                    # print("Extra  Sub = " + str(SubIDMapV[i,j]))
                                    # the sub ID is one of the added subs not routed by RIM
                                    # so it existed in the ExtraSubs list with a reference to
                                    # a SubID routed by RIM
                                    RIMSub = ExtraSubs.loc[np.where(
                                        ExtraSubs['addSub'] == SubIDMapV[
                                            i, j])[0][0], 'RIMSub']
                                    RetunPeriodMap[i, j] = T[np.where(
                                        SubsID[0] == RIMSub)[0][0]]
            except:
                print("Error")
                check.append(timestep)
                Klist.append(k)
                continue

            # save the return period ASCII file
            fname = "ReturnPeriod" + str(timestep) + ".asc"

            with open(SaveTo + "/" + fname, 'w') as File:
                # write the first lines
                for i in range(len(SpatialRef)):
                    File.write(str(SpatialRef[i].decode()[:-2]) + "\n")

                for i in range(np.shape(RetunPeriodMap)[0]):
                    File.writelines(
                        list(map(self.StringSpace, RetunPeriodMap[i, :])))
                    File.write("\n")

            # zip the file
            with zipfile.ZipFile(SaveTo + "/" + fname[:-4] + ".zip", "w",
                                 zipfile.ZIP_DEFLATED) as newzip:
                newzip.write(SaveTo + "/" + fname, arcname=fname)
            # delete the file
            os.remove(SaveTo + "/" + fname)

        check = list(zip(check, Klist))
        if len(check) > 0:
            np.savetxt(wpath + "CheckWaterDepth.txt", check, fmt='%6d')
Esempio n. 5
0
    def StatisticalProperties(self,
                              PathNodes,
                              PathTS,
                              StartDate,
                              WarmUpPeriod,
                              SavePlots,
                              SavePath,
                              SeparateFiles=False,
                              Filter=False,
                              Distibution="GEV",
                              EstimateParameters=False,
                              Quartile=0,
                              RIMResults=False,
                              SignificanceLevel=0.1):
        """
        =============================================================================
          StatisticalProperties(PathNodes, PathTS, StartDate, WarmUpPeriod, SavePlots, SavePath,
                              SeparateFiles = False, Filter = False, RIMResults = False)
        =============================================================================

        StatisticalProperties method reads the SWIM output file (.dat file) that
        contains the time series of discharge for some computational nodes
        and calculate some statistical properties

        the code assumes that the time series are of a daily temporal resolution, and
        that the hydrological year is 1-Nov/31-Oct (Petrow and Merz, 2009, JoH).

        Parameters
        ----------
            1-PathNodes : [String]
                the name of the file which contains the ID of the computational
                nodes you want to do the statistical analysis for, the ObservedFile
                should contain the discharge time series of these nodes in order.
            2-PathTS : [String]
                the name of the SWIM result file (the .dat file).
            3-StartDate : [string]
                the begining date of the time series.
            4-WarmUpPeriod : [integer]
                the number of days you want to neglect at the begining of the
                Simulation (warm up period).
            5-SavePlots : [Bool]
                DESCRIPTION.
            6-SavePath : [String]
                the path where you want to  save the statistical properties.
            7-SeparateFiles: [Bool]
                if the discharge data are stored in separate files not all in one file
                SeparateFiles should be True, default [False].
            8-Filter: [Bool]
                for observed or RIMresult data it has gaps of times where the
                model did not run or gaps in the observed data if these gap days
                are filled with a specific value and you want to ignore it here
                give Filter = Value you want
            9-RIMResults: [Bool]
                If the files are results form RIM or observed, as the format
                differes between the two. default [False]

        Returns
        -------
            1-Statistical Properties.csv:
                file containing some statistical properties like mean, std, min, 5%, 25%,
                median, 75%, 95%, max, t_beg, t_end, nyr, q1.5, q2, q5, q10, q25, q50,
                q100, q200, q500.
        """

        ComputationalNodes = np.loadtxt(PathNodes, dtype=np.uint16)
        # hydrographs
        if SeparateFiles:
            TS = pd.DataFrame()
            if RIMResults:
                for i in range(len(ComputationalNodes)):
                    TS.loc[:, int(ComputationalNodes[i])] = self.ReadRIMResult(
                        PathTS + "/" + str(int(ComputationalNodes[i])) +
                        '.txt')
            else:
                for i in range(len(ComputationalNodes)):
                    TS.loc[:, int(ComputationalNodes[i])] = np.loadtxt(
                        PathTS + "/" + str(int(ComputationalNodes[i])) +
                        '.txt')  #,skiprows = 0

            StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d")
            EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1)
            ind = pd.date_range(StartDate, EndDate)
            TS.index = ind
        else:
            TS = pd.read_csv(PathTS, delimiter=r'\s+', header=None)
            StartDate = dt.datetime.strptime(StartDate, "%Y-%m-%d")
            EndDate = StartDate + dt.timedelta(days=TS.shape[0] - 1)
            TS.index = pd.date_range(StartDate, EndDate, freq="D")
            # delete the first two columns
            del TS[0], TS[1]
            TS.columns = ComputationalNodes

        # neglect the first year (warmup year) in the time series
        TS = TS.loc[StartDate + dt.timedelta(days=WarmUpPeriod):EndDate, :]

        # List of the table output, including some general data and the return periods.
        col_csv = [
            'mean', 'std', 'min', '5%', '25%', 'median', '75%', '95%', 'max',
            't_beg', 't_end', 'nyr'
        ]
        rp_name = [
            'q1.5', 'q2', 'q5', 'q10', 'q25', 'q50', 'q100', 'q200', 'q500',
            'q1000'
        ]
        col_csv = col_csv + rp_name

        # In a table where duplicates are removed (np.unique), find the number of
        # gauges contained in the .csv file.
        # no_gauge = len(ComputationalNodes)
        # Declare a dataframe for the output file, with as index the gaugne numbers
        # and as columns all the output names.
        StatisticalPr = pd.DataFrame(np.nan,
                                     index=ComputationalNodes,
                                     columns=col_csv)
        StatisticalPr.index.name = 'ID'
        DistributionPr = pd.DataFrame(np.nan,
                                      index=ComputationalNodes,
                                      columns=['loc', 'scale'])
        DistributionPr.index.name = 'ID'
        # required return periods
        T = [1.5, 2, 5, 10, 25, 50, 50, 100, 200, 500, 1000]
        T = np.array(T)
        # these values are the Non Exceedance probability (F) of the chosen
        # return periods F = 1 - (1/T)
        # Non Exceedance propabilities
        #F = [1/3, 0.5, 0.8, 0.9, 0.96, 0.98, 0.99, 0.995, 0.998]
        F = 1 - (1 / T)
        # Iteration over all the gauge numbers.
        for i in ComputationalNodes:
            QTS = TS.loc[:, i]
            # The time series is resampled to the annual maxima, and turned into a
            # numpy array.
            # The hydrological year is 1-Nov/31-Oct (from Petrow and Merz, 2009, JoH).
            amax = QTS.resample('A-OCT').max().values

            if type(Filter) != bool:
                amax = amax[amax != Filter]
            if EstimateParameters:
                # estimate the parameters through an optimization
                # alpha = (np.sqrt(6) / np.pi) * amax.std()
                # beta = amax.mean() - 0.5772 * alpha
                # param_dist = [beta, alpha]
                threshold = np.quantile(amax, Quartile)
                if Distibution == "GEV":
                    print("Still to be finished later")
                else:
                    param = Gumbel.EstimateParameter(amax, Gumbel.ObjectiveFn,
                                                     threshold)
                    param_dist = [param[1], param[2]]

            else:
                # estimate the parameters through an maximum liklehood method
                if Distibution == "GEV":
                    param_dist = genextreme.fit(amax)
                else:
                    # A gumbel distribution is fitted to the annual maxima
                    param_dist = gumbel_r.fit(amax)

            if Distibution == "GEV":
                DistributionPr.loc[i, 'c'] = param_dist[0]
                DistributionPr.loc[i, 'loc'] = param_dist[1]
                DistributionPr.loc[i, 'scale'] = param_dist[2]
            else:
                DistributionPr.loc[i, 'loc'] = param_dist[0]
                DistributionPr.loc[i, 'scale'] = param_dist[1]

            # Return periods from the fitted distribution are stored.
            # get the Discharge coresponding to the return periods
            if Distibution == "GEV":
                Qrp = genextreme.ppf(F,
                                     param_dist[0],
                                     loc=param_dist[1],
                                     scale=param_dist[2])
            else:
                Qrp = gumbel_r.ppf(F, loc=param_dist[0], scale=param_dist[1])
            # to get the Non Exceedance probability for a specific Value
            # sort the amax
            amax.sort()
            # calculate the F (Exceedence probability based on weibul)
            cdf_Weibul = ST.Weibul(amax)
            # Gumbel.ProbapilityPlot method calculates the theoretical values based on the Gumbel distribution
            # parameters, theoretical cdf (or weibul), and calculate the confidence interval
            if Distibution == "GEV":
                Qth, Qupper, Qlower = GEV.ProbapilityPlot(
                    param_dist, cdf_Weibul, amax, SignificanceLevel)
                # to calculate the F theoretical
                Qx = np.linspace(0, 1.5 * float(amax.max()), 10000)
                pdf_fitted = genextreme.pdf(Qx,
                                            param_dist[0],
                                            loc=param_dist[2],
                                            scale=param_dist[2])
                cdf_fitted = genextreme.cdf(Qx,
                                            param_dist[0],
                                            loc=param_dist[1],
                                            scale=param_dist[2])
            else:
                Qth, Qupper, Qlower = Gumbel.ProbapilityPlot(
                    param_dist, cdf_Weibul, amax, SignificanceLevel)
                # gumbel_r.interval(SignificanceLevel)
                # to calculate the F theoretical
                Qx = np.linspace(0, 1.5 * float(amax.max()), 10000)
                pdf_fitted = gumbel_r.pdf(Qx,
                                          loc=param_dist[0],
                                          scale=param_dist[1])
                cdf_fitted = gumbel_r.cdf(Qx,
                                          loc=param_dist[0],
                                          scale=param_dist[1])
            # then calculate the the T (return period) T = 1/(1-F)
            if SavePlots:
                fig = plt.figure(60, figsize=(20, 10))
                gs = gridspec.GridSpec(nrows=1, ncols=2, figure=fig)
                # Plot the histogram and the fitted distribution, save it for each gauge.
                ax1 = fig.add_subplot(gs[0, 0])
                ax1.plot(Qx, pdf_fitted, 'r-')
                ax1.hist(amax, density=True)
                ax1.set_xlabel('Annual Discharge(m3/s)', fontsize=15)
                ax1.set_ylabel('pdf', fontsize=15)

                ax2 = fig.add_subplot(gs[0, 1])
                ax2.plot(Qx, cdf_fitted, 'r-')
                ax2.plot(amax, cdf_Weibul, '.-')
                ax2.set_xlabel('Annual Discharge(m3/s)', fontsize=15)
                ax2.set_ylabel('cdf', fontsize=15)

                plt.savefig(SavePath + "/" + "Figures/" + str(i) + '.png',
                            format='png')
                plt.close()

                fig = plt.figure(70, figsize=(10, 8))
                plt.plot(Qth,
                         amax,
                         'd',
                         color='#606060',
                         markersize=12,
                         label='Gumbel Distribution')
                plt.plot(Qth,
                         Qth,
                         '^-.',
                         color="#3D59AB",
                         label="Weibul plotting position")
                if Distibution != "GEV":
                    plt.plot(Qth,
                             Qlower,
                             '*--',
                             color="#DC143C",
                             markersize=12,
                             label='Lower limit (' +
                             str(int(
                                 (1 - SignificanceLevel) * 100)) + " % CI)")
                    plt.plot(Qth,
                             Qupper,
                             '*--',
                             color="#DC143C",
                             markersize=12,
                             label='Upper limit (' +
                             str(int(
                                 (1 - SignificanceLevel) * 100)) + " % CI)")

                plt.legend(fontsize=15, framealpha=1)
                plt.xlabel('Theoretical Annual Discharge(m3/s)', fontsize=15)
                plt.ylabel('Annual Discharge(m3/s)', fontsize=15)
                plt.savefig(SavePath + "/" + "Figures/F-" + str(i) + '.png',
                            format='png')
                plt.close()

            StatisticalPr.loc[i, 'mean'] = QTS.mean()
            StatisticalPr.loc[i, 'std'] = QTS.std()
            StatisticalPr.loc[i, 'min'] = QTS.min()
            StatisticalPr.loc[i, '5%'] = QTS.quantile(0.05)
            StatisticalPr.loc[i, '25%'] = QTS.quantile(0.25)
            StatisticalPr.loc[i, 'median'] = QTS.quantile(0.50)
            StatisticalPr.loc[i, '75%'] = QTS.quantile(0.75)
            StatisticalPr.loc[i, '95%'] = QTS.quantile(0.95)
            StatisticalPr.loc[i, 'max'] = QTS.max()
            StatisticalPr.loc[i, 't_beg'] = QTS.index.min()
            StatisticalPr.loc[i, 't_end'] = QTS.index.max()
            StatisticalPr.loc[
                i, 'nyr'] = (StatisticalPr.loc[i, 't_end'] -
                             StatisticalPr.loc[i, 't_beg']).days / 365.25
            for irp, irp_name in zip(Qrp, rp_name):
                StatisticalPr.loc[i, irp_name] = irp

            # Print for prompt and check progress.
            print("Gauge", i, "done.")
        #
        # Output file
        StatisticalPr.to_csv(SavePath + "/" + "Statistical Properties.csv")
        self.StatisticalPr = StatisticalPr
        DistributionPr.to_csv(SavePath + "/" + "DistributionProperties.csv")
        self.DistributionPr = DistributionPr
Esempio n. 6
0
def rank_histogram_cal(X, R, Thresh=None, gumbel_params=None):
    """Calculation of "Corrected" Forecast Probability Distribution Using Rank Histogram.

    refer to:
    Hamill, T. M. and S. J. Colucci (1998). "Evaluation of Eta–RSM Ensemble Probabilistic 
        Precipitation Forecasts." monthly weather review 126(3): 711-724.

    The compute scheme as following:  
                R0,  R1,  R2,  R3,  R4,...,Rn,  R{n+1}
                   X0,  X1,  X2,  X3,  ,...,  Xn
    if  [0, Ta)                                                 , (Ta/X0)*R0
    if  [0,                Ta)                                  , R0+R1+(Ta-X1)/(X2-X1)*R2
    if  [0,                                               Ta)   , R0+R1+...+R{n-1} + (F(Ta)-F(Xn))/(1-F(Xn))*R{n+1}
    if  [Ta, Tb)                                                , ((Tb-Ta)/X0)*R0
    if                                                  [Ta, Tb), (F(Tb)-F(Ta))/(1-F(Xn))*R{n+1}
    if               [Ta,                                 Tb)   , (X1-Ta)/(X1-X0)*R1+R2+...+R{n-1} + (F(Tb)-F(Xn))/(1-F(Xn))*R{n+1}
    if               [Ta,       Tb)                             , (X1-Ta)/(X1-X0)*R1+R2+(Tb-X2)/(X3-X2)*R3
    if      [Ta,                                          inf)  , ((X0-Ta)/X0)*R0+R1+...+R{n+1}
    if                         [Ta,                       inf)  , ((X3-Ta)/(X3-X2))*R3+R4+...+R{n+1}
    if                                                  [Ta,inf), (1-F(Ta))/(1-F(Xn))*R{n+1}

    Args:
        X (np.array): 1d array, ensemble forecast, N member
        R (np.array): 1d array, corresponding rank histogram, N+1 values, 
                      compuate form history forecasts and observations.
        Thresh (np.array): 1d array, precipiation category thresholds.
                           [T1, T2, ..., Tn], T1 should larger than 0.
        gumbel_params (list): Gumbel parameters using the method of moments (Wilks 1995)
                              [location, scale]. We assume that the probability beyond 
                              the highest ensemble member has the shape of Gumbel distribution.

    Return:
        np.array, the probability for each categories, 
            [P(0 <= V < T1), P(T1 <= V < T2), ..., P(Tn <= V)].

    Examples:
        X = [0, 0, 0, 0, 0, 0, 0.02, 0.04, 0.05, 0.07, 0.10, 0.11, 0.23, 0.26, 0.35]
        R = [0.25, 0.13, 0.09, 0.07, 0.05, 0.05, 0.04, 0.04,0.03, 0.03, 0.03, 0.02, 0.02, 0.03, 0.05, 0.07]
        Thresh = [0.01, 0.1, 0.25, 0.5, 1.0, 2.0]
        print(rank_histogram_cal(X, R, Thresh=Thresh))
        # the answer should be [0.66, 0.15, 0.06, 0.11, 0.01, 0.0, 0.0]
    """

    # sort ensemble forecast
    X = np.sort(X)
    nX = X.size

    # set precipiation category thresholds.
    if Thresh is None:
        Thresh = [0.1, 10, 25, 50, 100, 250]
    Thresh = np.sort(Thresh)

    # set gumbel params
    # default parameters from Hamill(1998) paper.
    if gumbel_params is None:
        gumbel_params = [0.03, 0.0898]
    gumbel = lambda x: gumbel_r.cdf(
        x, loc=gumbel_params[0], scale=gumbel_params[1])

    # the probabilities each categories
    nt = Thresh.size
    P = np.zeros(nt + 1)

    # calculate P(0 <= V < T1)
    ind = np.searchsorted(X, Thresh[0])
    if ind == 0:
        P[0] = (Thresh[0] / X[0]) * R[0]
    elif ind == nX:
        P[0] = np.sum(R[0:nX]) + (gumbel(Thresh[0]) - gumbel(X[nX - 1])) / (
            1.0 - gumbel(X[nX - 1])) * R[nX]
    else:
        P[0] = np.sum(R[0:ind]) + (Thresh[0] -
                                   X[ind - 1]) / (X[ind] - X[ind - 1]) * R[ind]

    # calculate  P(T1 <= V < T2), ..., P(Tn-1 <= V < Tn)
    for it, _ in enumerate(Thresh[0:-1]):
        # get threshold range
        Ta = Thresh[it]
        Tb = Thresh[it + 1]

        if Tb < X[0]:
            P[it + 1] = ((Tb - Ta) / X[0]) * R[0]
        elif Ta >= X[-1]:
            P[it +
              1] = (gumbel(Tb) - gumbel(Ta)) / (1.0 - gumbel(X[-1])) * R[nX]
        else:
            inda = np.searchsorted(X, Ta)
            indb = np.searchsorted(X, Tb)
            if indb == nX:
                P[it+1] = (X[inda] - Ta)/(X[inda]-X[inda-1])*R[inda] + \
                           np.sum(R[(inda+1):(indb)]) + (gumbel(Tb)-gumbel(X[-1]))/(1.0-gumbel(X[-1]))*R[nX]
            else:
                P[it+1] = (X[inda] - Ta)/(X[inda]-X[inda-1])*R[inda] + \
                           np.sum(R[(inda+1):(indb)]) + (Tb-X[indb-1])/(X[indb]-X[indb-1])*R[indb]

    # calculate P(Tn <= V)
    ind = np.searchsorted(X, Thresh[-1])
    if ind == 0:
        P[nt] = ((X[0] - Thresh[-1]) / X[0]) * R[0] + np.sum(R[1:])
    elif ind == nX:
        P[nt] = (1.0 - gumbel(Thresh[-1])) / (1.0 - gumbel(X[-1])) * R[nX]
    else:
        P[nt] = (X[ind] - Thresh[-1]) / (
            X[ind] - X[ind - 1]) * R[ind] + np.sum(R[(ind + 1):])

    return P
Esempio n. 7
0
def F_Gumbel(r, m, s):
    scale, loc = p_Gumbel(m, s)

    return gumbel_r.cdf(r, loc, scale)
# Display the probability density function (``pdf``):

x = np.linspace(gumbel_r.ppf(0.01), gumbel_r.ppf(0.99), 100)
ax.plot(x, gumbel_r.pdf(x), 'r-', lw=5, alpha=0.6, label='gumbel_r pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = gumbel_r()
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = gumbel_r.ppf([0.001, 0.5, 0.999])
np.allclose([0.001, 0.5, 0.999], gumbel_r.cdf(vals))
# True

# Generate random numbers:

r = gumbel_r.rvs(size=1000)

# And compare the histogram:

ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
Esempio n. 9
0
    s, a, ma, ta = alignFunction(
        seqA, seqB, matScore, gapOpen, gapExtend, ScoreOnly=True)

    sscores.append(s)

# Fit extreme value distribution to the scramble alignment data
miu, beta = gumbel_r.fit(sscores)
print("Length of sscores: ", len(sscores))
print("Computing histogram for {} scramble scores".format(N))
print("Max scrambled score:", max(sscores))
print("Min scrambled score:", min(sscores))
print("Median of scrambled scores:", np.median(sscores))
print("Gumbel miu:", miu)
print("Gumbel beta:", beta)
print("Probability of unscrambled score in a random alignment: ",
      1-gumbel_r.cdf(uscore, miu, beta))
print()

# Generate the basename for save files
basename = "smith" if args.alignment_method == "local" else "needle"
basename += "_{}_{}_{}_{:3.1f}_{:3.1f}".format(
    N, len(seqA), smatrix, abs(gapOpen), abs(gapExtend))

# Create the plot
fig, ax = plt.subplots()
ax.set_title("S-W, {} aligns,len {}, matrix {}, gapo {}, gape {}".format(
    N, len(seqA), smatrix, gapOpen, gapExtend))
counts, bins, _ = ax.hist(sscores, bins=np.arange(
    min(sscores), max(sscores)), align='left', rwidth=0.95)
x = np.arange(bins[0], bins[-1], 0.01)
ax.plot(x, sum(counts)*(bins[1]-bins[0])*gumbel_r.pdf(x, miu, beta),
Esempio n. 10
0
loc, scale = gumbel_r.fit(hmax)
fig, ax = plt.subplots()
x = np.linspace(gumbel_r.ppf(0.01, loc=loc, scale=scale),
                gumbel_r.ppf(0.99, loc=loc, scale=scale), 100)
ax.plot(x,
        gumbel_r.pdf(x, loc=loc, scale=scale),
        'r-',
        lw=5,
        alpha=0.6,
        label='gumbel_r pdf')
ax.hist(hmax, density=True)

fig, ax = plt.subplots()
ax.plot(x,
        gumbel_r.cdf(x, loc=loc, scale=scale),
        'r-',
        lw=5,
        alpha=0.6,
        label='gumbel_r pdf')
ecdf = statsmodels.distributions.ECDF(hmax)
ax.plot(x, ecdf(x))

fig, ax = plt.subplots()
probplot(hmax, dist=gumbel_r, sparams=(loc, scale), plot=ax)
ax.grid()

#%%

from scipy.signal import find_peaks
def gumbel_max_test(X, y, min_count=5):
    x, y = _check_Xy(X, y, norm_y=True)
    min_count = max(min_count, 5)
    return _groupby_agg(x, y, 'max', min_count=min_count,
                        func=lambda df: df.__setitem__('pval', gumbel_r.cdf(df.pop('max').values)))
Esempio n. 12
0
# calculate the number of samples smaller than the reference score
for score in permutation_scores:
    if score < reference_score:
        number_value_smaller +=1
# estimate the number of total samples + 1
number_samples = len(permutation_scores) +1
# calculate the p-value
p_value = 1-(float(number_value_smaller) / float(number_samples))
print(p_value )

# Task 5: Compute the associate p-value using an estimated gumble distribution

# estimate the parameter loc and scale using the fit function
loc,scale = gumbel_r.fit(permutation_scores)

# calculate the p-value
p_value = 1-gumbel_r.cdf(reference_score, loc=loc, scale=scale )
print(p_value)


# Task 6: Plot the histogram and the fitted probability density function with the reference score as vertical line

fig, ax = plt.subplots(1, 1)
x = np.linspace(gumbel_r.ppf(0.01,loc=loc, scale=scale),gumbel_r.ppf(0.99,loc=loc, scale=scale), 1000)

ax.plot(x, gumbel_r.pdf(x, loc=loc, scale=scale), 'k-', lw=2, label='frozen pdf')
ax.hist(permutation_scores, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
ax.axvline(reference_score)
plt.show()
import numpy as np
from matplotlib import pyplot as plt

from scipy.stats import gumbel_r
from scipy.stats import gumbel_l
from scipy.stats import genextreme

dataN = np.loadtxt("../data/Qdaily.txt")

x_pdf = np.linspace(np.min(dataN), np.max(dataN), num=100)
param = gumbel_r.fit(dataN)
cdf1 = gumbel_r.cdf(x_pdf, *param[:-2], loc=param[-2], scale=param[-1])
plt.plot(x_pdf, -np.log(-np.log(cdf1)), 'o')

print(param)

num_bins = 200
counts, bin_edges = np.histogram(dataN, bins=num_bins)
cdf2 = np.cumsum(counts) / np.sum(counts)
plt.plot(bin_edges[1:], -np.log(-np.log(cdf2)), 'x')

plt.show()