Ejemplo n.º 1
1
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False):
    '''
    Energy distance statistics test.
    Reference
    ---------
    Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636
    Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272
    Brian Lau, multdist, https://github.com/brian-lau/multdist
    '''
    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    stack = (stack - stack.mean(0)) / stack.std(0)
    if replace:
        rand = lambda x: random.randint(x, size=x)
    else:
        rand = random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot
Ejemplo n.º 2
0
def Plot_Dist_Train_Extreme(r_err, GT_val,bin1=500,bin2=500,interval1 = 0.95,interval2=0.99):
    covMat = np.array(r_err["Err"], dtype=float)
    median = np.median(covMat)
    c, loc, scale = genextreme.fit(covMat, floc=median)
    min_extreme1,max_extreme1 = genextreme.interval(interval1,c,loc,scale)
    min_extreme2,max_extreme2 = genextreme.interval(interval2,c,loc,scale)
    x = np.linspace(min(covMat),max(covMat),2000)
    fig,ax = plt.subplots(figsize = (30,10))
    plt.xlim(0,0.4)
    plt.plot(x, genextreme.pdf(x, *genextreme.fit(covMat)), linewidth=5)
    plt.hist(np.array(r_err["Err"], dtype=float),bins=bin1,alpha=0.3,density=True,edgecolor='black',facecolor='gray', linewidth=3,histtype='stepfilled') #{'bar', 'barstacked', 'step', 'stepfilled'})
    plt.hist(np.asarray(GT_val["Err"]), bins=bin2, alpha=0.3,density=True,edgecolor='red',facecolor='red', linewidth=3,histtype='stepfilled')
    plt.xlabel('Lengths Counts')
    plt.ylabel('Probability')
    plt.title(r'max_extreme1=%.3f,max_extreme2=%.3f' %(max_extreme1, max_extreme2))
    ax.tick_params(left = False, bottom = False)
    
    ax.axvline(min_extreme1, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #,
    ax.axvline(max_extreme1, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #,
    ax.text(min_extreme1, 8, "5th", size = 20, alpha = 0.8,color="red")
    ax.text(max_extreme1, 8, "95th", size = 20, alpha =.8,color="red")
    ax.axvline(min_extreme2, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #,
    ax.axvline(max_extreme2, alpha = 0.9, ymax = 0.20, linestyle = ":",linewidth=3,color="red") #,
    ax.text(min_extreme2, 8, "1st", size = 20, alpha = 0.8,color="red")
    ax.text(max_extreme2, 8, "99th", size = 20, alpha =.8,color="red")
    
    print("95% CI upper bound:",max_extreme1)
    print("99% CI upper bound:",max_extreme2)
    print("Median RE:",np.median(np.array(GT_val["Err"], dtype=float)))
    
    return c, loc, scale, fig,ax
Ejemplo n.º 3
0
def get_gev_fit(data):
    """
    Tries GEV fits with several loc parameters, selects the best one
    MIGHT BE DIFFERENT THAN OTHER SCRIPT--BE CAREFUL
    """
    md = mode(data)[0][0]
    std = np.std(data)
    # first try with loc=mode
    shape, loc, scale = gev.fit(data, loc=md)
    # if bad try again with mean
    if loc > md + std:
        shape, loc, scale = gev.fit(data, loc=np.mean(data))
    else:
        print('GEV fit with mode')
    # if still bad (ugh), try again with mode - std
    if loc > md + std:
        shape, loc, scale = gev.fit(data, loc=md - std)
    else:
        print('GEV fit with mean')
    if loc > md + std:
        print('GEV fit with c=0')
        shape, loc, scale = gev.fit(data, 0)
    else:
        print('GEV fit with mode minus std deviation')
    return shape, loc, scale
Ejemplo n.º 4
0
def get_gev_fit(data):
    """
    Tries GEV fits with several loc parameters, selects the best one
    BUG--doesn't always fit well. Might be related to x-range (needs to be wide enough to catch extremes)
    This also needs to get moved to UTIL eventually 
    """
    md = mode(data)[0][0]
    std = np.std(data)
    # first try with loc=mode
    shape, loc, scale = gev.fit(data, loc=md)
    # if bad try again with mean
    if loc > md + std:
        shape, loc, scale = gev.fit(data, loc=np.mean(data))
    else:
        print('GEV fit with mode')
    # if still bad (ugh), try again with mode - std
    if loc > md + std:
        shape, loc, scale = gev.fit(data, loc=md - std)
    else:
        print('GEV fit with mean')
    if loc > md + std:
        print('GEV fit with c=0')
        shape, loc, scale = gev.fit(data, 0)
    else:
        print('GEV fit with mode minus std deviation')
    return shape, loc, scale
Ejemplo n.º 5
0
def fit_gev(data, user_estimates=[], generate_estimates=False):
    """Fit a GEV by providing fit and scale estimates.

    Parameters
    ----------
    data : numpy ndarray
    user_estimates : list, optional
        Estimate of the location and scale parameters
    generate_estimates : bool, default False
        Fit GEV to data subset first to estimate parameters (useful for large datasets)

    Returns
    -------
    shape : float
        Shape parameter
    loc : float
        Location parameter
    scale : float
        Scale parameter
    """

    if user_estimates:
        loc_estimate, scale_estimate = user_estimates
        shape, loc, scale = gev.fit(data,
                                    loc=loc_estimate,
                                    scale=scale_estimate)
    elif generate_estimates:
        shape_estimate, loc_estimate, scale_estimate = gev.fit(data[::2])
        shape, loc, scale = gev.fit(data,
                                    loc=loc_estimate,
                                    scale=scale_estimate)
    else:
        shape, loc, scale = gev.fit(data)

    return shape, loc, scale
Ejemplo n.º 6
0
    def construct_IDF(self):

        if self.ci:
            bts = {}
            for col in self.reformatted_ams.columns:
                mams = []
                for i in range(self.number_bootstrap):
                    bootsams = np.random.choice(
                        self.reformatted_ams[col].values, replace=True, size=len(self.reformatted_ams))
                    fit = gev.fit(bootsams)
                    mams.append(
                        gev.isf(self.quantiles, c=fit[0], loc=fit[1], scale=fit[2]))
                bts[col] = np.asarray(mams)

            p_lo = ((1.0-self.alpha)/2.0) * 100
            p_up = (self.alpha+((1.0-self.alpha)/2.0)) * 100
            for col in self.reformatted_ams.columns:
                lower = np.apply_along_axis(np.percentile, 0, bts[col], p_lo)
                upper = np.apply_along_axis(np.percentile, 0, bts[col], p_up)
                median = np.apply_along_axis(
                    np.percentile, 0, bts[col], 50)
                self.idf[col] = np.append(lower, np.append(median, upper))
        else:

            for col in self.reformatted_ams.columns:
                fit = gev.fit(self.reformatted_ams[col])
                self.idf[col] = gev.isf(self.quantiles, c=fit[0],
                                        loc=fit[1], scale=fit[2])
Ejemplo n.º 7
0
 def _compare_resamples(self, tvalues, null_max_tvalues, null_min_tvalues):
     pvalues = []
     maxparams = genextreme.fit(null_max_tvalues)
     minparams = genextreme.fit([-x for x in null_min_tvalues])
     for tvalue in tvalues:
         pvalue = genextreme.sf(tvalue, *maxparams) if tvalue >= 0 else genextreme.sf(-tvalue, *minparams)
         pvalues.append(pvalue)
     return pvalues
Ejemplo n.º 8
0
def testcase():
    data = read_data()
    all_float = True
    for x in data:
        if type(x) != float:
            all_float = False
    print(f"All values were floats? {all_float}")
    print(f"Starting test with only first 2000 points of data...")
    shape, loc, scale = gev.fit(data[0:2000])
    print(f"Starting test with all points of data...")
    shape, loc, scale = gev.fit(data)
Ejemplo n.º 9
0
def generalized_extreme_value_distribution_fit(annual_maxima,
                                               loc=None,
                                               scale=None):
    # Fit the exceedances over threshold to Generalized Pareto distribution
    # BUG Missing default values get different results than default parameters values
    if loc is None and scale is not None:
        gev_param = genextreme.fit(annual_maxima, scale=scale)
    elif loc is not None and scale is None:
        gev_param = genextreme.fit(annual_maxima, loc=loc)
    elif loc is None and scale is None:
        gev_param = genextreme.fit(annual_maxima)
    else:
        gev_param = genextreme.fit(annual_maxima, loc=loc, scale=scale)

    return gev_param
Ejemplo n.º 10
0
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False):
    '''
    Energy distance statistics test.
    Reference
    ---------
    Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636
    Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272
    Brian Lau, multdist, https://github.com/brian-lau/multdist

    '''
    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    stack = (stack - stack.mean(0)) / stack.std(0)
    if replace:
        rand = lambda x: random.randint(x, size=x)
    else:
        rand = random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot
 def test_fit(self, datasets):
     for ds in datasets:
         c, loc, scale = genextreme.fit(ds)
         fit = GEV.fit(ds)
         assert fit.loc() == approx(loc)
         assert fit.scale() == approx(scale)
         assert fit.shape() == approx(-c)
Ejemplo n.º 12
0
 def getZScoreDistExpFunction(y_data):
     from scipy.stats import genextreme as ge
     fit_params = ge.fit(y_data)
     mean = fit_params[1]
     sigma = fit_params[2]
     shape = fit_params[0]
     return lambda x: ge.pdf(x, shape, loc=mean, scale=sigma)
Ejemplo n.º 13
0
 def fit(self, data, c=0, loc=1, scale=1):
     (c, loc, scale) = genextreme.fit(data)
     self.c, self.loc, self.scale = (c, loc, scale)
     self.epsilon = -self.c
     self.params = {'c': self.c, 'loc': self.loc, 'scale': self.scale}
     self.setParams(self.params)
     return (self.c, self.loc, self.scale)
Ejemplo n.º 14
0
def test_lm():
    x = [
        360.228515625, 513.506103515625, 273.85031127929688,
        340.94839477539062, 244.13925170898438, 283.414306640625,
        394.42819213867188, 284.3604736328125, 281.26956176757812,
        241.46173095703125, 489.75482177734375, 236.31536865234375,
        407.55133056640625, 244.6295166015625, 432.40670776367188,
        260.501953125, 517.23052978515625, 317.6553955078125,
        407.61935424804688, 275.0709228515625, 330.369140625,
        285.92086791992188, 247.9954833984375, 344.34811401367188,
        379.55596923828125, 330.80569458007812, 312.35330200195312,
        251.79550170898438, 372.66928100585938, 239.72474670410156
    ]

    #    print(get_initial_params_using_lm(x))
    print(np.mean(x))
    pars = [128.28104749, 578.4927539, 0.62410911]
    data = [
        588.4747314453125, 693.6640625, 519.03155517578125, 716.58013916015625,
        686.29168701171875, 432.65786743164062, 682.72113037109375,
        730.12603759765625, 698.971923828125, 491.75332641601562,
        597.258544921875, 487.13619995117188, 482.33123779296875,
        573.57861328125, 801.67169189453125, 616.41668701171875,
        690.954833984375, 671.31646728515625, 680.87554931640625,
        534.18414306640625, 427.86019897460938, 236.22953796386719,
        691.40972900390625, 599.84637451171875, 545.3563232421875,
        553.059814453125, 549.1295166015625, 658.3983154296875,
        719.122802734375, 636.84906005859375
    ]

    import lmoments3
    from lmoments3 import distr

    the_moments = lmoments3.lmom_ratios(sorted(data), 5)
    pars = distr.gev.lmom_fit(sorted(data), lmom_ratios=the_moments)

    print("Fitted params using lmoments: ", pars)
    xi, mu, sigma = pars.values()
    print(objective_function_stationary_high([sigma, mu, -xi], data))

    print("Fitted using MLE: ", distr.gev.fit(sorted(data)))

    print(
        "Fitted using custom method (Huziy et al 2013), not using l-moments: ",
        optimize_stationary_for_period(np.array(sorted(data))))
    print(
        "Fitted using custom method (Huziy et al 2013), using l-moments: ",
        optimize_stationary_for_period(np.array(sorted(data)),
                                       use_lmoments=True))

    from scipy.stats import genextreme

    print("Fitted using scipy.stats.genextreme: ",
          genextreme.fit(np.array(sorted(data))))
    print("10 year high flow return level: ",
          get_high_ret_level_stationary([sigma, mu, -xi, 0], 10))
    print("10 year high flow return level: ",
          get_high_ret_level_stationary([sigma, mu, -0.5, 0], 10))
Ejemplo n.º 15
0
def gevfit(sr):
    gev_fit = gev.fit(sr)
    c = gev_fit[0]
    mu = gev_fit[1]
    sigma = gev_fit[2]

    print("""
          GEV Fit Parameters:
          shape parameter c: %s
          location parameter mu: %s
          scale parameter sigma: %s
          """ % (c, sigma, mu))

    print("Median", gev.median(c, mu, sigma))
    print("Mean", gev.mean(c, mu, sigma))
    print("Std dev", gev.std(c, mu, sigma))
    print("95% interval: ", gev.interval(0.95, c, mu, sigma))

    if (c > 0):
        lBnd = mu - sigma / c
    else:
        lBnd = mu + sigma / c
    srmax = np.max(sr) * 1.1

    bins = sr.size

    x = np.linspace(np.min(sr) - 5, np.max(sr) + 5, 500)
    #x=np.linspace(lBnd,srmax,500)
    gev_pdf = gev.pdf(x, c, mu, sigma)
    gev_cdf = gev.cdf(x, c, mu, sigma)

    plt.figure(figsize=(12, 6))

    ax1 = plt.subplot(1, 2, 1)
    plt.hist(sr, normed=True, alpha=0.2, label='Raw Data', bins='auto')
    plt.plot(x, gev_pdf, 'r--', label='GEV Fit')
    plt.legend(loc='upper left')
    ax1.set_title('%s_Probability Density Fraction' % (sr.name))
    ax1.set_xlabel('Predicted Fatigue Limit (MPa)')
    ax1.set_ylabel('Probability')
    ax1.grid()

    ax2 = plt.subplot(1, 2, 2)
    plt.hist(sr,
             normed=True,
             alpha=0.2,
             label='Raw Data',
             cumulative=True,
             bins='auto')
    plt.plot(x, gev_cdf, 'r--', label='GEV Fit')
    plt.legend(loc='upper left')
    ax2.set_title('%s_Cumulative Density Fraction' % (sr.name))
    ax2.set_xlabel('Predicted Fatigue Limit (MPa)')
    ax2.set_ylabel('Density')
    ax2.grid()

    plt.show()
    pass
Ejemplo n.º 16
0
    def mvs(self):
        if self.data is None:
            raise ValueError("Data not's None")
        mvs = genextreme.fit(self.data)
        self.shape = mvs[0]
        self.loc = mvs[1]
        self.scale = mvs[2]

        return self.shape, self.loc, self.scale
Ejemplo n.º 17
0
def estat(x,
          y,
          nboot=1000,
          maxt=60.,
          replace=False,
          method='log',
          fitting=False):
    """
    Energy distance statistics test.

    References
    ----------

    * Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
      multivariate goodness-of-fit tests, two-sample comparison and unfolding.
      Nuc Instr and Meth in Phys Res A 537: 626-636

    * Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
      based on distances. J Stat Planning & Infer 143: 1249-1272

    * Brian Lau, multdist, https://github.com/brian-lau/multdist

    """

    n, N = len(x), len(x) + len(y)
    stack = np.vstack([x, y])
    # stack = (stack - stack.mean(0)) / stack.std(0)
    stack = (stack - np.nanmean(stack, 0)) / np.nanstd(stack, 0)
    if replace:

        def rand(x):
            return np.random.randint(x, size=x)

        # rand = lambda x: np.random.randint(x, size=x)
    else:
        rand = np.random.permutation

    en = energy(stack[:n], stack[n:], method)
    en_boot = np.zeros(nboot, 'f')
    s = t.time()
    for i in range(nboot):
        idx = rand(N)
        en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)
        if t.time() - s > maxt:
            print("Time consumed, exit bootstrap (N={})".format(i))
            en_boot, nboot = en_boot[:i], i + 1
            break

    if fitting:
        param = genextreme.fit(en_boot)
        p = genextreme.sf(en, *param)
        return p, en, param
    else:
        p = (en_boot >= en).sum() / nboot
        return p, en, en_boot
def srednie(plik_in):
    listy = []
    domeny = []
    li = 0
    d1 = 0

    with open(plik_in, 'r+') as f:
        for line in f:
            w = line.split()
            d = line.split()
            w = float(w[1])
            d = float(d[2])
            listy.append(w)
            # print(listy)
            domeny.append(d)

        for x, el in enumerate(domeny):
            if el == 0.0:
                domeny[x] = 1.0
        for x in domeny:
            li += 1
            if x == 1.0:
                d1 += 1

    # -------------------------DANIO RERIO REVIEWED----------------------
    data4 = pd.read_csv(
        'Danio_reviewed_out.txt',
        sep='\t',
        names=['Nazwa białka', 'Długość łańcucha', 'Liczba domen'])

    # histogram długosc łańcucha
    dwiekolumny4 = data4[data4.columns[1:3]]
    np.seterr(divide='ignore', invalid='ignore')
    dwiekolumny4.hist(column='Długość łańcucha',
                      bins=100,
                      figsize=(10, 10),
                      color='mediumvioletred',
                      density=True)

    p = genextreme.fit(listy, -1)
    print(p)
    ss.genextreme.fit(listy)
    plt.plot(np.linspace(0, 3500),
             genextreme.pdf(np.linspace(0, 3500), p[0], p[1], p[2]),
             'b--',
             lw=3,
             label='Generalized extreme value distribution ')

    plt.title('Danio rerio reviewed - Histogram długości łańucha',
              color='black')
    plt.xlabel('Długość łańcucha')
    plt.ylabel('Liczebność')
    plt.legend(loc='upper right')
    pylab.xlim([-10, 3500])
    plt.show()
Ejemplo n.º 19
0
    def mvs(self):
        if self.data is None:
            raise e.DataNotExist("Data not's None", 35)
        mvs = genextreme.fit(data=self.data)
        self.estimador = 'MVS'
        self.shape = mvs[0]
        self.loc = mvs[1]
        self.scale = mvs[2]
        self.dist = genextreme(c=self.shape, loc=self.loc, scale=self.scale)

        return self.shape, self.loc, self.scale
Ejemplo n.º 20
0
def extreme_value_prob_fit(NPM, perc):
    n = NPM.shape[0]
    t = NPM.shape[1]
    n_perc = int(round(t * perc))
    m = np.zeros(n)
    for i in range(n):
        temp =  np.abs(NPM[i, :])
        temp = np.sort(temp)
        temp = temp[t - n_perc:]
        m[i] = trim_mean(temp, 0.05)
    params = genextreme.fit(m)
    return params
def calculate_required_effort(data_before_capture, n_bootstrapping,
                              success_probability):
    effort_per_sighting = calculate_effort_per_sighting(data_before_capture)
    n_effort_per_sighting = len(effort_per_sighting)
    required_effort: np.array = np.zeros(n_bootstrapping)
    for i in range(n_bootstrapping):
        resampled_effort_per_sighting = np.random.choice(
            effort_per_sighting, n_effort_per_sighting)
        fit = genextreme.fit(resampled_effort_per_sighting)
        required_effort[i] = genextreme.ppf(success_probability, fit[0],
                                            fit[1], fit[2])
    return required_effort
Ejemplo n.º 22
0
def FitGEV_KMA_Frechet(bmus, n_clusters, var):
    '''
    Returns stationary GEV/Gumbel_L params for KMA bmus and varible series

    bmus        - KMA bmus (time series of KMA centroids)
    n_clusters  - number of KMA clusters
    var         - time series of variable to fit to GEV/Gumbel_L

    returns np.array (n_clusters x parameters). parameters = (shape, loc, scale)
    for gumbel distributions shape value will be ~0 (0.0000000001)
    '''

    param_GEV = np.empty((n_clusters, 3))
    for i in range(n_clusters):
        c = i + 1
        pos = np.where((bmus == c))[0]

        if len(pos) == 0:
            param_GEV[i, :] = [np.nan, np.nan, np.nan]

        else:

            # get variable at cluster position
            var_c = var[pos]
            var_c = var_c[~np.isnan(var_c)]

            # fit to Gumbel_l and get negative loglikelihood
            loc_gl, scale_gl = gumbel_l.fit(-var_c)
            theta_gl = (0.0000000001, -1 * loc_gl, scale_gl)
            nLogL_gl = genextreme.nnlf(theta_gl, var_c)

            # fit to GEV and get negative loglikelihood
            c = -0.1
            shape_gev, loc_gev, scale_gev = genextreme.fit(var_c, c)
            theta_gev = (shape_gev, loc_gev, scale_gev)
            nLogL_gev = genextreme.nnlf(theta_gev, var_c)

            # store negative shape
            theta_gev_fix = (-shape_gev, loc_gev, scale_gev)

            # apply significance test if Frechet
            if shape_gev < 0:

                # TODO: cant replicate ML exact solution
                if nLogL_gl - nLogL_gev >= 1.92:
                    param_GEV[i, :] = list(theta_gev_fix)
                else:
                    param_GEV[i, :] = list(theta_gl)
            else:
                param_GEV[i, :] = list(theta_gev_fix)

    return param_GEV
Ejemplo n.º 23
0
    def plot(self, FAPname, Nlevels, cheat=True):
        '''Plots the periodogram with significance levels provided by 
        the bootstrap. Number of displayed significance levels adjusted
        with Nlevels. cheat shows the FAP calculated by astrop, as well
        as a marker showing a tabulated value for the frequency of the planets
        oscillation. Not finished code for calcualting the FAP based on
        the z-levels is still present.'''
        P = self.search()
        Ptop = np.amax(P)
        ftop = self.flist[np.where(P == Ptop)[0][0]]
        Levels = np.array([50, 90, 95, 99, 99.9])[:Nlevels]
        # Neff = 0
        # for i in range(self.Neval-2):
        #     if (P[i]<P[i+1]) & (P[i+1]>P[i+2]):
        #         Neff = Neff + 1
        # # print(Neff)
        # Neff = self.fmax * 1/(self.flist[1]-self.flist[0])
        # # print(Neff)

        FAPfile = np.loadtxt(FAPname + 'FAPNormTest.txt')
        #PLevels = scoreperc(FAPfile,Levels)

        fit = gev.fit(FAPfile)
        PLevels = gev.ppf(Levels / 100, *fit)

        plt.figure(figsize=(20, 14))
        plt.hlines(PLevels, self.fmin, self.fmax, 'g')
        plt.plot(self.flist, P)
        plt.text(self.fmax - (self.fmax - self.fmin) / 2.25,
                 plt.ylim()[1], 'False alarm probability')
        plt.ylim(0, Ptop + 0.1)
        for i in range(Nlevels):
            plt.text(self.fmax - (self.fmax - self.fmin) / 3,
                     PLevels[i] + 0.003, str(np.round(1 - Levels[i] / 100, 3)))
        plt.plot(ftop,
                 Ptop,
                 'r',
                 marker='o',
                 linestyle='none',
                 markerfacecolor='none',
                 markersize=35)
        if cheat == True:
            plt.vlines(1 / self.cheat, 0, Ptop, 'g')

            CheatLevels = self.APFAP(1 - Levels / 100)
            plt.hlines(CheatLevels, self.fmin, self.fmax, 'r')
        plt.xlabel('Frequency [1/day]')
        plt.ylabel('Lomb-Scargle Power')
        plt.title(
            'Lomb - Scargle Periodogram for {planet}'.format(planet=self.name))
        print('Highest probability of period = {p} days'.format(
            p=round(1 / ftop, 3)))
Ejemplo n.º 24
0
def extreme_value_prob_fit(NPM, perc):
    n = NPM.shape[0]
    t = NPM.shape[1]
    n_perc = int(round(t * perc))
    m = np.zeros(n)
    for i in range(n):
        temp = np.abs(NPM[i, :])
        temp = np.sort(temp)
        temp = temp[t - n_perc:]
        temp = temp[0:int(np.floor(0.90 * temp.shape[0]))]
        m[i] = np.mean(temp)
    params = genextreme.fit(m)
    return params
Ejemplo n.º 25
0
    def CalculaParametros(self):

        if self.tipoSerie == 'Parcial':
            #Achando o valor limiar:
            Parametro = genpareto.fit(self.dadoSerie)
            print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return  Parametro
        elif self.tipoSerie == 'Anual':
            Parametro = genextreme.fit(self.dadoSerie)
            print('Parametros com Gev: \nForma: %f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return Parametro
Ejemplo n.º 26
0
    def fit_GEV(self, theta=None, save=False):
        """
        Fits GEV parameters to annual maxima of data using maximum
        likelihood (if theta is None) and draws Q-Q plot.
        -----------------------------------------------------------------------
        theta: Fitted [mu, sigma, xi].
        -----------------------------------------------------------------------
        Returns: Fitted [mu, sigma, xi].
        """

        block_max = [x for x in self.block_max if x > 0]
        block_max.sort()

        if theta is None:
            xi, mu, sigma = genextreme.fit(block_max)

            xi = -xi

            theta = [mu, sigma, xi]

        a = np.array(range(len(block_max))) + 1.0
        b = len(block_max) + 1.0

        emp_p = a / b
        emp_q = quantile(theta, emp_p)

        fig, ax = plt.subplots(figsize=(6, 4))

        ax.scatter(emp_q, block_max, s=10, color="k")
        ax.plot([0, 1], [0, 1], transform=ax.transAxes, color="k")

        ax.set(xlabel="Theoretical quantiles", ylabel="Empirical quantiles")

        both = np.concatenate((emp_q, block_max))

        l = (max(both) - min(both)) / 10

        limits = [min(both) - l, max(both) + l]

        ax.grid(True)

        plt.axis("square")
        ax.axis([*limits, *limits])

        if save:
            fig.tight_layout()
            plt.savefig("%s/plots/%s-qq.pdf" % (save_path, self.name),
                        bbox_inches="tight")
        plt.show()

        return theta
Ejemplo n.º 27
0
    def calculate_pvalue_gev(original_score: float, scores: list) -> float:
        """Calculates a p-value to a target/query combination by int. with a given amount of shuffle iterations by
        fitting a generalized extreme value distribution and integrating from -inf to the original score

        >>> i = IntaRNApvalue(['-q', 'AGGAUG', '-t', 'UUUAUCGUU', '--scores', '10', '-m', 'b', '--threads', '3'])
        >>> i.calculate_pvalue_gev(-10.0, [-1.235, -1.435645, -6.234234, -12.999, -15.23, -6.98, -6.23, -2.78])
        0.17611816922560236
        """
        shape, loc, scale = gev.fit(scores)

        def f(x):
            return gev.pdf(x, shape, loc=loc, scale=scale)

        return integ(f, -np.inf, original_score)[0]
Ejemplo n.º 28
0
 def _process_block(self) -> None:
     assert len(self.block) == self.block_size
     # We've finished a block. Take the extremum for it
     # as an observation for the GEV distribution and reset the block.
     self.extrema.append(self.take_extremum(self.block))
     self.extremum = self.take_extremum(self.extrema)
     self.block.clear()
     # Fit a generalized extreme value (GEV) distribution to our
     # extremum samples.
     self.dist = gev(*gev.fit(self.extrema))
     self.p = getattr(self.dist, self.find_p)(self.extremum)
     self.return_period = 1.0 / (self.block_size * self.p)  # type: ignore
     self.return_time = self.return_period * self.avg_t
     self.can_report = True
Ejemplo n.º 29
0
    def calc_match_statistics(self,
                              oligo,
                              charges,
                              modifications,
                              ms,
                              ppm_error,
                              score_to_test,
                              random_oligo_to_test=1000):

        fr = Fragmentor()
        matcher = Matcher()
        column_headers = ['Sequence', 'Score']

        min_char = len(oligo)
        max_char = len(oligo)
        allchar = ['A', 'G', 'C', 'T', 'U']

        data_to_save = []

        for i in range(random_oligo_to_test):

            random_oligo = "".join(
                choice(allchar) for _ in range(randint(min_char, max_char)))

            fragments = fr.fragment_oligo(random_oligo)
            df_search_space = matcher.create_search_space(
                fragments, charges, modifications)
            df_results = matcher.match_oligo_fragments_pandas(
                df_search_space, ms, ppm_error)

            score = self.simple_score(df_results)

            print('Oligo: {0:<30} Score: {1:7.3f}'.format(random_oligo, score))

            data_to_save.append([random_oligo, score])

        dist_df = pd.DataFrame(data_to_save, columns=column_headers)

        extreme_fit = genextreme.fit(dist_df.Score)
        c = extreme_fit[0]
        loc = extreme_fit[1]
        scale = extreme_fit[2]
        print(("Extreme value fits c = {0}, loc = {1}, scale = {2}").format(
            c, loc, scale))

        extreme_to_plot = genextreme(c, loc, scale)
        p_value = extreme_to_plot.pdf(score_to_test)
        print(("p value of score {0} = {1}").format(score_to_test, p_value))

        return dist_df, p_value, score_to_test, extreme_to_plot
Ejemplo n.º 30
0
    def gev_fit(var_fit):
        c = -0.1
        vv = np.linspace(0, 10, 200)

        sha_g, loc_g, sca_g = genextreme.fit(var_fit, c)
        pg = genextreme.cdf(vv, sha_g, loc_g, sca_g)

        ix = pg > 0.1
        vv = vv[ix]
        ts = 1 / (1 - pg[ix])

        # TODO gev params 95% confidence intervals

        return ts, vv
Ejemplo n.º 31
0
def doGev(dis, retPerYr):
  prob = 1-1/retPerYr
  npt = dis.shape[1]
  nretper = len(retPerYr)
  retLev = np.ones([npt, nretper])*np.nan
  for ipt in range(npt):
    disii = dis[:,ipt]
    disii = disii[~np.isnan(disii)]
    if len(disii) > 15:
      shape, loc, scale = gev.fit(-disii)
      retLevII = -gev.ppf(prob, shape, loc=loc, scale=scale)
      if sum(retLevII < 0) == 0:
        retLev[ipt, :] = retLevII
  return retLev
Ejemplo n.º 32
0
def rl_bootstrap(data, T=100, nsim=1000):
    """returns a return level 
    
    :param data: list of input data
    :param T: timestep period
    :param nsim: number of recalcualtions
    """
    
    from scipy.stats import genextreme as gev
    
    RL_bt=[]
    for i in range(0,nsim,1):
        subset = resample(data)
        s, a, b = gev.fit(subset)
        RL_bt.append(RL(T,a,b,s))        
    return RL_bt
Ejemplo n.º 33
0
def plot_probability_density(annual_max, station_id):
    mle = genextreme.fit(sorted(annual_max), 0)
    mu = mle[1]
    sigma = mle[2]
    xi = mle[0]
    min_x = min(annual_max)-0.5
    max_x = max(annual_max)+0.5
    x = np.linspace(min_x, max_x, num=100)
    y = [genextreme.pdf(z, xi, loc=mu, scale=sigma) for z in x]

    fig = plt.figure(figsize=(12,6))
    axes = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    xlabel = (station_id + " - Annual Max Wind Speed (m/s)")
    axes.set_title("Probability Density & Normalized Histogram")
    axes.set_xlabel(xlabel)
    axes.plot(x, y, color='Red')
    axes.hist(annual_max, bins=arange(min_x, max_x, abs((max_x-min_x)/10)), normed=1, color='Yellow')
Ejemplo n.º 34
0
def CalculateEVDParameters (MaxSimilarities):

#	from rpy2.robjects.packages import importr 
#	import rpy2.rpy_classic as rpy
#	import rpy2.robjects as robjects
#	from rpy2.robjects import r
	# Create an R object of sorted scores
#	maxsims = robjects.FloatVector( sorted(MaxSimilarities) )
	# MLE Estimates using "ismev" package
#	r.library("ismev")
#	gev_fit = robjects.r['gev.fit'](maxsims)
	#Mu = location, Sigma = scale, Xi = shape
#	Mu    = gev_fit[6][0] 
#	Sigma = gev_fit[6][1]
#	Xi    = gev_fit[6][2]
	#Standard errors for the Mu, Sigma and Xi
#	eMu    = gev_fit[8][0]
#	eSigma = gev_fit[8][1]
#	eXi    = gev_fit[8][2]
#	print "baseR: ", Xi,Mu,Sigma,eXi,eMu,eSigma

	i = 0
	while i < len(MaxSimilarities):
		print i, " -- ", MaxSimilarities[i], " == ",
		if(MaxSimilarities[i] == 0):
			MaxSimilarities[i] = 1
		print MaxSimilarities[i]
		i += 1

	from scipy.stats import genextreme
	import warnings
	with warnings.catch_warnings():
		warnings.simplefilter("ignore")
		gev_shape,gev_loc,gev_scale = genextreme.fit( sorted(MaxSimilarities) )	

	print "scipy: shape=", gev_shape, " loc=", gev_loc, " scale=", gev_scale

	Xi     = abs(gev_shape)
	Mu     = gev_loc
	Sigma  = gev_scale
	eXi    = 0
	eMu    = 0
	eSigma = 0

	return (Xi,Mu,Sigma,eXi,eMu,eSigma)
Ejemplo n.º 35
0
def plot_return_values(annual_max, station_id):
    fig, axes = plt.subplots(figsize=(20,6))
    T=np.r_[1:500]
    mle = genextreme.fit(sorted(annual_max), 0)
    mu = mle[1]
    sigma = mle[2]
    xi = mle[0]
#     print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi)
    sT = genextreme.isf(1./T, 0, mu, sigma)
    axes.semilogx(T, sT, 'r'), hold
    N=np.r_[1:len(annual_max)+1]; 
    Nmax=max(N);
    axes.plot(Nmax/N, sorted(annual_max)[::-1], 'bo')
    title = station_id
    axes.set_title(title)
    axes.set_xlabel('Return Period (yrs)')
    axes.set_ylabel('Wind Speed (m/s)') 
    axes.grid(True)
Ejemplo n.º 36
0
Archivo: gevfit.py Proyecto: guziy/RPN
def test_lm():
    x = [360.228515625, 513.506103515625, 273.85031127929688, 340.94839477539062,
         244.13925170898438, 283.414306640625, 394.42819213867188, 284.3604736328125,
         281.26956176757812, 241.46173095703125, 489.75482177734375, 236.31536865234375,
         407.55133056640625, 244.6295166015625, 432.40670776367188, 260.501953125,
         517.23052978515625, 317.6553955078125, 407.61935424804688, 275.0709228515625,
         330.369140625, 285.92086791992188, 247.9954833984375, 344.34811401367188,
         379.55596923828125, 330.80569458007812, 312.35330200195312, 251.79550170898438,
         372.66928100585938, 239.72474670410156]

    #    print(get_initial_params_using_lm(x))
    print(np.mean(x))
    pars = [128.28104749, 578.4927539, 0.62410911]
    data = [588.4747314453125, 693.6640625, 519.03155517578125, 716.58013916015625,
            686.29168701171875, 432.65786743164062, 682.72113037109375, 730.12603759765625,
            698.971923828125, 491.75332641601562, 597.258544921875, 487.13619995117188, 482.33123779296875,
            573.57861328125, 801.67169189453125, 616.41668701171875, 690.954833984375, 671.31646728515625,
            680.87554931640625, 534.18414306640625, 427.86019897460938, 236.22953796386719, 691.40972900390625,
            599.84637451171875,
            545.3563232421875, 553.059814453125, 549.1295166015625, 658.3983154296875, 719.122802734375,
            636.84906005859375]

    import lmoments3
    from lmoments3 import distr

    the_moments = lmoments3.lmom_ratios(sorted(data), 5)
    pars = distr.gev.lmom_fit(sorted(data), lmom_ratios=the_moments)

    print("Fitted params using lmoments: ", pars)
    xi, mu, sigma = pars.values()
    print(objective_function_stationary_high([sigma, mu, -xi], data))

    print("Fitted using MLE: ", distr.gev.fit(sorted(data)))

    print("Fitted using custom method (Huziy et al 2013), not using l-moments: ", optimize_stationary_for_period(
        np.array(sorted(data))))
    print("Fitted using custom method (Huziy et al 2013), using l-moments: ",
          optimize_stationary_for_period(np.array(sorted(data)), use_lmoments=True))

    from scipy.stats import genextreme

    print("Fitted using scipy.stats.genextreme: ", genextreme.fit(np.array(sorted(data))))
    print("10 year high flow return level: ", get_high_ret_level_stationary([sigma, mu, -xi, 0], 10))
    print("10 year high flow return level: ", get_high_ret_level_stationary([sigma, mu, -0.5, 0], 10))
Ejemplo n.º 37
0
    def CalculaParametros(self):

        if self.tipoSerie == 'Parcial':
            #Achando o valor limiar:
            limite = lp.LimiteParcial(self.dadoSerie).AchaLimite(2)
            print(limite)
            Parciais = se.Series(self.dadoSerie).serieMaxParcial(limite)
            datasP, PicosParciais = se.Series(Parciais).separaDados()
            Parametro = genpareto.fit(PicosParciais)
            print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return  Parametro
        elif self.tipoSerie == 'Anual':
            Anuais = se.Series(self.dadoSerie).serieMaxAnual()
            datasA, PicosAnuais = se.Series(Anuais).separaDados()
            Parametro = genextreme.fit(PicosAnuais)
            print('Parametros com Gev: \nForma: %.f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return Parametro
Ejemplo n.º 38
0
def eventdistribution(data, per=[5,95], nsim=1000, rp = [ 10., 20., 50., 100., 200.,500., 1000. ], rp_scale_factor=1, white_noise=False):
    """ 
    returns a matrix with (returnperiod,lower_percentil,return_level, upper_percentil)
    
    :param data: values of timeseries
    :param per: lower and upper percentile defining the uncertainty
    :param nsim: Number of returs for bootstrap calculation
    :param rp: list of return timestepps
    :param rp_scale_factor: scale factor for rp
    :param std_err: default = True
    :param white_noise: add a white noise (random number between 0 to std/10). In case of singular timeseries
    """
    
    from scipy.stats import genextreme as gev
    from numpy import percentile, vstack

    if white_noise == True: 
        s = std(data)/10
        ts_white_noise = [n + uniform(0,s) for n in data]
        data = ts_white_noise

    s, a, b = gev.fit(data)
        
    rl = []
    edist = []

    per_low = []
    per_high = []

    for T in rp * rp_scale_factor :
        
        rl.append(RL(T,a,b,s))
        
        RL_bt = rl_bootstrap(data, T=T, nsim=nsim)
        #per, b = percentile(RL_bt,[per[0],per[1]])
        per_low.append(percentile(RL_bt, 5))
        per_high.append(percentile(RL_bt, 95))

    rl_c = vstack((rp, per_low, rl, per_high))
    
    return (rl_c)
# <codecell>

annual_max_levels = yx

# <headingcell level=4>

# Fit data to GEV distribution

# <codecell>

def sea_levels_gev_pdf(x):
    return genextreme.pdf(x, xi, loc=mu, scale=sigma)

# <codecell>

mle = genextreme.fit(sorted(annual_max_levels), 0)
mu = mle[1]
sigma = mle[2]
xi = mle[0]
print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi)

# <headingcell level=4>

# Probability Density Plot

# <codecell>

min_x = min(annual_max_levels)-0.5
max_x = max(annual_max_levels)+0.5
x = np.linspace(min_x, max_x, num=100)
y = [sea_levels_gev_pdf(z) for z in x]
Ejemplo n.º 40
0
for i in annual_max:
    data_levels.append(float(i))
annual_max = data_levels    

# <headingcell level=4>

# Fit data to GEV distribution

# <codecell>

def gev_pdf(x):
    return genextreme.pdf(x, xi, loc=mu, scale=sigma)

# <codecell>

mle = genextreme.fit(sorted(annual_max), 0)
mu = mle[1]
sigma = mle[2]
xi = mle[0]
print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi)

# <headingcell level=4>

# Probability Density Plot

# <codecell>

min_x = min(annual_max_levels)-0.5
max_x = max(annual_max_levels)+0.5
x = np.linspace(min_x, max_x, num=100)
y = [gev_pdf(z) for z in x]
Ejemplo n.º 41
0
def create_gev_models():
    """
    parses the top values and creates
    extreme value dist and saves the
    parameters to the database
    """

    base_dir = '/home/dtgillis/ccsim_workspace/evd/'
    dir_dict = {'inf': 'CC_0_0_0_150/',
                '1': 'CC_0_0_0_150.1/',
                '5': 'CC_0_0_0_150.5/',
                '10': 'CC_0_0_0_150.10/'}
    gev_model_list = []
    for mouse_per in ['inf', '1', '5', '10']:

        for software in exp.Software.objects.all():

            if software.name == 'htree' and (mouse_per == '5' or mouse_per == '10'):
                continue
            else:

                data_file = base_dir + dir_dict[mouse_per] + software.name + '.evd'
                np_extreme_values = np.genfromtxt(data_file)
                if software.name in ['plink', 'emmax']:
                    np_extreme_values = -np.log(np_extreme_values)
                shape, location, scale = genextreme.fit(np_extreme_values, -1, loc=np_extreme_values.mean())
                gev_model_list.append(exp.GevModelParam(
                    software=software, mouse_per_strain=mouse_per,
                    location=location, scale=scale, shape=shape, strains=150, var_env=.25))

    ## additive large strain numbers models
    base_dir = '/home/dtgillis/ccsim_workspace/evd/strain_sweep'

    for mouse_per in ['inf']:

        for software in exp.Software.objects.filter(name='emmax'):

            for strain_num in [300, 450, 900]:
                data_file = base_dir + os.sep + 'CC_0_0_0_' + str(strain_num) + '.' + software.name + '.top'
                np_extreme_values = np.genfromtxt(data_file)
                np_extreme_values = -np.log(np_extreme_values)
                shape, location, scale = genextreme.fit(np_extreme_values, -1, loc=np_extreme_values.mean())
                gev_model_list.append(exp.GevModelParam(
                    software=software, mouse_per_strain=mouse_per,
                    location=location, scale=scale, shape=shape, strains=strain_num, var_env=.25))

    exp.GevModelParam.objects.bulk_create(gev_model_list)

    ## additive large strain numbers models
    base_dir = '/home/dtgillis/ccsim_workspace/evd/env_sweep'
    gev_model_list = []
    for mouse_per in ['inf', '1', '5', '10']:
        for var_env in [.05, .50]:
            for software in exp.Software.objects.filter(name='emmax'):
                if mouse_per == 'inf':
                    data_file = base_dir + os.sep + 'CC_0_0_0_' + str(int(var_env * 100)) + '_150.emmax.top'
                else:
                    data_file = base_dir + os.sep + 'CC_0_0_0_' + str(int(var_env * 100)) + '_150.' + mouse_per + '.emmax.top'

                np_extreme_values = np.genfromtxt(data_file)
                np_extreme_values = -np.log(np_extreme_values)
                shape, location, scale = genextreme.fit(np_extreme_values, -1, loc=np_extreme_values.mean())
                gev_model_list.append(exp.GevModelParam(
                    software=software, mouse_per_strain=mouse_per,
                    location=location, scale=scale, shape=shape, strains=150, var_env=var_env))

    exp.GevModelParam.objects.bulk_create(gev_model_list)

    return 0
Ejemplo n.º 42
0
# <codecell>

annual_max = list(annual_max_dict.values()) 

# <markdowncell>

# ### Fit observation data to GEV distribution

# <codecell>

def gev_pdf(x):
    return genextreme.pdf(x, xi, loc=mu, scale=sigma)

# <codecell>

mle = genextreme.fit(sorted(annual_max), 0)
mu = mle[1]
sigma = mle[2]
xi = mle[0]
print "The mean, sigma, and shape parameters are %s, %s, and %s, resp." % (mu, sigma, xi)

# <markdowncell>

# ### Probability Density Plot

# <codecell>

min_x = min(annual_max)-0.5
max_x = max(annual_max)+0.5
x = np.linspace(min_x, max_x, num=100)
y = [gev_pdf(z) for z in x]
def FAP(R_max, K, L, n, fap_levels):
	epsilon, mu, sig = gev.fit(R_max, c = -0.2, loc = 3.e-4, scale = 1)
	#fap_levels = 1./fap_levels 
	epsilon = -epsilon
	return mu - sig/epsilon*(1-(-log(K*L/(fap_levels*n)))**(-epsilon))