def HybridNormalGPDCDF(xs, u, mu, sigma, shape, loc, scale):
    '''
    Params: 
        xs: unsorted list of datat to fit semi-parametric CDF to.
        u: threshold to move from Gaussian CDF Fit in center to GPD tail fitting.
        mu:  mean of the data.
        sigma: standard deviation of the data.
        shape: gpd least squares estimated shape parameter.
        loc: gpd least squares estimated location parameter.
        scale: gpd least squares estimated scale parameter.
    Returns:
        an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the CDF fit.
    '''
    out = list()
    l = (mu - abs(u - mu))
    h = (mu + abs(u - mu))
    #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h))
    for x in xs:
        if x < l:
            nrm = norm.cdf(l, mu, sigma)
            out.append(nrm *
                       (1 - genpareto.cdf(l - x, shape, loc=loc, scale=scale)))
        elif x >= h:
            nrm = norm.cdf(h, mu, sigma)
            out.append((1 - nrm) *
                       genpareto.cdf(x - h, shape, loc=loc, scale=scale) + nrm)
        else:
            out.append(norm.cdf(x, mu, sigma))
    return out
Esempio n. 2
0
    def get_pvalue(sorted_scores, stat, n):
        # approximate the gpd tail
        n_exceed = 250
        is_gpd_fitted = False
        while n_exceed >= 10:
            exceedances = sorted_scores[:n_exceed]
            # check if the n_exceed largest permutation values follow GPD
            #   with Anderson-Darling goodness-of-fit test
            try:
                ad = eva.gpdAd(FloatVector(exceedances))
                ad_pval = ad.rx2('p.value')[0]
            except:
                n_exceed -= 10
                continue
            # H0 = exceedances come from a GPD
            if ad_pval > 0.05:
                is_gpd_fitted = True
                break
            n_exceed -= 10
        if not is_gpd_fitted:
            #print('GPD good fit is never reached - use ECDF instead...')
            return (None)
        # compute the exceedance threshold t
        t = float((sorted_scores[n_exceed] + sorted_scores[n_exceed - 1]) / 2)
        # estimate shape and scale params with maximum likelihood
        gpd_fit = eva.gpdFit(FloatVector(sorted_scores),
                             threshold=t,
                             method='mle')
        scale, shape = gpd_fit.rx2('par.ests')[0], gpd_fit.rx2('par.ests')[1]

        # compute GPD p-value
        f_gpd = genpareto.cdf(x=gt_score - t, c=shape, scale=scale)
        return (n_exceed / n * (1 - f_gpd))
Esempio n. 3
0
def gpd_ad(x, tp):
    u, y = get_excesses(x, tp)
    xi, sigma = gpd_fit(y)
    z = genpareto.cdf(y, xi, 0, sigma)
    z = np.sort(z)
    n = len(z)
    i = np.linspace(1, n, n)
    stat = -n - (1 / n) * np.sum(
        (2 * i - 1) * (np.log(z) + np.log1p(-z[::-1])))
    return u, stat, xi, sigma
 def _margin_tail_cdf(self, x, i):
     # CDF of GP approximation (no need to weight it by p, that's done elsewhere)
     # i = component index
     if self.shapes[i] != 0:
         return gp.cdf(x,
                       c=self.shapes[i],
                       loc=self.u[i],
                       scale=self.scales[i])
     else:
         return expdist.cdf(x, loc=self.u[i], scale=self.scales[i])
Esempio n. 5
0
    def EstimaProbabilidade(self, Magnitude, Parametros):
        if self.tipoSerie == 'Parcial':
            probabilidade = genpareto.cdf(Magnitude, Parametros[0],
                                        loc = Parametros[1],
                                        scale = Parametros[2])

        elif self.tipoSerie == 'Anual':
            probabilidade = genextreme.cdf(Magnitude, Parametros[0],
                                            loc = Parametros[1],
                                            scale = Parametros[2])
        return probabilidade
def survival_function(sample, threshold, fit_method,
                      alpha):  #Plot the survival function, (1 - cdf)
    [shape, scale, sample, sample_excess,
     sample_over_thresh] = gpdfit(sample, threshold, fit_method)

    n = len(sample_over_thresh)
    y_surv = 1 - np.arange(1, n + 1) / n

    i_initial = 0

    n = len(sample)
    for i in range(0, n):
        if sample[i] > threshold + 0.0001:
            i_initial = i
            break
    #Computing confidence interval with the Dvoretzky–Kiefer–Wolfowitz
    F1 = []
    F2 = []
    for i in range(i_initial, len(sample)):
        e = (((mt.log(2 / alpha)) / (2 * len(sample_over_thresh)))**0.5)
        F1.append(y_surv[i - i_initial] - e)
        F2.append(y_surv[i - i_initial] + e)

    x_points = np.arange(0, max(sample), 0.001)
    surv_func = 1 - genpareto.cdf(x_points, shape, loc=threshold, scale=scale)

    #Plotting survival function
    plt.figure(9)
    plt.plot(x_points,
             surv_func,
             color='black',
             label='Theoretical Survival Function')
    plt.xlabel('Data')
    plt.ylabel('Survival Function')
    plt.title('Data Survival Function Plot')
    plt.scatter(sorted(sample_over_thresh),
                y_surv,
                label='Empirical Survival Function')
    plt.plot(sorted(sample_over_thresh),
             F1,
             linestyle='--',
             color='red',
             alpha=0.8,
             lw=0.9,
             label='Dvoretzky–Kiefer–Wolfowitz Confidence Bands')
    plt.plot(sorted(sample_over_thresh),
             F2,
             linestyle='--',
             color='red',
             alpha=0.8,
             lw=0.9)
    plt.legend()
    plt.show()
Esempio n. 7
0
def GeneralizedPareto_CDF(x):
    '''
    Generalized Pareto fit
    Returns cumulative probability function at x.
    '''

    # fit a generalized pareto and get params 
    shape, _, scale = genpareto.fit(x)

    # get generalized pareto CDF
    cdf = genpareto.cdf(x, shape, scale=scale)

    return cdf
def HybridSemiParametricGPDCDF(xs, u, ydata, shape, loc, scale):
    '''
    Params: 
        xs: unsorted list of datat to fit semi-parametric CDF to.
        u: threshold to move from Gaussian Kernel estimation to GPD tail fitting.
        mu:  mean of the data.
        sigma: standard deviation of the data.
        shape: gpd least squares estimated shape parameter.
        loc: gpd least squares estimated location parameter.
        scale: gpd least squares estimated scale parameter.
    Returns:
        an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the CDF fit.
    '''
    #print("Starting Canonical Maximum Likelihood")
    out = list()
    mu = mean(ydata)
    l = (mu - abs(u - mu))
    h = (mu + abs(u - mu))
    #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h))
    srtdxs = sorted(list(xs) + [l, h])
    bandwidth = 0.2
    cdf_smoother, bandwidth = kde_statsmodels_m_cdf_output(ydata,
                                                           srtdxs,
                                                           bandwidth=bandwidth)
    d = dict(zip(srtdxs, cdf_smoother))

    for x in xs:
        if x < l:
            nrm = d[l]
            out.append(nrm *
                       (1 - genpareto.cdf(l - x, shape, loc=loc, scale=scale)))
        elif x >= h:
            nrm = d[h]
            out.append((1 - nrm) *
                       genpareto.cdf(x - h, shape, loc=loc, scale=scale) + nrm)
        else:
            out.append(d[x])
    return xs, out, srtdxs, cdf_smoother, bandwidth
def gpdcdf(sample, threshold, fit_method,
           alpha):  #plot gpd cdf with empirical points
    [shape, scale, sample, sample_excess,
     sample_over_thresh] = gpdfit(sample, threshold, fit_method)  #fit the data

    n = len(sample_over_thresh)
    y = np.arange(1, n + 1) / n  #empirical probabilities

    i_initial = 0
    n = len(sample)
    for i in range(0, n):
        if sample[i] > threshold + 0.0001:
            i_initial = i
            break

    #Computing confidence interval with the Dvoretzky–Kiefer–Wolfowitz method based on the empirical points
    F1 = []
    F2 = []
    for i in range(i_initial, len(sample)):
        e = (((mt.log(2 / alpha)) / (2 * len(sample_over_thresh)))**0.5)
        F1.append(y[i - i_initial] - e)
        F2.append(y[i - i_initial] + e)

    x_points = np.arange(0, max(sample),
                         0.001)  #generating points to apply in the cdf
    cdf = genpareto.cdf(x_points, shape, loc=threshold,
                        scale=scale)  #getting theoretical cdf

    #Plotting cdf
    plt.figure(7)
    plt.plot(x_points, cdf, color='black', label='Theoretical CDF')
    plt.xlabel('Data')
    plt.ylabel('CDF')
    plt.title('Data Comulative Distribution Function')
    plt.scatter(sorted(sample_over_thresh), y, label='Empirical CDF')
    plt.plot(sorted(sample_over_thresh),
             F1,
             linestyle='--',
             color='red',
             alpha=0.8,
             lw=0.9,
             label='Dvoretzky–Kiefer–Wolfowitz Confidence Bands')
    plt.plot(sorted(sample_over_thresh),
             F2,
             linestyle='--',
             color='red',
             alpha=0.8,
             lw=0.9)
    plt.legend()
    plt.show()
def ppplot(sample, threshold, fit_method,
           alpha):  #probability-probability plot to diagnostic the model
    [shape, scale, sample, sample_excess,
     sample_over_thresh] = gpdfit(sample, threshold, fit_method)  #fit the data
    n = len(sample_over_thresh)
    #Getting empirical probabilities
    y = np.arange(1, n + 1) / n
    #Getting theoretical probabilities
    cdf_pp = genpareto.cdf(sample_over_thresh,
                           shape,
                           loc=threshold,
                           scale=scale)

    #Getting Confidence Intervals using the Dvoretzky–Kiefer–Wolfowitz method
    i_initial = 0
    n = len(sample)
    for i in range(0, n):
        if sample[i] > threshold + 0.0001:
            i_initial = i
            break
    F1 = []
    F2 = []
    for i in range(i_initial, len(sample)):
        e = (((mt.log(2 / alpha)) / (2 * len(sample_over_thresh)))**0.5)
        F1.append(y[i - i_initial] - e)
        F2.append(y[i - i_initial] + e)

    #Plotting PP
    plt.figure(6)
    sns.regplot(y,
                cdf_pp,
                ci=None,
                line_kws={
                    'color': 'black',
                    'label': 'Regression Line'
                })
    plt.plot(y,
             F1,
             linestyle='--',
             color='red',
             alpha=0.5,
             lw=0.8,
             label='Dvoretzky–Kiefer–Wolfowitz Confidence Bands')
    plt.plot(y, F2, linestyle='--', color='red', alpha=0.5, lw=0.8)
    plt.legend()
    plt.title('P-P Plot')
    plt.xlabel('Empirical Probability')
    plt.ylabel('Theoritical Probability')
    plt.show()
Esempio n. 11
0
    def EstimaFrequencias(self, Parametros):
        if self.tipoSerie == 'Parcial':
            limite = lp.LimiteParcial(self.dadoSerie).AchaLimite(2)
            Parciais = se.Series(self.dadoSerie).serieMaxParcial(limite)
            datasP, PicosParciais = se.Series(Parciais).separaDados()
            PicosParciais.sort(reverse = True)
            print(PicosParciais)
            frequencias = genpareto.cdf(PicosParciais, Parametros[0],
                                        loc = Parametros[1],
                                        scale = Parametros[2])

        elif self.tipoSerie == 'Anual':
            Anuais = se.Series(self.dadoSerie).serieMaxAnual()
            datasA, PicosAnuais = se.Series(Anuais).separaDados()
            PicosAnuais.sort(reverse = True)
            print(PicosAnuais)
            frequencias = genextreme.cdf(PicosAnuais, Parametros[0],
                                            loc = Parametros[1],
                                            scale = Parametros[2])
        return frequencias
Esempio n. 12
0
def extremal_distribution_fit(data,
                              var_name,
                              sample,
                              threshold,
                              fit_type,
                              x_min,
                              x_max,
                              n_points,
                              loc=None,
                              scale=None,
                              cumulative=True):
    # Initialization of the output variables
    param = None
    x = None
    y = None
    y_rp = None

    if fit_type == 'gpd':
        # Fit the exceedances over threshold to Generalized Pareto distribution
        param = generalized_pareto_distribution_fit(sample, threshold, loc,
                                                    scale)

        # Calculate the pdf and/or cdf
        x = np.linspace(x_min, x_max, n_points)

        if cumulative:
            y = genpareto.cdf(x, param[0], param[1], param[2])

            # Calculate the number of extreme peaks per year
            n_peaks_year = len(sample) / len(
                data[var_name].index.year.unique())
            y_rp = return_period_curve(n_peaks_year, y)
        else:
            y = genpareto.pdf(x, param[0], param[1], param[2])

    elif fit_type == 'coles':
        # Fit the exceedances over threshold to Generalized Pareto distribution
        param = generalized_pareto_distribution_fit(sample, threshold, loc,
                                                    scale)

        x = np.arange(1, 501)
        u = param[1]
        sigma = param[2]
        xi = param[0]

        # Mean number of data in a year (numero medio de datos en un año)
        n_y = len(data[var_name]) / len(data[var_name].index.year.unique())
        # Total number of POT / number of years
        z_u = len(sample) / len(data[var_name])
        # n_y*z_u is the number of POT / number of years -- > numer of POT per year
        y_rp = u + (sigma / xi) * (((x * n_y * z_u)**xi) - 1)

    elif fit_type == 'gev':
        param = generalized_extreme_value_distribution_fit(sample, loc, scale)

        # Calculate the pdf and/or cdf
        x = np.linspace(x_min, x_max, n_points)

        if cumulative:
            y = genextreme.cdf(x, param[0], param[1], param[2])

            # Calculate the number of extreme peaks per year
            n_peaks_year = 1
            y_rp = return_period_curve(n_peaks_year, y)
        else:
            y = genpareto.pdf(x, param[0], param[1], param[2])

    elif fit_type == 'poisson':
        # Calculate the pdf and/or cdf
        x = np.linspace(x_min, x_max, n_points)

        # Fit the exceedances over threshold to Generalized Pareto distribution
        gpd_param = generalized_pareto_distribution_fit(
            sample, threshold, loc, scale)

        # Poisson parameter (número de eventos extraños al año)
        poisspareto_param = len(sample) / len(
            data[var_name].index.year.unique())
        # Poisson pareto parameters
        poisspareto_param = [
            poisspareto_param, gpd_param[0], gpd_param[2], gpd_param[1]
        ]
        # Equivalent gev parameters
        param = [0, 0, 0]
        param[0] = -poisspareto_param[1]
        param[1] = poisspareto_param[2] * (poisspareto_param[0]**
                                           poisspareto_param[1])
        param[2] = poisspareto_param[3] + (
            (poisspareto_param[2] / poisspareto_param[1]) *
            ((poisspareto_param[0]**poisspareto_param[1]) - 1))

        if cumulative:
            y = genextreme.cdf(x, param[0], param[2], param[1])

            # Calculate the number of extreme peaks per year
            n_peaks_year = 1
            y_rp = return_period_curve(n_peaks_year, y)
        else:
            y = genextreme.pdf(x, param[0], param[2], param[1])

    return param, x, y, y_rp
Esempio n. 13
0
                    fit = genpareto.fit(poted_values, floc=[poted_values[-1]])
                    fit = genpareto.fit(poted_values,
                                        floc=fit[1],
                                        fscale=fit[2])
                    if j == 0:
                        #print(fit[2])
                        mu_check.append(poted_values[-1])
                    gamma = fit[0]
                    mu = fit[1]

                    sigma = fit[2]
                    #gpd_params_dict[str(j + 1)]["gamma"].append(gamma)
                    #gpd_params_dict[str(j + 1)]["mu"].append(mu[0])
                    #gpd_params_dict[str(j + 1)]["sigma"].insert(sigma)
                    if dw[i, j] >= fit[1]:
                        hpp[i - gev_window, j] = 1 - genpareto.cdf(
                            dw[i, j], fit[0], fit[1], fit[2]) + 1e-50
                        #gpd_params[j].append(fit)

        totalhpp1 = -np.log10(np.prod(hpp, axis=1))
        min_index = np.argmax(totalhpp1)
        le = pa.detection.learning_entropy(w, m=1200, order=1)

        snr[seed_counter] = 10 * np.log10(
            (np.std(desired_output[gev_window:])**2) / (noise_sigma**2))
        # print(Fore.RED + "experiment number: " + str(seed_counter))
        # print(Fore.GREEN + "SNR: " + (str(snr[seed_counter])))
        # print(Fore.BLACK + "min_index GPD: " + str(min_index))
        if min_index > 199 and min_index < 211:
            gpd_result[seed_counter] = 1

        max_index_elbnd = np.argmax(elbnd[-400:])
Esempio n. 14
0
def genpareto_gradient_cdf(x, c, scale):
    """Gradient of the Generalized Pareto Distribution function w.r.t. to the scale and shape parameter

    :param x: array_like
        quantiles
    :param c:  positive number
        shape parameter
    :param scale:positive number
        scale parameter (default=1)
    :return: (2 X n)-matrix where n is equal to the size of x
        The first row  corresponds to the gradient of the cdf w.r.t. the shape parameter evaluated at x
        The second row corresponds to the gradient of the cdf w.r.t. the scale parameter evaluated at x
    """

    output = np.zeros(shape=(2, x.size))

    cond = 0 < (1+c*x/scale)

    output[0] = np.where(cond, (-1/c**2*log(1 + c*x/scale) + x/(c*(scale + c * x)))*(1 - genpareto.cdf(x, c, scale)), 0)
    output[1] = -x/scale*genpareto.pdf(x, c, scale)

    return output
Esempio n. 15
0
import numpy as np
from scipy.stats import genpareto
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)

c = 0.1
mean, var, skew, kurt = genpareto.stats(c, moments='mvsk')
x = np.linspace(genpareto.ppf(0.01, c),genpareto.ppf(0.99, c), 100)
ax.plot(x, genpareto.pdf(x, c),'r-', lw=5, alpha=0.6, label='genpareto pdf')
rv = genpareto(c)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
vals = genpareto.ppf([0.001, 0.5, 0.999], c)
np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c))
r = genpareto.rvs(c, size=1000)
ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
Esempio n. 16
0
                                             padx=4)

quantileroot.mainloop()

#Probability Plot
line2 = [.01 * x for x in range(100)]
plt.title('Probability Plot')
plt.ylabel('Model')
plt.xlabel('Empirical')
plt.scatter(listofpercents, [
    genparetodist(x, Threshold, Maxsigma, Maxxis)
    for x in xvalues['x' + str(threshposition)]
],
            s=.5)
plt.scatter(listofpercents, [
    genpareto.cdf(y, Maxxis, loc=0, scale=Maxsigma)
    for y in yvalues['y' + str(threshposition)]
],
            s=.5,
            label='Simulation Data')
plt.axis([0, 1, 0, 1])
plt.plot(line2, line2, 'b', label='Best Model Fit')
plt.legend(bbox_to_anchor=(0., 1.1, 1., .102),
           loc=3,
           ncol=1,
           mode="expand",
           borderaxespad=0.)
plt.show()
print 'The Dotted line (Real Data), should reasonably agree with the model to make a Linear fit.'
print '\n'
Esempio n. 17
0
import numpy as np
from scipy.stats import genpareto
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1)

c = 0.1
mean, var, skew, kurt = genpareto.stats(c, moments='mvsk')
x = np.linspace(genpareto.ppf(0.01, c), genpareto.ppf(0.99, c), 100)
ax.plot(x, genpareto.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genpareto pdf')
rv = genpareto(c)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
vals = genpareto.ppf([0.001, 0.5, 0.999], c)
np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c))
r = genpareto.rvs(c, size=1000)
ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
    gamma_k = lamda * mean_X - gamma_alpha / gamma_beta
    sort_comp_poisson_rnd = compound_poisson_distribution(
        lamda, num_values, mu, sigma)
    sort_comp_poisson_rnd.sort()
    alpha_low = .99
    alpha_high = .99999
    low_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_low)]
    high_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_high)]
    mu_gp = low_val
    sample_data = compound_poisson_distribution(lamda, num_values, mu, sigma)
    data_gp = sample_data[sample_data > mu_gp] - mu_gp
    gpd_value = gpd.fit(data_gp)

    cdf_values = np.arange(low_val, high_val + (high_val - low_val) / 1000,
                           (high_val - low_val) / 1000)

    norm_cdf_tail = 1 - norm.cdf(cdf_values, mean_SN, (var_SN)**(1 / 2))
    gamma_cdf_tail = 1 - gamma.cdf(
        cdf_values - gamma_k, gamma_alpha, scale=1 / gamma_beta)
    GP_cdf_tail = 1 - (genpareto.cdf(
        cdf_values - low_val, gpd_value[0], scale=gpd_value[2]) * 0.01 + 0.99)
    emp_cdf_tails = emp_cdf_tail(sample_data, cdf_values)

    plt.loglog(cdf_values, norm_cdf_tail, label='CLT')
    plt.loglog(cdf_values, gamma_cdf_tail, label='GAMMA')
    plt.loglog(cdf_values, GP_cdf_tail, label='GP')
    plt.loglog(cdf_values, emp_cdf_tails, label='EMP')
    plt.title('LOG-LOG plot of 1-F_SN vs x')
    plt.legend()
    plt.show()