def HybridNormalGPDPDF(xs, u, mu, sigma, shape, loc, scale):
    '''
    Params: 
        xs: unsorted list of datat to fit semi-parametric PDF to.
        u: threshold to move from Gaussian PDF Fit in center to GPD tail fitting.
        mu:  mean of the data.
        sigma: standard deviation of the data.
        shape: gpd least squares estimated shape parameter.
        loc: gpd least squares estimated location parameter.
        scale: gpd least squares estimated scale parameter.
    Returns:
        an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit.
    '''
    out = list()
    l = (mu - abs(u - mu))
    h = (mu + abs(u - mu))
    #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h))
    for x in xs:
        if x < l:
            out.append(
                norm.cdf(l, mu, sigma) *
                genpareto.pdf(l - x, shape, loc=loc, scale=scale))
        elif x >= h:
            out.append((1 - norm.cdf(h, mu, sigma)) *
                       genpareto.pdf(x - h, shape, loc=loc, scale=scale))
        else:
            out.append(norm.pdf(x, mu, sigma))
    return out
 def _margin_tail_pdf(self, x, i):
     # density of GP approximation (no need to weight it by p, that's done elsewhere)
     # i = component index
     if self.shapes[i] != 0:
         return gp.pdf(x,
                       c=self.shapes[i],
                       loc=self.u[i],
                       scale=self.scales[i])
     else:
         return expdist.pdf(x, loc=self.u[i], scale=self.scales[i])
def HybridSemiParametricGPDPDF(xs, u, ydata, shape, loc, scale):
    '''
    Params: 
        xs: unsorted list of datat to fit semi-parametric PDF to.
        u: threshold to move from Gaussian Kernel estimation to GPD tail fitting.
        mu:  mean of the data.
        sigma: standard deviation of the data.
        shape: gpd least squares estimated shape parameter.
        loc: gpd least squares estimated location parameter.
        scale: gpd least squares estimated scale parameter.
    Returns:
        an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit.
    '''
    out = list()
    mu = mean(ydata)
    l = (mu - abs(u - mu))
    h = (mu + abs(u - mu))
    #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h))
    bandwidth = 0.2
    srtdxs = sorted(list(xs) + [l, h])
    cdf_smoother, bandwidth = kde_statsmodels_m_cdf_output(ydata,
                                                           srtdxs,
                                                           bandwidth=bandwidth)
    d_cdf = dict(zip(srtdxs, cdf_smoother))
    pdf_smoother, bandwidth = kde_statsmodels_m_pdf_output(ydata,
                                                           srtdxs,
                                                           bandwidth=bandwidth)
    d_pdf = dict(zip(srtdxs, pdf_smoother))
    for x in xs:
        if x < l:
            out.append(d_cdf[l] *
                       genpareto.pdf(l - x, shape, loc=loc, scale=scale))
        elif x >= h:
            out.append((1 - d_cdf[h]) *
                       genpareto.pdf(x - h, shape, loc=loc, scale=scale))
        else:
            out.append(d_pdf[x])
    return xs, out, srtdxs, pdf_smoother, bandwidth
Exemple #4
0
def plot_extreme_value_distribution(extremes, fitted_params=None, bins=50,
        ax=None):
    """
    Plots the fitted extreme value distribution probability density
    function overlayed over a histogram of the empircal extreme
    values.
    
    Parameters:
        extremes : array_like
            Extreme values.
        fitted_params : tuple of floats
            parameters corresponding to the fitted distribution.
        bins : int
            Number of bins to use in histogram
        ax : matplotlib.axes.Axes
            Plotting axis.
        kwargs : (optional)
            Additional keyword arguments to pass to matplotlib.
    Returns: 
        ax : matplotlib.axes.Axes
            Plotting axis.
    """

    if ax is None:
        fig, ax = plt.subplots()

    if fitted_params is None:
        fitted_params = fit_distribution(extremes, fix_loc=True)
    
    # Plot histogram of extreme values
    ax.hist(extremes, bins=bins, density=True, label='Extreme values')

    # Plot MLE pdf
    quantiles = np.linspace(
        genpareto.ppf(0.01, *fitted_params),
        genpareto.ppf(0.99, *fitted_params),
        99
    )
    fitted_pdf = genpareto.pdf(quantiles, *fitted_params)
    ax.plot(quantiles, fitted_pdf, label='Fitted GPD pdf')

    ax.set(
        title='Best fit generalized Pareto distribution to extremes',
        xlabel='Extreme values [m]',
        ylabel='Density'
    )
    ax.legend(loc='upper right')

    return ax
Exemple #5
0
def returnPeriodUncertainty(data, mu, xi, sigma, intervals):
    """
    Calculate uncertainty around a fit, holding threshold fixed

    :param data: :class:`numpy.ndarray` containing the observed values (with
                 missing values removed).
    :param float mu: Threshold parameter (also called location).
    :param float chi: Shape parameter.
    :param float sigma: Scale parameter.
    :param intervals: :class:`numpy.ndarray` or float of return period intervals
              to evaluate return level uncertainties for. 

    :returns: Array of standard deviation values for each return period, based
              on all permutations of shape and scale parameters with standard
              errors.
    :rtype: :class:`numpy.ndarray`

    """
    sortedmax = np.sort(data[data > mu])
    nobs = len(sortedmax)
    rate = float(nobs) / float(len(data))
    emppdf = empiricalPDF(data[data > mu])

    # Perform the curve fitting, holding ``mu`` fixed and allowing
    # ``xi`` and ``sigma`` to vary.
    try:
        popt, pcov = curve_fit(lambda x, xi, sigma: \
                               genpareto.pdf(x, xi, loc=mu, scale=sigma),
                               sortedmax, emppdf, (xi, sigma))
    except RuntimeError as e:
        LOG.exception("Curve fitting failed: %s", e)
        return np.zeros(len(intervals))

    sd = np.sqrt(np.diag(pcov))

    svals = (sigma - sd[1], sigma, sigma + sd[1])
    mvals = (mu, mu, mu)
    xvals = (xi - sd[0], xi, xi + sd[0])

    rpvalues = np.array([
        returnLevels(intervals, m, xii, s, rate)
        for (s, m, xii) in product(svals, mvals, xvals)
    ])

    rpFitError = np.std(rpvalues, axis=0)

    return rpFitError
Exemple #6
0
def chi2_test(hist, n_bins, c, loc, scale, norm):
    """ Simple Chi^2 test for the goodness of the fit.
    """
    chi2 = n_empty_bins = 0
    for i in range(len(hist[0])):
        if hist[0][i] == 0:
            # Ignore this empty bin.
            n_empty_bins += 1
            continue

        # Get the center of bin i.
        x = (hist[1][i] + hist[1][i + 1]) / 2
        fit_val = gpareto.pdf(x, c=c, loc=loc, scale=scale)
        chi = (fit_val - hist[0][i]) / np.sqrt(hist[0][i])
        chi2 += chi**2

    return  norm * chi2 / (n_bins - n_empty_bins)
Exemple #7
0
def returnPeriodUncertainty(data, mu, xi, sigma, intervals):
    """
    Calculate uncertainty around a fit, holding threshold fixed

    :param data: :class:`numpy.ndarray` containing the observed values (with
                 missing values removed).
    :param float mu: Threshold parameter (also called location).
    :param float chi: Shape parameter.
    :param float sigma: Scale parameter.
    :param intervals: :class:`numpy.ndarray` or float of return period intervals
              to evaluate return level uncertainties for. 

    :returns: Array of standard deviation values for each return period, based
              on all permutations of shape and scale parameters with standard
              errors.
    :rtype: :class:`numpy.ndarray`

    """
    sortedmax = np.sort(data[data > mu])
    nobs = len(sortedmax)
    rate = float(nobs)/float(len(data))
    emppdf = empiricalPDF(data[data > mu])

    # Perform the curve fitting, holding ``mu`` fixed and allowing
    # ``xi`` and ``sigma`` to vary.
    try:
        popt, pcov = curve_fit(lambda x, xi, sigma: \
                               genpareto.pdf(x, xi, loc=mu, scale=sigma),
                               sortedmax, emppdf, (xi, sigma))
    except RuntimeError as e:
        LOG.exception("Curve fitting failed: %s", e)
        return np.zeros(len(intervals))

    sd = np.sqrt(np.diag(pcov))

    svals = (sigma - sd[1], sigma, sigma + sd[1])
    mvals = (mu, mu, mu)
    xvals = (xi - sd[0], xi, xi + sd[0])

    rpvalues = np.array([returnLevels(intervals, m, xii, s, rate) for
                         (s, m, xii) in product(svals, mvals, xvals)])

    rpFitError = np.std(rpvalues, axis=0)

    return rpFitError
def gpdpdf(sample, threshold, fit_method, bin_method,
           alpha):  #get PDF plot with histogram to diagnostic the model
    [shape, scale, sample, sample_excess,
     sample_over_thresh] = gpdfit(sample, threshold, fit_method)  #Fit the data
    x_points = np.arange(0, max(sample),
                         0.001)  #define a range of points for drawing the pdf
    pdf = genpareto.pdf(x_points, shape, loc=0,
                        scale=scale)  #get the pdf values

    #Plotting PDF
    plt.figure(4)
    plt.xlabel('Data')
    plt.ylabel('PDF')
    plt.title('Data Probability Density Function')
    plt.plot(x_points, pdf, color='black', label='Theoretical PDF')
    plt.hist(sample_excess, bins=bin_method, density=True)  #draw histograms
    plt.legend()
    plt.show()
Exemple #9
0
def fittedPDF(data, mu, sigma, xi):
    """
    Calculate probability denisty function values given data and
    GPD fit parameters.

    :param data: :class:`numpy.ndarray` of data values.
    :param float mu: Location parameter of the fitted GPD.
    :param float sigma: Shape parameter of the fitted GPD.
    :param float xi: Scale parameter of the fitted GPD.

    :returns: :class:`numpy.ndarray` of PDF values at the data points.

    """

    LOG.debug("Calculating fitted GPD PDF")
    res = genpareto.pdf(np.sort(data[data > mu]), 
                        sigma, loc=mu, scale=xi)

    return res
Exemple #10
0
def genpareto_gradient_cdf(x, c, scale):
    """Gradient of the Generalized Pareto Distribution function w.r.t. to the scale and shape parameter

    :param x: array_like
        quantiles
    :param c:  positive number
        shape parameter
    :param scale:positive number
        scale parameter (default=1)
    :return: (2 X n)-matrix where n is equal to the size of x
        The first row  corresponds to the gradient of the cdf w.r.t. the shape parameter evaluated at x
        The second row corresponds to the gradient of the cdf w.r.t. the scale parameter evaluated at x
    """

    output = np.zeros(shape=(2, x.size))

    cond = 0 < (1+c*x/scale)

    output[0] = np.where(cond, (-1/c**2*log(1 + c*x/scale) + x/(c*(scale + c * x)))*(1 - genpareto.cdf(x, c, scale)), 0)
    output[1] = -x/scale*genpareto.pdf(x, c, scale)

    return output
# data below the threshold
l_1 = where(epsi < epsi_bar)[1]
l_2 = where(p_quant <= p_bar)[0]
epsi_ex = epsi_bar - epsi[
    0, l_1]  # dataset of the conditional excess distribution

# MLFP quantile and Generalized Pareto Distribution
q_MLFP = zeros((k_, len(l_2)))
f_MLFP = zeros((k_, len(l_1)))
for k in range(k_):
    csi_MLFP, sigma_MLFP = FitGenParetoMLFP(
        epsi_ex, p[k, l_1]
    )  # Maximum Likelihood optimization with Generalized Pareto Distribution
    f_MLFP[k, :] = genpareto.pdf(sort(epsi_ex),
                                 c=0,
                                 scale=csi_MLFP,
                                 loc=sigma_MLFP - 1)

    q_MLFP[k, :] = QuantileGenParetoMLFP(epsi_bar, p_bar, csi_MLFP, sigma_MLFP,
                                         p_quant[l_2])[0]  # MLFP-quantile

# historical quantile below the threshold
q_bt = q_hist[0, l_2]
# histogram of the pdf of the Conditional Excess Distribution
t_ex_ = len(epsi_ex)
options = namedtuple('options', 'n_bins')
options.n_bins = round(12 * log(t_ex_))
hgram_ex, x_bin = HistogramFP(epsi_ex.reshape(1, -1),
                              ones((1, t_ex_)) / t_ex_, options)
# -
Exemple #12
0
import numpy as np
from scipy.stats import genpareto
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1)

c = 0.1
mean, var, skew, kurt = genpareto.stats(c, moments='mvsk')
x = np.linspace(genpareto.ppf(0.01, c), genpareto.ppf(0.99, c), 100)
ax.plot(x, genpareto.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genpareto pdf')
rv = genpareto(c)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
vals = genpareto.ppf([0.001, 0.5, 0.999], c)
np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c))
r = genpareto.rvs(c, size=1000)
ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
Exemple #13
0
def extremal_distribution_fit(data,
                              var_name,
                              sample,
                              threshold,
                              fit_type,
                              x_min,
                              x_max,
                              n_points,
                              loc=None,
                              scale=None,
                              cumulative=True):
    # Initialization of the output variables
    param = None
    x = None
    y = None
    y_rp = None

    if fit_type == 'gpd':
        # Fit the exceedances over threshold to Generalized Pareto distribution
        param = generalized_pareto_distribution_fit(sample, threshold, loc,
                                                    scale)

        # Calculate the pdf and/or cdf
        x = np.linspace(x_min, x_max, n_points)

        if cumulative:
            y = genpareto.cdf(x, param[0], param[1], param[2])

            # Calculate the number of extreme peaks per year
            n_peaks_year = len(sample) / len(
                data[var_name].index.year.unique())
            y_rp = return_period_curve(n_peaks_year, y)
        else:
            y = genpareto.pdf(x, param[0], param[1], param[2])

    elif fit_type == 'coles':
        # Fit the exceedances over threshold to Generalized Pareto distribution
        param = generalized_pareto_distribution_fit(sample, threshold, loc,
                                                    scale)

        x = np.arange(1, 501)
        u = param[1]
        sigma = param[2]
        xi = param[0]

        # Mean number of data in a year (numero medio de datos en un año)
        n_y = len(data[var_name]) / len(data[var_name].index.year.unique())
        # Total number of POT / number of years
        z_u = len(sample) / len(data[var_name])
        # n_y*z_u is the number of POT / number of years -- > numer of POT per year
        y_rp = u + (sigma / xi) * (((x * n_y * z_u)**xi) - 1)

    elif fit_type == 'gev':
        param = generalized_extreme_value_distribution_fit(sample, loc, scale)

        # Calculate the pdf and/or cdf
        x = np.linspace(x_min, x_max, n_points)

        if cumulative:
            y = genextreme.cdf(x, param[0], param[1], param[2])

            # Calculate the number of extreme peaks per year
            n_peaks_year = 1
            y_rp = return_period_curve(n_peaks_year, y)
        else:
            y = genpareto.pdf(x, param[0], param[1], param[2])

    elif fit_type == 'poisson':
        # Calculate the pdf and/or cdf
        x = np.linspace(x_min, x_max, n_points)

        # Fit the exceedances over threshold to Generalized Pareto distribution
        gpd_param = generalized_pareto_distribution_fit(
            sample, threshold, loc, scale)

        # Poisson parameter (número de eventos extraños al año)
        poisspareto_param = len(sample) / len(
            data[var_name].index.year.unique())
        # Poisson pareto parameters
        poisspareto_param = [
            poisspareto_param, gpd_param[0], gpd_param[2], gpd_param[1]
        ]
        # Equivalent gev parameters
        param = [0, 0, 0]
        param[0] = -poisspareto_param[1]
        param[1] = poisspareto_param[2] * (poisspareto_param[0]**
                                           poisspareto_param[1])
        param[2] = poisspareto_param[3] + (
            (poisspareto_param[2] / poisspareto_param[1]) *
            ((poisspareto_param[0]**poisspareto_param[1]) - 1))

        if cumulative:
            y = genextreme.cdf(x, param[0], param[2], param[1])

            # Calculate the number of extreme peaks per year
            n_peaks_year = 1
            y_rp = return_period_curve(n_peaks_year, y)
        else:
            y = genextreme.pdf(x, param[0], param[2], param[1])

    return param, x, y, y_rp
Exemple #14
0
fig, ax = plt.subplots(nrows=2, ncols=2)
fig.tight_layout()
ax[-1, -1].axis('off')

hist_ot = ax[0][0].hist(x=lat, bins=bins_ot, histtype='stepfilled', alpha=0.3)
ax[0][0].set_xlabel('latency [\u03BCs]', fontsize=8)
ax[0][0].set_yscale('log')
#print(hist_ot[0])

hist_ot_norm = ax[1][0].hist(x=lat, bins=bins_ot,
                             density=True, histtype='stepfilled', alpha=0.3)

# Fit using the fitter of the genpareto class (shown in red).
ret = gpareto.fit(lat, loc=threshold)
ax[1][0].plot(x, gpareto.pdf(x, c=ret[0],  loc=ret[1],  scale=ret[2]),
              'r-', lw=1, color='red',  alpha=0.8)

ax[1][0].set_xlabel('latency [\u03BCs]', fontsize=8)
print(ret)
print('\ngoodness-of-fit: ' + '{:03.3f}'.format(chi2_test(hist_ot_norm,
                                                          n_bins=n_bins,
                                                          c=ret[0],
                                                          loc=ret[1],
                                                          scale=ret[2],
                                                          norm=len(lat))))

print("\n curve_fit:")
# Fit using the curve_fit fitter. Fix the value of the "loc" parameter.
popt, pcov = cfit(lambda x, c, scale: gpareto.pdf(x, c=c, loc=threshold, scale=scale),
                  x, hist_ot_norm[0],
Exemple #15
0
           borderaxespad=0.)
plt.show()
print 'The Dotted line (Real Data), should reasonably agree with the model to make a Linear fit.'
print '\n'

#Density/Histogram Plot
plt.title('Density Plot')
plt.ylabel('Density')
plt.xlabel('Threshold Exceedances')
plt.hist(numpy.array(xvalues['x' + str(threshposition)]),
         normed=True,
         bins=25,
         facecolor='y',
         label='Simulation Data')
plt.plot(xvalues['x' + str(threshposition)], [
    genpareto.pdf(xx, Maxxis, loc=Threshold, scale=Maxsigma)
    for xx in xvalues['x' + str(threshposition)]
],
         'r-',
         lw=3,
         label='Model Fit')
plt.legend(bbox_to_anchor=(0., 1.1, 1., .102),
           loc=3,
           ncol=1,
           mode="expand",
           borderaxespad=0.)
plt.show()

#Return Level Plot
#Generate the real rplist for the raw data
randproblist = []
import numpy as np
from scipy.stats import genpareto
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)

c = 0.1
mean, var, skew, kurt = genpareto.stats(c, moments='mvsk')
x = np.linspace(genpareto.ppf(0.01, c),genpareto.ppf(0.99, c), 100)
ax.plot(x, genpareto.pdf(x, c),'r-', lw=5, alpha=0.6, label='genpareto pdf')
rv = genpareto(c)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')
vals = genpareto.ppf([0.001, 0.5, 0.999], c)
np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c))
r = genpareto.rvs(c, size=1000)
ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()