def HybridNormalGPDPDF(xs, u, mu, sigma, shape, loc, scale): ''' Params: xs: unsorted list of datat to fit semi-parametric PDF to. u: threshold to move from Gaussian PDF Fit in center to GPD tail fitting. mu: mean of the data. sigma: standard deviation of the data. shape: gpd least squares estimated shape parameter. loc: gpd least squares estimated location parameter. scale: gpd least squares estimated scale parameter. Returns: an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit. ''' out = list() l = (mu - abs(u - mu)) h = (mu + abs(u - mu)) #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h)) for x in xs: if x < l: out.append( norm.cdf(l, mu, sigma) * genpareto.pdf(l - x, shape, loc=loc, scale=scale)) elif x >= h: out.append((1 - norm.cdf(h, mu, sigma)) * genpareto.pdf(x - h, shape, loc=loc, scale=scale)) else: out.append(norm.pdf(x, mu, sigma)) return out
def _margin_tail_pdf(self, x, i): # density of GP approximation (no need to weight it by p, that's done elsewhere) # i = component index if self.shapes[i] != 0: return gp.pdf(x, c=self.shapes[i], loc=self.u[i], scale=self.scales[i]) else: return expdist.pdf(x, loc=self.u[i], scale=self.scales[i])
def HybridSemiParametricGPDPDF(xs, u, ydata, shape, loc, scale): ''' Params: xs: unsorted list of datat to fit semi-parametric PDF to. u: threshold to move from Gaussian Kernel estimation to GPD tail fitting. mu: mean of the data. sigma: standard deviation of the data. shape: gpd least squares estimated shape parameter. loc: gpd least squares estimated location parameter. scale: gpd least squares estimated scale parameter. Returns: an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit. ''' out = list() mu = mean(ydata) l = (mu - abs(u - mu)) h = (mu + abs(u - mu)) #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h)) bandwidth = 0.2 srtdxs = sorted(list(xs) + [l, h]) cdf_smoother, bandwidth = kde_statsmodels_m_cdf_output(ydata, srtdxs, bandwidth=bandwidth) d_cdf = dict(zip(srtdxs, cdf_smoother)) pdf_smoother, bandwidth = kde_statsmodels_m_pdf_output(ydata, srtdxs, bandwidth=bandwidth) d_pdf = dict(zip(srtdxs, pdf_smoother)) for x in xs: if x < l: out.append(d_cdf[l] * genpareto.pdf(l - x, shape, loc=loc, scale=scale)) elif x >= h: out.append((1 - d_cdf[h]) * genpareto.pdf(x - h, shape, loc=loc, scale=scale)) else: out.append(d_pdf[x]) return xs, out, srtdxs, pdf_smoother, bandwidth
def plot_extreme_value_distribution(extremes, fitted_params=None, bins=50, ax=None): """ Plots the fitted extreme value distribution probability density function overlayed over a histogram of the empircal extreme values. Parameters: extremes : array_like Extreme values. fitted_params : tuple of floats parameters corresponding to the fitted distribution. bins : int Number of bins to use in histogram ax : matplotlib.axes.Axes Plotting axis. kwargs : (optional) Additional keyword arguments to pass to matplotlib. Returns: ax : matplotlib.axes.Axes Plotting axis. """ if ax is None: fig, ax = plt.subplots() if fitted_params is None: fitted_params = fit_distribution(extremes, fix_loc=True) # Plot histogram of extreme values ax.hist(extremes, bins=bins, density=True, label='Extreme values') # Plot MLE pdf quantiles = np.linspace( genpareto.ppf(0.01, *fitted_params), genpareto.ppf(0.99, *fitted_params), 99 ) fitted_pdf = genpareto.pdf(quantiles, *fitted_params) ax.plot(quantiles, fitted_pdf, label='Fitted GPD pdf') ax.set( title='Best fit generalized Pareto distribution to extremes', xlabel='Extreme values [m]', ylabel='Density' ) ax.legend(loc='upper right') return ax
def returnPeriodUncertainty(data, mu, xi, sigma, intervals): """ Calculate uncertainty around a fit, holding threshold fixed :param data: :class:`numpy.ndarray` containing the observed values (with missing values removed). :param float mu: Threshold parameter (also called location). :param float chi: Shape parameter. :param float sigma: Scale parameter. :param intervals: :class:`numpy.ndarray` or float of return period intervals to evaluate return level uncertainties for. :returns: Array of standard deviation values for each return period, based on all permutations of shape and scale parameters with standard errors. :rtype: :class:`numpy.ndarray` """ sortedmax = np.sort(data[data > mu]) nobs = len(sortedmax) rate = float(nobs) / float(len(data)) emppdf = empiricalPDF(data[data > mu]) # Perform the curve fitting, holding ``mu`` fixed and allowing # ``xi`` and ``sigma`` to vary. try: popt, pcov = curve_fit(lambda x, xi, sigma: \ genpareto.pdf(x, xi, loc=mu, scale=sigma), sortedmax, emppdf, (xi, sigma)) except RuntimeError as e: LOG.exception("Curve fitting failed: %s", e) return np.zeros(len(intervals)) sd = np.sqrt(np.diag(pcov)) svals = (sigma - sd[1], sigma, sigma + sd[1]) mvals = (mu, mu, mu) xvals = (xi - sd[0], xi, xi + sd[0]) rpvalues = np.array([ returnLevels(intervals, m, xii, s, rate) for (s, m, xii) in product(svals, mvals, xvals) ]) rpFitError = np.std(rpvalues, axis=0) return rpFitError
def chi2_test(hist, n_bins, c, loc, scale, norm): """ Simple Chi^2 test for the goodness of the fit. """ chi2 = n_empty_bins = 0 for i in range(len(hist[0])): if hist[0][i] == 0: # Ignore this empty bin. n_empty_bins += 1 continue # Get the center of bin i. x = (hist[1][i] + hist[1][i + 1]) / 2 fit_val = gpareto.pdf(x, c=c, loc=loc, scale=scale) chi = (fit_val - hist[0][i]) / np.sqrt(hist[0][i]) chi2 += chi**2 return norm * chi2 / (n_bins - n_empty_bins)
def returnPeriodUncertainty(data, mu, xi, sigma, intervals): """ Calculate uncertainty around a fit, holding threshold fixed :param data: :class:`numpy.ndarray` containing the observed values (with missing values removed). :param float mu: Threshold parameter (also called location). :param float chi: Shape parameter. :param float sigma: Scale parameter. :param intervals: :class:`numpy.ndarray` or float of return period intervals to evaluate return level uncertainties for. :returns: Array of standard deviation values for each return period, based on all permutations of shape and scale parameters with standard errors. :rtype: :class:`numpy.ndarray` """ sortedmax = np.sort(data[data > mu]) nobs = len(sortedmax) rate = float(nobs)/float(len(data)) emppdf = empiricalPDF(data[data > mu]) # Perform the curve fitting, holding ``mu`` fixed and allowing # ``xi`` and ``sigma`` to vary. try: popt, pcov = curve_fit(lambda x, xi, sigma: \ genpareto.pdf(x, xi, loc=mu, scale=sigma), sortedmax, emppdf, (xi, sigma)) except RuntimeError as e: LOG.exception("Curve fitting failed: %s", e) return np.zeros(len(intervals)) sd = np.sqrt(np.diag(pcov)) svals = (sigma - sd[1], sigma, sigma + sd[1]) mvals = (mu, mu, mu) xvals = (xi - sd[0], xi, xi + sd[0]) rpvalues = np.array([returnLevels(intervals, m, xii, s, rate) for (s, m, xii) in product(svals, mvals, xvals)]) rpFitError = np.std(rpvalues, axis=0) return rpFitError
def gpdpdf(sample, threshold, fit_method, bin_method, alpha): #get PDF plot with histogram to diagnostic the model [shape, scale, sample, sample_excess, sample_over_thresh] = gpdfit(sample, threshold, fit_method) #Fit the data x_points = np.arange(0, max(sample), 0.001) #define a range of points for drawing the pdf pdf = genpareto.pdf(x_points, shape, loc=0, scale=scale) #get the pdf values #Plotting PDF plt.figure(4) plt.xlabel('Data') plt.ylabel('PDF') plt.title('Data Probability Density Function') plt.plot(x_points, pdf, color='black', label='Theoretical PDF') plt.hist(sample_excess, bins=bin_method, density=True) #draw histograms plt.legend() plt.show()
def fittedPDF(data, mu, sigma, xi): """ Calculate probability denisty function values given data and GPD fit parameters. :param data: :class:`numpy.ndarray` of data values. :param float mu: Location parameter of the fitted GPD. :param float sigma: Shape parameter of the fitted GPD. :param float xi: Scale parameter of the fitted GPD. :returns: :class:`numpy.ndarray` of PDF values at the data points. """ LOG.debug("Calculating fitted GPD PDF") res = genpareto.pdf(np.sort(data[data > mu]), sigma, loc=mu, scale=xi) return res
def genpareto_gradient_cdf(x, c, scale): """Gradient of the Generalized Pareto Distribution function w.r.t. to the scale and shape parameter :param x: array_like quantiles :param c: positive number shape parameter :param scale:positive number scale parameter (default=1) :return: (2 X n)-matrix where n is equal to the size of x The first row corresponds to the gradient of the cdf w.r.t. the shape parameter evaluated at x The second row corresponds to the gradient of the cdf w.r.t. the scale parameter evaluated at x """ output = np.zeros(shape=(2, x.size)) cond = 0 < (1+c*x/scale) output[0] = np.where(cond, (-1/c**2*log(1 + c*x/scale) + x/(c*(scale + c * x)))*(1 - genpareto.cdf(x, c, scale)), 0) output[1] = -x/scale*genpareto.pdf(x, c, scale) return output
# data below the threshold l_1 = where(epsi < epsi_bar)[1] l_2 = where(p_quant <= p_bar)[0] epsi_ex = epsi_bar - epsi[ 0, l_1] # dataset of the conditional excess distribution # MLFP quantile and Generalized Pareto Distribution q_MLFP = zeros((k_, len(l_2))) f_MLFP = zeros((k_, len(l_1))) for k in range(k_): csi_MLFP, sigma_MLFP = FitGenParetoMLFP( epsi_ex, p[k, l_1] ) # Maximum Likelihood optimization with Generalized Pareto Distribution f_MLFP[k, :] = genpareto.pdf(sort(epsi_ex), c=0, scale=csi_MLFP, loc=sigma_MLFP - 1) q_MLFP[k, :] = QuantileGenParetoMLFP(epsi_bar, p_bar, csi_MLFP, sigma_MLFP, p_quant[l_2])[0] # MLFP-quantile # historical quantile below the threshold q_bt = q_hist[0, l_2] # histogram of the pdf of the Conditional Excess Distribution t_ex_ = len(epsi_ex) options = namedtuple('options', 'n_bins') options.n_bins = round(12 * log(t_ex_)) hgram_ex, x_bin = HistogramFP(epsi_ex.reshape(1, -1), ones((1, t_ex_)) / t_ex_, options) # -
import numpy as np from scipy.stats import genpareto import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) c = 0.1 mean, var, skew, kurt = genpareto.stats(c, moments='mvsk') x = np.linspace(genpareto.ppf(0.01, c), genpareto.ppf(0.99, c), 100) ax.plot(x, genpareto.pdf(x, c), 'r-', lw=5, alpha=0.6, label='genpareto pdf') rv = genpareto(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = genpareto.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c)) r = genpareto.rvs(c, size=1000) ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
def extremal_distribution_fit(data, var_name, sample, threshold, fit_type, x_min, x_max, n_points, loc=None, scale=None, cumulative=True): # Initialization of the output variables param = None x = None y = None y_rp = None if fit_type == 'gpd': # Fit the exceedances over threshold to Generalized Pareto distribution param = generalized_pareto_distribution_fit(sample, threshold, loc, scale) # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) if cumulative: y = genpareto.cdf(x, param[0], param[1], param[2]) # Calculate the number of extreme peaks per year n_peaks_year = len(sample) / len( data[var_name].index.year.unique()) y_rp = return_period_curve(n_peaks_year, y) else: y = genpareto.pdf(x, param[0], param[1], param[2]) elif fit_type == 'coles': # Fit the exceedances over threshold to Generalized Pareto distribution param = generalized_pareto_distribution_fit(sample, threshold, loc, scale) x = np.arange(1, 501) u = param[1] sigma = param[2] xi = param[0] # Mean number of data in a year (numero medio de datos en un año) n_y = len(data[var_name]) / len(data[var_name].index.year.unique()) # Total number of POT / number of years z_u = len(sample) / len(data[var_name]) # n_y*z_u is the number of POT / number of years -- > numer of POT per year y_rp = u + (sigma / xi) * (((x * n_y * z_u)**xi) - 1) elif fit_type == 'gev': param = generalized_extreme_value_distribution_fit(sample, loc, scale) # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) if cumulative: y = genextreme.cdf(x, param[0], param[1], param[2]) # Calculate the number of extreme peaks per year n_peaks_year = 1 y_rp = return_period_curve(n_peaks_year, y) else: y = genpareto.pdf(x, param[0], param[1], param[2]) elif fit_type == 'poisson': # Calculate the pdf and/or cdf x = np.linspace(x_min, x_max, n_points) # Fit the exceedances over threshold to Generalized Pareto distribution gpd_param = generalized_pareto_distribution_fit( sample, threshold, loc, scale) # Poisson parameter (número de eventos extraños al año) poisspareto_param = len(sample) / len( data[var_name].index.year.unique()) # Poisson pareto parameters poisspareto_param = [ poisspareto_param, gpd_param[0], gpd_param[2], gpd_param[1] ] # Equivalent gev parameters param = [0, 0, 0] param[0] = -poisspareto_param[1] param[1] = poisspareto_param[2] * (poisspareto_param[0]** poisspareto_param[1]) param[2] = poisspareto_param[3] + ( (poisspareto_param[2] / poisspareto_param[1]) * ((poisspareto_param[0]**poisspareto_param[1]) - 1)) if cumulative: y = genextreme.cdf(x, param[0], param[2], param[1]) # Calculate the number of extreme peaks per year n_peaks_year = 1 y_rp = return_period_curve(n_peaks_year, y) else: y = genextreme.pdf(x, param[0], param[2], param[1]) return param, x, y, y_rp
fig, ax = plt.subplots(nrows=2, ncols=2) fig.tight_layout() ax[-1, -1].axis('off') hist_ot = ax[0][0].hist(x=lat, bins=bins_ot, histtype='stepfilled', alpha=0.3) ax[0][0].set_xlabel('latency [\u03BCs]', fontsize=8) ax[0][0].set_yscale('log') #print(hist_ot[0]) hist_ot_norm = ax[1][0].hist(x=lat, bins=bins_ot, density=True, histtype='stepfilled', alpha=0.3) # Fit using the fitter of the genpareto class (shown in red). ret = gpareto.fit(lat, loc=threshold) ax[1][0].plot(x, gpareto.pdf(x, c=ret[0], loc=ret[1], scale=ret[2]), 'r-', lw=1, color='red', alpha=0.8) ax[1][0].set_xlabel('latency [\u03BCs]', fontsize=8) print(ret) print('\ngoodness-of-fit: ' + '{:03.3f}'.format(chi2_test(hist_ot_norm, n_bins=n_bins, c=ret[0], loc=ret[1], scale=ret[2], norm=len(lat)))) print("\n curve_fit:") # Fit using the curve_fit fitter. Fix the value of the "loc" parameter. popt, pcov = cfit(lambda x, c, scale: gpareto.pdf(x, c=c, loc=threshold, scale=scale), x, hist_ot_norm[0],
borderaxespad=0.) plt.show() print 'The Dotted line (Real Data), should reasonably agree with the model to make a Linear fit.' print '\n' #Density/Histogram Plot plt.title('Density Plot') plt.ylabel('Density') plt.xlabel('Threshold Exceedances') plt.hist(numpy.array(xvalues['x' + str(threshposition)]), normed=True, bins=25, facecolor='y', label='Simulation Data') plt.plot(xvalues['x' + str(threshposition)], [ genpareto.pdf(xx, Maxxis, loc=Threshold, scale=Maxsigma) for xx in xvalues['x' + str(threshposition)] ], 'r-', lw=3, label='Model Fit') plt.legend(bbox_to_anchor=(0., 1.1, 1., .102), loc=3, ncol=1, mode="expand", borderaxespad=0.) plt.show() #Return Level Plot #Generate the real rplist for the raw data randproblist = []
import numpy as np from scipy.stats import genpareto import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1) c = 0.1 mean, var, skew, kurt = genpareto.stats(c, moments='mvsk') x = np.linspace(genpareto.ppf(0.01, c),genpareto.ppf(0.99, c), 100) ax.plot(x, genpareto.pdf(x, c),'r-', lw=5, alpha=0.6, label='genpareto pdf') rv = genpareto(c) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') vals = genpareto.ppf([0.001, 0.5, 0.999], c) np.allclose([0.001, 0.5, 0.999], genpareto.cdf(vals, c)) r = genpareto.rvs(c, size=1000) ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()