Example #1
0
    def fit(self, data):
        recs = data[data > 0]
        mu = scoreatpercentile(recs, self.threshold)
        loc, scale, shp = [self.nodata] * 3
        w = nodata * np.ones(len(self.intervals))

        if len(data) < self.minrecords:
            return Rp, loc, scale, shp

    # Fill each day that a cyclone isn't recorded with zero so we get
    # the correct rate for the return periods
        datafilled = np.zeros(int(self.numsim * NPYR))
        datafilled[-len(data):] = data
        log.debug("The length of the filled data is {0}".format(
            len(datafilled)))

        rate = float(len(datafilled[datafilled > mu])) / float(len(datafilled))
        log.debug("The calculated rate is: {0}".format(rate))

        try:
            shape, location, scale = genpareto.fit(datafilled[datafilled > mu],
                                                   floc=mu)
        except:
            return w, loc, scale, shp

        w = gpdReturnLevel(self.intervals, mu, shape, scale, rate)
        if shape > 0:  # or Rpeval[0] < 0.0:
            return w, loc, scl, shp
        else:
            return w, location, scale, shape
Example #2
0
def gpdfit(data,
           years,
           numsim,
           missingValue=-9999,
           minrecords=50,
           threshold=99.5):
    """
    Fit a Generalised Pareto Distribution to the data. For a quick evaluation,
    we use the 99.5th percentile as a threshold. 

    :param data: array of data values.
    :type data: :class:`numpy.ndarray`
    :param years: array of years for which to calculate return period values.
    :param int numsim: number of simulations created.
    :type years: :class:`numpy.ndarray`
    :param float missingValue: value to insert if fit does not converge.
    :param int minrecords: minimum number of valid observations required to
                           perform fitting.
    :param float threshold: Threshold for performing the fitting. Default is 
                            the 99.5th percentile

    Returns:
    --------

    :param Rpeval: `numpy.array` of return period wind speed values
    :param location: location parameter
    :param scale: scale parameter
    :param shape: shape parameter
    """
    recs = data[data > 0]
    mu = scoreatpercentile(data, threshold)

    loc, scl, shp = [missingValue, missingValue, missingValue]
    Rp = missingValue * np.ones(len(years))

    log.debug("The length of the data currently is {0}".format(len(data)))

    if len(data) < minrecords:
        return Rp, loc, scl, shp

    # Fill each day that a cyclone isn't recorded with zero so we get
    # the correct rate for the return periods
    datafilled = np.zeros(int(numsim * 365.25))
    datafilled[-len(data):] = data
    log.debug("The length of the filled data is {0}".format(len(datafilled)))

    rate = float(len(datafilled[datafilled > mu])) / float(len(datafilled))
    log.debug("The calculated rate is: {0}".format(rate))

    try:
        shape, location, scale = genpareto.fit(datafilled[datafilled > mu],
                                               floc=mu)
    except:
        return Rp, loc, scl, shp

    Rpeval = gpdReturnLevel(years, mu, shape, scale, rate)
    if shape > 0:  # or Rpeval[0] < 0.0:
        return Rp, loc, scl, shp
    else:
        return Rpeval, location, scale, shape
def calculateShape(mu, data):
    """
    :param float mu: threshold parameter for the GPD distribution.
    :param data: :class:`numpy.ndarray` of data values to fit.
    """
    nobs = len(data)
    nexc = len(data[data > mu])
    rate = float(nexc) / float(nobs)
    gpd = genpareto.fit(data[data > mu] - mu)

    return gpd
def calculateShape(mu, data):
    """
    :param float mu: threshold parameter for the GPD distribution.
    :param data: :class:`numpy.ndarray` of data values to fit.
    """
    nobs = len(data)
    nexc = len(data[data > mu])
    rate = float(nexc)/float(nobs)
    gpd = genpareto.fit(data[data > mu] - mu)

    return gpd
Example #5
0
def GeneralizedPareto_ICDF(x, p):
    '''
    Generalized Pareto fit
    Returns inverse cumulative probability function at p points
    '''

    # fit a generalized pareto and get params 
    shape, _, scale = genpareto.fit(x)

    # get percent points (inverse of CDF) 
    icdf = genpareto.ppf(p, shape, scale=scale)

    return icdf
Example #6
0
def GeneralizedPareto_CDF(x):
    '''
    Generalized Pareto fit
    Returns cumulative probability function at x.
    '''

    # fit a generalized pareto and get params 
    shape, _, scale = genpareto.fit(x)

    # get generalized pareto CDF
    cdf = genpareto.cdf(x, shape, scale=scale)

    return cdf
Example #7
0
    def CalculaParametros(self):

        if self.tipoSerie == 'Parcial':
            #Achando o valor limiar:
            Parametro = genpareto.fit(self.dadoSerie)
            print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return  Parametro
        elif self.tipoSerie == 'Anual':
            Parametro = genextreme.fit(self.dadoSerie)
            print('Parametros com Gev: \nForma: %f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return Parametro
Example #8
0
def generalized_pareto_distribution_fit(peaks_over_th,
                                        threshold,
                                        loc=None,
                                        scale=None):
    # Fit the exceedances over threshold to Generalized Pareto distribution
    # BUG Missing default values get different results than default parameters values
    if loc is None and scale is not None:
        gpd_param = genpareto.fit(peaks_over_th - threshold, scale=scale)
    elif loc is not None and scale is None:
        gpd_param = genpareto.fit(peaks_over_th - threshold, loc=loc)
    elif loc is None and scale is None:
        gpd_param = genpareto.fit(peaks_over_th - threshold)
    else:
        gpd_param = genpareto.fit(peaks_over_th - threshold,
                                  loc=loc,
                                  scale=scale)

    # Set the localization parameter equal to the threshold
    gpd_param = list(gpd_param)
    gpd_param[1] = threshold

    return gpd_param
Example #9
0
    def _p(test_i, null_i, M_i, d_i):
        gpd_fit = None
        gpd_fit_p_value = None

        n_i = n
        
        # TODO: no need to sort as much as N numbers, do partial sort:
        #  but this requires some tests (both performance and unit)
        # null_i_partitioned = np.partition(null_i, n_i+1)
        # null_i_first_n_sorted = sorted(null_i_partitioned[:-n_i+1])
        null_i = sorted(null_i)
        t = None
        
        if all(np.isnan(null_i)):
            return np.nan, False, np.nan, np.nan
        
        # compute ecdf based, biased estimate of p-value
        raw_ecdf_estimate = (ecdf_pseudocount + d_i.sum()) / (N + 1)
        
        if M_i < m:
            # fit GDP, reducing $n$ until convergance
            while n_i > 0:
                
                # -1 because Python has 0-based indexing
                t = (null_i[-n_i-1] + null_i[-n_i-2]) / 2
                
                y_untill_n = null_i[-n_i:]
                exceedences = y_untill_n - t

                assert all(y_untill_n >= t)
                assert len(exceedences) == n_i
                
                fit = genpareto.fit(exceedences)
                fitted = genpareto(*fit)
                gpd_fit = fitted
                
                gpd_fit_p_value = ad_test(exceedences, fitted).pvalue

                if gpd_fit_p_value <= 0.05:
                    break
                else:
                    n_i -= decrease_n_by

        if gpd_fit and gpd_fit_p_value < 0.05:
            return n_i / N * (1 - gpd_fit.cdf(test_i - t)), True, gpd_fit_p_value, raw_ecdf_estimate
        else:
            if gpd_fit:
                # TODO: get index and highlight which observation could not be fitted!
                warn(f'A good GPD fit could not be reached, using ECDF estimate instead')
            
            return raw_ecdf_estimate, False, np.nan, raw_ecdf_estimate
Example #10
0
    def fit_tail(tail):
        """
        Fitting the tail using scipys genpareto and calculating the cdf of the tail for the fitted distribution
        Args:
            tail (numpy.ndarray): tail to fit

        Returns:
            numpy.ndarray, tuple: Cdf of the data for the fitted tail, fit parameters (c, loc, scale).
        """
        # floc is set to zero because the data is expected to be transformed, so the location of the pareto distribution
        #  is 0. Check generate_tails for further information.
        fit_out = genpareto.fit(tail, floc=0)
        # generate distribution with the fitted parameters
        estimated_distribution = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2])
        # calculate the cdf of the estimated distribution in ascending order
        cdf_of_tail = estimated_distribution.cdf(tail)
        cdf_of_tail.sort()
        return cdf_of_tail, fit_out
Example #11
0
    def montecarlo_simulation(self, mc_steps=None):
        """
        Runs Monte Carlo simulation for the optimal position.
        
        Args:
            mc_steps: number of Monte Carlo steps to run.

        Returns:
            float: p-value for the AU2 test statistic
            float: p-value for the Anderson-Darling test statistic
            float: p-value for the Cramér-von Mises test statistic
            int: number of montecarlo steps
            
        Raises:
            RuntimeError is the function gets called, when the fit for the optimal tail start has not been run before.
        """
        if (self.optimal_tail_index is None or
                self.rv_list is None or
                self.cdf_list is None):
            raise RuntimeError("Fits have to run before the Monte Carlo simulation")
        if mc_steps is None:
            mc_steps = self.mc_steps
        # generate mc points
        mc_counter_au2 = 0
        mc_counter_a2 = 0
        mc_counter_w2 = 0

        # make sure every thread has a different seed
        random_state = np.random.RandomState(np.random.seed())

        random_variates = self.rv_list[self.optimal_tail_index].rvs(size=(mc_steps, self.optimal_tail.size), random_state=random_state)
        for index, random_variate in enumerate(random_variates):
            print("\t" + str(index) + "/" + str(mc_steps), end='\r', flush=True)
            fit_out = genpareto.fit(np.sort(random_variate)[::-1], floc=0)
            my_pareto = genpareto(c=fit_out[0], loc=fit_out[1], scale=fit_out[2])
            cdf_of_tail = np.sort(my_pareto.cdf(random_variate))
            if au2(cdf_of_tail) > self.au_2_data[self.optimal_tail_index]:
                mc_counter_au2 += 1
            if anderson_darling(cdf_of_tail) > self.anderson_data[self.optimal_tail_index]:
                mc_counter_a2 += 1
            if cramer_von_mises(cdf_of_tail) > self.cramer_data[self.optimal_tail_index]:
                mc_counter_w2 += 1

        return mc_counter_au2, mc_counter_a2, mc_counter_w2, mc_steps
def SetUpSemiParametricCDFPlot(c1,
                               u,
                               plotvsc1=False,
                               name="Semi-Parametric Fit",
                               xlabel="",
                               ylabel=""):

    result = dict()
    us = list([u])
    x = np.linspace(min(c1), max(c1), 1000) if plotvsc1 == False else c1
    i = 1
    for u in us:
        exceedances = list()
        internals = list()
        for rvs in c1:
            if abs(rvs) > u:
                exceedances.append(abs(rvs) - u)
            else:
                internals.append(rvs)
        fits = None
        while fits == None:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')
                try:
                    fits = genpareto.fit(exceedances)
                except Warning as e:
                    print('error found:', e)
                warnings.filterwarnings('default')
        internals = np.array(internals).reshape((len(internals), 1))

        r1c, r2c, r3c, r4c, bwArr_Cdf = HybridSemiParametricGPDCDF(
            x, u, c1, fits[0], loc=fits[1], scale=fits[2])
        emp = pd.Series(r1c).apply(Empirical_StepWise_CDF(sorted(c1)))
        r1s, r2cs = DualSortByL1(r1c, r2c)

        r1p, r2p, r3p, r4p, bwArr_pdf = HybridSemiParametricGPDPDF(
            x, u, c1, fits[0], loc=fits[1], scale=fits[2])
        r1s, r2ps = DualSortByL1(r1p, r2p)

        result['%.10f' % (u)] = (r2c, r2p)
        i += 3

        plt.subplots_adjust(hspace=0.48)
    return result, c1, u, r1c, r2c, r3c, r4c, bwArr_Cdf, r1p, r2p, r3p, r4p, bwArr_pdf, plotvsc1, name, xlabel, ylabel
Example #13
0
    def CalculaParametros(self):

        if self.tipoSerie == 'Parcial':
            #Achando o valor limiar:
            limite = lp.LimiteParcial(self.dadoSerie).AchaLimite(2)
            print(limite)
            Parciais = se.Series(self.dadoSerie).serieMaxParcial(limite)
            datasP, PicosParciais = se.Series(Parciais).separaDados()
            Parametro = genpareto.fit(PicosParciais)
            print('Parametros com Pareto: \nForma: %.f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return  Parametro
        elif self.tipoSerie == 'Anual':
            Anuais = se.Series(self.dadoSerie).serieMaxAnual()
            datasA, PicosAnuais = se.Series(Anuais).separaDados()
            Parametro = genextreme.fit(PicosAnuais)
            print('Parametros com Gev: \nForma: %.f, Localidade: %f, Escala: %f' %
                  (Parametro[0],Parametro[1],Parametro[2]))
            return Parametro
Example #14
0
def gpdfit(data, rate, years, missingValue=-9999, minrecords=50, thresh=99.7):
    """
    Fit a Generalised Pareto Distribution to the data. For a quick evaluation,
    we use the 99.7th percentile as a threshold. 

    :param data: array of data values.
    :type data: :class:`numpy.ndarray`
    :param years: array of years for which to calculate return period values.
    :type years: :class:`numpy.ndarray`
    :param float missingValue: value to insert if fit does not converge.
    :param int minRecords: minimum number of valid observations required to
                           perform fitting.
    :param float thresh: Threshold for performing the fitting. Default is the
                     99.7th percentile

    :return: return period values
    :rtype: :class:`numpy.ndarray`
    :return: location, shape and scale parameters of the distribution,
             determined using MLE (as per :meth:`scipy.stats.rv_continuous`)
    :rtype: float
    """

    mu = scoreatpercentile(data, thresh)

    if len(data[data > 0]) < minrecords:
        return
    loc, scale, shp = [missingValue, missingValue, missingValue]
    w = missingValue * np.ones(len(years))

    if len(data[data > 0]) < minrecords:
        return w, loc, scale, shp

    rate = float(len(data[data > mu])) / float(len(mu))

    try:
        params = genpareto.fit(data[data > mu], floc=mu)
    except:
        return w, loc, scale, shp

    w = gpdReturnLevel(years, mu, params[0], params[2], rate)

    return w, mu, params[2], params[0]
    lamda = theta = 0.97
    mu = np.mean(log_return.iloc[:499,])
    covariance = np.std(log_return.iloc[:499,])
    ewma_mu,ewma_covariance = EWMA(mu,covariance,lamda,theta,log_return)

    blocks_size = 125
    num_blocks = math.floor(log_return.shape[0]/blocks_size)
    max_in_blocks = block_maximuns(num_blocks,blocks_size, port_loss)
    gev_values = gev.fit(max_in_blocks)

    alpha_thresh = 0.95
    u_value = sort_port_loss[math.floor(log_return.shape[0] * alpha_thresh)]
    cdf_u_value = math.ceil(log_return.shape[0] * alpha_thresh) / log_return.shape[0]
    sort_data = port_loss[port_loss > u_value].dropna()- u_value
    y_data = np.array(sort_data)[:, 0].T
    gpd_values = gpd.fit(y_data)

    high_alpha = 0.9999
    low_alpha = 0.99
    delta_alpha = 0.000099
    alpha = np.arange(low_alpha,high_alpha,delta_alpha)

    emp_var_table = EMP_VaR(alpha, sort_port_loss)
    ewma_var_table = EWMA_VaR(ewma_mu[0],ewma_covariance[0], alpha,port_value )
    gev_var_table = GEV_VaR(gev_values,blocks_size,alpha)
    gpd_var_table = GPD_VaR(gpd_values,u_value,cdf_u_value,alpha)

    crash_log_return = -0.099452258
    crash_loss = -port_value * crash_log_return
    emp_pre = see_crash(emp_var_table, crash_loss, delta_alpha)
    ewma_pre = see_crash(ewma_var_table, crash_loss, delta_alpha)
Example #16
0
        label="rnds")
ax.legend(loc="best", frameon=False)

print("maximum: {}".format(max(random_numbers)))
print("minimum: {}".format(min(random_numbers)))

sample_mean = np.mean(random_numbers)
sample_mean_sq = np.mean(random_numbers**2)
sample_variance = np.var(random_numbers)
sample_variance_sq = np.var(random_numbers**2)
sigma_est = 0.5 * sample_mean * sample_mean_sq / (
    sample_mean_sq - sample_mean**2)  # OK
gamma_est = 0.5 - (sample_mean**2 / (2 *
                                     (sample_mean_sq - sample_mean**2)))  # ok
print("sigma closed est: {}".format(sigma_est))
print("gamma closed est: {}".format(gamma_est))

sigma_moments = 0.5 * sample_mean * (sample_mean**2 / sample_variance + 1)
gamma_moments = -0.5 * (sample_mean**2 / sample_variance - 1)
print("sigma moments est:  {}".format(sigma_moments))
print("gamma moments est: {}".format(gamma_moments))

# print(sigma_moments - sigma_est)
# print(gamma_moments - gamma_est)
# print(1 - sigma/sigma_moments)

mle_fit = genpareto.fit(random_numbers)
print("gamma mle {}".format(mle_fit[0]))
print("mu mle {}".format(mle_fit[1]))
print("sigma mle {}".format(mle_fit[2]))
plt.show()
Example #17
0
        y, e, w = honu_filter.run(desired_output, filter_data)
        elbnd = pa.detection.ELBND(w, e, function="sum")

        dw = np.copy(w)
        dw[1:] = np.abs(np.diff(dw, n=1, axis=0))
        dw_count = int(dw.shape[0])

        hpp = np.ones((dw_count - gev_window, filter_len))
        for i in range(gev_window, dw.shape[0]):
            if i % 100 == 0:
                pass  # print((str(datetime.now())), " processing: ", i)
            for j in range(filter_len):
                poted_values = pot(dw[i - gev_window:i, j], 1)

                if dw[i, j] > poted_values[-1]:
                    fit = genpareto.fit(poted_values, floc=[poted_values[-1]])
                    fit = genpareto.fit(poted_values,
                                        floc=fit[1],
                                        fscale=fit[2])
                    if j == 0:
                        #print(fit[2])
                        mu_check.append(poted_values[-1])
                    gamma = fit[0]
                    mu = fit[1]

                    sigma = fit[2]
                    #gpd_params_dict[str(j + 1)]["gamma"].append(gamma)
                    #gpd_params_dict[str(j + 1)]["mu"].append(mu[0])
                    #gpd_params_dict[str(j + 1)]["sigma"].insert(sigma)
                    if dw[i, j] >= fit[1]:
                        hpp[i - gev_window, j] = 1 - genpareto.cdf(
Example #18
0
def extremesl_fit(station_data_file, pipeline_id):

    # Load the station data file
    try:
        f = open(station_data_file, 'rb')
    except:
        print("Cannot open station data file: {}\n".format(station_data_file))

    # Extract the configuration variables
    station_data = pickle.load(f)
    f.close()

    # Keep track of station IDs that don't produce valid GPD fit
    bad_stations = []

    # Initialize dictionary to hold fitted data
    fitted_data = {}

    # Loop through the available station data
    for station_id in station_data:

        #range of Peak-Over-Threshold values and corresponding heights at station
        potvals = station_data[station_id]['pot_vals']

        #etreme values based on POT with 95%
        extremes_prepo = station_data[station_id]['decl_extremes']

        #Determine location parameter, i.e. find the peak height corresponding to POT threshold in projection settings
        #location parameter = threshold water-level above which return levels are estimated with the GPD,
        #loc = potvals[np.where(potvals[:,0] == gpd_pot_threshold)][0][1]
        loc = potvals

        #extremes that exceed location parameter
        extremes_loc = extremes_prepo[extremes_prepo['height'] > loc]
        extremes_loc_timesorted = extremes_loc.sort_values(by=['mytime'])

        #count declustered days with extreme obs per year
        decl_esls_pyear = extremes_loc.groupby(
            extremes_loc_timesorted.mytime.dt.year, as_index=True).count()
        esl_years = np.unique(extremes_loc_timesorted['mytime'].dt.year.values)

        #average exceedances per year for configured threshold
        avg_exceed = 365.25 * 24 * len(extremes_loc) / len(
            station_data[station_id]['obs'])
        #^ above is not completely correct, because there are less than so many hours in one year? should take the average number of hours of obs in one year?

        ##### FIT GPD
        #provide initial guess using method of moments (code based on gpfit.m)
        xbar = np.mean(extremes_loc['height'].values - loc)  #mean of extremes
        s2 = np.var(extremes_loc['height'].values - loc)  #variance of extremes
        k0 = -.5 * ((xbar**2) / s2 - 1)  #initial guesses
        sigma0 = .5 * xbar * ((xbar**2) / s2 + 1)
        xmax = max(extremes_loc['height'].values - loc)

        # Method of moments invalid (code based on gpfit.m)
        if (k0 < 0 and xmax >= -sigma0 / k0):
            #assume exponential distribution
            k0 = 0
            sigma0 = xbar

        #fit gpd based on exceedences and initial guess, note that optimization vals differ slightly from gpfit in MATLAB
        gp_params = genpareto.fit(extremes_loc['height'].values - loc,
                                  loc=0,
                                  scale=sigma0)

        #calculate covariance matrix of estimated parameters
        gp_nlogl, gp_cov = gplike(gp_params[0], gp_params[2],
                                  extremes_loc['height'].values - loc)

        # Are the GPD parameters outside the supported range? If so, add this station to the list of bad stations.
        if gp_nlogl == np.Inf:
            print(
                'GDP parameters of ' + station_id +
                ' are outside of the supported range, confidence intervals and standard errors cannot be computed reliably.'
            )
            print(station_id + ' will be ommitted from the results.')
            bad_stations.append(station_id)

        ###### QUERY HISTORICAL RETURN FREQUENCIES FOR TEST HEIGHTS
        #get MHHW for Gumbel distribution below Pareto
        obs = station_data[station_id]['obs']
        obs2D = obs.groupby(pd.Grouper(key='mytime',
                                       freq='2D')).max()  #2-daily maxima
        mhhw = np.nanmean(
            obs2D.height
        )  #calculate MHHW as mean of maximum value per 2 days (to account for tidal cycle longer than 1 day)
        mhhwFreq = 365.25 / 2
        #assume MHHW is exceeded once every 2 days

        ##### STORE DATA IN DICTIONARY
        fitted_data[station_id] = {'loc': loc, 'gp_shape': gp_params[0], 'gp_scale': gp_params[2], \
         'gp_cov': gp_cov, 'extremes_loc': extremes_loc, 'avg_exceed': avg_exceed, 'decl_esls_pyear': decl_esls_pyear, \
         'esl_years': esl_years, 'mhhw': mhhw, 'mhhwFreq': mhhwFreq}

        #fitted_data[station_id]['gp_shape'] = gp_params[0]
        #fitted_data[station_id]['gp_scale'] = gp_params[2]
        #fitted_data[station_id]['gp_cov'] = gp_cov
        #fitted_data[station_id]['extremes_loc'] = extremes_loc
        #fitted_data[station_id]['avg_exceed'] =avg_exceed
        #fitted_data[station_id]['decl_esls_pyear'] =decl_esls_pyear
        #fitted_data[station_id]['esl_years'] = esl_years
        #fitted_data[station_id]['mhhw'] = mhhw
        #fitted_data[station_id]['mhhwFreq'] = mhhwFreq

    # Remove the stations that don't support GPD fit
    for bad_station in bad_stations:
        del fitted_data[bad_station]

    # If there are no more stations left, raise an error
    if len(fitted_data) == 0:
        raise Exception("No stations available with valid GPD fit")

    # Write station data to an output pickle
    outdir = os.path.dirname(__file__)
    outfile = open(os.path.join(outdir, "{}_fit.pkl".format(pipeline_id)),
                   'wb')
    pickle.dump(fitted_data, outfile)
    outfile.close()

    return (0)
Example #19
0
    def _PlotSemiParametricFitResults(self,
                                      c1,
                                      u,
                                      r1c,
                                      r2c,
                                      r3c,
                                      r4c,
                                      bwArr_Cdf,
                                      r1p,
                                      r2p,
                                      r3p,
                                      r4p,
                                      bwArr_pdf,
                                      plotvsc1=False,
                                      name="Semi-Parametric Fit",
                                      xlabel="",
                                      ylabel=""):
        '''
        Wrapper to plot the results of SemiParametricCDFFit
        returns void
        '''
        x = np.linspace(min(c1), max(c1), 1000) if plotvsc1 == False else c1

        us = list([u])
        fig, ax = plt.subplots(3,
                               len(us),
                               sharey=True,
                               figsize=(7, 7 * len(us)))
        fig.subplots_adjust(wspace=0)
        fig.canvas.set_window_title(name)
        fig.canvas.figure.set_label(name)
        result = dict()
        i = 1
        for u in us:
            exceedances = list()
            internals = list()
            for rvs in c1:
                if abs(rvs) > u:
                    exceedances.append(abs(rvs) - u)
                else:
                    internals.append(rvs)
            fits = None
            while fits == None:
                with warnings.catch_warnings():
                    warnings.filterwarnings('ignore')
                    try:
                        fits = genpareto.fit(exceedances)
                    except Warning as e:
                        print('error found:', e)
                    warnings.filterwarnings('default')
            internals = np.array(internals).reshape((len(internals), 1))
            #c1s = np.array(c1).reshape((len(c1),1))
            #cdf_smoother = kde_statsmodels_m_cdf(internals,x,bandwidth=0.2)
            #pdf_smoother = kde_statsmodels_m_pdf(internals,x,bandwidth=0.2)
            #plt.subplot(2,len(us),i)
            #plt.plot(x, HybridNormalGPDCDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2)
            #plt.plot(x, norm.cdf(x,mean(c1),sd(c1)), linewidth=2)
            #plt.plot(x, HybridNormalGPDPDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2)
            #plt.plot(x, norm.pdf(x,mean(c1),sd(c1)), linewidth=2)
            #plt.hist(np.array(c1), bins=15, normed=True)
            #plt.xlabel(xlabel)
            #plt.ylabel(ylabel)
            #plt.title("Generalised Pareto Tails on Gaussian Fitted Center")
            #plt.legend(["Fitted_HybridCDF", "Fitted_Normal_CDF", "Fitted_HybridPDF", "Fitted_Normal_PDF", "Data Histogram"],loc='best')
            plt.subplot(2, len(us), i)

            emp = pd.Series(r1c).apply(Empirical_StepWise_CDF(sorted(c1)))
            r1s, r2cs = DualSortByL1(r1c, r2c)
            plt.plot(r3c, r4c, linewidth=2)
            plt.plot(r1c, emp, linewidth=2)
            plt.plot(r1s, r2cs, linewidth=2)
            #plt.plot(r1, norm.cdf(r1,mean(c1),sd(c1)), linewidth=2)
            plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.title("Semi Parametric CDF with BandWidth {0}".format(
                bwArr_Cdf).replace('[ ', "list_").replace(']', "_"))
            #plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother", "Fitted_NormalCDF", "Fitted_HybridPDF", "PDF_Smoother", "Fitted_Normal_PDF", "Student_T Hist"],loc='best')
            plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother"],
                       loc='best')

            plt.subplot(2, len(us), i + 1)

            r1s, r2ps = DualSortByL1(r1p, r2p)
            plt.plot(r3p, r4p, linewidth=2)
            plt.plot(r1s, r2ps, linewidth=2)
            #plt.plot(r1, norm.pdf(r1,mean(c1),sd(c1)), linewidth=2)
            plt.hist(np.array(c1), bins=15, normed=True)
            plt.xlabel(xlabel)
            plt.ylabel(ylabel)
            plt.title("Semi Parametric PDF with BandWidth {0}".format(
                bwArr_pdf).replace('[ ', "list_").replace(']', "_"))
            plt.legend(["Fitted_HybridPDF", "PDF_Smoother", "Data Histogram"],
                       loc='best')

            result['%.10f' % (u)] = (r2c, r2p)
            i += 3

            plt.subplots_adjust(hspace=0.48)
Example #20
0
 def fit_resample(self):
     resample = genpareto.rvs(self.shape, self.location, self.scale,
                              self.size)
     return genpareto.fit(resample)
Example #21
0
#print(bins_ot)

fig, ax = plt.subplots(nrows=2, ncols=2)
fig.tight_layout()
ax[-1, -1].axis('off')

hist_ot = ax[0][0].hist(x=lat, bins=bins_ot, histtype='stepfilled', alpha=0.3)
ax[0][0].set_xlabel('latency [\u03BCs]', fontsize=8)
ax[0][0].set_yscale('log')
#print(hist_ot[0])

hist_ot_norm = ax[1][0].hist(x=lat, bins=bins_ot,
                             density=True, histtype='stepfilled', alpha=0.3)

# Fit using the fitter of the genpareto class (shown in red).
ret = gpareto.fit(lat, loc=threshold)
ax[1][0].plot(x, gpareto.pdf(x, c=ret[0],  loc=ret[1],  scale=ret[2]),
              'r-', lw=1, color='red',  alpha=0.8)

ax[1][0].set_xlabel('latency [\u03BCs]', fontsize=8)
print(ret)
print('\ngoodness-of-fit: ' + '{:03.3f}'.format(chi2_test(hist_ot_norm,
                                                          n_bins=n_bins,
                                                          c=ret[0],
                                                          loc=ret[1],
                                                          scale=ret[2],
                                                          norm=len(lat))))

print("\n curve_fit:")
# Fit using the curve_fit fitter. Fix the value of the "loc" parameter.
popt, pcov = cfit(lambda x, c, scale: gpareto.pdf(x, c=c, loc=threshold, scale=scale),
Example #22
0
def gpd_fit(y):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        xi_mle, _, sig_mle = genpareto.fit(y, floc=0)
    return xi_mle, sig_mle
def SemiParametricCDFFit(c1,
                         u,
                         plotvsc1=False,
                         name="Semi-Parametric Fit",
                         xlabel="",
                         ylabel=""):
    '''
    Calculates a SemiParametric fit to the data in c1.
    Uses a gaussian kernal estimation within the centre of the distribution of c1 which is decided by the threshold u.
    Uses a Generalised Pareto distribution to fit both tails outside of the threshold governed by u.
    Returns a tuple containing the the range (y points) of the (SemiPara-CDF,SemiPara-PDF); 
    if (plotvsc1 = False) => the y points depend on 1000 equally spaced points between min(c1) and max(c1).
    if (plotvsc1 = True) => the y points depend on the points in c1 and maintain the order of c1 in the outputted array. i.e. F_n(c1) where F_n is the semiparametric fitted function.
    '''
    'https://mglerner.github.io/posts/histograms-and-kernel-density-estimation-kde-2.html?p=28'
    x = np.linspace(min(c1), max(c1), 1000) if plotvsc1 == False else c1

    us = list([u])
    fig, ax = plt.subplots(3, len(us), sharey=True, figsize=(7, 7 * len(us)))
    fig.subplots_adjust(wspace=0)
    fig.canvas.set_window_title(name)
    fig.canvas.figure.set_label(name)
    result = dict()
    i = 1
    for u in us:
        exceedances = list()
        internals = list()
        for rvs in c1:
            if abs(rvs) > u:
                exceedances.append(abs(rvs) - u)
            else:
                internals.append(rvs)
        fits = None
        while fits == None:
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')
                try:
                    fits = genpareto.fit(exceedances)
                except Warning as e:
                    print('error found:', e)
                warnings.filterwarnings('default')
        internals = np.array(internals).reshape((len(internals), 1))
        #c1s = np.array(c1).reshape((len(c1),1))
        #cdf_smoother = kde_statsmodels_m_cdf(internals,x,bandwidth=0.2)
        #pdf_smoother = kde_statsmodels_m_pdf(internals,x,bandwidth=0.2)
        #plt.subplot(2,len(us),i)
        #plt.plot(x, HybridNormalGPDCDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2)
        #plt.plot(x, norm.cdf(x,mean(c1),sd(c1)), linewidth=2)
        #plt.plot(x, HybridNormalGPDPDF(x,u,mean(c1),sd(c1),fits[0],loc=fits[1],scale=fits[2]), linewidth=2)
        #plt.plot(x, norm.pdf(x,mean(c1),sd(c1)), linewidth=2)
        #plt.hist(np.array(c1), bins=15, normed=True)
        #plt.xlabel(xlabel)
        #plt.ylabel(ylabel)
        #plt.title("Generalised Pareto Tails on Gaussian Fitted Center")
        #plt.legend(["Fitted_HybridCDF", "Fitted_Normal_CDF", "Fitted_HybridPDF", "Fitted_Normal_PDF", "Data Histogram"],loc='best')

        plt.subplot(2, len(us), i)
        r1, r2c, r3, r4, bwArr = HybridSemiParametricGPDCDF(x,
                                                            u,
                                                            c1,
                                                            fits[0],
                                                            loc=fits[1],
                                                            scale=fits[2])
        emp = pd.Series(r1).apply(Empirical_StepWise_CDF(sorted(c1)))
        r1s, r2cs = DualSortByL1(r1, r2c)
        plt.plot(r3, r4, linewidth=2)
        plt.plot(r1, emp, linewidth=2)
        plt.plot(r1s, r2cs, linewidth=2)
        #plt.plot(r1, norm.cdf(r1,mean(c1),sd(c1)), linewidth=2)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.title(
            "Semi Parametric CDF with BandWidth {0}".format(bwArr).replace(
                '[ ', "list_").replace(']', "_"))
        #plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother", "Fitted_NormalCDF", "Fitted_HybridPDF", "PDF_Smoother", "Fitted_Normal_PDF", "Student_T Hist"],loc='best')
        plt.legend(["Fitted_HybridCDF", "ECDF Comparison", "CDF_Smoother"],
                   loc='best')

        plt.subplot(2, len(us), i + 1)
        r1, r2p, r3, r4, bwArr = HybridSemiParametricGPDPDF(x,
                                                            u,
                                                            c1,
                                                            fits[0],
                                                            loc=fits[1],
                                                            scale=fits[2])
        r1s, r2ps = DualSortByL1(r1, r2p)
        plt.plot(r3, r4, linewidth=2)
        plt.plot(r1s, r2ps, linewidth=2)
        #plt.plot(r1, norm.pdf(r1,mean(c1),sd(c1)), linewidth=2)
        plt.hist(np.array(c1), bins=15, normed=True)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
        plt.title(
            "Semi Parametric PDF with BandWidth {0}".format(bwArr).replace(
                '[ ', "list_").replace(']', "_"))
        plt.legend(["Fitted_HybridPDF", "PDF_Smoother", "Data Histogram"],
                   loc='best')

        result['%.10f' % (u)] = (r2c, r2p)
        i += 3

        plt.subplots_adjust(hspace=0.48)
    return result
    skew_SN = (mean_X_cube) * (lamda * (mean_X_sq)**3)**(-1 / 2)

    gamma_alpha = 4 * (skew_SN)**(-2)
    gamma_beta = (gamma_alpha / (lamda * mean_X_sq))**(1 / 2)
    gamma_k = lamda * mean_X - gamma_alpha / gamma_beta
    sort_comp_poisson_rnd = compound_poisson_distribution(
        lamda, num_values, mu, sigma)
    sort_comp_poisson_rnd.sort()
    alpha_low = .99
    alpha_high = .99999
    low_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_low)]
    high_val = sort_comp_poisson_rnd[math.floor(num_values * alpha_high)]
    mu_gp = low_val
    sample_data = compound_poisson_distribution(lamda, num_values, mu, sigma)
    data_gp = sample_data[sample_data > mu_gp] - mu_gp
    gpd_value = gpd.fit(data_gp)

    cdf_values = np.arange(low_val, high_val + (high_val - low_val) / 1000,
                           (high_val - low_val) / 1000)

    norm_cdf_tail = 1 - norm.cdf(cdf_values, mean_SN, (var_SN)**(1 / 2))
    gamma_cdf_tail = 1 - gamma.cdf(
        cdf_values - gamma_k, gamma_alpha, scale=1 / gamma_beta)
    GP_cdf_tail = 1 - (genpareto.cdf(
        cdf_values - low_val, gpd_value[0], scale=gpd_value[2]) * 0.01 + 0.99)
    emp_cdf_tails = emp_cdf_tail(sample_data, cdf_values)

    plt.loglog(cdf_values, norm_cdf_tail, label='CLT')
    plt.loglog(cdf_values, gamma_cdf_tail, label='GAMMA')
    plt.loglog(cdf_values, GP_cdf_tail, label='GP')
    plt.loglog(cdf_values, emp_cdf_tails, label='EMP')