Ejemplo n.º 1
0
 def _setup(dataset1: np.ndarray,
            dataset2: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     dataset1 = dataset1.flatten()
     dataset2 = dataset2.flatten()
     assert dataset1.size == dataset2.size
     afactor = dataset1.size / (dataset1.size + 1)
     first_ecdf = empirical_distribution.ECDF(dataset1)(dataset1) * afactor
     second_ecdf = empirical_distribution.ECDF(dataset2)(dataset2) * afactor
     return first_ecdf, second_ecdf
Ejemplo n.º 2
0
def gof_ks(x, Rho, P):
    """
  The function performs Kolmogorov-Smirnov goodness-of-fit test for the Vasicek distribution.
  Parameters:
    x   : A numeric vector in the interval of (0, 1) to test
    Rho : The Rho parameter in the Vasicek distribution
    P   : The P parameter in the Vasicek distribution
  Returns:
    A dictionary with ks-statistic and pvalue
  Example:
    import py_vsk
    x = py_vsk.vsk_rvs(100, Rho = 0.2, P = 0.1)
    gof_ks(x, Rho = 0.2, P = 0.1)
    # {'ks': 0.09, 'pvalue': 0.8154147124661313}
  """

    _x = sorted([_ for _ in x if _ > 0 and _ < 1 and not numpy.isnan(_)])

    ocdf = empirical_distribution.ECDF(_x)(_x)

    ecdf = [_["cdf"] for _ in vsk_cdf(_x, Rho=Rho, P=P)]

    _rst = ks_2samp(ecdf, ocdf)

    return ({"ks": _rst.statistic, "pvalue": _rst.pvalue})
Ejemplo n.º 3
0
def Covariance_valuation(data):

    x, y = np.shape(data)
    #data is the data matrix at all time step. The dimention would be X*Y
    #data 2 is required if calculating disimilarity

    #Step 1: Transform the data into emperical CDF
    P = np.zeros((x, y))
    for i in range(0, y):
        ECDF = edis.ECDF(data[:, i])
        P[:, i] = ECDF(data[:, i])


#Step 2: Transform the ECDF into a uniform distribution
    Y = 2 * (P - 0.5)
    #Calculate different indcies
    #M is surfeit index which is the mean state of the system
    M = 1 / y * np.sum(Y, axis=1)
    #S is the severity index showing systemwide how extreme the states are
    S = 1 / y * np.sum(np.abs(Y), axis=1)
    #D measures how dissimilar the states of all sites are with rescpect to each other
    D1 = np.zeros((x, y - 1))
    D2 = np.zeros((x, y))
    for i in range(0, y - 1):
        for j in range(i + 1, y):
            D2[:, j] = np.abs(Y[:, i] - Y[:, j])
        D1[:, i] = np.sum(D2, axis=1)
    D = 1 / y**2 * np.sum(D1, axis=1)

    return M, S, D
Ejemplo n.º 4
0
def omega_empirical(returns, target_rtn=0, log=True, plot=False, steps=1000):
    """
    Omega Ratio based on empirical distribution.
    """
    # validate_return_type(return_type)

    if not log:
        returns = pct_to_log_return(returns)

    # TODO
    ecdf = sde.ECDF(returns)

    # Generate computation space
    x = np.linspace(start=returns.min(), stop=returns.max(), num=steps)
    y = ecdf(x)

    norm_cdf = ss.norm.cdf(x, loc=returns.mean(), scale=returns.std(ddof=1))

    # Plot empirical distribution CDF versus Normal CDF with same mean and
    # stdev
    if plot:
        fig, ax = plt.subplots()
        fig.set_size_inches((12, 6))
        ax.plot(x, y, c="r", ls="--", lw=1.5, alpha=0.8, label="ECDF")
        ax.plot(x, norm_cdf, alpha=0.3, ls="-", c="b", lw=5, label="Normal CDF")
        ax.legend(loc="best")
        plt.show(fig)
        plt.close(fig)
Ejemplo n.º 5
0
def ecdfer(df: pd.DataFrame,
           ascending: bool = True,
           prediction_column: str = "prediction",
           ecdf_column: str = "prediction_ecdf",
           max_range: int = 1000) -> LearnerReturnType:
    """
    Learns an Empirical Cumulative Distribution Function from the specified column
    in the input DataFrame. It is usually used in the prediction column to convert
    a predicted probability into a score from 0 to 1000.

    Parameters
    ----------
    df : Pandas' pandas.DataFrame
        A Pandas' DataFrame that must contain a `prediction_column` columns.

    ascending : bool
        Whether to compute an ascending ECDF or a descending one.

    prediction_column : str
        The name of the column in `df` to learn the ECDF from.

    ecdf_column : str
        The name of the new ECDF column added by this function

    max_range : int
        The maximum value for the ECDF. It will go will go
         from 0 to max_range.
    """

    if ascending:
        base = 0
        sign = 1
    else:
        base = max_range
        sign = -1

    values = df[prediction_column]

    ecdf = ed.ECDF(values)

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        return new_df.assign(
            **{
                ecdf_column: (
                    base + sign * max_range * ecdf(new_df[prediction_column]))
            })

    p.__doc__ = learner_pred_fn_docstring("ecdefer")

    log = {
        'ecdfer': {
            'nobs': len(values),
            'prediction_column': prediction_column,
            'ascending': ascending,
            'transformed_column': [ecdf_column]
        }
    }

    return p, p(df), log
Ejemplo n.º 6
0
def interp_ecfd(sample):
    # https://stackoverflow.com/a/44163082
    sample_edf = edf.ECDF(sample)
    slope_changes = sorted(set(sample))
    sample_edf_values_at_slope_changes = [
        sample_edf(item) for item in slope_changes
    ]
    inverted_edf = interp1d(sample_edf_values_at_slope_changes, slope_changes)
    return inverted_edf
Ejemplo n.º 7
0
def linear_interpolation(array, sample_size):
    """
    Sampling from 1D array with linear interpolation as inverse cdf
    :param array: input 1D array of Numbers
    :param sample_size: number of samples
    :return: array of simulated data
    """
    sample_edf = edf.ECDF(array)
    slope_changes = sorted(set(array))
    sample_edf_values_at_slope_changes = [
        sample_edf(item) for item in slope_changes
    ]
    inverted_edf = interp1d(sample_edf_values_at_slope_changes, slope_changes)
    return inverted_edf(np.random.uniform(0, 1, sample_size))
Ejemplo n.º 8
0
def gof_chisq(x, Rho, P, n=10):
    """
  The function performs chi-square goodness-of-fit test for the Vasicek distribution.
  Parameters:
    x   : A numeric vector in the interval of (0, 1) to test
    Rho : The Rho parameter in the Vasicek distribution
    P   : The P parameter in the Vasicek distribution
    n   : The number of groups for the chi-square test. The value should be picked such
          that all observed and expected frequencies should be at least 5.
  Returns:
    A dictionary with chi-square statistic, pvalue, and a table to calculate chi-square
  Example:
    import py_vsk
    x = py_vsk.vsk_rvs(100, Rho = 0.2, P = 0.1)
    gof_chisq(x, Rho = 0.2, P = 0.1)['stat']
    # {'chisq': 11.0, 'pvalue': 0.27570893677222197}
  """

    _x = sorted([_ for _ in x if _ > 0 and _ < 1 and not numpy.isnan(_)])

    ocdf = empirical_distribution.ECDF(_x)(_x)

    ecdf = [_["cdf"] for _ in vsk_cdf(_x, Rho=Rho, P=P)]

    _cut = [_ for _ in sorted(qcut(ecdf, n) + [0, 1])]

    ogrp = numpy.searchsorted(_cut, ocdf).tolist()

    egrp = numpy.searchsorted(_cut, ecdf).tolist()

    _tbl = [
        dict(
            zip(["group", "observed", "expected"], [
                g,
                len([_ for _ in ogrp if _ == g]),
                len([_ for _ in egrp if _ == g])
            ])) for g in sorted(set(egrp))
    ]

    _rst = chisquare([_["observed"] for _ in _tbl],
                     [_["expected"] for _ in _tbl])

    return ({
        "stat": {
            "chisq": _rst.statistic,
            "pvalue": _rst.pvalue
        },
        "tbl": _tbl
    })
Ejemplo n.º 9
0
Archivo: pbo.py Proyecto: jijoy/pypbo
def pbo(M,
        S,
        metric_func,
        threshold,
        n_jobs=1,
        verbose=False,
        plot=False,
        hist=False):
    '''
    Based on http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2326253

    Features:
    * training and test sets are of equal size, providing comparable accuracy
    to both IS and OOS Sharpe ratios.
    * CSCV is symmetric, decline in performance can only result from
    overfitting, not arbitrary discrepancies between the training and test
    sets.
    * CSCV respects the time-dependence and other season-dependent features
    present in the data.
    * Results are deterministic, can be replicated.
    * Dispersion in the distribution of logits conveys relevant info regarding
    the robustness of the strategy selection process.
    * Model-free, non-parametric. Logits distribution resembles the cumulative
    Normal distribution if w_bar are close to uniform distribution (i.e. the
    backtest appears to be information-less). Therefore, for good backtesting,
    the distribution of logits will be centered in a significantly positive
    value, and its tail will marginally cover the region of negative logit
    values.

    Limitations:
    * CSCV is symmetric, for some strategies, K-fold CV might be better.
    * Not suitable for time series with strong auto-correlation, especially
    when S is large.
    * Assumes all the sample statistics carry the same weight.
    * Entirely possible that all the N strategy configs have high but similar
    Sharpe ratios. Therefore, PBO may appear high, however, 'overfitting' here
    is among many 'skilful' strategies.

    Parameters:

    M:
        returns data, numpy or dataframe format.
    S:
        chuncks to devided M into, must be even number. Paper suggests setting
        S = 16. See paper for details of choice of S.
    metric_func:
        evaluation function for returns data
    threshold:
        used as prob. of OOS Loss calculation cutoff. For Sharpe ratio,
        this should be 0 to indicate probabilty of loss.
    n_jobs:
        if greater than 1 then enable parallel mode
    hist:
        Default False, whether to plot histogram for rank of logits.
        Some problems exist when S >= 10. Need to look at why numpy /
        matplotlib does it.

    Returns:
    PBO result in namedtuple, instance of PBO.
    '''
    if S % 2 == 1:
        raise ValueError(
            'S must be an even integer, {:.1f} was given'.format(S))

    n_jobs = int(n_jobs)
    if n_jobs < 1:
        n_jobs = 1

    if isinstance(M, pd.DataFrame):
        # conver to numpy values
        if verbose:
            print('Convert from DataFrame to numpy array.')
        M = M.values

    # Paper suggests T should be 2x the no. of observations used by investor
    # to choose a model config, due to the fact that CSCV compares combinations
    # of T/2 observations with their complements.
    T, N = M.shape
    residual = T % S
    if residual != 0:
        M = M[residual:]
        T, N = M.shape

    sub_T = T // S

    if verbose:
        print('Total sample size: {:,d}, chunck size: {:,d}'.format(T, sub_T))

    # generate subsets, each of length sub_T
    Ms = []
    Ms_values = []
    for i in range(S):
        start, end = i * sub_T, (i + 1) * sub_T
        Ms.append((i, M[start:end, :]))
        Ms_values.append(M[start:end, :])
    Ms_values = np.array(Ms_values)

    if verbose:
        print('No. of Chuncks: {:,d}'.format(len(Ms)))

    # generate combinations
    Cs = [x for x in itr.combinations(Ms, S // 2)]
    if verbose:
        print('No. of combinations = {:,d}'.format(len(Cs)))

    # Ms_index used to find J_bar (complementary OOS part)
    Ms_index = set([x for x in range(len(Ms))])

    # create J and J_bar
    if n_jobs < 2:
        J = []
        J_bar = []

        for i in range(len(Cs)):
            # make sure chucks are concatenated in their original order
            order = [x for x, _ in Cs[i]]
            sort_ind = np.argsort(order)

            Cs_values = np.array([v for _, v in Cs[i]])
            # if verbose:
            #     print('Cs index = {}, '.format(order), end='')
            joined = np.concatenate(Cs_values[sort_ind, :])
            J.append(joined)

            # find Cs_bar
            Cs_bar_index = list(sorted(Ms_index - set(order)))
            # if verbose:
            # print('Cs_bar_index = {}'.format(Cs_bar_index))
            J_bar.append(np.concatenate(Ms_values[Cs_bar_index, :]))

        # compute matrices for J and J_bar, e.g. Sharpe ratio
        R = [metric_func(j) for j in J]
        R_bar = [metric_func(j) for j in J_bar]

        # compute ranks of metrics
        R_rank = [ss.rankdata(x) for x in R]
        R_bar_rank = [ss.rankdata(x) for x in R_bar]

        # find highest metric, rn contains the index position of max value
        # in each set of R (IS)
        rn = [np.argmax(r) for r in R_rank]
        # use above index to find R_bar (OOS) in same index position
        # i.e. the same config / setting
        rn_bar = [R_bar_rank[i][rn[i]] for i in range(len(R_bar_rank))]

        # formula in paper used N+1 as the denominator for w_bar.
        w_bar = [float(r) / N for r in rn_bar]
        # logit(.5) gives 0 so if w_bar value is equal to median logits is 0
        logits = [spec.logit(w) for w in w_bar]
    else:
        # use joblib for parallel calc
        # print('Run in parallel mode.')
        cores = job.Parallel(n_jobs=n_jobs)(job.delayed(pbo_core_calc)(
            Cs_x, Ms, Ms_values, Ms_index, metric_func, verbose)
                                            for Cs_x in Cs)
        # core_df = pd.DataFrame(cores, columns=PBOCore._fields)
        # convert to values needed.
        # # core_df = pd.DataFrame.from_records(cores)

        # J = core_df.J.values
        # J_bar = core_df.J_bar.values
        # R = core_df.R.values
        # R_bar = core_df.R_bar.values
        # R_rank = core_df.R_rank.values
        # R_bar_rank = core_df.R_bar_rank.values
        # rn = core_df.rn.values
        # rn_bar = core_df.rn_bar.values
        # w_bar = core_df.w_bar.values
        # logits = core_df.logits.values

        J = [c.J for c in cores]
        J_bar = [c.J_bar for c in cores]
        R = [c.R for c in cores]
        R_bar = [c.R_bar for c in cores]
        R_rank = [c.R_rank for c in cores]
        R_bar_rank = [c.R_bar_rank for c in cores]
        rn = [c.rn for c in cores]
        rn_bar = [c.rn_bar for c in cores]
        w_bar = [c.w_bar for c in cores]
        logits = [c.logits for c in cores]

    # prob of overfitting
    phi = np.array([1.0 if lam <= 0 else 0.0 for lam in logits]) / len(Cs)
    pbo_test = np.sum(phi)

    # performance degradation
    R_n_star = np.array([R[i][rn[i]] for i in range(len(R))])
    R_bar_n_star = np.array([R_bar[i][rn[i]] for i in range(len(R_bar))])
    lm = ss.linregress(x=R_n_star, y=R_bar_n_star)

    prob_oos_loss = np.sum(
        [1.0 if r < threshold else 0.0
         for r in R_bar_n_star]) / len(R_bar_n_star)

    # Stochastic dominance
    y = np.linspace(min(R_bar_n_star),
                    max(R_bar_n_star),
                    endpoint=True,
                    num=1000)
    R_bar_n_star_cdf = smd.ECDF(R_bar_n_star)
    optimized = R_bar_n_star_cdf(y)

    R_bar_cdf = smd.ECDF(np.concatenate(R_bar))
    non_optimized = R_bar_cdf(y)

    dom_df = pd.DataFrame(
        dict(optimized_IS=optimized, non_optimized_OOS=non_optimized))
    dom_df.index = y
    # visually, non_optimized curve above optimized curve indicates good
    # backtest with low overfitting.
    dom_df['SD2'] = dom_df.non_optimized_OOS - dom_df.optimized_IS

    result = PBO(pbo_test, prob_oos_loss, lm, dom_df, Cs, J, J_bar, R, R_bar,
                 R_rank, R_bar_rank, rn, rn_bar, w_bar, logits, R_n_star,
                 R_bar_n_star)

    if plot:
        plot_pbo(result, hist=hist)

    return result
Ejemplo n.º 10
0
def discrete_ecdfer(df: pd.DataFrame,
                    ascending: bool = True,
                    prediction_column: str = "prediction",
                    ecdf_column: str = "prediction_ecdf",
                    max_range: int = 1000,
                    round_method: Callable = int) -> LearnerReturnType:
    """
    Learns an Empirical Cumulative Distribution Function from the specified column
    in the input DataFrame. It is usually used in the prediction column to convert
    a predicted probability into a score from 0 to 1000.

    Parameters
    ----------
    df : Pandas' pandas.DataFrame
        A Pandas' DataFrame that must contain a `prediction_column` columns.

    ascending : bool
        Whether to compute an ascending ECDF or a descending one.

    prediction_column : str
        The name of the column in `df` to learn the ECDF from.

    ecdf_column : str
        The name of the new ECDF column added by this function.

    max_range : int
        The maximum value for the ECDF. It will go will go
         from 0 to max_range.

    round_method: Callable
        A function perform the round of transformed values for ex: (int, ceil, floor, round)
    """

    if ascending:
        base = 0
        sign = 1
    else:
        base = max_range
        sign = -1

    values = df[prediction_column]

    ecdf = ed.ECDF(values)

    df_ecdf = pd.DataFrame()
    df_ecdf['x'] = ecdf.x
    df_ecdf['y'] = pd.Series(base +
                             sign * max_range * ecdf.y).apply(round_method)

    boundaries = df_ecdf.groupby("y").agg((min, max))["x"]["min"].reset_index()

    y = boundaries["y"]
    x = boundaries["min"]
    side = ecdf.side

    log = {
        'discrete_ecdfer': {
            'map': dict(zip(x, y)),
            'round_method': round_method,
            'nobs': len(values),
            'prediction_column': prediction_column,
            'ascending': ascending,
            'transformed_column': [ecdf_column]
        }
    }

    del ecdf
    del values
    del df_ecdf

    def p(new_df: pd.DataFrame) -> pd.DataFrame:
        if not ascending:
            tind = np.searchsorted(-x, -new_df[prediction_column])
        else:
            tind = np.searchsorted(x, new_df[prediction_column], side) - 1

        return new_df.assign(**{ecdf_column: y[tind].values})

    return p, p(df), log
Ejemplo n.º 11
0
    #M is surfeit index which is the mean state of the system
    M = 1 / y * np.sum(Y, axis=1)
    #S is the severity index showing systemwide how extreme the states are
    S = 1 / y * np.sum(np.abs(Y), axis=1)
    #D measures how dissimilar the states of all sites are with rescpect to each other
    D1 = np.zeros((x, y - 1))
    D2 = np.zeros((x, y))
    for i in range(0, y - 1):
        for j in range(i + 1, y):
            D2[:, j] = np.abs(Y[:, i] - Y[:, j])
        D1[:, i] = np.sum(D2, axis=1)
    D = 1 / y**2 * np.sum(D1, axis=1)

    return M, S, D

A = edis.ECDF(Q[:, 0])
B = edis.ECDF(Q_stronger[:, 0])
C = edis.ECDF(Q_weaker[:, 0])

import matplotlib.pyplot as plt

#Temperature
fig, ax = plt.subplots()
ax.hist(Q, bins=300, histtype='step', label='Reconstruct')
ax.hist(Q_stronger, bins=300, histtype='step', label='Stronger')
ax.hist(Q_weaker, bins=300, histtype='step', label='Weaker')
fig.legend()
ax.set_title('Histgram')
fig.savefig('Netload_PDF.png')
#
def bc_a_bootstrap_ratio_cookie_buckets(treatment_cookie_buckets, control_cookie_buckets,
                                        bins_boundaries, num_of_boot_samples, ci_level,
                                        estimator_type='quantile',
                                        quantile=0.5, paired=False, return_bootstrap_est=False, return_interval=False):
    """
    Function that computes BCa bootstrap CI

    Parameters
    ----------
      treatment_cookie_buckets: pd.Dataframe
        dataframe corresponding to treatment group in which each rows corresponds to a cookie bucket
          and number of columns correspond to number of bins in a histogram

      control_cookie_buckets: pd.Dataframe
        dataframe corresponding to treatment group in which each rows corresponds to a cookie bucket
          and number of columns correspond to number of bins in a histogram

      bins_boundaries: list of bins boundaries
        Example: [1,3,5,9,16] corresponds to 4 bins

      num_of_boot_samples: int
        number of bootstrap samples to use

      ci_level: float in (0,1)
        level at which to construct the confidence interval

      estimator_type: 'quantile' or 'mean'
        whether to perform a test when the effect size is in terms of mean or quantile

      quantile: (0,1)
        quantile for which the test is to be performed

      return_bootstrap_est: Bool
        whether to return estimators corresponding to bootstrap samples

    Returns
    ----------
      test_result: Bool
        whether the test rejects (True) the null hypothesis

      If return_bootstrap_est is True, then estimators computed on bootstrap samples are returned
      (might worth further to return the interval itself-for length and shape comparison)

    """
    # convert to arrays for treatment and control groups

    # !!! some notebooks have to be updates to pass arrays

    # data_treat = treatment_cookie_buckets.values
    # data_control = control_cookie_buckets.values

    # stacking boostrap estimators
    bootstrap_est = list()

    # number of histogram bins
    num_of_bins = len(bins_boundaries)-1

    # obtain bins given boundaries
    bins_tuples = [(bins_boundaries[i-1],
                    bins_boundaries[i])
                   for i in range(1, num_of_bins+1)]

    # get number of cookie buckets for treatment and check that
    # it matches control
    num_of_cookie_buckets = treatment_cookie_buckets.shape[0]
    assert num_of_cookie_buckets == control_cookie_buckets.shape[0]

    # possible indices to perform resamplaing
    indices = np.arange(num_of_cookie_buckets)

    for i in range(num_of_boot_samples):
        if paired:
            # get bootstrap indices for treatment and control separately
            ind_resampled = np.random.choice(
                indices, size=num_of_cookie_buckets)
            # get an bootstrap array for treatment and control separately
            # and compute the resulting histogram
            boot_treat = treatment_cookie_buckets[ind_resampled, :].sum(axis=0)
            boot_control = control_cookie_buckets[ind_resampled, :].sum(axis=0)
            # compute estimator
        else:
          # get bootstrap indices for treatment and control separately
            ind_treat = np.random.choice(indices, size=num_of_cookie_buckets)
            ind_control = np.random.choice(indices, size=num_of_cookie_buckets)
            # get an bootstrap array for treatment and control separately
            # and compute the resulting histogram
            boot_treat = treatment_cookie_buckets[ind_treat, :].sum(axis=0)
            boot_control = control_cookie_buckets[ind_control, :].sum(axis=0)

        # compute estimator
        if estimator_type is 'quantile':
            quant_treat = compute_quantile_hist_data(
                boot_treat, bins_tuples, quantile)
            quant_control = compute_quantile_hist_data(
                boot_control, bins_tuples, quantile)
            assert quant_control > 0
            bootstrap_est += [100 * (quant_treat / quant_control - 1)]

    # convert list to array for simplicity
    bootstrap_est = np.array(bootstrap_est)

    # compute estimator on original sample
    quant_treat = compute_quantile_hist_data(
        treatment_cookie_buckets.sum(axis=0), bins_tuples, quantile)
    quant_control = compute_quantile_hist_data(
        control_cookie_buckets.sum(axis=0), bins_tuples, quantile)
    est_ratio = 100 * (quant_treat / quant_control - 1)

    # compute bias correction
    pre_z = (bootstrap_est <= est_ratio).mean()
    z_0 = norm.ppf(pre_z)

    # compute leave-one-bucket-out estimators
    leave_one_out_est = list()

    for i in range(num_of_cookie_buckets):
        # leave one cookie bucket out at a time
        ind_treat = np.delete(indices, i)
        ind_control = np.delete(indices, i)
        # get corresponding histograms
        current_data_treat = treatment_cookie_buckets[ind_treat, :].sum(axis=0)
        current_data_control = control_cookie_buckets[ind_control, :].sum(
            axis=0)
        # compute estimator
        if estimator_type is 'quantile':
            quant_treat = compute_quantile_hist_data(
                current_data_treat, bins_tuples, quantile)
            quant_control = compute_quantile_hist_data(
                current_data_control, bins_tuples, quantile)
            leave_one_out_est += [100 * (quant_treat / quant_control - 1)]

    # convert list to array for simplicity
    leave_one_out_est = np.array(leave_one_out_est)

    # take the mean for further comp of infl fns
    est_mean = leave_one_out_est.mean()

    # compute influence functions
    infl_fns = (num_of_cookie_buckets-1) * (est_mean-leave_one_out_est)

    # compute acceleration factor
    num = sum(infl_fns ** 3)
    den = sum(infl_fns ** 2) ** (3/2)
    accel_factor = num / (6 * den)

    # compute left and right quantiles of standard normal
    left_q, right_q = norm.ppf([(1 - ci_level)/2, (1 + ci_level)/2])

    # transform using bias correction and acceleration
    left_bound = z_0 + (z_0 + left_q) / (1 - accel_factor * (z_0 + left_q))
    right_bound = z_0 + (z_0 + right_q) / (1 - accel_factor * (z_0 + right_q))

    # apply gaussian transform
    left_bound, right_bound = norm.cdf([left_bound, right_bound])

    # apply inverse transform using empirical cdf of bootstrap samples
    sample_edf = edf.ECDF(bootstrap_est)
    slope_changes = sorted(set(bootstrap_est))

    sample_edf_values_at_slope_changes = [
        sample_edf(item) for item in slope_changes]
    inverted_edf = interp1d(sample_edf_values_at_slope_changes, slope_changes)

    left_bound, right_bound = inverted_edf([left_bound, right_bound])

    if return_interval:
        if return_bootstrap_est:
            # in case we want to return the estimators for analyses
            if 0 < left_bound or right_bound < 0:
                return True, [left_bound, right_bound], bootstrap_est
            else:
                return False, [left_bound, right_bound], bootstrap_est
        else:
            # in case we just need to perform the test
            if 0 < left_bound or right_bound < 0:
                return True, [left_bound, right_bound]
            else:
                return False, [left_bound, right_bound]
    else:
        if return_bootstrap_est:
            # in case we want to return the estimators for analyses
            if 0 < left_bound or right_bound < 0:
                return True, bootstrap_est
            else:
                return False, bootstrap_est
        else:
            # in case we just need to perform the test
            if 0 < left_bound or right_bound < 0:
                return True
            else:
                return False
def run(output_suffix):
    # Import historical tmeperature data
    df_temp = pd.read_excel(
        'Synthetic_streamflows/input/hist_temps_1953_2007.xlsx')
    his_temp_matrix = df_temp.values

    # Import calender
    calender = pd.read_excel(
        'Synthetic_streamflows/input/BPA_hist_streamflow.xlsx',
        sheet_name='Calender',
        header=None)
    calender = calender.values
    julian = calender[:, 2]

    ###############################
    # Synthetic HDD CDD calculation

    # Simulation data
    sim_weather = pd.read_csv(
        'Synthetic_weather/output/synthetic_weather_data' + output_suffix +
        '.csv',
        header=0)

    # Load temperature data only
    cities = [
        'SALEM_T', 'EUGENE_T', 'SEATTLE_T', 'BOISE_T', 'PORTLAND_T',
        'SPOKANE_T', 'FRESNO_T', 'LOS ANGELES_T', 'SAN DIEGO_T',
        'SACRAMENTO_T', 'SAN JOSE_T', 'SAN FRANCISCO_T', 'TUCSON_T',
        'PHOENIX_T', 'LAS VEGAS_T'
    ]
    sim_temperature = sim_weather[cities]

    # Convert temperatures to Fahrenheit
    sim_temperature = (sim_temperature * (9 / 5)) + 32
    sim_temperature = sim_temperature.values

    num_cities = len(cities)
    num_sim_days = len(sim_temperature)

    HDD_sim = np.zeros((num_sim_days, num_cities))
    CDD_sim = np.zeros((num_sim_days, num_cities))

    # calculate daily records of heating (HDD) and cooling (CDD) degree days
    for i in range(0, num_sim_days):
        for j in range(0, num_cities):
            HDD_sim[i, j] = np.max((0, 65 - sim_temperature[i, j]))
            CDD_sim[i, j] = np.max((0, sim_temperature[i, j] - 65))

    # calculate annual totals of heating and cooling degree days for each city
    annual_HDD_sim = np.zeros((int(len(HDD_sim) / 365), num_cities))
    annual_CDD_sim = np.zeros((int(len(CDD_sim) / 365), num_cities))

    for i in range(0, int(len(HDD_sim) / 365)):
        for j in range(0, num_cities):
            annual_HDD_sim[i,
                           j] = np.sum(HDD_sim[0 + (i * 365):365 + (i * 365),
                                               j])
            annual_CDD_sim[i,
                           j] = np.sum(CDD_sim[0 + (i * 365):365 + (i * 365),
                                               j])

    ########################################################################
    #Calculate HDD and CDD for historical temperature data

    num_cities = len(cities)
    num_days = len(his_temp_matrix)

    # daily records
    HDD = np.zeros((num_days, num_cities))
    CDD = np.zeros((num_days, num_cities))

    for i in range(0, num_days):
        for j in range(0, num_cities):
            HDD[i, j] = np.max((0, 65 - his_temp_matrix[i, j + 1]))
            CDD[i, j] = np.max((0, his_temp_matrix[i, j + 1] - 65))

    # annual sums
    annual_HDD = np.zeros((int(len(HDD) / 365), num_cities))
    annual_CDD = np.zeros((int(len(CDD) / 365), num_cities))

    for i in range(0, int(len(HDD) / 365)):
        for j in range(0, num_cities):
            annual_HDD[i, j] = np.sum(HDD[0 + (i * 365):365 + (i * 365), j])
            annual_CDD[i, j] = np.sum(CDD[0 + (i * 365):365 + (i * 365), j])

    ###########################################################################################
    #This section is used for calculating total hydro

    # Load relevant streamflow data (1953-2007)
    BPA_streamflow = pd.read_excel(
        'Synthetic_streamflows/input/BPA_hist_streamflow.xlsx',
        sheet_name='Inflows',
        header=0)
    Hoover_streamflow = pd.read_csv(
        'Synthetic_streamflows/input/Hoover_hist_streamflow.csv', header=0)
    CA_streamflow = pd.read_excel(
        'Synthetic_streamflows/input/CA_hist_streamflow.xlsx', header=0)
    Willamette_streamflow = pd.read_csv(
        'Synthetic_streamflows/input/Willamette_hist_streamflow.csv', header=0)

    # headings
    name_Will = list(Willamette_streamflow.loc[:, 'Albany':])
    name_CA = list(CA_streamflow.loc[:, 'ORO_fnf':])
    name_BPA = list(BPA_streamflow.loc[:, '1M':])

    # number of streamflow gages considered
    num_BPA = len(name_BPA)
    num_CA = len(name_CA)
    num_Will = len(name_Will)
    num_gages = num_BPA + num_CA + num_Will + 1

    # Calculate historical totals for 1953-2007
    years = range(1953, 2008)

    for y in years:

        y_index = years.index(y)

        BPA = BPA_streamflow.loc[BPA_streamflow['year'] == y, '1M':]
        CA = CA_streamflow.loc[CA_streamflow['year'] == y, 'ORO_fnf':]
        WB = Willamette_streamflow.loc[Willamette_streamflow['year'] == y,
                                       'Albany':]
        HO = Hoover_streamflow.loc[Hoover_streamflow['year'] == y, 'Discharge']

        BPA_sums = np.reshape(np.sum(BPA, axis=0).values, (1, num_BPA))
        CA_sums = np.reshape(np.sum(CA, axis=0).values, (1, num_CA))
        WB_sums = np.reshape(np.sum(WB, axis=0).values, (1, num_Will))
        HO_sums = np.reshape(np.sum(HO, axis=0), (1, 1))

        # matrix of annual flows for each stream gage
        joined = np.column_stack((BPA_sums, CA_sums, WB_sums, HO_sums))

        if y_index < 1:

            hist_totals = joined

        else:

            hist_totals = np.vstack((hist_totals, joined))

    BPA_headers = np.reshape(list(BPA_streamflow.loc[:, '1M':]), (1, num_BPA))
    CA_headers = np.reshape(list(CA_streamflow.loc[:, 'ORO_fnf':]),
                            (1, num_CA))
    WB_headers = np.reshape(list(Willamette_streamflow.loc[:, 'Albany':]),
                            (1, num_Will))
    HO_headers = np.reshape(['Hoover'], (1, 1))

    headers = np.column_stack(
        (BPA_headers, CA_headers, WB_headers, HO_headers))

    # annual streamflow totals for 1953-2007
    df_hist_totals = pd.DataFrame(hist_totals)
    df_hist_totals.columns = headers[0, :]
    df_hist_totals.loc[38, '83L'] = df_hist_totals.loc[36, '83L']
    added_value = abs(np.min((df_hist_totals))) + 5
    log_hist_total = np.log(df_hist_totals + abs(added_value))

    A = df_hist_totals.values
    B = np.column_stack((A, annual_HDD, annual_CDD))

    x, y = np.shape(B)
    #data is the data matrix at all time step. The dimention would be X*Y
    #data 2 is required if calculating disimilarity

    #Step 1: Transform the data into emperical CDF
    P = np.zeros((x, y))
    for i in range(0, y):
        ECDF = edis.ECDF(B[:, i])
        P[:, i] = ECDF(B[:, i])

    Y = 2 * (P - 0.5)

    new_cols = ['Name'] + ['type_' + str(i) for i in range(0, 141)]

    #remove constant zeros columns
    need_to_remove = [1, 17, 22, 24, 27, 32, 34, 36, 37, 38, 44, 107, 108, 109]
    Y2 = np.delete(Y, need_to_remove, axis=1)
    Y[:, 107] = 1
    mean = np.mean(Y, axis=0)
    cov = np.cov(Y, rowvar=0)
    runs = int(num_sim_days / 365) * 5
    sim_years = int(num_sim_days / 365)
    N = np.random.multivariate_normal(mean, cov, runs)

    T = (N / 2) + 0.5

    T_all = np.zeros((runs, y))
    for i in range(0, y):
        for j in range(0, runs):
            if T[j, i] < 0:
                T_all[j,
                      i] = (np.percentile(B[:, i], q=0 * 100)) * (1 + T[j, i])

            elif T[j, i] <= 1 and T[j, i] >= 0:
                T_all[j, i] = np.percentile(B[:, i], q=T[j, i] * 100)
            else:
                T_all[j, i] = (np.percentile(B[:, i], q=1 * 100)) * T[j, i]

    Sim_total = T_all[:, :112]
    Sim_HDD_CDD = T_all[:, 112:]
    Sim_CDD = Sim_HDD_CDD[:, 15:]
    Sim_HDD = Sim_HDD_CDD[:, :15]
    ######################################
    #sns.kdeplot(annual_CDD[:,0],label='His')
    #sns.kdeplot(annual_CDD_sim[:,0],label='Syn')
    #sns.kdeplot(Sim_HDD_CDD[:,15],label='Capula')
    #plt.legend()
    #
    #sns.kdeplot(annual_HDD[:,0],label='His')
    #sns.kdeplot(annual_HDD_sim[:,0],label='Syn')
    #sns.kdeplot(Sim_HDD_CDD[:,0],label='Capula')
    #plt.legend()

    #########################################

    HDD_CDD = np.column_stack((annual_HDD_sim, annual_CDD_sim))

    year_list = np.zeros(int(num_sim_days / 365))
    Best_RMSE = 9999999999
    CHECK = np.zeros((sim_years, runs))

    for i in range(0, sim_years):
        for j in range(0, runs):
            RMSE = (np.sum(np.abs(HDD_CDD[i, :] - Sim_HDD_CDD[j, :])))
            CHECK[i, j] = RMSE
            if RMSE <= Best_RMSE:
                year_list[i] = j
                Best_RMSE = RMSE

            else:
                pass
        Best_RMSE = 9999999999

    sim_totals = np.zeros((sim_years, num_gages))
    for i in range(0, sim_years):
        sim_totals[i, :] = Sim_total[int(year_list[i]), :]

    ###################################################################################

    #C_1=np.corrcoef(sim_totals,rowvar=0)
    #C_his=np.corrcoef(A,rowvar=0)
    #import seaborn as sns; sns.set()
    #
    #grid_kws = {"height_ratios": (.9, .05), "hspace": .3}
    #fig,ax=plt.subplots()
    #plt.rcParams["font.weight"] = "bold"
    #plt.rcParams["axes.labelweight"] = "bold"
    #ax1=plt.subplot(121)
    #sns.heatmap(C_1,vmin=0,vmax=1,cbar=False)
    #plt.axis('off')
    #ax.set_title('Syn')
    #
    #
    #
    #ax2=plt.subplot(122)
    #cbar_ax = fig.add_axes([.92, .15, .03, .7])  # <-- Create a colorbar axes
    #
    #fig2=sns.heatmap(C_his,ax=ax2,cbar_ax=cbar_ax,vmin=0,vmax=1)
    #cbar=ax2.collections[0].colorbar
    #cbar.ax.tick_params(labelsize='large')
    #
    #fig2.axis('off')
    #
    #
    #
    ##################################################################################
    #plt.figure()
    #sns.kdeplot(A[:,0],label='His')
    #sns.kdeplot(sim_totals[:,0],label='Syn')
    #sns.kdeplot(Sim_total[:,0],label='Capula')
    #plt.legend()
    #
    #plt.figure()
    #sns.kdeplot(A[:,5],label='His')
    #sns.kdeplot(sim_totals[:,5],label='Syn')
    #sns.kdeplot(Sim_total[:,5],label='Capula')
    #plt.legend()
    #
    #plt.figure()
    #sns.kdeplot(A[:,52],label='His')
    #sns.kdeplot(sim_totals[:,52],label='Syn')
    #sns.kdeplot(Sim_total[:,52],label='Capula')
    #plt.legend()
    #
    #plt.figure()
    #sns.kdeplot(A[:,55],label='His')
    #sns.kdeplot(sim_totals[:,55],label='Syn')
    #sns.kdeplot(Sim_total[:,55],label='Capula')
    #plt.legend()
    #
    #plt.figure()
    #sns.kdeplot(A[:,56],label='His')
    #sns.kdeplot(sim_totals[:,56],label='Syn')
    #sns.kdeplot(Sim_total[:,56],label='Capula')
    #plt.legend()
    #
    #plt.figure()
    #sns.kdeplot(A[:,66],label='His')
    #sns.kdeplot(sim_totals[:,66],label='Syn')
    #sns.kdeplot(Sim_total[:,66],label='Capula')
    #plt.legend()
    ##################################################################################
    # impose logical constraints
    mins = np.min(df_hist_totals.loc[:, :'Hoover'], axis=0)

    for i in range(0, num_gages):
        lower_bound = mins[i]

        for j in range(0, sim_years):
            if sim_totals[j, i] < lower_bound:
                sim_totals[j, i] = lower_bound * np.random.uniform(0, 1)

    df_sim_totals = pd.DataFrame(sim_totals)
    H = list(headers)
    df_sim_totals.columns = H

    #A1=[]
    #A2=[]
    #for h in H:
    #    a1=np.average(df_hist_totals.loc[:,h])
    #    a2=np.average(df_sim_totals.loc[:,h])
    #    A1.append(a1)
    #    A2.append(a2)
    #
    #plt.plot(A1)
    #plt.plot(A2)
    #####################################################################################
    # This section selects daily fractions which are paired with
    # annual totals to arrive at daily streamflows

    # 4 cities are nearest to all 109 stream gage sites
    Fraction_calculation_cities = ['Spokane', 'Boise', 'Sacramento', 'Fresno']

    # Each is weighted by average annual flow at nearby gage sites
    Temperature_weights = pd.read_excel(
        'Synthetic_streamflows/input/city_weights.xlsx', header=0)

    # historical temperatures for those 4 cities
    fraction_hist_temp = df_temp[Fraction_calculation_cities]
    fraction_hist_temp_matrix = fraction_hist_temp.values

    # calculate daily record of weighted temperatures across 4 cities
    weighted_T = np.zeros(len(fraction_hist_temp_matrix))
    for i in range(0, len(fraction_hist_temp_matrix)):
        weighted_T[i] = fraction_hist_temp_matrix[
            i, 0] * Temperature_weights['Spokane'] + fraction_hist_temp_matrix[
                i,
                1] * Temperature_weights['Boise'] + fraction_hist_temp_matrix[
                    i, 2] * Temperature_weights[
                        'Sacramento'] + fraction_hist_temp_matrix[
                            i, 3] * Temperature_weights['Fresno']

    # synthetic temperatures for each of the cities
    fcc = list(['SPOKANE_T', 'BOISE_T', 'SACRAMENTO_T', 'FRESNO_T'])
    fraction_sim = sim_weather[fcc]
    fraction_sim_matrix = fraction_sim.values

    weighted_T_sim = np.zeros(len(fraction_sim_matrix))

    # calculate synthetic weighted temperature (in Fahrenheit)
    for i in range(0, len(fraction_sim_matrix)):
        weighted_T_sim[i] = fraction_sim_matrix[i, 0] * Temperature_weights[
            'Spokane'] + fraction_sim_matrix[i, 1] * Temperature_weights[
                'Boise'] + fraction_sim_matrix[i, 2] * Temperature_weights[
                    'Sacramento'] + fraction_sim_matrix[
                        i, 3] * Temperature_weights['Fresno']

    weighted_T_sim = (weighted_T_sim * (9 / 5)) + 32

    #Sample synthetic fractions, then combine with totals
    sim_years = int(len(fraction_sim_matrix) / 365)
    sim_T = np.zeros((365, sim_years))

    hist_years = int(len(fraction_hist_temp) / 365)
    hist_T = np.zeros((365, hist_years))

    # reshape historical and simulated weighted temperatures in new variables
    for i in range(0, hist_years):
        hist_T[:, i] = weighted_T[i * 365:365 + (i * 365)]

    for i in range(0, sim_years):
        sim_T[:, i] = weighted_T_sim[i * 365:365 + (i * 365)]

    # aggregate weighted temperatures into monthly values
    Normal_Starting = datetime(1900, 1, 1)
    datelist = pd.date_range(Normal_Starting, periods=365)
    count = 0
    m = np.zeros(365)
    for i in range(0, 365):
        m[i] = int(datelist[count].month)
        count = count + 1
        if count > 364:
            count = 0
    hist_T_monthly = np.column_stack((hist_T, m))

    monthly_hist_T = np.zeros((12, hist_years))
    for i in range(0, sim_years):
        for j in range(1, 13):
            d1 = hist_T_monthly[hist_T_monthly[:, hist_years] == j]
            d2 = d1[:, :hist_years]
            monthly_hist_T[j - 1, :] = np.sum(d2, axis=0)

    Normal_Starting = datetime(1900, 1, 1)
    datelist = pd.date_range(Normal_Starting, periods=365)
    count = 0
    m = np.zeros(365)
    for i in range(0, 365):
        m[i] = int(datelist[count].month)
        count = count + 1
        if count > 364:
            count = 0
    sim_T_monthly = np.column_stack((sim_T, m))

    monthly_sim_T = np.zeros((12, sim_years))
    for i in range(0, sim_years):
        for j in range(1, 13):
            d1 = sim_T_monthly[sim_T_monthly[:, sim_years] == j]
            d2 = d1[:, :sim_years]
            monthly_sim_T[j - 1, :] = np.sum(d2, axis=0)

    # select historical year with most similar spring and summer temperatures
    # to new simulated years
    year_list = np.zeros(sim_years)
    Best_RMSE = 9999999999
    CHECK = np.zeros((sim_years, hist_years))

    for i in range(0, sim_years):
        for j in range(0, hist_years):
            RMSE = (np.sum(
                np.abs(monthly_sim_T[3:8, i] - monthly_hist_T[3:8, j])))
            CHECK[i, j] = RMSE
            if RMSE <= Best_RMSE:
                year_list[i] = j
                Best_RMSE = RMSE

            else:
                pass
        Best_RMSE = 9999999999

    ################################################################################
    #Generate streamflow
    TDA = np.zeros((int(365 * sim_years), 2))
    totals_hist = np.zeros((num_gages, hist_years))
    fractions_hist = np.zeros((hist_years, 365, num_gages))

    totals_hist_hoover = np.zeros((1, hist_years))
    output_BPA = np.zeros((sim_years * 365, num_BPA))
    output_Hoover = np.zeros((sim_years * 365, 1))
    output_CA = np.zeros((sim_years * 365, num_CA))
    output_WI = np.zeros((sim_years * 365, num_Will))

    # historical daily flows
    x_Hoover = Hoover_streamflow.loc[:, 'Discharge'].values
    x_BPA = BPA_streamflow.loc[:, '1M':].values
    x_CA = CA_streamflow.loc[:, 'ORO_fnf':].values
    x_WI = Willamette_streamflow.loc[:, 'Albany':'COT5A'].values
    x = np.column_stack((x_BPA, x_CA, x_WI, x_Hoover))
    x = np.reshape(x, (hist_years, 365, num_gages))

    # historical daily fractions
    for i in range(0, hist_years):
        for j in range(0, num_gages):
            totals_hist[j, i] = np.sum(np.abs(x[i, :, j]))
            if totals_hist[j, i] == 0:
                fractions_hist[i, :, j] = 0
            else:
                fractions_hist[i, :, j] = x[i, :, j] / totals_hist[j, i]

    # sample simulated daily fractions
    for i in range(0, sim_years):
        for j in range(0, num_gages):

            if j <= num_BPA - 1:
                output_BPA[(i * 365):(i * 365) + 365,
                           j] = fractions_hist[int(year_list[i]), :,
                                               j] * sim_totals[i, j]
            elif j == num_gages - 1:
                output_Hoover[(i * 365):(i * 365) + 365,
                              0] = fractions_hist[int(year_list[i]), :,
                                                  j] * sim_totals[i, j]
            elif j > num_BPA - 1 and j <= num_BPA + num_CA - 1:
                output_CA[(i * 365):(i * 365) + 365,
                          j - num_BPA] = fractions_hist[int(year_list[i]), :,
                                                        j] * sim_totals[i, j]
            else:
                output_WI[(i * 365):(i * 365) + 365, j - num_BPA -
                          num_CA] = fractions_hist[int(year_list[i]), :,
                                                   j] * sim_totals[i, j]

        TDA[(i * 365):(i * 365) + 365, 0] = range(1, 366)

    # assign flows to the Dalles, OR
    TDA[:, 1] = output_BPA[:, 47]

    ###############################################################################
    # # Output
    # np.savetxt('Synthetic_streamflows/output/synthetic_streamflows_FCRPS.csv',output_BPA,delimiter=',')
    # np.savetxt('Synthetic_streamflows/output/synthetic_streamflows_TDA.csv',TDA[:,1],delimiter=',')
    # np.savetxt('Synthetic_streamflows/output/synthetic_discharge_Hoover.csv',output_Hoover,delimiter=',')
    # CA=pd.DataFrame(output_CA,columns=name_CA)
    # CA.to_csv('Synthetic_streamflows/output/synthetic_streamflows_CA.csv')
    # Willamatte_Syn=pd.DataFrame(output_WI,columns=name_Will)
    # Willamatte_Syn.to_csv('Synthetic_streamflows/output/synthetic_streamflows_Willamette.csv')

    #write CA synthetic flows to ORCA file
    leap_cycles = int(sim_years // 4)
    r = np.shape(output_CA)
    for i in range(0, leap_cycles):

        if i < 1:
            C = output_CA[0:1154, :]
            B = np.empty((1, int(r[1])))
            B[:] = np.nan
            D = output_CA[i * 1460 + 1154:i * 1460 + 1154 + 1460]
            F = np.vstack((C, B, D))
        else:
            D = output_CA[i * 1460 + 1154:i * 1460 + 1154 + 1460]
            F = np.vstack((F, B, D))

    df_leap = pd.DataFrame(F, columns=name_CA)
    df_leap.to_csv('Synthetic_streamflows/output/ORCA_forecast_flows' +
                   output_suffix + '.csv')