def test_permuted_ols_check_h0_noeffect_signswap(random_state=0):
    rng = check_random_state(random_state)
    # design parameters
    n_samples = 100
    # create dummy design with no effect
    target_var = rng.randn(n_samples, 1)
    tested_var = np.ones((n_samples, 1))
    # permuted OLS
    # We check that h0 is close to the theoretical distribution, which is
    # known for this simple design (= t(n_samples - dof)).
    perm_ranges = [10, 100, 1000]  # test various number of permutations
    all_kstest_pvals = []
    # we compute the Mean Squared Error between cumulative Density Function
    # as a proof of consistency of the permutation algorithm
    all_mse = []
    for i, n_perm in enumerate(np.repeat(perm_ranges, 10)):
        pval, orig_scores, h0 = permuted_ols(
            tested_var, target_var, model_intercept=False,
            n_perm=n_perm, two_sided_test=False, random_state=i)
        assert_equal(h0.size, n_perm)
        # Kolmogorov-Smirnov test
        kstest_pval = stats.kstest(h0, stats.t(n_samples).cdf)[1]
        all_kstest_pvals.append(kstest_pval)
        mse = np.mean(
            (stats.t(n_samples).cdf(np.sort(h0))
             - np.linspace(0, 1, h0.size + 1)[1:]) ** 2)
        all_mse.append(mse)
    all_kstest_pvals = np.array(all_kstest_pvals).reshape(
        (len(perm_ranges), -1))
    all_mse = np.array(all_mse).reshape((len(perm_ranges), -1))
    # check that a difference between distributions is not rejected by KS test
    assert_array_less(0.01 / (len(perm_ranges) * 10.), all_kstest_pvals)
    # consistency of the algorithm: the more permutations, the less the MSE
    assert_array_less(np.diff(all_mse.mean(1)), 0)
Ejemplo n.º 2
0
    def _hinv(self, v, u, rotation=0, *theta):
        """!
        @brief Inverse H function (Inv Conditional distribution) of T copula.
        TODO: CHECK UU and VV ordering!
        """
        kT = self.kTau(rotation, *theta)
        kTs = kT / abs(kT)
        kTM = 1 if kTs < 0 else 0

        h1 = 1.0 - np.power(theta[0], 2.0)
        nu1 = theta[1] + 1.0
        dist1 = stats.t(df=theta[1], scale=1.0, loc=0.0)
        dist2 = stats.t(df=nu1, scale=1.0, loc=0.0)

        UU = np.array(kTM + kTs * u)  # TODO: check input bounds
        VV = np.array(v)

        # inverse CDF yields quantiles
        x = dist2.ppf(UU)
        y = dist1.ppf(VV)

        # eval H function
        uu = dist1.cdf(x * np.sqrt((theta[1] + np.power(y, 2.0)) * h1 / nu1) +
                       theta[0] * y)
        return uu
Ejemplo n.º 3
0
def create_probe_statistic(probe_values, fpr, verbose=0):
    # Create prediction interval statistics based on randomly permutated probe features (based on real features)
    n = len(probe_values)

    if n == 0:
        if verbose > 0:
            logging.info(
                "All probes were infeasible. All features considered relevant."
            )
        #    # If all probes were infeasible we expect an empty list
        #    # If they are infeasible it also means that only strongly relevant features were in the data
        #    # As such we just set the prediction without considering the statistics

        low_t = 0
        up_t = 0
    elif n == 1:
        val = probe_values[0]
        low_t = val
        up_t = val
    else:
        probe_values = np.asarray(probe_values)
        mean = probe_values.mean()
        s = probe_values.std()
        low_t = mean + stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n))
        up_t = mean - stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n))
    return ProbeStatistic(low_t, up_t, n)
def ci_specified_risk_metrics(returns, weights, portfolio_vol, ci, d):
    """
    Calculate the Analytical VaR and Expected Shortfall for a portfolio, using both the Normal distribution and the
    T-distribution.
    :param returns: np.array([float]) - the historical portfolio returns
    :param weights: np.array([float]) - the weights of the assets in the portfolio
    :param portfolio_vol: float - the volatility of the portfolio
    :param ci: float - the confidence interval at which to take the Analytical VaR
    :param d: int - the number of degrees of freedom to use
    :return: float, float, float, float, float, float
    """

    # calculate the standard deviation of the portfolio
    sigma = np.sqrt(portfolio_vol)

    # calculate the mean return of the portfolio
    mu = np.sum(portfolio_returns(returns, weights)) / returns.shape[1]

    # integrate the Probability Density Function to find the Analytical Value at Risk for both Normal and t distributions
    var_level = stats.norm.ppf(ci, mu, sigma)
    t_dist_var_level = stats.t(d).ppf(ci)

    # calculate the expected shortfall for each distribution - this is the expected loss (in % daily returns) for the
    # portfolio in the worst a_var% of cases - it is effectively the mean of the values along the x-axis from
    # -infinity% to a_var%
    es = (stats.norm.pdf(stats.norm.ppf((1 - ci))) * sigma) / (1 - ci) - mu
    t_dist_es = (stats.t(d).pdf(stats.t(d).ppf(
        (1 - ci))) * sigma * (d + (stats.t(d).ppf(
            (1 - ci)))**2)) / ((1 - ci) * (d - 1)) - mu

    return sigma, mu, var_level, t_dist_var_level, es, t_dist_es
Ejemplo n.º 5
0
def risk_metrics(returns, weights, portfolio_vol, var_p, d):
    """
    Calculate the Analytical VaR and Expected Shortfall for a portfolio, using both the Normal distribution and the
    T-distribution.
    :param returns: np.array([float]) - the historical portfolio returns
    :param weights: np.array([float]) - the weights of the assets in the portfolio
    :param portfolio_vol: float - the volatility of the portfolio
    :param var_p: float - the value of the daily returns at which to take the Analytical VaR
    :param d: int - the number of degrees of freedom to use
    :return: float, float, float, float, float, float
    """

    # calculate the standard deviation of the portfolio
    sigma = np.sqrt(portfolio_vol)

    # calculate the mean return of the portfolio
    mu = np.sum(portfolio_returns(returns, weights))/returns.shape[1]

    # integrate the Probability Density Function to find the Analytical Value at Risk for both Normal and t distributions
    a_var = stats.norm(mu, sigma).cdf(var_p)
    t_dist_a_var = stats.t(d).cdf(var_p)

    # calculate the expected shortfall for each distribution - this is the expected loss (in % daily returns) for the
    # portfolio in the worst a_var% of cases - it is effectively the mean of the values along the x-axis from
    # -infinity% to a_var%
    es = (stats.norm(mu, sigma).pdf(stats.norm(mu, sigma).ppf((1 - a_var))) * sigma)/(1 - a_var) - mu
    percent_point_function = stats.t(d).ppf((1 - a_var))
    t_dist_es = -1/(1 - a_var) * (1-d)**(-1) * (d-2 + percent_point_function**2) * stats.t(d).pdf(percent_point_function)*sigma - mu

    return sigma, mu, a_var, t_dist_a_var, es, t_dist_es
Ejemplo n.º 6
0
def plot_power(ax, s1, s2, xlabel='', ylabel='', title='', **options):

    x = np.linspace(-8, 8, 250)
    se = se_welch(s1, s2)
    df = welch_satterhwaithe_df(s1, s2)

    null = stats.t(df=df)
    alt = stats.t(loc=(s1.mean() - s2.mean()) / se, df=df)

    sns.lineplot(x, null.pdf(x), label='null')
    sns.lineplot(x, alt.pdf(x), label='alt')
    ax.vlines(x=null.ppf(0.975),
              ymin=0,
              ymax=0.5,
              color='#000000',
              linestyle='--',
              label='alpha',
              zorder=1)

    ax.fill_between(x,
                    alt.pdf(x),
                    where=(x >= null.ppf(0.975)),
                    color="red",
                    alpha=0.25)

    ax.set_xlabel(xlabel, fontsize=15)
    ax.set_ylabel(ylabel, fontsize=15)
    ax.tick_params(axis='both', which='major', labelsize=15)
    ax.set_title(title, fontsize=15)

    ax.legend(fontsize='xx-large', loc='upper left')
Ejemplo n.º 7
0
 def stats(x):
     n = len(TRACTS)
     m = x.mean()
     se = x.std() / sqrt(n)
     e = se * t(n - 1).ppf(.975)
     p = 2 - 2 * t(n - 1).cdf(abs(m) / se)
     return (m, se, e, p)
Ejemplo n.º 8
0
def sample_elections(plan_df, n=1000, p_seats=False):
    elec_results = plan_df[['2008', '2012', '2016']].values
    state_year_results = elec_results.mean(axis=0)
    state_vote_share_mean = state_year_results.mean()
    state_vote_t = t(df=2,
                     loc=state_vote_share_mean,
                     scale=state_year_results.std(ddof=1))

    state_vote_share_samples = state_vote_t.rvs(n)

    district_mean = elec_results.mean(axis=1)
    district_std = elec_results.std(axis=1, ddof=1)
    district_vote_t = t(df=2, loc=district_mean, scale=district_std)

    district_static_samples = district_vote_t.rvs((n, len(district_mean)))
    district_mean_samples = district_static_samples.mean(axis=1)
    district_vote_shares = (
        district_static_samples +
        (state_vote_share_samples - district_mean_samples)[:, np.newaxis])

    if p_seats:
        seat_shares = np.sum(
            1 - t.cdf(.5, df=2, loc=district_vote_shares, scale=district_std),
            axis=1)
    else:
        seat_shares = np.sum(district_vote_shares > 0.5, axis=1)
    vote_shares = np.sum(district_vote_shares, axis=1)
    return seat_shares / len(plan_df), vote_shares / len(plan_df)
Ejemplo n.º 9
0
def create_probe_statistic(probe_values, fpr, verbose=0):
    # Create prediction interval statistics based on randomly permutated probe features (based on real features)
    n = len(probe_values)

    if n == 0:
        if verbose > 0:
            logging.info(
                "All probes were infeasible. All features considered relevant."
            )
        #    # If all probes were infeasible we expect an empty list
        #    # If they are infeasible it also means that only strongly relevant features were in the data
        #    # As such we just set the prediction without considering the statistics
        mean = 0
    else:
        probe_values = np.asarray(probe_values)
        mean = probe_values.mean()

    if mean == 0:
        lower_threshold, upper_threshold = mean, mean
        s = 0
    else:
        s = probe_values.std()
        lower_threshold = mean + stats.t(df=n -
                                         1).ppf(fpr) * s * np.sqrt(1 + (1 / n))
        upper_threshold = mean - stats.t(df=n -
                                         1).ppf(fpr) * s * np.sqrt(1 + (1 / n))

    if verbose > 0:
        print(
            f"FS threshold: {lower_threshold}-{upper_threshold}, Mean:{mean}, Std:{s}, n_probes {n}"
        )

    return lower_threshold, upper_threshold
Ejemplo n.º 10
0
def CalcErr(Vals, stds, Flux, FluxErr, Tj, TjErr,fitFunc,**kwargs):

    outErrs=np.zeros(len(Vals))

    npar=len(Flux)
    tvalsq=(stats.t(df=(npar-4)).ppf(0.975))**2/npar
    relmeasErrsq=np.sum(FluxErr**2)/((np.sum(Flux))**2)+np.sum(TjErr**2)/((np.sum(Tj))**2)

    if fitFunc=='EMA':
        taug=kwargs['taug']
        x0val=taug[0]
        x0std=taug[1]
        SO2Flux=taug[2]
        SO2FluxErr=taug[3]
        SO2Tj = taug[4]
        SO2TjErr = taug[5]
        nSO2par = len(SO2Flux)
        tvalsqg = (stats.t(df=(nSO2par - 4)).ppf(0.975)) ** 2 / nSO2par

        reltaugErrsq = tvalsqg * (np.sum(SO2FluxErr**2)/((np.sum(SO2Flux))**2)+np.sum(SO2TjErr**2)/((np.sum(SO2Tj))**2) + (x0std / x0val) ** 2) + 0.01 + 0.01

        for ivar in np.arange(len(Vals)):
            outErrs[ivar]=np.absolute(Vals[ivar])*np.sqrt(tvalsq*(relmeasErrsq+(stds[ivar]/Vals[ivar])**2)+0.01+0.01+0.04+reltaugErrsq)

    else:

        for ival in np.arange(len(Vals)):
            outErrs[ival]=np.absolute(Vals[ival])*np.sqrt(tvalsq*(relmeasErrsq+(stds[ival]/Vals[ival])**2)+0.01+0.01)

    return outErrs
Ejemplo n.º 11
0
    def __init__(self, mean, cov, df=None, random_state=1):
        self.mean = mean
        self.cov = cov
        self.sd = sd = np.sqrt(np.diag(cov))
        if df is None:
            self.dist = stats.multivariate_normal(mean=mean, cov=cov)
            self.udist = stats.norm(loc=mean, scale=sd)
            self.std_udist = stats.norm(loc=0., scale=1.)
        else:
            sigma = cov * (df - 2) / df
            self.dist = MVT(mean=mean, sigma=sigma, df=df)
            self.udist = stats.t(loc=mean, scale=sd, df=df)
            self.std_udist = stats.t(loc=0., scale=1., df=df)
        self.dist.random_state = random_state
        self.udist.random_state = random_state
        self.std_udist.random_state = random_state

        self._chol = cholesky(self.cov)
        self._pchol = pivoted_cholesky(self.cov)

        e, v = np.linalg.eigh(self.cov)
        # To match Bastos and O'Hagan definition
        # i.e., eigenvalues ordered from largest to smallest
        e, v = e[::-1], v[:, ::-1]
        ee = np.diag(np.sqrt(e))
        self._eig = (v @ ee)
 def __init__(self, rho, nu, Law_RS, Law_RF):
     self.rho        = rho      # Dependence Parameter
     self.nu         = nu       # Degree of Freedom 
     self.Law_RS     = Law_RS   # Marginal Distribution of Spot
     self.Law_RF     = Law_RF   # Marginal Distribution of Future
     self.meta_t     = multivariate_t(nu =nu,  # DF
                                      Sigma=np.array([[1,rho], # COV
                                         [rho,1]]))
     self.t1 = stats.t(df=nu) # inner
     self.t2 = stats.t(df=nu) 
Ejemplo n.º 13
0
    def __init__(self,
                 a,
                 b,
                 beginning=None,
                 ending=None,
                 beginning_factor=None,
                 ending_factor=None):
        """
        start and end can be in either datetime or unix time
        """
        a, b = UnixTime(a), UnixTime(b)
        assert a < b, "'b' should be greater than 'a'"
        if (beginning, ending) != (None, None):
            assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \
                                                                      "either 'beginning_factor' and 'ending_factor' " \
                                                                      "or 'beginning' and 'ending'"

        if beginning_factor is not None:
            assert beginning_factor > 0
            self.beginning_factor = beginning_factor
        if ending_factor is not None:
            assert ending_factor > 0
            self.ending_factor = ending_factor

        if (beginning, ending) != (None, None):
            beginning = UnixTime(beginning)
            ending = UnixTime(ending)
        else:
            beginning, ending = 0, 0
            while not a < beginning < ending < b:
                beginning = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        a + float(b - a) / self.beginning_factor,
                        float(b - a) / self.beginning_factor))

                ending = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        b - float(b - a) / self.ending_factor,
                        float(b - a) / self.ending_factor))
        TemporalEventPiecewiseLinear.__init__(self, {
            a: 0,
            beginning: 1
        }, {
            ending: 1,
            b: 0
        })
Ejemplo n.º 14
0
 def p(self):
     n = len(self.x)
     x1 = self.x[self.x < self.peak]
     y1 = self.y[self.x < self.peak]
     se1 = sqrt(sum(square(y1 - self.b[0] + self.b[1] * (x1 - self.peak))) / (n - 2)) / sqrt(
         sum(square(x1 - mean(x1))))
     x2 = self.x[self.x >= self.peak]
     y2 = self.y[self.x >= self.peak]
     se2 = sqrt(sum(square(y2 - self.b[0] + self.b[2] * (x2 - self.peak))) / (n - 2)) / sqrt(
         sum(square(x2 - mean(x2))))
     return (2 * (1 - t(n - 2).cdf(abs(self.b[1] / se1))), 2 * (1 - t(n - 2).cdf(abs(self.b[2] / se2))))
Ejemplo n.º 15
0
 def ci_se(self, alpha, symmetric):
     if symmetric == True:
         qq = t(df=self.n).ppf(1 - alpha / 2)
         return np.array(
             [self.theta - self.se * qq, self.theta + self.se * qq])
     else:
         qq = t(df=self.n).ppf(1 - alpha)
     if symmetric == 'lower':
         return self.theta - qq * self.se
     else:
         return self.theta + qq * self.se
Ejemplo n.º 16
0
def show_continuous():
    """Show a variety of continuous distributions"""

    x = linspace(-10, 10, 201)

    # Normal distribution
    showDistribution(x, stats.norm, stats.norm(loc=2, scale=4), "Normal Distribution", "Z", "P(Z)", "")

    # Exponential distribution
    showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4), "Exponential Distribution", "X", "P(X)", "")

    # Students' T-distribution
    # ... with 4, and with 10 degrees of freedom (DOF)
    plot(x, stats.norm.pdf(x), "g")
    hold(True)
    showDistribution(x, stats.t(4), stats.t(10), "T-Distribution", "X", "P(X)", ["normal", "t=4", "t=10"])

    # F-distribution
    # ... with (3,4) and (10,15) DOF
    showDistribution(x, stats.f(3, 4), stats.f(10, 15), "F-Distribution", "F", "P(F)", ["(3,4) DOF", "(10,15) DOF"])

    # Weibull distribution
    # ... with the shape parameter set to 1 and 2
    # Don't worry that in Python it is called "weibull_min": the "weibull_max" is
    # simply mirrored about the origin.
    showDistribution(
        arange(0, 5, 0.02),
        stats.weibull_min(1),
        stats.weibull_min(2),
        "Weibull Distribution",
        "X",
        "P(X)",
        ["k=1", "k=2"],
        xmin=0,
        xmax=4,
    )

    # Uniform distribution
    showDistribution(x, stats.uniform, "", "Uniform Distribution", "X", "P(X)", "")

    # Logistic distribution
    showDistribution(x, stats.norm, stats.logistic, "Logistic Distribution", "X", "P(X)", ["Normal", "Logistic"])

    # Lognormal distribution
    x = logspace(-9, 1, 1001) + 1e-9
    showDistribution(x, stats.lognorm(2), "", "Lognormal Distribution", "X", "lognorm(X)", "", xmin=-0.1)

    # The log-lin plot has to be done by hand:
    plot(log(x), stats.lognorm.pdf(x, 2))
    xlim(-10, 4)
    title("Lognormal Distribution")
    xlabel("log(X)")
    ylabel("lognorm(X)")
    show()
def probabilidade_dado_um_intervalo(
    numero_amostras, x1, x2, mu=0, sigma=1, df=10000000
):
    """Calcula a probabilidade dado um intervalo de valores."""

    t1 = (x1 - mu) / (sigma / sqrt(numero_amostras))
    t2 = (x2 - mu) / (sigma / sqrt(numero_amostras))
    p_t1 = stats.t(df=df, loc=0, scale=1).cdf(t1)
    p_t2 = stats.t(df=df, loc=0, scale=1).cdf(t2)
    probabilidade = p_t2 - p_t1

    return t1, t2, probabilidade
Ejemplo n.º 18
0
    def __init__(self,
                 a,
                 b,
                 beginning=None,
                 ending=None,
                 beginning_factor=None,
                 ending_factor=None):
        """
        start and end can be in either datetime or unix time
        """
        a, b = UnixTime(a), UnixTime(b)
        assert a < b, "'b' should be greater than 'a'"
        if (beginning, ending) != (None, None):
            assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \
                                                                      "either 'beginning_factor' and 'ending_factor' " \
                                                                      "or 'beginning' and 'ending'"
            if not a < beginning < ending < b:
                raise AttributeError(
                    "The inputs should satisfy 'a < beginning < ending < b' relation"
                )

        if beginning_factor is not None:
            assert beginning_factor > 0
            self.beginning_factor = beginning_factor
        if ending_factor is not None:
            assert ending_factor > 0
            self.ending_factor = ending_factor

        if (beginning, ending) == (None, None):
            beginning, ending = 0, 0
            while not a < beginning < ending < b:
                beginning = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        a + float(b - a) / self.beginning_factor,
                        float(b - a) / self.beginning_factor))

                ending = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        b - float(b - a) / self.ending_factor,
                        float(b - a) / self.ending_factor))
        TemporalEvent.__init__(self,
                               uniform(loc=a, scale=UnixTime(beginning - a)),
                               uniform(loc=ending, scale=UnixTime(b - ending)),
                               bins=4)
Ejemplo n.º 19
0
def show_continuous():
    """Show a variety of continuous distributions"""
        
    x = np.linspace(-10,10,201)
    
    # Normal distribution
    showDistribution(x, stats.norm, stats.norm(loc=2, scale=4),
                     'Normal Distribution', 'Z', 'P(Z)','')
    
    # Exponential distribution
    showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4),
                     'Exponential Distribution', 'X', 'P(X)','')
    
    # Students' T-distribution
    # ... with 4, and with 10 degrees of freedom (DOF)
    plt.plot(x, stats.norm.pdf(x), 'g-.')
    plt.hold(True)
    showDistribution(x, stats.t(4), stats.t(10),
                     'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10'])
    
    # F-distribution
    # ... with (3,4) and (10,15) DOF
    showDistribution(x, stats.f(3,4), stats.f(10,15),
                     'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF'])
    
    # Weibull distribution
    # ... with the shape parameter set to 1 and 2
    # Don't worry that in Python it is called "weibull_min": the "weibull_max" is
    # simply mirrored about the origin.
    showDistribution(np.arange(0,5,0.02), stats.weibull_min(1), stats.weibull_min(2),
                     'Weibull Distribution', 'X', 'P(X)',['k=1', 'k=2'], xmin=0, xmax=4)
    
    # Uniform distribution
    showDistribution(x, stats.uniform,'' ,
                     'Uniform Distribution', 'X', 'P(X)','')
    
    # Logistic distribution
    showDistribution(x, stats.norm, stats.logistic,
                     'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic'])
    
    # Lognormal distribution
    x = np.logspace(-9,1,1001)+1e-9
    showDistribution(x, stats.lognorm(2), '',
                     'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1)
    
    # The log-lin plot has to be done by hand:
    plt.plot(np.log(x), stats.lognorm.pdf(x,2))
    plt.xlim(-10, 4)
    plt.title('Lognormal Distribution')
    plt.xlabel('log(X)')
    plt.ylabel('lognorm(X)')
    plt.show()
Ejemplo n.º 20
0
def perform_analysis(model: RegressionModel) -> np.array:
    """
    Calculates a set of statistics for a given linear regression model
    @param model: RegressionModel object to be studied
    @return: indices of factors which may be equal to zero (for significance level 0.1)
    """
    n, m = model.x.shape
    sum_errs2 = sum((model.y - model.y_hat)**2)
    s2 = sum_errs2 / (n - m)
    print("s^2 = {:.3f}".format(s2))

    cov_a = s2 * model.inv_xx
    print_matrix(cov_a, var_name="cov(a)")

    s_a = np.diag(cov_a)**.5
    print_matrix(s_a, var_name="standard deviation of `a`")

    ixx_diag = np.array([np.diag(model.inv_xx)])
    corr_a = model.inv_xx / np.sqrt(ixx_diag.T @ ixx_diag)
    print_matrix(corr_a, var_name="corr(a)")

    sum_y2_centered = sum((model.y - np.mean(model.y))**2)
    r2 = 1 - sum_errs2 / sum_y2_centered
    print("R^2 = {:.4f}".format(r2))
    r2n = 1 - (sum_errs2 / (n - m)) / (sum_y2_centered / (n - 1))
    print("R_n^2 = {:.4f}".format(r2n))

    gamma = .95
    quantile = stats.t(n - m).ppf((1 + gamma) / 2)
    a_confidence_intervals = np.array(
        [model.a - s_a * quantile, model.a + s_a * quantile])
    print_matrix(
        a_confidence_intervals.T,
        var_name="confidence intervals for `a` with confidence level {}".
        format(gamma))

    joint_alpha = (1 - gamma) / m
    joint_quantile = stats.t(n - m).ppf((1 + joint_alpha) / 2)
    a_joint_conf_intervals = np.array(
        [model.a - s_a * joint_quantile, model.a + s_a * joint_quantile])
    print_matrix(
        a_joint_conf_intervals.T,
        var_name="joint confidence intervals for `a` with confidence {}".
        format(gamma))

    # testing hypotheses: a_i ?= 0
    zero_hypot_statistics = np.abs(model.a) / s_a
    print_matrix(
        zero_hypot_statistics,
        var_name="statistics for hypothesis a_i = 0 (t_alpha={})".format(
            quantile))
    return (zero_hypot_statistics < quantile).nonzero()[0]
Ejemplo n.º 21
0
def show_continuous():
    """Show a variety of continuous distributions"""
        
    x = linspace(-10,10,201)
    
    # Normal distribution
    showDistribution(x, stats.norm, stats.norm(loc=2, scale=4),
                     'Normal Distribution', 'Z', 'P(Z)','')
    
    # Exponential distribution
    showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4),
                     'Exponential Distribution', 'X', 'P(X)','')
    
    # Students' T-distribution
    # ... with 4, and with 10 degrees of freedom (DOF)
    plot(x, stats.norm.pdf(x), 'g-.')
    hold(True)
    showDistribution(x, stats.t(4), stats.t(10),
                     'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10'])
    
    # F-distribution
    # ... with (3,4) and (10,15) DOF
    showDistribution(x, stats.f(3,4), stats.f(10,15),
                     'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF'])
    
    # Weibull distribution
    # ... with the shape parameter set to 1 and 2
    # Don't worry that in Python it is called "weibull_min": the "weibull_max" is
    # simply mirrored about the origin.
    showDistribution(arange(0,5,0.02), stats.weibull_min(1), stats.weibull_min(2),
                     'Weibull Distribution', 'X', 'P(X)',['k=1', 'k=2'], xmin=0, xmax=4)
    
    # Uniform distribution
    showDistribution(x, stats.uniform,'' ,
                     'Uniform Distribution', 'X', 'P(X)','')
    
    # Logistic distribution
    showDistribution(x, stats.norm, stats.logistic,
                     'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic'])
    
    # Lognormal distribution
    x = logspace(-9,1,1001)+1e-9
    showDistribution(x, stats.lognorm(2), '',
                     'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1)
    
    # The log-lin plot has to be done by hand:
    plot(log(x), stats.lognorm.pdf(x,2))
    xlim(-10, 4)
    title('Lognormal Distribution')
    xlabel('log(X)')
    ylabel('lognorm(X)')
    show()
Ejemplo n.º 22
0
    def N_eq(N):
        # Distribution for the control group
        Fc = t(df=p * N - 2)

        # Distribution for the treatment group
        Ft = t(df=p * N - 2, loc=MDE)

        # Calculate discrepancy
        delta_N = (MDE - (Fc.ppf(1 - alpha / 2) + Ft.ppf(kappa)) * np.sqrt(
            (p * (1 - p))**(-1) * sigma2 / N))

        # Return discrepancy
        return delta_N
Ejemplo n.º 23
0
 def __init__(self, dofs):
     self.dofs = dofs
     if self.dofs is not None:
         if self.dofs > 0:
             self.bounds = np.array([-np.inf, np.inf])
             mean, var, skew, kurt = t.stats(df=self.dofs, moments='mvsk')
             self.parent = t(df=self.dofs)
             self.mean = mean
             self.variance = var
             self.skewness = skew
             self.kurtosis = kurt
             self.x_range_for_pdf = np.linspace(-5.0, 5.0,
                                                RECURRENCE_PDF_SAMPLES)
             self.parent = t(self.dofs)
Ejemplo n.º 24
0
def _create_probe_statistic(probe_values, fpr):
    # Create prediction interval statistics based on randomly permutated probe features (based on real features)
    n = len(probe_values)

    if n == 1:
        val = probe_values[0]
        low_t = val
        up_t = val
    else:
        probe_values = np.asarray(probe_values)
        mean = probe_values.mean()
        s = probe_values.std()
        low_t = mean + stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n))
        up_t = mean - stats.t(df=n - 1).ppf(fpr) * s * np.sqrt(1 + (1 / n))
    return low_t, up_t
Ejemplo n.º 25
0
def ttest_uneq(x):
  assert x.columns.get_level_values(0).isin(['mu','se','n']).all()
  idx = pd.IndexSlice
  mu_mat = x.loc[:,idx['mu']].values
  n_mat = x.loc[:,idx['n']].values
  se_mat = x.loc[:,idx['se']].values
  # Calculate t-stat
  dmu_vec = mu_mat[:,0] - mu_mat[:,1]
  var_vec = np.sum(se_mat**2 / n_mat,1)
  se_vec = np.sqrt(var_vec)
  df_vec = var_vec**2 / np.sum((se_mat**2 / n_mat)**2 / (n_mat-1),1)
  stat_vec = dmu_vec / se_vec
  pval = 2*np.minimum(stats.t(df=df_vec).cdf(stat_vec),1-stats.t(df=df_vec).cdf(stat_vec))
  res = pd.DataFrame({'stat':stat_vec, 'pval':pval, 'df':df_vec},index=x.index)
  return res
Ejemplo n.º 26
0
def welchs_ttest(stats1, stats2):
    """
    SNAGGED FROM https://github.com/mozilla/datazilla-metrics/blob/master/dzmetrics/ttest.py#L56
    Execute TWO-sided Welch's t-test given pre-calculated means and stddevs.

    Accepts summary data (N, stddev, and mean) for two datasets and performs
    one-sided Welch's t-test, returning p-value.
    """
    n1 = stats1.count
    m1 = stats1.mean
    v1 = max(stats1.variance, 1.0/12.0)

    n2 = stats2.count
    m2 = stats2.mean
    v2 = max(stats2.variance, 1.0/12.0)

    if n1 < 2 or n2 < 2:
        return {"confidence": 0, "diff": 0}

    vpooled = v1 / n1 + v2 / n2
    # 1/12 == STD OF STANDARD UNIFORM DISTRIBUTION
    # We assume test replicates (xi) are actually rounded results from
    # actual measurements somewhere in the range of (xi - 0.5, xi + 0.5),
    # which has a variance of 1/12
    tt = abs(m1 - m2) / sqrt(vpooled)

    df_numerator = vpooled ** 2
    df_denominator = ((v1 / n1) ** 2) / (n1 - 1) + ((v2 / n2) ** 2) / (n2 - 1)
    df = df_numerator / df_denominator

    # abs(x - 0.5)*2 IS AN ATTEMPT TO GIVE HIGH NUMBERS TO EITHER TAIL OF THE cdf
    return {"confidence": abs(stats.t(df).cdf(tt) - 0.5) * 2, "diff": tt}
Ejemplo n.º 27
0
        def density(unifs):
            if np.ndim(unifs[0]) == 0:
                unifs = [[i] for i in unifs]

            v = self.par['v']
            sigma = norm_matrix(self.par['sigma'])
            std = math.sqrt(v / (v - 2))

            vecs = []
            marginal_density = []
            t_obj = stats.t(v)
            for i in range(dim):
                temp = t_obj.ppf(unifs[i])
                marginal_density.append(t_obj.pdf(temp))
                vecs.append(temp)
            res = []

            vecs = list(zip(*vecs))
            marginal_density = list(zip(*marginal_density))
            factor = spe.gamma((v + self.dim) / 2) / (
            spe.gamma(v / 2) * math.sqrt((math.pi * v) ** self.dim * np.linalg.det(sigma)))
            cov_inv = sigma ** (-1)

            for i in list(zip(*[vecs, marginal_density])):
                x = np.matrix(i[0])
                m = np.prod(i[1])
                if m > 0:
                    temp = 1 / m * factor * math.exp(
                        math.log(1 + x * cov_inv * np.transpose(x) / v) * (-(v + self.dim) / 2))
                else:
                    temp = 0
                res.append(temp)
            return res
Ejemplo n.º 28
0
Archivo: dlm.py Proyecto: wesm/statlib
    def mu_ci(self, alpha=0.10, prior=False):
        """
        Compute marginal confidence intervals around each parameter \theta_{ti}
        If prior is False, compute posterior
        """
        _x, _y = np.diag_indices(self.ndim)
        diags = self.mu_scale[:, _x, _y]

        # Only care about marginal scale
        delta = self.state_discount
        if isinstance(delta, np.ndarray):
            delta = np.diag(delta)

        if prior:
            df = self.df[:-1]
            mode = self.mu_mode[:-1]
            scale = np.sqrt(diags[:-1] / delta)
        else:
            df = self.df[1:]
            mode = self.mu_mode[1:]
            scale = np.sqrt(diags[1:])

        q = stats.t(df).ppf(1 - alpha / 2)
        band = (scale.T * q).T
        ci_lower = mode - band
        ci_upper = mode + band

        return mode, ci_lower, ci_upper
Ejemplo n.º 29
0
    def run(self, results_x, results_z, attach=True):
        '''

        see class docstring (for now)
        '''
        if not np.allclose(results_x.model.endog, results_z.model.endog):
            raise ValueError('endogenous variables in models are not the same')
        nobs = results_x.model.endog.shape[0]
        y = results_x.model.exog
        x = results_x.model.exog
        z = results_z.model.exog
        #sigma2_x = results_x.ssr/nobs
        #sigma2_z = results_z.ssr/nobs
        yhat_x = results_x.fittedvalues
        #yhat_z = results_z.fittedvalues
        res_zx = sm.OLS(y, np.column_stack((yhat_x, z))).fit()
        self.res_zx = res_zx  #for testing
        tstat = res_zx.tvalues[0]
        pval = res_zx.pvalues[0]
        if attach:
            self.res_zx = res_zx
            self.dist = stats.t(res_zx.model.df_resid)
            self.teststat = tstat
            self.pvalue = pval

        return tsta, pval
Ejemplo n.º 30
0
def linReg(X, y, intercept = False):
    #reweighted least squares logistic regression
    #add intercept:
    if intercept:
        X = np.insert(X, X.shape[1], 1, axis=1)

    y = np.array([y]).T #make column

    #fit regression:
    betas = np.dot(np.dot(np.linalg.inv((np.dot(X.T,X))), X.T), y)

    #calculate p-values:
    error = y - (np.dot(X,betas))
    RSS = np.sum(error**2)
    betas = betas.flatten()
    df = float((X.shape[0] - (len(betas) - 1 if intercept else 0)) - 1)
    s2 = RSS / df
    #print s2
    beta_ses = np.sqrt(s2 / (np.sum( (X - np.mean(X,0))**2, 0)))
    #print beta_ses
    ts = [betas[j] / beta_ses[j] for j in range(len(betas))]
    pvalues = (1 - ss.t(df).cdf(np.abs(ts))) * 2 #two-tailed

    ##FOR TESTING:
    #print (betas, pvalues)#DEBUG
    #for comparison purposes:
    #results = sm.OLS(y, X).fit() #DEBUG
    #print (results.params, results.pvalues)
    return betas, pvalues
Ejemplo n.º 31
0
def VaR(ts, alpha, flavour):
    if flavour == "historical":
        temp_ts = ts.copy()
        temp_ts.sort()
        n = len( temp_ts)
        try:
            return -temp_ts.values[ np.floor( (1-alpha)*n ) ]
        except:
            return -temp_ts[ np.floor( (1-alpha)*n ) ]
            
    elif flavour == "t":
        t = stats.t
        t = stats.t( *t.fit( ts ) )
        return -t.ppf( 1-alpha )
            
    elif flavour == "normal":
        mean = ts.mean()
        std = ts.std()
        return -stats.norm.ppf( 1-alpha, mean, std )
    elif flavour == "Cornish-Fischer":
        z_c = -stats.norm.ppf( 1-alpha, 0 ,1)
        S = stats.skew(ts)
        K = stats.kurtosis(ts)
        z_cf = z_c + (z_c**2-1)*S/6 + (z_c**3- 3*z_c)*K/24 + (2*z_c**3-5*z_c)*S**2/36
        return ts.mean() - z_cf*np.sqrt( ts.std() )
        
    elif flavour == "kernel":
        kde = stats.gaussian_kde( ts )  
        print kde.factor

        f = lambda x: kde.integrate_box_1d(-1, x) - (1-alpha)
        return -fsolve( f, -0.05)[0]
Ejemplo n.º 32
0
def student(mm):
    ''' Распределение Стьюдента '''
    alfa=0.05              # уровень значимости
    n = size(mm) - 1            # число степеней свободы
    t = stats.t(n)
    tcr = t.ppf(1 - alfa / 2)
    return round(mean(mm), 4), round (tcr * std(mm) / sqrt( size(mm) ), 4)
Ejemplo n.º 33
0
def test_sample_nig():
    mu_0 = 0.0
    lmbda_0 = 10.0
    alpha_0 = 10.0
    beta_0 = 10.0

    # Directly sample nig and lookg at marginals
    from pyhawkes.utils.utils import sample_nig

    mu_samples = np.array([sample_nig(mu_0, lmbda_0, alpha_0, beta_0)[0] for _ in xrange(10000)])

    # Plot the histogram of impulse means
    plt.figure()
    p_mu = t(df=2 * alpha_0, loc=mu_0, scale=np.sqrt(beta_0 / (alpha_0 * lmbda_0)))

    _, bins, _ = plt.hist(mu_samples, bins=50, alpha=0.5, normed=True)
    bincenters = 0.5 * (bins[1:] + bins[:-1])
    plt.plot(bincenters, p_mu.pdf(bincenters), "r--", linewidth=1)
    plt.xlabel("mu")
    plt.ylabel("p(mu)")

    plt.figure()
    probplot(mu_samples, dist=p_mu, plot=plt.gca())

    plt.show()
Ejemplo n.º 34
0
    def plot_mu_density(self, t, index=0, support_thresh=None):
        ix = index
        dists = {}
        weights = {}

        thresh = 0
        for i in range(self.nmodels):
            df = self.df[t]
            mode = self.mu_mode[t + 1, i, ix]
            scale = np.sqrt(self.mu_scale[t + 1, i, ix, ix])
            dist = stats.t(df, loc=mode, scale=scale)
            dists[i] = dist
            weights[i] = self.marginal_prob[t + 1, i]

            thresh = max(thresh, dist.pdf(mode))

        if support_thresh is not None:
            thresh = support_thresh
        else:
            # HACK
            thresh /= 1000

        plot_mixture(dists, weights,
                     hi=self.mu_mode[:, :, ix].max(),
                     lo=self.mu_mode[:, :, ix].min(),
                     support_thresh=thresh)
Ejemplo n.º 35
0
    def plot_mu_density(self, t, index=0, support_thresh=0.1):
        """
        Plot posterior densities for single model parameter over the set of
        mixture components

        Parameters
        ----------
        t : int
            time index, relative to response variable
        index : int
            parameter index to plot

        Notes
        -----
        cf. West & Harrison Figure 12.3. Automatically annotating individual
        component curves would probably be difficult.
        """
        ix = index
        dists = {}
        for name in self.names:
            model = self.models[name]
            df = model.df[t]
            mode = model.mu_mode[t + 1, ix]
            scale = np.sqrt(model.mu_scale[t + 1, ix, ix])
            dists[name] = stats.t(df, loc=mode, scale=scale)

        plot_mixture(dists, self.get_weights(t),
                           support_thresh=support_thresh)
Ejemplo n.º 36
0
def TProbabilitiesLowerTail(values, df):
    if len(values)>0 and df>0:
        
        outputStr = ""
        areas = []

        for val in values:
            outputStr += str(val)
            
            rv= stats.t(df, loc = 0, scale = 1)   #default loc ve scale degerleri
            area = rv.cdf(val)
            area = "{0:.5f}".format(area)
            areas.append(area)

            if len(values) >1 and values.index(val) < len(values) - 1 : 
                outputStr += ", "
            else: 
                outputStr += "" 

        outputStr += ", serbestlik derecesi: " + str(df) 
        return outputStr, areas

    elif df <= 0:
        return False, "Standart sapma 0'dan kucuk olamaz."
    else:
        return False, "Hesaplama icin gecerli degerler girilmelidir."
Ejemplo n.º 37
0
def PlotTDistributionDistributionFunction(df): 
    if df>0: 
        main_frame = QtGui.QWidget()
        dpi = 100
        fig = Figure((5.0, 4.0), dpi=dpi)
        canvas = FigureCanvas(fig)
        canvas.setParent(main_frame)

        axes = fig.add_subplot(111)
        mpl_toolbar = NavigationToolbar(canvas, main_frame)

        hbox = QtGui.QHBoxLayout()
        vbox = QtGui.QVBoxLayout()
        vbox.addWidget(canvas)
        vbox.addWidget(mpl_toolbar)
        vbox.addLayout(hbox)
        main_frame.setLayout(vbox)

        axes.clear()

        alpha = 0.0005  #R'da o sekilde alinmis, burada da ayni olmas? icin bu deger verildi
        sequance = stats.t.isf(alpha, df)

        x = np.linspace(-sequance, sequance, 100)    #100 adet veri default verildi
        rv = stats.t(df)
        y = rv.cdf(x)

        axes.plot(x,y)
        canvas.draw()

        return main_frame

    else: 
        return False, "Serbestlik derecesi 0'dan kucuk olamaz."
Ejemplo n.º 38
0
def get_normal_gamma_posterior_predictive(x, u0, k0, a0, b0):
    """
    Get posterior predictive for vector sample with unknown mean and variance.

    The returned Student-t distribution for the predictive posterior can be seen as derived in [1], [2], and [3]
    1: https://www.cs.ubc.ca/~murphyk/Teaching/CS340-Fall07/reading/NG.pdf (page 5)
    2: https://www.cs.ubc.ca/~murphyk/Papers/bayesGauss.pdf (page 9)
    3: https://en.wikipedia.org/wiki/Conjugate_prior (Normal-gamma conjugate prior section)

    :param x: Sample to estimate distribution for (any non-zero length sequence convertible to float)
    :param u0: Hyperparameter for mean of mean distribution
    :param k0: Hyperparameter for inverse variance of mean distribution
    :param a0: Hyperparameter for alpha of precision distribution (shape)
    :param b0: Hyperparameter for beta of precision distribution (rate, not scale)
    :return: T-Distribution (posterior predictive for samples)
        *Note that posterior on parameters is not returned here simply because python has no normal-gamma implementation
    """
    x = np.array(x, dtype=np.float64)
    n = len(x)
    x_bar = np.mean(x)
    u = (k0 * u0 + n * x_bar) / (k0 + n)
    k = k0 + n
    a = a0 + n / 2.
    b = b0 + .5 * np.sum((x - x_bar)**2) + (k0 * n * (x_bar - u0)**2) / (2 * (k0 + n))
    # print(u, k, a, b, (b * (k + 1))/(a * k))
    predictive_dist = stats.t(df=2*a, loc=u, scale=(b * (k + 1))/(a * k))
    return predictive_dist
Ejemplo n.º 39
0
def check_mean():        
    '''Data from Altman, check for significance of mean value.
    Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and
    compare it to the recommended level of 7725 kJ.
    '''
    # Get data from Altman

    data = getData('altman_91.txt')

    # Watch out: by default the SD is calculated with 1/N!
    myMean = np.mean(data)
    mySD = np.std(data, ddof=1)
    print 'Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD)

    # Confidence intervals
    tf = stats.t(len(data)-1)
    ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.isf(0.025)
    print 'The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1])

    # Check for significance
    checkValue = 7725
    t, prob = stats.ttest_1samp(data, checkValue)
    if prob < 0.05:
        print '{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob)

    # For not normally distributed data, use the Wilcoxon signed rank test
    (rank, pVal) = stats.wilcoxon(data-checkValue)
    if pVal < 0.05:
      issignificant = 'unlikely'
    else:
      issignificant = 'likely'
      
    print 'It is ' + issignificant + ' that the value is {0:d}'.format(checkValue)
Ejemplo n.º 40
0
def welchs_ttest(stats1, stats2):
    """
    SNAGGED FROM https://github.com/mozilla/datazilla-metrics/blob/master/dzmetrics/ttest.py#L56
    Execute TWO-sided Welch's t-test given pre-calculated means and stddevs.

    Accepts summary data (N, stddev, and mean) for two datasets and performs
    one-sided Welch's t-test, returning p-value.
    """
    n1 = stats1.count
    m1 = stats1.mean
    v1 = stats1.variance

    n2 = stats2.count
    m2 = stats2.mean
    v2 = stats2.variance

    if n1 < 2 or n2 < 2:
        return Struct(confidence=0, diff=0)

    vpooled = v1 / n1 + v2 / n2
    tt = abs(m1 - m2) / sqrt(vpooled)

    df_numerator = vpooled ** 2
    df_denominator = ((v1 / n1) ** 2) / (n1 - 1) + ((v2 / n2) ** 2) / (n2 - 1)
    df = df_numerator / df_denominator

    # RETURN NUMBER-OF-NINES OF CONFIDENCE (0.99 = 2, 0.999 = 3, etc)
    return Struct(confidence=-stats.t(df).logsf(tt) / log(10), diff=tt)
Ejemplo n.º 41
0
    def cdf(
        self,
        resids: ArrayLike,
        parameters: Optional[Union[Sequence[float], ArrayLike1D]] = None,
    ) -> NDArray:
        parameters = self._check_constraints(parameters)
        scalar = isscalar(resids)
        if scalar:
            resids = array([resids])

        eta, lam = parameters

        a = self.__const_a(parameters)
        b = self.__const_b(parameters)

        var = eta / (eta - 2)
        y1 = (b * resids + a) / (1 - lam) * sqrt(var)
        y2 = (b * resids + a) / (1 + lam) * sqrt(var)
        tcdf = stats.t(eta).cdf
        resids = asarray(resids)
        p = (1 - lam) * tcdf(y1) * (resids < (-a / b))
        p += (resids >=
              (-a / b)) * ((1 - lam) / 2 + (1 + lam) * (tcdf(y2) - 0.5))
        if scalar:
            p = p[0]
        return p
 def predict(self, X, **kwargs):
     mu = self.trace_['alpha'] + self.trace_['beta'] * X[:, None]
     dist = t(df=self.trace_['nu'], loc=mu, scale=self.trace_['sigma'])
     if kwargs.get('q') is None:
         return dist, dist.mean().mean(axis=1)
     else:
         return dist, [dist.ppf(q_).mean(axis=1) for q_ in kwargs['q']]
Ejemplo n.º 43
0
def oneProportion() -> float:
    """Calculate the confidence intervals of the population, based on a
    given data sample.
    The data are taken from Altman, chapter 10.2.1.
    Suppose a general practitioner chooses a random sample of 215 women from
    the patient register for her general practice, and finds that 39 of them
    have a history of suffering from asthma. What is the confidence interval
    for the prevalence of asthma?

    Returns
    -------
    ci : 95% confidence interval
    """

    # Get the data
    numTotal = 215
    numPositive = 39

    # --- >>> START stats <<< ---
    # Calculate the confidence intervals
    p = float(numPositive) / numTotal
    se = np.sqrt(p * (1 - p) / numTotal)
    td = stats.t(numTotal - 1)
    ci = p + np.array([-1, 1]) * td.isf(0.025) * se
    # --- >>> STOP stats <<< ---

    # Print them
    print('ONE PROPORTION ----------------------------------------')
    print('The confidence interval for the given sample is ' +
          f'{ci[0]:.3f} - {ci[1]:.3f}')

    return ci
Ejemplo n.º 44
0
def get_p_val(y, z, a, b, muL, muR, var, ind=0, use_tdist=False):
    """
    Correct pval approach using approximations of truncated Gaussians

    Parameters
    ----------
    y: points from one cluster
    z: points from the other cluster
    a: separating hyperplane
    use_tdist: null distribution of TN statistic to use
        False for standard normal
        True for t distribution with df=len(y)+len(z)-2
    muL, muR, var: estimated using maximum likelihood
    ind: gene to test

    Returns
    ----------
    pvalue
    """
    muY, varY, muZ, varZ = get_null_truncmv_params(a,
                                                   b,
                                                   muL,
                                                   muR,
                                                   var=var,
                                                   ind=ind)
    nY, nZ = len(y), len(z)
    stat = (np.sum(z[:, ind]) - np.sum(y[:, ind]) -
            (nZ * muZ - nY * muY)) / np.sqrt(nY * varY + nZ * varZ)
    if use_tdist:
        df = len(z) + len(y) - 2
        d0 = t(df=df).cdf
    else:
        d0 = norm.cdf
    p = np.min((d0(stat), d0(-stat))) * 2
    return p
Ejemplo n.º 45
0
def oneProportion():
    '''Calculate the confidence intervals of the population, based on a
    given data sample.
    The data are taken from Altman, chapter 10.2.1.
    Suppose a general practitioner chooses a random sample of 215 women from
    the patient register for her general practice, and finds that 39 of them
    have a history of suffering from asthma. What is the confidence interval
    for the prevalence of asthma?'''

    # Get the data
    numTotal = 215
    numPositive = 39

    # --- >>> START stats <<< ---
    # Calculate the confidence intervals
    p = float(numPositive)/numTotal
    se = np.sqrt(p*(1-p)/numTotal)
    td = stats.t(numTotal-1)
    ci = p + np.array([-1,1])*td.isf(0.025)*se
    # --- >>> STOP stats <<< ---

    # Print them
    print('ONE PROPORTION ----------------------------------------')
    print(('The confidence interval for the given sample is {0:5.3f} to {1:5.3f}'.format(
        ci[0], ci[1])))
    
    return ci
Ejemplo n.º 46
0
def get_p_val_1D(y, z, a, muL, muR, var, use_tdist=False):
    """
    Correct pval approach using approximations of truncated Gaussians
    (1D case, so y, z, a should be scalars)

    Parameters
    ----------
    y: points from one cluster
    z: points from the other cluster
    a: threshold
    use_tdist: null distribution of TN statistic to use
        False for standard normal
        True for t distribution with df=len(y)+len(z)-2
    muL, muR, var: estimated using maximum likelihood

    Returns
    ----------
    pvalue
    """
    muY, varY, muZ, varZ = get_null_trunc_params(muL, muR, var=var, a=a)
    nY, nZ = len(y), len(z)
    stat = (np.sum(z) - np.sum(y) -
            (nZ * muZ - nY * muY)) / np.sqrt(nY * varY + nZ * varZ)
    if use_tdist:
        df = len(z) + len(y) - 2
        d0 = t(df=df).cdf
    else:
        d0 = norm.cdf
    p = np.min((d0(stat), d0(-stat))) * 2
    return p
Ejemplo n.º 47
0
def sample_ar1t(
    rhos,
    n=50,
    df_t=DEFAULT_DF_T,
):
    """
    Samples t variables according to a Markov chain.
    """
    # Initial t samples
    p = rhos.shape[0] + 1
    tvars = stats.t(df=df_t).rvs(size=(n, p))

    # Initialize X
    X = np.zeros((n, p))
    scale = np.sqrt((df_t - 2) / df_t)
    X[:, 0] = scale * tvars[:, 0]

    # Loop through variables according to markov chain
    conjugates = np.sqrt(1 - rhos**2)
    for j in range(1, p):
        X[:,
          j] = rhos[j - 1] * X[:, j - 1] + conjugates[j - 1] * scale * tvars[:,
                                                                             j]

    return X
Ejemplo n.º 48
0
def fun7():
    print("open三大抽样分布")
    #绘制 正态分布 卡方分布 t分布 F分布
    nor_dis = stats.norm()
    chi2_dis = stats.chi2(df=eval(k_1.get()))
    t_dis = stats.t(df=eval(t_1.get()))
    f_dis = stats.f(dfn=eval(f_1.get()), dfd=eval(f_2.get()))

    x1 = np.linspace(nor_dis.ppf(0.001), nor_dis.ppf(0.999), 1000)
    x2 = np.linspace(chi2_dis.ppf(0.001), chi2_dis.ppf(0.999), 1000)
    x3 = np.linspace(t_dis.ppf(0.001), t_dis.ppf(0.999), 1000)
    x4 = np.linspace(f_dis.ppf(0.001), f_dis.ppf(0.999), 1000)
    fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    ax.plot(x1, nor_dis.pdf(x1), 'r-', lw=2, label=r'N(0, 1)')
    ax.plot(x2,
            chi2_dis.pdf(x2),
            'g-',
            lw=2,
            label=r'$\chi^2$(%d)' % eval(k_1.get()))
    ax.plot(x3, t_dis.pdf(x3), 'b-', lw=2, label='t(%d)' % eval(t_1.get()))
    ax.plot(x4,
            f_dis.pdf(x4),
            'm-',
            lw=2,
            label='F(%d, %d)' % (eval(f_1.get()), eval(f_2.get())))

    plt.xlabel("x")
    plt.ylabel('Probability')
    plt.title(r'PDF of Three Sampling Distribution')
    ax.legend(loc='best', frameon=False)
    plt.grid()
    plt.show()
Ejemplo n.º 49
0
def TQuantilesLowerTail(probs, df):
    if len(probs)>0 and df>0:

        outputStr = ""
        yArray = []

        for prob in probs:
            outputStr += str(prob)

            if prob> 0 and prob<1:

                rv = stats.t(df, loc = 0, scale = 1)
                y = rv.ppf(prob)
                y = "{0:.5f}".format(y)
                yArray.append(y) 

            else:
                yArray.append("NaN") 
            
            if len(probs) >1 and probs.index(prob) < len(probs) - 1 : 
                    outputStr += ", "
            else: 
                    outputStr += "" 

        outputStr += ", serbestlik derecesi: " + str(df) 
        return outputStr, yArray

    elif df<=0: 
        return False, "Serbestlik derecesi 0'dan kucuk olamaz."
    else: 
        return False, "Gecerli olasilik degeri girilmelidir."
Ejemplo n.º 50
0
def coherr(C,J1,J2,p=0.05,Nsp1=None,Nsp2=None):
    """
    Function to compute lower and upper confidence intervals on
    coherency (absolute value of coherence).

    C:            coherence (real or complex)
    J1,J2:        tapered fourier transforms
    p:            the target P value (default 0.05)
    Nsp1:         number of spikes in J1, used for finite size correction.
    Nsp2:         number of spikes in J2, used for finite size correction.
                  Default is None, for no correction

    Outputs:
    CI:           confidence interval for C, N x 2 array, (lower, upper)
    phi_std:      stanard deviation of phi, N array
    """
    from numpy import iscomplexobj, absolute, fix, zeros, setdiff1d, real, sqrt,\
         arctanh, tanh
    from scipy.stats import t

    J1 = _combine_trials(J1)
    J2 = _combine_trials(J2)
    N,K = J1.shape
    assert J1.shape==J2.shape, "J1 and J2 must have the same dimensions."
    assert N == C.size, "S and J lengths don't match"
    if iscomplexobj(C): C = absolute(C)

    pp = 1 - p/2
    dof = 2*K
    dof1 = dof if Nsp1 is None else fix(2.*Nsp1*dof/(2.*Nsp1+dof))
    dof2 = dof if Nsp2 is None else fix(2.*Nsp2*dof/(2.*Nsp2+dof))
    dof = min(dof1,dof2)

    Cerr = zeros((N,2))
    tcrit = t(dof-1).ppf(pp).tolist()
    atanhCxyk = zeros((N,K))
    phasefactorxyk = zeros((N,K),dtype='complex128')

    for k in xrange(K):
        indxk = setdiff1d(range(K),[k])
        J1k = J1[:,indxk]
        J2k = J2[:,indxk]
        eJ1k = real(J1k * J1k.conj()).sum(1)
        eJ2k = real(J2k * J2k.conj()).sum(1)
        eJ12k = (J1k.conj() * J2k).sum(1)
        Cxyk = eJ12k/sqrt(eJ1k*eJ2k)
        absCxyk = absolute(Cxyk)
        atanhCxyk[:,k] = sqrt(2*K-2)*arctanh(absCxyk)
        phasefactorxyk[:,k] = Cxyk / absCxyk

    atanhC = sqrt(2*K-2)*arctanh(C);
    sigma12 = sqrt(K-1)* atanhCxyk.std(1)

    Cu = atanhC + tcrit * sigma12
    Cl = atanhC - tcrit * sigma12
    Cerr[:,0] = tanh(Cl / sqrt(2*K-2))
    Cerr[:,1] = tanh(Cu / sqrt(2*K-2))
    phistd = (2*K-2) * (1 - absolute(phasefactorxyk.mean(1)))
    return Cerr, phistd
Ejemplo n.º 51
0
    def __init__(self, a, b, beginning=None, ending=None, beginning_factor=None, ending_factor=None):
        """
        start and end can be in either datetime or unix time
        """
        a, b = UnixTime(a), UnixTime(b)
        assert a < b, "'b' should be greater than 'a'"
        if (beginning, ending) != (None, None):
            assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \
                                                                      "either 'beginning_factor' and 'ending_factor' " \
                                                                      "or 'beginning' and 'ending'"
            if not a < beginning and ending < b and (beginning < ending or almost_equals(beginning, ending)):
                raise AttributeError("The inputs should satisfy 'a < beginning <= ending < b' relation")

        if beginning_factor is not None:
            assert beginning_factor > 0
            self.beginning_factor = beginning_factor
        if ending_factor is not None:
            assert ending_factor > 0
            self.ending_factor = ending_factor

        if (beginning, ending) == (None, None):
            beginning, ending = 0, 0
            while not a < beginning < ending < b:
                beginning = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        a + float(b - a) / self.beginning_factor,
                        float(b - a) / self.beginning_factor
                    )
                )

                ending = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        b - float(b - a) / self.ending_factor,
                        float(b - a) / self.ending_factor
                    )
                )
        TemporalEvent.__init__(self, uniform(loc=a, scale=UnixTime(beginning - a)),
                               uniform(loc=ending, scale=UnixTime(b - ending)), bins=4)
Ejemplo n.º 52
0
def calculate_mean_confidence_interval_small(series, confidence_interval=0.95):
    mean = series.mean()
    s = math.sqrt(series.var())
    count = series.count()
    rv = t(count - 1)
    z = rv.isf((1 - confidence_interval) / 2)
    delta = round(z * (s / math.sqrt(count)), 2)
    return FloatInterval.closed(mean - delta, mean + delta)
Ejemplo n.º 53
0
def umean(bb,stdlev=0.05):
    '''calculates correct uncertainty using student coefficient
    '''
    from uncertainties import ufloat
    from numpy import mean,std
    from math import sqrt
    from scipy import stats
    return ufloat(mean(bb),std(bb)*stats.t(len(bb)).isf(stdlev)/nu.sqrt(len(bb)-1))
Ejemplo n.º 54
0
def PosteriorParameters(Y):
    stdY=np.std(Y)
    meanY=np.mean(Y)
    rv=stats.t(Y.size-1,loc=meanY,scale=stdY**2/Y.size)
    
    #sigma_posterior=((Y.size-1)*stdY**2)/stats.chi2.pdf(Y[:-1],stdY**2)
    mu_posterior=stats.t.pdf(Y,Y.size-1)
    return(mu_posterior)
Ejemplo n.º 55
0
    def __init__(self, a, b, beginning=None, ending=None, beginning_factor=None, ending_factor=None):
        """
        start and end can be in either datetime or unix time
        """
        a, b = UnixTime(a), UnixTime(b)
        assert a < b, "'b' should be greater than 'a'"
        if (beginning, ending) != (None, None):
            assert (beginning_factor, ending_factor) == (None, None), "PiecewiseTemporalEvent() only accepts " \
                                                                      "either 'beginning_factor' and 'ending_factor' " \
                                                                      "or 'beginning' and 'ending'"

        if beginning_factor is not None:
            assert beginning_factor > 0
            self.beginning_factor = beginning_factor
        if ending_factor is not None:
            assert ending_factor > 0
            self.ending_factor = ending_factor

        if (beginning, ending) != (None, None):
            beginning = UnixTime(beginning)
            ending = UnixTime(ending)
        else:
            beginning, ending = 0, 0
            while not a < beginning < ending < b:
                beginning = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        a + float(b - a) / self.beginning_factor,
                        float(b - a) / self.beginning_factor
                    )
                )

                ending = random_time(
                    a,
                    b,
                    probability_distribution=t(
                        # df, mean, variance
                        4,
                        b - float(b - a) / self.ending_factor,
                        float(b - a) / self.ending_factor
                    )
                )
        TemporalEventPiecewiseLinear.__init__(self, [a, beginning, ending, b], [0, 1, 1, 0])
Ejemplo n.º 56
0
def show_continuous():
    """Show a variety of continuous distributions"""
        
    x = linspace(-10,10,201)
    
    # Normal distribution
    showDistribution(x, stats.norm, stats.norm(loc=2, scale=4),
                     'Normal Distribution', 'Z', 'P(Z)','')
    
    # Exponential distribution
    showDistribution(x, stats.expon, stats.expon(loc=-2, scale=4),
                     'Exponential Distribution', 'X', 'P(X)','')
    
    # Students' T-distribution
    # ... with 4, and with 10 degrees of freedom (DOF)
    plot(x, stats.norm.pdf(x), 'g')
    hold(True)
    showDistribution(x, stats.t(4), stats.t(10),
                     'T-Distribution', 'X', 'P(X)',['normal', 't=4', 't=10'])
    
    # F-distribution
    # ... with (3,4) and (10,15) DOF
    showDistribution(x, stats.f(3,4), stats.f(10,15),
                     'F-Distribution', 'F', 'P(F)',['(3,4) DOF', '(10,15) DOF'])
    
    # Uniform distribution
    showDistribution(x, stats.uniform,'' ,
                     'Uniform Distribution', 'X', 'P(X)','')
    
    # Logistic distribution
    showDistribution(x, stats.norm, stats.logistic,
                     'Logistic Distribution', 'X', 'P(X)',['Normal', 'Logistic'])
    
    # Lognormal distribution
    x = logspace(-9,1,1001)+1e-9
    showDistribution(x, stats.lognorm(2), '',
                     'Lognormal Distribution', 'X', 'lognorm(X)','', xmin=-0.1)
    
    # The log-lin plot has to be done by hand:
    plot(log(x), stats.lognorm.pdf(x,2))
    xlim(-10, 4)
    title('Lognormal Distribution')
    xlabel('log(X)')
    ylabel('lognorm(X)')
    show()
Ejemplo n.º 57
0
    def infer_1dgaussian(self, init_labels, output_file = None):
        """Perform inference on class labels assuming data are 1-d gaussian,
        without OpenCL acceleration.
        """
        total_time = 0
        a_time = time()

        cluster_labels = init_labels
        if self.record_best: self.auto_save_sample(cluster_labels)
        
        for i in xrange(self.niter):
            # identify existing clusters and generate a new one
            uniq_labels = np.unique(cluster_labels)
            _, _, new_cluster_label = smallest_unused_label(uniq_labels)
            uniq_labels = np.hstack((new_cluster_label, uniq_labels))

            # compute the sufficient statistics of each cluster
            logpost = np.empty((self.N, uniq_labels.shape[0]))
            
            for label_index in xrange(uniq_labels.shape[0]):
                label = uniq_labels[label_index]
                if label == new_cluster_label:
                    n, mu, var = 0, 0, 0
                else:
                    cluster_obs = self.obs[np.where(cluster_labels == label)]
                    n = cluster_obs.shape[0]
                    mu = np.mean(cluster_obs)
                    var = np.var(cluster_obs)

                k_n = self.gaussian_k0 + n
                mu_n  = (self.gaussian_k0 * self.gaussian_mu0 + n * mu) / k_n
                alpha_n = self.gamma_alpha0 + n / 2
                beta_n = self.gamma_beta0 + 0.5 * var * n + \
                    self.gaussian_k0 * n * (mu - self.gaussian_mu0) ** 2 / (2 * k_n)
                Lambda = alpha_n * k_n / (beta_n * (k_n + 1))
            
                t_frozen = t(df = 2 * alpha_n, loc = mu_n, scale = (1 / Lambda) ** 0.5)
                logpost[:,label_index] = t_frozen.logpdf(self.obs[:,0])
                logpost[:,label_index] += np.log(n/(self.N + self.alpha)) if n > 0 else np.log(self.alpha/(self.N+self.alpha))
            
            # sample and implement the changes
            temp_cluster_labels = np.empty(cluster_labels.shape, dtype=np.int32)
            for j in xrange(self.N):
                target_cluster = sample(a = uniq_labels, p = lognormalize(logpost[j]))
                temp_cluster_labels[j] = target_cluster

            if self.record_best:
                if self.auto_save_sample(temp_cluster_labels):
                    cluster_labels = temp_cluster_labels
                if self.no_improvement(500):
                    break                    
            else:
                if i >= self.burnin and i % self.thining == 0:
                    print(*temp_cluster_labels, file = output_file, sep=',')
                
        self.total_time += time() - a_time
        return self.gpu_time, self.total_time, Counter(cluster_labels).most_common()
Ejemplo n.º 58
0
def confidence_interval_mean(std, num, confidence=0.95):
    """ calculates the confidence interval of the mean given a standard
    deviation and a number of observations, assuming a normal distribution """
    sem = std/np.sqrt(num) # estimate of the standard error of the mean

    # get confidence interval from student-t distribution
    factor = stats.t(num - 1).ppf(0.5 + 0.5*confidence)

    return factor*sem
Ejemplo n.º 59
0
def T(v, tag=None):
    """
    A Student-T random variate
    
    Parameters
    ----------
    v : int
        The degrees of freedom of the distribution (must be greater than one)
    """
    assert isinstance(v, int) and v>1, 'v must be an int greater than 1'
    return uv(rv=ss.t(v), tag=tag)