Esempio n. 1
0
def single_mahalanobis(sim_data,
                       real_data,
                       features=FEATURE_COL,
                       klass=KLASS_COL):
    """Classify single variant using Mahalanobis distance"""
    assert real_data.shape[0] == 1, "Real data should have just one variant"
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore")

        x = sim_data[features]
        y = sim_data[klass]
        real_x = real_data[features]

        def mahal_score(data):
            if data.shape[0] < 2:
                # Insufficient data for calculation
                return float("inf")
            robust_cov = MinCovDet(assume_centered=False).fit(data)
            return robust_cov.mahalanobis(real_x)[0]

        score = x.groupby(y).apply(mahal_score)
        assert len(
            score) == 3, "Missing classes in Mahalanobis distance calculation"
        pred = score.idxmin()
        with np.errstate(under="ignore"):
            prob = chi2.logsf(score, len(features))
            prob = np.exp(prob - logsumexp(prob))

        return (pred, prob, score)
Esempio n. 2
0
def outarray_effect(est, ses, freqs, vy):
    N_effective = vy / (2 * freqs * (1 - freqs) * np.power(ses, 2))
    Z = est / ses
    P = -log10(np.exp(1)) * chi2.logsf(np.power(Z, 2), 1)
    array_out = np.column_stack((N_effective, est, ses, Z, P))
    array_out = np.round(array_out, decimals=6)
    array_out[:, 0] = np.round(array_out[:, 0], 0)
    return array_out
def ErrorRate(n, l, q, g, sigma, t):
    div_t = 0
    avg_t = 0.
    for i in xrange(-2**(t - 1), 2**(t - 1) + 1):
        div_t += (1. * i - 0)**2
    div_t /= 2.**t + 1
    #print t, div_t
    s = sqrt(n * l * sigma**2 * (sigma**2 + div_t) + n * l * sigma**4 +
             sigma**2)
    dis = (q - 1.) / 2 - sqrt(2.) * (q * 1. / g + 1.)
    #print dis, s
    pr = chi2.logsf((dis / s)**2, 8.) / log(2) + log(n / 16., 2)
    return pr
Esempio n. 4
0
def LikRatio_test(psi, psi_null, AD, DP, GT_prob, theta, log=False):
    """Likelihood ratio test for psi vector in a null hypothesis.
    Please use the same AD, DP, and GT_prob as the fit() function.

    Parameters
    ----------
    psi: numpy.array (n_donor, )
        The fractional abundance of each donor in the mixture for alternative
        hypothesis
    psi_null: numpy.array (n_donor, )
        The psi vector in a null hypothesis
    AD: numpy.array, (n_variant, ), int
        The count vector for alternative allele in all variants
    DP: numpy.array (n_variant, ), int
        The count vector for depths in all variants (i.e., two alleles)
    GT_prob: numpy.array, (n_variants, n_donor, n_GT)
        The probability tensor for each genotype in each donor
    theta: numpy.array (n_GT, )
        The alternative allele rate in each genotype category
    log: bool
        If return p value in logarithm scale

    Return
    ------
    statistic: float
        The calculated chi2-statistic.
    pvalue: float
        The single-tailed p-value.
    """
    from scipy.stats import chi2

    BD = DP - AD
    theta_vct_alt = np.dot(np.dot(GT_prob, theta), psi)
    logLik_alt = np.sum(AD * np.log(theta_vct_alt) +
                        BD * np.log(1 - theta_vct_alt))

    theta_vct_null = np.dot(np.dot(GT_prob, theta), psi_null)
    logLik_null = np.sum(AD * np.log(theta_vct_null) +
                         BD * np.log(1 - theta_vct_null))

    LR = 2 * (logLik_alt - logLik_null)
    df = len(psi_null) - 1
    if log:
        pval = chi2.logsf(LR, df)
    else:
        pval = chi2.sf(LR, df)

    return LR, pval
Esempio n. 5
0
def LR_test(LR, df=1, is_log=False):
    """Likelihood ratio test

    Args:
        LR (np.array): likelihood ratio at log scale between alternative model 
            vs null model, namely logLik difference
        df (int): degree of freedom in chi square distribution, namely number
            of additional parameters in alternative model
        is_log (bool): if return p value at log scale

    Returns:
        np.array: p value or log(p value) for single-sided test
    """
    if is_log:
        return chi2.logsf(2 * LR, df)
    else:
        return chi2.sf(2 * LR, df)
Esempio n. 6
0
def neglog10pval(x,df):
    return -np.log10(np.e)*chi2.logsf(x,df)
Esempio n. 7
0
def neglog10pval(x, df):
    return -np.log10(np.e) * chi2.logsf(x, df)
def p_value(x):
    k = 4
    v = chi2.logsf(-2*x, k)
    return v    
Esempio n. 9
0
def dm2_to_prob(score, df=len(MAHAL_FEATURES)):
    with np.errstate(under="ignore"):
        prob = chi2.logsf(score, df)
        return np.exp(prob - logsumexp(prob))
    # Brute MC
    n_samples = int(1e6)
    #ts_vals = [very_simple_ts(norm.rvs(loc=0, scale=1, size=30)) for i in range(n_samples)]
    #ts_vals_threebin = [three_bin_ts(norm.rvs(loc=0, scale=1, size=30)) for i in tqdm(range(n_samples))]
    b = brute_low_memory(very_simple_ts, very_simple_transform, 30, ts_vals_range, n=n_samples)
    b_3b = brute_low_memory(three_bin_ts, very_simple_transform, 30, ts_vals_range, n=n_samples)

    # Polychord
    res1_1b, res2-1b = pc(very_simple_ts, very_simple_transform, n_dim=30, observed=observed, n_live=100, file_root="pc_1bin", do_clustering=False, feedback=2, resume=False, ev_data=True)
    res1_3b, res2_3b = pc(three_bin_ts, very_simple_transform, n_dim=30, observed=observed, n_live=100, file_root="pc_3bin", do_clustering=False, feedback=2, resume=False, ev_data=True)

    res, test_statistic, log_x, log_x_delta = analyse_pch(root="pc_1bin")
    res_3b, test_statistic_3b, log_x_3b, log_x_3b_delta = analyse_pch(root="pc_3bin")

    # analytic in this case
    log10_local_p = chi2.logsf(ts_vals_range, df=1) / np.log(10)
    log10_global_p = np.log10(30.) + log10_local_p

    plt.plot(ts_vals_range, log10_global_p, c='red', ls='--', label="Theory")
    plt.plot(ts_vals_range, np.log10(b), c='grey', ls='--', label="Brute MC")
    plt.plot(test_statistic, np.log10(np.exp(log_x)), c='b', label="Polychord")
    plt.xlim([0,observed])
    plt.xlabel('TS')
    plt.ylabel('$\log_{10}(p)$')
    plt.legend(title='1-bin example')
    plt.savefig("simple_ts_onebin.pdf")
    plt.show()

    plt.plot(ts_vals_range, np.log10(b_3b), c='grey', ls='--', label="Brute MC")
    plt.plot(test_statistic_3b, np.log10(np.exp(log_x_3b)), c='b', label="Polychord")
    plt.xlim([0,observed])
Esempio n. 11
0
# - output: string "source computer \t pvalue \t mid_pvalue" obtained using Edgington, Fisher, Pearson, \
#           George, Stouffer and Tippet methods

old_key = ""
for line in sys.stdin:
    ## Obtain the edge and the pvalues
    key, pvals = line.strip().split("\t")
    if key != old_key:
        if old_key != "":
            ## Edgington p-value (normal approximation - extremely good even for n=4)
            edgington_pval = norm.logcdf(
                sqrt(12.0 / n) * (sum_pvals_edg - .5 * n))
            mid_edgington_pval = norm.logcdf(
                sqrt(12.0 / n) * (sum_mid_pvals_edg - .5 * n))
            ## Fisher p-value
            fisher_pval = chi2.logsf(-2 * sum_pvals_fisher, 2 * n)
            mid_fisher_pval = chi2.logsf(-2 * sum_mid_pvals_fisher, 2 * n)
            ## Pearson p-value *** CHANGE OF SIGN wrt Biometrika paper ***
            pearson_pval = chi2.logcdf(2 * sum_pvals_pearson, 2 * n)
            mid_pearson_pval = chi2.logcdf(2 * sum_mid_pvals_pearson, 2 * n)
            ## Mudholkar and George p-value (scaled Student's t approximation)
            george_pval = t.logcdf(
                sqrt(3.0 / n * (5.0 * n + 4.0) / (5.0 * n + 2.0)) / pi *
                (sum_pvals_pearson + sum_pvals_fisher), 5 * n + 4)
            mid_george_pval = t.logcdf(
                sqrt(3.0 / n * (5.0 * n + 4.0) / (5.0 * n + 2.0)) / pi *
                (sum_mid_pvals_fisher + sum_mid_pvals_pearson), 5 * n + 4)
            ## Stouffer's p-value
            stouffer_pval = norm.logcdf(sum_pvals_stouffer / sqrt(n))
            mid_stouffer_pval = norm.logcdf(sum_mid_pvals_stouffer / sqrt(n))
            ## Tippett's p-value