Example #1
0
Performs a correlation benchmark on the Global Gut project
"""

from __future__ import division
import numpy as np
import pandas as pd
from biom import load_table
from biom import Table
from correlation import zhengr, get_corr_matrix
from composition import coverage_replacement, clr
from skbio.diversity.alpha import robbins
import matplotlib.pyplot as plt

meta_file = "../data/gg/GG_100nt.txt"
biom_file = "../data/gg/GG_100nt.biom"

table = load_table(biom_file)
meta_map = pd.read_table(meta_file, index_col=0)

mat = np.array(table._get_sparse_data().todense()).T
mat = mat.astype(np.int64)
cmat = coverage_replacement(mat, uncovered_estimator=robbins)

# lovell_mat = get_corr_matrix(mat, zhengr)
rlovell_mat = get_corr_matrix(clr(cmat), zhengr)
np.savetxt('../results/gg_lovell.txt', rlovell_mat)
pearson_mat = get_corr_matrix(mat.T, np.corrcoef)
np.savetxt('../results/real_data/gg/gg_pearson.txt', pearson_mat)

heatmap = plt.pcolor(data)
        y = np.zeros((num_species))
        x[0] = 0.05  # fixed proportion of first species
        x[1:] = (1 - x[0])/(num_species - 1)
        y[0] = x[0] + diff
        y[1:] = (1 - y[0])/(num_species - 1)
        rel_pvals = np.zeros((num_samps))
        cov_pvals = np.zeros((num_samps))
        sig_species = np.zeros(((num_samps)//2, 2))

        for i in range(0, num_samps, 2):
            samp_table[i, :] = np.random.multinomial(n=C, pvals=x)
            samp_table[i+1, :] = np.random.multinomial(n=C, pvals=y)
            cats[i] = 0
            cats[i+1] = 1
            sig_species[i//2] = samp_table[i:i+2, 0]
        cov_table = coverage_replacement(samp_table,
                                         uncovered_estimator=robbins)

        if metric != 'ancom':
            fun = lambda x: metric_func(x[cats == 0], x[cats == 1])
            _, cov_pvals = np.apply_along_axis(fun, 0, cov_table)
        else:
            _, cov_pvals = metric_func(cov_table, cats)

        # Calculate effect size for first species
        effect_sizes[j] = effect_size(sig_species[:, 1],
                                      sig_species[:, 0])

        # Calculate fdr and power
        cov_detect = cov_pvals <= alpha
        cov_miss = cov_pvals > alpha
        cov_fdr[j] = (cov_detect[1:]).sum() / (cov_detect).sum()
Example #3
0
mat = np.array(table._get_sparse_data().todense()).T

# Randomly sample simplex
num_dists = 10000
num_species = 1000
depths=[300, 3000, 30000]
relative_tvd = np.zeros((num_dists, len(depths)))
robbins_tvd = np.zeros((num_dists, len(depths)))
for u, depth in enumerate(depths):
    for i in range(num_dists):
        pvals = closure(-np.log(np.random.rand(num_species)))
        # pvals = closure(mat[i, :])

        samp_table = np.random.multinomial(n=depth, pvals=pvals)

        cx1 = coverage_replacement(np.atleast_2d(samp_table),
                                   uncovered_estimator=robbins)
        relative_tvd[i, u] = variation_distance(closure(samp_table),  pvals)
        robbins_tvd[i, u] = variation_distance(cx1, pvals)

fig, axes = plt.subplots(1, 3, figsize=(15, 4.5))
for u in range(len(depths)):
    axes[u].hist(relative_tvd[:, u], 20, label='Relative', alpha=0.5, color='b')
    axes[u].hist(robbins_tvd[:, u], 20, label='Robbins', alpha=0.5, color='r')
    axes[u].set_title('Depth=%d' % depths[u])
    if u == 0:
        axes[u].set_ylabel('Counts')
    if u == 1:
        axes[u].set_xlabel('Total Variation Distance')
    axes[u].locator_params(nbins=4)
plt.legend()
fig.savefig('../results/multiple_simplicial_hists.png')
Example #4
0
bT = biom.Table(table, range(table.shape[0]), range(table.shape[1]))
biomname = '../data/tables_6_3_2015/bioms/table_1.biom'
txtname = '../data/tables_6_3_2015/txts/table_1.txt'
open(biomname, 'w').write(bT.to_json('Jamie'))
open(txtname, 'w').write(bT.to_tsv())

zheng = lambda x, y: abs(zhengr(x, y))

#######################################################################
#               Uniform rarefaction correlation                       #
#######################################################################
pvals = np.apply_along_axis(lambda x: x / x.sum(), axis=1, arr=table)
samp_table = np.apply_along_axis(
    lambda p: np.random.multinomial(n=2000, pvals=p), axis=1, arr=pvals)
mrsamp_table = multiplicative_replacement(samp_table)
lrsamp_table = coverage_replacement(samp_table)
rrsamp_table = coverage_replacement(samp_table, uncovered_estimator=robbins)

pearson_corr_mat = abs(np.corrcoef(samp_table.T))
spearman_corr_mat = abs(spearmanr(samp_table)[0])
zheng_corr_mat = get_corr_matrix(samp_table, zheng)
rrzheng_corr_mat = get_corr_matrix(rrsamp_table, zheng)

metric_df = confusion_evaluate(
    corr_mat,
    [pearson_corr_mat, spearman_corr_mat, zheng_corr_mat, rrzheng_corr_mat],
    ['Pearson', 'Spearman', 'Lovell', 'Robbins Corrected Lovell'])

roc_fig = plot_roc(metric_df, ['-ob', '-og', '-or', '-om'])
prec_fig = plot_recall(metric_df, ['-ob', '-og', '-or', '-om'])