Performs a correlation benchmark on the Global Gut project """ from __future__ import division import numpy as np import pandas as pd from biom import load_table from biom import Table from correlation import zhengr, get_corr_matrix from composition import coverage_replacement, clr from skbio.diversity.alpha import robbins import matplotlib.pyplot as plt meta_file = "../data/gg/GG_100nt.txt" biom_file = "../data/gg/GG_100nt.biom" table = load_table(biom_file) meta_map = pd.read_table(meta_file, index_col=0) mat = np.array(table._get_sparse_data().todense()).T mat = mat.astype(np.int64) cmat = coverage_replacement(mat, uncovered_estimator=robbins) # lovell_mat = get_corr_matrix(mat, zhengr) rlovell_mat = get_corr_matrix(clr(cmat), zhengr) np.savetxt('../results/gg_lovell.txt', rlovell_mat) pearson_mat = get_corr_matrix(mat.T, np.corrcoef) np.savetxt('../results/real_data/gg/gg_pearson.txt', pearson_mat) heatmap = plt.pcolor(data)
y = np.zeros((num_species)) x[0] = 0.05 # fixed proportion of first species x[1:] = (1 - x[0])/(num_species - 1) y[0] = x[0] + diff y[1:] = (1 - y[0])/(num_species - 1) rel_pvals = np.zeros((num_samps)) cov_pvals = np.zeros((num_samps)) sig_species = np.zeros(((num_samps)//2, 2)) for i in range(0, num_samps, 2): samp_table[i, :] = np.random.multinomial(n=C, pvals=x) samp_table[i+1, :] = np.random.multinomial(n=C, pvals=y) cats[i] = 0 cats[i+1] = 1 sig_species[i//2] = samp_table[i:i+2, 0] cov_table = coverage_replacement(samp_table, uncovered_estimator=robbins) if metric != 'ancom': fun = lambda x: metric_func(x[cats == 0], x[cats == 1]) _, cov_pvals = np.apply_along_axis(fun, 0, cov_table) else: _, cov_pvals = metric_func(cov_table, cats) # Calculate effect size for first species effect_sizes[j] = effect_size(sig_species[:, 1], sig_species[:, 0]) # Calculate fdr and power cov_detect = cov_pvals <= alpha cov_miss = cov_pvals > alpha cov_fdr[j] = (cov_detect[1:]).sum() / (cov_detect).sum()
mat = np.array(table._get_sparse_data().todense()).T # Randomly sample simplex num_dists = 10000 num_species = 1000 depths=[300, 3000, 30000] relative_tvd = np.zeros((num_dists, len(depths))) robbins_tvd = np.zeros((num_dists, len(depths))) for u, depth in enumerate(depths): for i in range(num_dists): pvals = closure(-np.log(np.random.rand(num_species))) # pvals = closure(mat[i, :]) samp_table = np.random.multinomial(n=depth, pvals=pvals) cx1 = coverage_replacement(np.atleast_2d(samp_table), uncovered_estimator=robbins) relative_tvd[i, u] = variation_distance(closure(samp_table), pvals) robbins_tvd[i, u] = variation_distance(cx1, pvals) fig, axes = plt.subplots(1, 3, figsize=(15, 4.5)) for u in range(len(depths)): axes[u].hist(relative_tvd[:, u], 20, label='Relative', alpha=0.5, color='b') axes[u].hist(robbins_tvd[:, u], 20, label='Robbins', alpha=0.5, color='r') axes[u].set_title('Depth=%d' % depths[u]) if u == 0: axes[u].set_ylabel('Counts') if u == 1: axes[u].set_xlabel('Total Variation Distance') axes[u].locator_params(nbins=4) plt.legend() fig.savefig('../results/multiple_simplicial_hists.png')
bT = biom.Table(table, range(table.shape[0]), range(table.shape[1])) biomname = '../data/tables_6_3_2015/bioms/table_1.biom' txtname = '../data/tables_6_3_2015/txts/table_1.txt' open(biomname, 'w').write(bT.to_json('Jamie')) open(txtname, 'w').write(bT.to_tsv()) zheng = lambda x, y: abs(zhengr(x, y)) ####################################################################### # Uniform rarefaction correlation # ####################################################################### pvals = np.apply_along_axis(lambda x: x / x.sum(), axis=1, arr=table) samp_table = np.apply_along_axis( lambda p: np.random.multinomial(n=2000, pvals=p), axis=1, arr=pvals) mrsamp_table = multiplicative_replacement(samp_table) lrsamp_table = coverage_replacement(samp_table) rrsamp_table = coverage_replacement(samp_table, uncovered_estimator=robbins) pearson_corr_mat = abs(np.corrcoef(samp_table.T)) spearman_corr_mat = abs(spearmanr(samp_table)[0]) zheng_corr_mat = get_corr_matrix(samp_table, zheng) rrzheng_corr_mat = get_corr_matrix(rrsamp_table, zheng) metric_df = confusion_evaluate( corr_mat, [pearson_corr_mat, spearman_corr_mat, zheng_corr_mat, rrzheng_corr_mat], ['Pearson', 'Spearman', 'Lovell', 'Robbins Corrected Lovell']) roc_fig = plot_roc(metric_df, ['-ob', '-og', '-or', '-om']) prec_fig = plot_recall(metric_df, ['-ob', '-og', '-or', '-om'])