def fastspar_correlation(table: Table, verbose: bool = False, calc_pvalues=False, bootstraps=1000, nprocs=1, p_adjust_method='fdr_bh') -> pd.DataFrame: with tempfile.TemporaryDirectory(prefix='fastspar') as temp: # To fix AttributeError: 'DataFrame' object has no attribute 'to_dense'. See: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sparse.to_dense.html table.to_dataframe().sparse.to_dense().to_csv(path.join( temp, 'otu_table.tsv'), sep='\t', index_label='#OTU ID') if verbose: stdout = None else: stdout = subprocess.DEVNULL run_fastspar(path.join(temp, 'otu_table.tsv'), path.join(temp, path.join(temp, 'correl_table.tsv')), path.join(temp, 'covar_table.tsv'), stdout, nprocs) cor = pd.read_csv(path.join(temp, 'correl_table.tsv'), sep='\t', index_col=0) correls = df_to_correls(cor) if calc_pvalues: subprocess.run([ 'fastspar_bootstrap', '-c', path.join(temp, 'otu_table.tsv'), '-n', str(bootstraps), '-p', path.join(temp, 'boot'), '-t', str(nprocs) ], stdout=stdout) # infer correlations for each bootstrap count using all available processes with ThreadPoolExecutor(max_workers=nprocs) as executor: for i in glob((path.join(temp, 'boot*'))): executor.submit(run_fastspar, i, i.replace('boot', 'cor_boot'), i.replace('boot', 'cov_boot')) # calculate p_values for correlation table subprocess.run([ 'fastspar_pvalues', '-c', path.join(temp, 'otu_table.tsv'), '-r', path.join(temp, 'correl_table.tsv'), '-p', path.join(temp, 'cor_boot'), '-t', str(nprocs), '-n', str(bootstraps), '-o', path.join(temp, 'pvalues.tsv') ], stdout=stdout) pvals = pd.read_csv(path.join(temp, 'pvalues.tsv'), sep='\t', index_col=0) pvals = df_to_correls(pvals, col_label='p') correls = pd.concat([correls, pvals], axis=1, join='inner') correls['p_adjusted'] = p_adjust(correls.p, p_adjust_method) correls.index = pd.MultiIndex.from_tuples( [sorted(i) for i in correls.index]) return correls
def calculate_correlations( table: Table, corr_method=spearmanr, p_adjustment_method: str = 'fdr_bh') -> pd.DataFrame: # TODO: multiprocess this index = list() data = list() for (val_i, id_i, _), (val_j, id_j, _) in table.iter_pairwise(axis='observation'): r, p = corr_method(val_i, val_j) index.append((id_i, id_j)) data.append((r, p)) correls = pd.DataFrame(data, index=index, columns=['r', 'p']) correls.index = pd.MultiIndex.from_tuples( correls.index) # Turn tuple index into actual multiindex if p_adjustment_method is not None: correls['p_adjusted'] = p_adjust(correls.p, method=p_adjustment_method) return correls
def calculate_correlations(table: Table, corr_method=spearmanr, p_adjust_method: str = 'fdr_bh', nprocs=1) -> \ pd.DataFrame: if nprocs > multiprocessing.cpu_count(): warnings.warn( "nprocs greater than CPU count, using all avaliable CPUs") nprocs = multiprocessing.cpu_count() pool = multiprocessing.Pool(nprocs) cor = partial(calculate_correlation, corr_method=corr_method) results = pool.map( cor, pairwise_iter_wo_metadata(table.iter_pairwise(axis='observation'))) index = [i[0] for i in results] data = [i[1] for i in results] pool.close() pool.join() correls = pd.DataFrame(data, index=index, columns=['r', 'p']) # Turn tuple index into actual multiindex, now guaranteeing that correls index is sorted correls.index = pd.MultiIndex.from_tuples( [sorted(i) for i in correls.index]) if p_adjust_method is not None: correls['p_adjusted'] = p_adjust(correls.p, method=p_adjust_method) return correls
def test_bh_adjust(unadj_ps): adj_ps = np.array([.03, .075, .5]) bh_ps = p_adjust(unadj_ps, 'fdr_bh') assert isinstance(bh_ps, np.ndarray) assert_allclose(adj_ps, bh_ps)
def test_bonferroni_adjust(unadj_ps): adj_ps = np.array([.03, .15, 1]) bon_ps = p_adjust(unadj_ps, method='b') assert isinstance(bon_ps, np.ndarray) assert_allclose(adj_ps, bon_ps)
def between_correls(args): """TABLES MUST SORT SO THAT SAMPLES ARE IN THE SAME ORDER """ logger = general.Logger("SCNIC_log.txt") logger["SCNIC analysis type"] = "between" # correlation and p-value adjustment methods correl_methods = {'spearman': spearmanr, 'pearson': pearsonr} correl_method = correl_methods[args.correl_method] # load tables table1 = load_table(args.table1) table2 = load_table(args.table2) logger["input table 1"] = args.table1 logger["input table 1"] = args.table2 table1 = table1.sort() table2 = table2.sort() # make new output directory and change to it if args.force and args.output is not None: shutil.rmtree(args.output, ignore_errors=True) if args.output is not None: os.makedirs(args.output) os.chdir(args.output) logger["output directory"] = args.output # filter tables if args.sparcc_filter is True: table1 = general.sparcc_paper_filter(table1) table2 = general.sparcc_paper_filter(table2) print("Table 1 filtered: %s observations" % str(table1.shape[0])) print("Table 2 filtered: %s observations" % str(table2.shape[0])) logger["sparcc paper filter"] = True logger["number of observations present in table 1 after filter"] = table1.shape[0] logger["number of observations present in table 2 after filter"] = table2.shape[0] if args.min_sample is not None: table1 = general.filter_table(table1, args.min_sample) table2 = general.filter_table(table2, args.min_sample) if not np.array_equal(table1.ids(), table2.ids()): raise ValueError("Tables have different sets of samples present") metadata = general.get_metadata_from_table(table1) metadata.update(general.get_metadata_from_table(table2)) # make correlations logger["correlation metric"] = args.correl_method logger["p adjustment method"] = args.p_adjust correls = ca.between_correls_from_tables(table1, table2, correl_method, nprocs=args.procs) correls.sort_values(correls.columns[-1], inplace=True) correls['p_adj'] = general.p_adjust(correls['p']) correls.to_csv(open('correls.txt', 'w'), sep='\t', index=True) # make network correls_filt = general.filter_correls(correls, min_p=args.min_p, min_r=args.min_r) net = general.correls_to_net(correls_filt, metadata=metadata) logger["number of nodes"] = net.number_of_nodes() logger["number of edges"] = net.number_of_edges() nx.write_gml(net, 'crossnet.gml') logger.output_log()
def within_correls(args): logger = general.Logger("SCNIC_within_log.txt") logger["SCNIC analysis type"] = "within" # correlation and p-value adjustment methods correl_methods = {'spearman': spearmanr, 'pearson': pearsonr, 'kendall': kendalltau, 'sparcc': 'sparcc'} correl_method = correl_methods[args.correl_method.lower()] # get features to be correlated table = load_table(args.input) logger["input table"] = args.input if args.verbose: print("Table loaded: " + str(table.shape[0]) + " observations") print("") logger["number of samples in input table"] = table.shape[1] logger["number of observations in input table"] = table.shape[0] # make new output directory and change to it if args.output is not None: if not os.path.isdir(args.output): os.makedirs(args.output) os.chdir(args.output) logger["output directory"] = os.getcwd() # filter if args.sparcc_filter is True: table_filt = general.sparcc_paper_filter(table) if args.verbose: print("Table filtered: %s observations" % str(table_filt.shape[0])) print("") logger["sparcc paper filter"] = True logger["number of observations present after filter"] = table_filt.shape[0] elif args.min_sample is not None: table_filt = general.filter_table(table, args.min_sample) if args.verbose: print("Table filtered: %s observations" % str(table_filt.shape[0])) print("") logger["min samples present"] = args.min_sample logger["number of observations present after filter"] = table_filt.shape[0] else: table_filt = table logger["number of processors used"] = args.procs # correlate features if correl_method in [spearmanr, pearsonr, kendalltau]: # calculate correlations if args.verbose: print("Correlating with %s" % args.correl_method) # correlate feature correls = ca.calculate_correlations(table_filt, correl_method) elif correl_method == 'sparcc': correls = ca.fastspar_correlation(table_filt, verbose=args.verbose) if args.sparcc_p is not None: raise NotImplementedError() # TODO: reimplement with fastspar else: raise ValueError("How did this even happen?") logger["distance metric used"] = args.correl_method if args.verbose: print("Features Correlated") print("") if 'p' in correls.columns: correls['p_adj'] = general.p_adjust(correls['p']) correls.to_csv('correls.txt', sep='\t', index_label=('feature1', 'feature2')) if args.verbose: print("Correls.txt written") # make correlation network metadata = general.get_metadata_from_table(table_filt) net = general.correls_to_net(correls, metadata=metadata) nx.write_gml(net, 'correlation_network.gml') if args.verbose: print("Network made") print("") logger.output_log()