def between_correls(args): """TABLES MUST SORT SO THAT SAMPLES ARE IN THE SAME ORDER """ logger = general.Logger("SCNIC_log.txt") logger["SCNIC analysis type"] = "between" # correlation and p-value adjustment methods correl_methods = {'spearman': spearmanr, 'pearson': pearsonr} p_methods = {'bh': general.bh_adjust, 'bon': general.bonferroni_adjust} correl_method = correl_methods[args.correl_method] if args.p_adjust is not None: p_adjust = p_methods[args.p_adjust] else: p_adjust = None # load tables table1 = load_table(args.table1) table2 = load_table(args.table2) logger["input table 1"] = args.table1 logger["input table 1"] = args.table2 table1 = table1.sort() table2 = table2.sort() if not np.array_equal(table1.ids(), table2.ids()): raise ValueError("Tables have different sets of samples present") # make new output directory and change to it if args.output is not None: os.makedirs(args.output) os.chdir(args.output) logger["output directory"] = args.output # filter tables if args.min_sample is not None: table1 = general.filter_table(table1, args.min_sample) metadata = general.get_metadata_from_table(table1) table2 = general.filter_table(table2, args.min_sample) metadata.update(general.get_metadata_from_table(table2)) else: metadata = general.get_metadata_from_table(table1) metadata.update(general.get_metadata_from_table(table2)) # make correlations logger["correlation metric"] = args.correl_method logger["p adjustment method"] = args.p_adjust correls = between_correls_from_tables(table1, table2, correl_method) correls.sort_values(correls.columns[-1], inplace=True) correls.to_csv(open('correls.txt', 'w'), sep='\t', index=False) # adjust p-values correls['p_adj'] = p_adjust(correls['p']) # make network net = general.correls_to_net(correls, metadata=metadata, min_p=args.min_p, min_r=args.min_r) logger["number of nodes"] = net.number_of_nodes() logger["number of edges"] = net.number_of_edges() nx.write_gml(net, 'crossnet.gml') logger.output_log() print '\a'
def module_maker(args): logger = general.Logger("SCNIC_module_log.txt") logger["SCNIC analysis type"] = "module" # read in correlations file correls = pd.read_table(args.input, index_col=(0, 1), sep='\t', dtype={ 'feature1': str, 'feature2': str }) logger["input correls"] = args.input if args.verbose: print("correls.txt read") # sanity check args if args.min_r is not None and args.min_p is not None: raise ValueError( "arguments min_p and min_r may not be used concurrently") if args.min_r is None and args.min_p is None: raise ValueError("argument min_p or min_r must be used") # read in correlations file and make distance matrix if args.min_r is not None: min_dist = ma.cor_to_dist(args.min_r) logger["minimum r value"] = args.min_r cor, labels = ma.correls_to_cor(correls) dist = ma.cor_to_dist(cor) elif args.min_p is not None: # TODO: This raise NotImplementedError() else: raise ValueError("this is prevented above") # read in biom table if given if args.table is not None: table = load_table(args.table) logger["input uncollapsed table"] = args.table if args.verbose: print("otu table read") # make new output directory and change to it if args.output is not None: if not os.path.isdir(args.output): os.makedirs(args.output) os.chdir(args.output) logger["output directory"] = os.getcwd() # make modules modules = ma.make_modules(dist, min_dist, obs_ids=labels) logger["number of modules created"] = len(modules) if args.verbose: print("Modules Formed") print("number of modules: %s" % len(modules)) print("number of observations in modules: %s" % np.sum([len(i) for i in modules])) print("") ma.write_modules_to_file(modules) # collapse modules if args.table is not None: coll_table = ma.collapse_modules(table, modules) ma.write_modules_to_dir(table, modules) logger["number of observations in output table"] = coll_table.shape[0] if args.verbose: print("Table Collapsed") print("collapsed Table Observations: " + str(coll_table.shape[0])) print("") with biom_open('collapsed.biom', 'w') as f: coll_table.to_hdf5(f, 'make_modules.py') # make network if args.table is not None: metadata = general.get_metadata_from_table(table) else: metadata = defaultdict(dict) metadata = ma.add_modules_to_metadata(modules, metadata) correls_filter = general.filter_correls(correls, conet=True, min_p=args.min_p, min_r=args.min_r) net = general.correls_to_net(correls_filter, metadata=metadata) nx.write_gml(net, 'correlation_network.gml') if args.verbose: print("Network Generated") print("number of nodes: %s" % str(net.number_of_nodes())) print("number of edges: %s" % str(net.number_of_edges())) logger["number of nodes"] = net.number_of_nodes() logger["number of edges"] = net.number_of_edges() logger.output_log()
def between_correls(args): """TABLES MUST SORT SO THAT SAMPLES ARE IN THE SAME ORDER """ logger = general.Logger("SCNIC_log.txt") logger["SCNIC analysis type"] = "between" # correlation and p-value adjustment methods correl_methods = {'spearman': spearmanr, 'pearson': pearsonr} correl_method = correl_methods[args.correl_method] # load tables table1 = load_table(args.table1) table2 = load_table(args.table2) logger["input table 1"] = args.table1 logger["input table 1"] = args.table2 table1 = table1.sort() table2 = table2.sort() # make new output directory and change to it if args.force and args.output is not None: shutil.rmtree(args.output, ignore_errors=True) if args.output is not None: os.makedirs(args.output) os.chdir(args.output) logger["output directory"] = args.output # filter tables if args.sparcc_filter is True: table1 = general.sparcc_paper_filter(table1) table2 = general.sparcc_paper_filter(table2) print("Table 1 filtered: %s observations" % str(table1.shape[0])) print("Table 2 filtered: %s observations" % str(table2.shape[0])) logger["sparcc paper filter"] = True logger["number of observations present in table 1 after filter"] = table1.shape[0] logger["number of observations present in table 2 after filter"] = table2.shape[0] if args.min_sample is not None: table1 = general.filter_table(table1, args.min_sample) table2 = general.filter_table(table2, args.min_sample) if not np.array_equal(table1.ids(), table2.ids()): raise ValueError("Tables have different sets of samples present") metadata = general.get_metadata_from_table(table1) metadata.update(general.get_metadata_from_table(table2)) # make correlations logger["correlation metric"] = args.correl_method logger["p adjustment method"] = args.p_adjust correls = ca.between_correls_from_tables(table1, table2, correl_method, nprocs=args.procs) correls.sort_values(correls.columns[-1], inplace=True) correls['p_adj'] = general.p_adjust(correls['p']) correls.to_csv(open('correls.txt', 'w'), sep='\t', index=True) # make network correls_filt = general.filter_correls(correls, min_p=args.min_p, min_r=args.min_r) net = general.correls_to_net(correls_filt, metadata=metadata) logger["number of nodes"] = net.number_of_nodes() logger["number of edges"] = net.number_of_edges() nx.write_gml(net, 'crossnet.gml') logger.output_log()
def within_correls(input_loc, output_loc, correl_method='sparcc', sparcc_filter=False, min_sample=None, procs=1, sparcc_p=1000, p_adjust='fdr_bh', verbose=False): logger = general.Logger(path.join(output_loc, "SCNIC_within_log.txt")) logger["SCNIC analysis type"] = "within" # correlation and p-value adjustment methods correl_methods = {'spearman': spearmanr, 'pearson': pearsonr, 'kendall': kendalltau, 'sparcc': 'sparcc'} correl_method = correl_methods[correl_method.lower()] # get features to be correlated table = load_table(input_loc) logger["input table"] = input_loc if verbose: print("Table loaded: " + str(table.shape[0]) + " observations") print("") logger["number of samples in input table"] = table.shape[1] logger["number of observations in input table"] = table.shape[0] # make new output directory if output_loc is not None: if not path.isdir(output_loc): os.makedirs(output_loc) logger["output directory"] = path.abspath(output_loc) # filter if sparcc_filter is True: table_filt = general.sparcc_paper_filter(table) if verbose: print("Table filtered: %s observations" % str(table_filt.shape[0])) print("") logger["sparcc paper filter"] = True logger["number of observations present after filter"] = table_filt.shape[0] elif min_sample is not None: table_filt = general.filter_table(table, min_sample) if verbose: print("Table filtered: %s observations" % str(table_filt.shape[0])) print("") logger["min samples present"] = min_sample logger["number of observations present after filter"] = table_filt.shape[0] else: table_filt = table logger["number of processors used"] = procs # correlate features if correl_method in [spearmanr, pearsonr, kendalltau]: # calculate correlations if verbose: print("Correlating with %s" % correl_method) # correlate feature correls = ca.calculate_correlations(table_filt, correl_method, nprocs=procs, p_adjust_method=p_adjust) elif correl_method == 'sparcc': if sparcc_p is None: correls = ca.fastspar_correlation(table_filt, verbose=verbose, nprocs=procs) else: correls = ca.fastspar_correlation(table_filt, calc_pvalues=True, bootstraps=sparcc_p, verbose=verbose, nprocs=procs, p_adjust_method=p_adjust) else: raise ValueError("How did this even happen?") logger["distance metric used"] = correl_method if verbose: print("Features Correlated") print("") correls.to_csv(path.join(output_loc, 'correls.txt'), sep='\t', index_label=('feature1', 'feature2')) if verbose: print("Correls.txt written") # make correlation network metadata = general.get_metadata_from_table(table_filt) net = general.correls_to_net(correls, metadata=metadata) nx.write_gml(net, path.join(output_loc, 'correlation_network.gml')) if verbose: print("Network made") print("") logger.output_log()
def within_correls(args): logger = general.Logger("SCNIC_within_log.txt") logger["SCNIC analysis type"] = "within" # correlation and p-value adjustment methods correl_methods = {'spearman': spearmanr, 'pearson': pearsonr, 'kendall': kendalltau, 'sparcc': 'sparcc'} correl_method = correl_methods[args.correl_method.lower()] # get features to be correlated table = load_table(args.input) logger["input table"] = args.input if args.verbose: print("Table loaded: " + str(table.shape[0]) + " observations") print("") logger["number of samples in input table"] = table.shape[1] logger["number of observations in input table"] = table.shape[0] # make new output directory and change to it if args.output is not None: if not os.path.isdir(args.output): os.makedirs(args.output) os.chdir(args.output) logger["output directory"] = os.getcwd() # filter if args.sparcc_filter is True: table_filt = general.sparcc_paper_filter(table) if args.verbose: print("Table filtered: %s observations" % str(table_filt.shape[0])) print("") logger["sparcc paper filter"] = True logger["number of observations present after filter"] = table_filt.shape[0] elif args.min_sample is not None: table_filt = general.filter_table(table, args.min_sample) if args.verbose: print("Table filtered: %s observations" % str(table_filt.shape[0])) print("") logger["min samples present"] = args.min_sample logger["number of observations present after filter"] = table_filt.shape[0] else: table_filt = table logger["number of processors used"] = args.procs # correlate features if correl_method in [spearmanr, pearsonr, kendalltau]: # calculate correlations if args.verbose: print("Correlating with %s" % args.correl_method) # correlate feature correls = ca.calculate_correlations(table_filt, correl_method) elif correl_method == 'sparcc': correls = ca.fastspar_correlation(table_filt, verbose=args.verbose) if args.sparcc_p is not None: raise NotImplementedError() # TODO: reimplement with fastspar else: raise ValueError("How did this even happen?") logger["distance metric used"] = args.correl_method if args.verbose: print("Features Correlated") print("") if 'p' in correls.columns: correls['p_adj'] = general.p_adjust(correls['p']) correls.to_csv('correls.txt', sep='\t', index_label=('feature1', 'feature2')) if args.verbose: print("Correls.txt written") # make correlation network metadata = general.get_metadata_from_table(table_filt) net = general.correls_to_net(correls, metadata=metadata) nx.write_gml(net, 'correlation_network.gml') if args.verbose: print("Network made") print("") logger.output_log()
def module_maker(input_loc, output_loc, min_p=None, min_r=None, method='naive', k_size=3, gamma=.4, table_loc=None, prefix='module', verbose=False): logger = general.Logger(path.join(output_loc, "SCNIC_module_log.txt")) logger["SCNIC analysis type"] = "module" # read in correlations file correls = pd.read_csv(input_loc, index_col=(0, 1), sep='\t') correls.index = pd.MultiIndex.from_tuples([(str(id1), str(id2)) for id1, id2 in correls.index]) logger["input correls"] = input_loc if verbose: print("correls.txt read") # sanity check args if min_r is not None and min_p is not None: raise ValueError("arguments min_p and min_r may not be used concurrently") if min_r is None and min_p is None: raise ValueError("argument min_p or min_r must be used") # make new output directory and change to it if output_loc is not None: if not path.isdir(output_loc): os.makedirs(output_loc) logger["output directory"] = path.abspath(output_loc) # make modules if method == 'naive': modules = ma.make_modules_naive(correls, min_r, min_p, prefix=prefix) elif method == 'k_cliques': modules = ma.make_modules_k_cliques(correls, min_r, min_p, k_size, prefix=prefix) elif method == 'louvain': modules = ma.make_modules_louvain(correls, min_r, min_p, gamma, prefix=prefix) else: raise ValueError('%s is not a valid module picking method' % method) logger["number of modules created"] = len(modules) if verbose: print("Modules Formed") print("number of modules: %s" % len(modules)) print("number of observations in modules: %s" % np.sum([len(i) for i in modules])) print("") ma.write_modules_to_file(modules, path_str=path.join(output_loc, 'modules.txt')) # collapse modules if table_loc is not None: table = load_table(table_loc) logger["input uncollapsed table"] = table_loc if verbose: print("otu table read") coll_table = ma.collapse_modules(table, modules) # ma.write_modules_to_dir(table, modules) logger["number of observations in output table"] = coll_table.shape[0] if verbose: print("Table Collapsed") print("collapsed Table Observations: " + str(coll_table.shape[0])) print("") with biom_open(path.join(output_loc, 'collapsed.biom'), 'w') as f: coll_table.to_hdf5(f, 'make_modules.py') metadata = general.get_metadata_from_table(table) else: metadata = defaultdict(dict) # make network metadata = ma.add_modules_to_metadata(modules, metadata) correls_filter = general.filter_correls(correls, conet=True, min_p=min_p, min_r=min_r) net = general.correls_to_net(correls_filter, metadata=metadata) nx.write_gml(net, path.join(output_loc, 'correlation_network.gml')) if verbose: print("Network Generated") print("number of nodes: %s" % str(net.number_of_nodes())) print("number of edges: %s" % str(net.number_of_edges())) logger["number of nodes"] = net.number_of_nodes() logger["number of edges"] = net.number_of_edges() logger.output_log()