def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) paths = utils.read_paths(args.paths_file) expr = utils.read_expr(args.expr_file) gsm = utils.read_text(args.gsm_file) expr = expr[gsm] if args.n_cores > 1: seqcorrs = utils.parallel_process( wrapper, [(path, expr) for path in paths], n_jobs=args.n_cores) # for some reason this is slower else: seqcorrs = [] for path in tqdm(paths): seqcorrs.append(get_seq_corr(path, expr)) #seqcorrs = list(map(wrapper, tqdm([(path, expr) for path in paths]))) utils.create_dir_if_not_exist(dirname(args.out_seqcorr_file)) np.savetxt(args.out_seqcorr_file, seqcorrs)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) data_raw = pd.read_excel(args.in_data_path) row_values = data_raw[['Gene symbol', 'P-value']].values rows = utils.parallel_process(wrap, row_values, n_jobs=args.N_cores) rows = [elem for elem in rows if elem is not None] genes = pd.DataFrame([val for row in rows for val in row if val[0] != -1], columns=['EntrezID', 'P-value']) genes_unique = genes.sort_values('P-value') genes_unique.drop_duplicates('EntrezID', inplace=True) genes_list = genes_unique.EntrezID.tolist() logging.info('Num. of mapped genes: {}'.format(len(genes_list))) utils.write_gene_list(args.out_genes_path, genes_list)
logging.StreamHandler()]) logging.info(args) paths = utils.read_paths(args.paths_file) expr = utils.read_expr(args.expr_file) gsm = utils.read_text(args.gsm_file) expr = expr[gsm] def get_abscorr(i, j, corrdata): method = 'pearson' if not args.spearman else 'spearman' corrmatr = corrdata.loc[[i, j]].T.corr(method) return corrmatr.abs().groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).T.groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).values[0, 1] def get_seq_corr(path): if not np.all([gene in expr.index.tolist() for gene in path]): return np.nan return np.mean([get_abscorr(path[i], path[i + 1], expr) for i in range(len(path) - 1)]) if args.n_cores > 1: seqcorrs = utils.parallel_process(get_seq_corr, paths, n_jobs=args.n_cores) else: seqcorrs = [] for path in tqdm(paths): seqcorrs.append(get_seq_corr(path, expr)) #seqcorrs = list(map(wrapper, tqdm([(path, expr) for path in paths]))) utils.create_dir_if_not_exist(dirname(args.out_seqcorr_file)) np.savetxt(args.out_seqcorr_file, seqcorrs)
method = 'pearson' if not args.spearman else 'spearman' corrmatr = corrdata.loc[[i, j]].T.corr(method) return corrmatr.abs().groupby('ENTREZ_GENE_ID').apply(lambda x: x.max( )).T.groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).values[0, 1] def get_seq_corr(path): if not np.all([gene in expr.index.tolist() for gene in path]): return np.nan return np.mean([ get_abscorr(path[i], path[i + 1], expr) for i in range(len(path) - 1) ]) def get_random_coexpr(i): src = random.choice(srcnodes) dest = random.choice(destnodes) path = random.choice(list(nx.all_shortest_paths(net, src, dest))) return get_seq_corr(path) if args.N_cores > 1: seqs = utils.parallel_process(get_random_coexpr, range(args.N_samples), n_jobs=args.N_cores) else: seqs = [] for i in trange(args.N_samples): seqs.append(get_random_coexpr(i)) np.savetxt(args.out_seqcorr_file, seqs)
parser.add_argument('--N_per_bin', type=int, default=30, help='Minimum number of nodes per degree bin') parser.add_argument('--alpha', type=int, default=1, help='Alpha parameter of DIAMOnD') parser.add_argument('--rdmseed', type=int, default=None, help='RNG seed') args = parser.parse_args() logging.basicConfig(level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log"),logging.StreamHandler()]) logging.info(args) genes = utils.read_gene_list(args.in_genes_path) mod = utils.read_gene_list(args.in_module_path) net = utils.read_network(args.net_path) rdmseeds = gen.gen_degree_preserved_sets(genes, net, args.N_samples, args.N_per_bin,rdmseed=args.rdmseed) def wrap(pack): rdmset, N_mod, net, alpha = pack return rdmset + [gene[0] for gene in DIAMOnD.DIAMOnD(net, rdmset, N_mod - len(rdmset), alpha,progress_bar=False)] samples = utils.parallel_process(wrap, utils.pack_variables(rdmseeds, len(mod), net, args.alpha), n_jobs=args.N_cores) utils.write_genesets_list(args.out_samples_path, samples)
fcnodes = flows[(flows.FCS >= args.fc_thresh) & (flows.N_paths >= args.npath_thresh)].index.tolist() logging.info('Num of FC nodes: {}'.format(len(fcnodes))) seqsims = [] for srcnodes_sample, destnodes_sample in zip(srcnodes_samples, destnodes_samples): all_paths = [] for src_gene in tqdm(srcnodes_sample): def get_sps(dest_gene): return list(nx.all_shortest_paths(net, src_gene, dest_gene)) all_paths = all_paths + sum( utils.parallel_process( get_sps, destnodes_sample, n_jobs=args.N_cores), []) logging.info('Num of all paths: {}'.format(len(all_paths))) fc_paths = [] for i in trange(len(all_paths)): fullpath = all_paths[i] if len(fullpath) > 2: path = all_paths[i][1:-1] if np.all([node in fcnodes for node in path]): fc_paths.append(fullpath) logging.info('Num of FC paths: {}'.format(len(fc_paths))) go = obo_parser.GODag(args.obo_file) gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
for j_go_term in j_go: def wrap(i_go_term): return resnik_sim(i_go_term, j_go_term, go, termcounts) simlist = [sim for sim in map(wrap, i_go) if sim is not None] if len(simlist): sims.append(max(simlist)) return np.mean(sims) def get_path_sim(path): if np.all([gene in gene2go.keys() for gene in path]): sims = [] for i in range(len(path) - 1): sims.append(get_sim([path[i], path[i + 1]])) return np.mean(sims) else: return np.nan if args.n_cores > 1: sims = utils.parallel_process(get_path_sim, paths, n_jobs=args.n_cores) else: sims = [] for path in paths: sims.append(get_path_sim(path)) utils.create_dir_if_not_exist(dirname(args.out_sims_file)) np.savetxt(args.out_sims_file, sims)
destnodes = utils.read_gene_list(args.destnodes_file) flows = utils.read_flows(args.flows_file) fcnodes = flows[(flows.FCS >= args.fc_thresh) & (flows.N_paths >= args.npath_thresh)].index.tolist() logging.info('Num of FC nodes: {}'.format(len(fcnodes))) all_paths = [] for src_gene in tqdm(srcnodes): def get_sps(dest_gene): return list(nx.all_shortest_paths(net, src_gene, dest_gene)) all_paths = all_paths + sum( utils.parallel_process(get_sps, destnodes, n_jobs=args.N_cores), []) logging.info('Num of all paths: {}'.format(len(all_paths))) fc_paths = [] for i in trange(len(all_paths)): fullpath = all_paths[i] if len(fullpath) > 2: path = all_paths[i][1:-1] if np.all([node in fcnodes for node in path]): fc_paths.append(fullpath) logging.info('Num of FC paths: {}'.format(len(fc_paths))) rdm_paths_A = [] for i in trange(args.N_samples):