def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    paths = utils.read_paths(args.paths_file)
    expr = utils.read_expr(args.expr_file)
    gsm = utils.read_text(args.gsm_file)
    expr = expr[gsm]

    if args.n_cores > 1:
        seqcorrs = utils.parallel_process(
            wrapper, [(path, expr) for path in paths],
            n_jobs=args.n_cores)  # for some reason this is slower
    else:
        seqcorrs = []
        for path in tqdm(paths):
            seqcorrs.append(get_seq_corr(path, expr))
        #seqcorrs = list(map(wrapper, tqdm([(path, expr) for path in paths])))
    utils.create_dir_if_not_exist(dirname(args.out_seqcorr_file))
    np.savetxt(args.out_seqcorr_file, seqcorrs)
Esempio n. 2
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    data_raw = pd.read_excel(args.in_data_path)
    row_values = data_raw[['Gene symbol', 'P-value']].values
    rows = utils.parallel_process(wrap, row_values, n_jobs=args.N_cores)
    rows = [elem for elem in rows if elem is not None]
    genes = pd.DataFrame([val for row in rows for val in row if val[0] != -1],
                         columns=['EntrezID', 'P-value'])
    genes_unique = genes.sort_values('P-value')
    genes_unique.drop_duplicates('EntrezID', inplace=True)
    genes_list = genes_unique.EntrezID.tolist()
    logging.info('Num. of mapped genes: {}'.format(len(genes_list)))

    utils.write_gene_list(args.out_genes_path, genes_list)
                                  logging.StreamHandler()])
logging.info(args)

paths = utils.read_paths(args.paths_file)
expr = utils.read_expr(args.expr_file)
gsm = utils.read_text(args.gsm_file)
expr = expr[gsm]

def get_abscorr(i, j, corrdata):
    method = 'pearson' if not args.spearman else 'spearman'
    corrmatr = corrdata.loc[[i, j]].T.corr(method)
    return corrmatr.abs().groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).T.groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).values[0, 1]

def get_seq_corr(path):
    if not np.all([gene in expr.index.tolist() for gene in path]):
        return np.nan
    return np.mean([get_abscorr(path[i], path[i + 1], expr) for i in range(len(path) - 1)])

if args.n_cores > 1:
    seqcorrs = utils.parallel_process(get_seq_corr, paths, n_jobs=args.n_cores)
else:
    seqcorrs = []
    for path in tqdm(paths):
        seqcorrs.append(get_seq_corr(path, expr))
    #seqcorrs = list(map(wrapper, tqdm([(path, expr) for path in paths])))
utils.create_dir_if_not_exist(dirname(args.out_seqcorr_file))
np.savetxt(args.out_seqcorr_file, seqcorrs)



Esempio n. 4
0
    method = 'pearson' if not args.spearman else 'spearman'
    corrmatr = corrdata.loc[[i, j]].T.corr(method)
    return corrmatr.abs().groupby('ENTREZ_GENE_ID').apply(lambda x: x.max(
    )).T.groupby('ENTREZ_GENE_ID').apply(lambda x: x.max()).values[0, 1]


def get_seq_corr(path):
    if not np.all([gene in expr.index.tolist() for gene in path]):
        return np.nan
    return np.mean([
        get_abscorr(path[i], path[i + 1], expr) for i in range(len(path) - 1)
    ])


def get_random_coexpr(i):
    src = random.choice(srcnodes)
    dest = random.choice(destnodes)
    path = random.choice(list(nx.all_shortest_paths(net, src, dest)))
    return get_seq_corr(path)


if args.N_cores > 1:
    seqs = utils.parallel_process(get_random_coexpr,
                                  range(args.N_samples),
                                  n_jobs=args.N_cores)
else:
    seqs = []
    for i in trange(args.N_samples):
        seqs.append(get_random_coexpr(i))

np.savetxt(args.out_seqcorr_file, seqs)
parser.add_argument('--N_per_bin', type=int, default=30, help='Minimum number of nodes per degree bin')
parser.add_argument('--alpha', type=int, default=1, help='Alpha parameter of DIAMOnD')
parser.add_argument('--rdmseed', type=int, default=None, help='RNG seed')

args = parser.parse_args()

logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log"),logging.StreamHandler()])
logging.info(args)

genes = utils.read_gene_list(args.in_genes_path)
mod = utils.read_gene_list(args.in_module_path)
net = utils.read_network(args.net_path)

rdmseeds = gen.gen_degree_preserved_sets(genes, net, args.N_samples, args.N_per_bin,rdmseed=args.rdmseed)

def wrap(pack):
    rdmset, N_mod, net, alpha = pack
    return rdmset + [gene[0] for gene in DIAMOnD.DIAMOnD(net, rdmset, N_mod - len(rdmset), alpha,progress_bar=False)]


samples = utils.parallel_process(wrap, utils.pack_variables(rdmseeds, len(mod), net, args.alpha), n_jobs=args.N_cores)

utils.write_genesets_list(args.out_samples_path, samples)





fcnodes = flows[(flows.FCS >= args.fc_thresh)
                & (flows.N_paths >= args.npath_thresh)].index.tolist()

logging.info('Num of FC nodes: {}'.format(len(fcnodes)))

seqsims = []
for srcnodes_sample, destnodes_sample in zip(srcnodes_samples,
                                             destnodes_samples):
    all_paths = []
    for src_gene in tqdm(srcnodes_sample):

        def get_sps(dest_gene):
            return list(nx.all_shortest_paths(net, src_gene, dest_gene))

        all_paths = all_paths + sum(
            utils.parallel_process(
                get_sps, destnodes_sample, n_jobs=args.N_cores), [])

    logging.info('Num of all paths: {}'.format(len(all_paths)))

    fc_paths = []
    for i in trange(len(all_paths)):
        fullpath = all_paths[i]
        if len(fullpath) > 2:
            path = all_paths[i][1:-1]
            if np.all([node in fcnodes for node in path]):
                fc_paths.append(fullpath)

    logging.info('Num of FC paths: {}'.format(len(fc_paths)))

    go = obo_parser.GODag(args.obo_file)
    gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
    for j_go_term in j_go:

        def wrap(i_go_term):
            return resnik_sim(i_go_term, j_go_term, go, termcounts)

        simlist = [sim for sim in map(wrap, i_go) if sim is not None]
        if len(simlist):
            sims.append(max(simlist))
    return np.mean(sims)


def get_path_sim(path):
    if np.all([gene in gene2go.keys() for gene in path]):
        sims = []
        for i in range(len(path) - 1):
            sims.append(get_sim([path[i], path[i + 1]]))
        return np.mean(sims)
    else:
        return np.nan


if args.n_cores > 1:
    sims = utils.parallel_process(get_path_sim, paths, n_jobs=args.n_cores)
else:
    sims = []
    for path in paths:
        sims.append(get_path_sim(path))

utils.create_dir_if_not_exist(dirname(args.out_sims_file))
np.savetxt(args.out_sims_file, sims)
Esempio n. 8
0
destnodes = utils.read_gene_list(args.destnodes_file)

flows = utils.read_flows(args.flows_file)
fcnodes = flows[(flows.FCS >= args.fc_thresh)
                & (flows.N_paths >= args.npath_thresh)].index.tolist()

logging.info('Num of FC nodes: {}'.format(len(fcnodes)))

all_paths = []
for src_gene in tqdm(srcnodes):

    def get_sps(dest_gene):
        return list(nx.all_shortest_paths(net, src_gene, dest_gene))

    all_paths = all_paths + sum(
        utils.parallel_process(get_sps, destnodes, n_jobs=args.N_cores), [])

logging.info('Num of all paths: {}'.format(len(all_paths)))

fc_paths = []
for i in trange(len(all_paths)):
    fullpath = all_paths[i]
    if len(fullpath) > 2:
        path = all_paths[i][1:-1]
        if np.all([node in fcnodes for node in path]):
            fc_paths.append(fullpath)

logging.info('Num of FC paths: {}'.format(len(fc_paths)))

rdm_paths_A = []
for i in trange(args.N_samples):