Beispiel #1
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    genes = utils.read_gene_list(args.in_genes_path)
    if args.from_symbols:
        utils.gm.enable_agreement_check()
        entrez = utils.gm.symb2entrez(genes)
        for s, e in zip(genes, entrez):
            print('{} -> {}'.format(s, e))
        if len(utils.gm.errors):
            logging.info('Num. of original genes: {}'.format(len(genes)))
            if len(utils.gm.get_failed_queries()) > 0:
                logging.error(
                    'Num. of gene names that could not be translated: {}'.
                    format(len(utils.gm.get_failed_queries())))
            print(utils.gm.errors)
    else:
        entrez = genes
    if args.net_path is not None:
        net = utils.read_network(args.net_path)
        entrez = [gene for gene in entrez if gene in net and gene != -1]
        logging.info('Num. of mapped genes: {}'.format(len(entrez)))

    utils.write_gene_list(args.out_genes_path, entrez)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    seeds = utils.read_gene_list(args.in_seed_path)
    candidates = utils.read_gene_list(args.in_candidate_path)
    net = utils.read_network(args.net_path)
    ext = [
        gene for gene in utils.read_gene_list(args.in_extgenes_path)
        if gene not in seeds and gene in net
    ]

    pvals = np.zeros(len(candidates))
    for i in range(len(candidates)):
        pvals[i] = utils.fisher_overlap_set(candidates[:i + 1], ext,
                                            list(net.nodes()))

    i_min = np.argmin(pvals)
    module = seeds + candidates[:i_min + 1]

    np.savetxt(args.out_pvals_path, pvals)
    utils.write_gene_list(args.out_module_path, module)

    if args.plot:
        import matplotlib.pyplot as plt
        plt.semilogy(pvals)
        plt.title("Cutoff: {}, size: {}".format(i_min, len(module)))
        plt.show()
Beispiel #3
0
def main(args):
    logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log"),logging.StreamHandler()])
    logging.info(args)

    genes = utils.read_gene_list(args.in_genes_path)
    net = utils.read_network(args.net_path)

    diamond_genes = DIAMOnD.DIAMOnD(net, genes, args.N, args.alpha)

    utils.write_gene_list(args.out_diamond_path, [gene[0] for gene in diamond_genes])
Beispiel #4
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    data_raw = pd.read_excel(args.in_data_path)
    row_values = data_raw[['Gene symbol', 'P-value']].values
    rows = utils.parallel_process(wrap, row_values, n_jobs=args.N_cores)
    rows = [elem for elem in rows if elem is not None]
    genes = pd.DataFrame([val for row in rows for val in row if val[0] != -1],
                         columns=['EntrezID', 'P-value'])
    genes_unique = genes.sort_values('P-value')
    genes_unique.drop_duplicates('EntrezID', inplace=True)
    genes_list = genes_unique.EntrezID.tolist()
    logging.info('Num. of mapped genes: {}'.format(len(genes_list)))

    utils.write_gene_list(args.out_genes_path, genes_list)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    gda = pd.read_csv(args.in_gda_path, sep='\t')
    net = utils.read_network(args.in_net_path)

    seeds = gda[gda.diseaseId == args.disease_id].geneId.tolist()

    logging.info("Seed size: {}, DIAMOND iterations: {}".format(
        len(seeds), args.module_size - len(seeds)))
    diamond_genes = [
        gene[0] for gene in DIAMOnD.DIAMOnD(net, seeds, args.module_size -
                                            len(seeds), args.alpha)
    ]
    utils.write_gene_list(args.out_module_file, seeds + diamond_genes)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    gda = pd.read_csv(args.in_gda_path, sep='\t')
    doid_sims = pd.read_csv(args.in_sims_path, sep='\t')
    all_doids = doid_sims.index.tolist()
    diseases = pd.read_csv(args.in_diseases_path,
                           sep='\t',
                           index_col='diseaseId')
    diseases['DOID_list'] = diseases.DOID.apply(
        lambda x: [doid for doid in x.split('|') if doid in all_doids]
        if str(x) != 'nan' else [])

    dis_ids = diseases.index.tolist()
    dis_sims = np.full([len(dis_ids), len(dis_ids)], np.nan)
    np.fill_diagonal(dis_sims, 1)
    for i in trange(len(dis_ids)):
        for j in range(i + 1, len(dis_ids)):
            doids_i = diseases.loc[dis_ids[i]].DOID_list
            doids_j = diseases.loc[dis_ids[j]].DOID_list
            curr_sims = []
            for doid_i in doids_i:
                for doid_j in doids_j:
                    curr_sims.append(doid_sims.loc[doid_i, doid_j])
            if len(curr_sims) > 0:
                max_sim = np.nanmax(curr_sims)
                dis_sims[i, j] = max_sim
                dis_sims[j, i] = max_sim
    dis_sims = pd.DataFrame(dis_sims, index=dis_ids, columns=dis_ids)
    dis_sims_flat = dis_sims.values[~np.isnan(dis_sims.values)
                                    & (dis_sims.values > 0) &
                                    (dis_sims.values < 1)]

    dis_overlaps = np.zeros([len(dis_ids), len(dis_ids)])
    for i in trange(len(dis_ids)):
        for j in range(i, len(dis_ids)):
            genes_i = gda[gda.diseaseId == dis_ids[i]].geneId.tolist()
            genes_j = gda[gda.diseaseId == dis_ids[j]].geneId.tolist()
            dis_overlaps[i, j] = len(set(genes_i) & set(genes_j))
            dis_overlaps[j, i] = dis_overlaps[i, j]

    dis_pairs_related_idx = np.asarray(
        np.where(
            np.triu(((dis_sims.values > np.percentile(
                dis_sims_flat, args.perc_thresh)) & (dis_sims.values < 1) &
                     (dis_overlaps < args.overlap_thresh))))).T
    dis_pairs_unrelated_idx = np.asarray(
        np.where(np.triu(((dis_sims.values == 0) & (dis_overlaps < 10))))).T
    dis_pairs_related = [[dis_ids[i], dis_ids[j]]
                         for i, j in dis_pairs_related_idx]
    dis_pairs_unrelated = [[dis_ids[i], dis_ids[j]]
                           for i, j in dis_pairs_unrelated_idx]
    random.seed(args.rdmseed)
    dis_pairs_unrelated = random.sample(dis_pairs_unrelated,
                                        len(dis_pairs_related))
    logging.info(
        "Related disease pairs: {}, unrelated disease pairs: {}".format(
            len(dis_pairs_related), len(dis_pairs_unrelated)))

    pd.DataFrame(dis_pairs_related).to_csv(args.out_related_pairs,
                                           header=False,
                                           sep='\t',
                                           index=False)
    pd.DataFrame(dis_pairs_unrelated).to_csv(args.out_unrelated_pairs,
                                             header=False,
                                             sep='\t',
                                             index=False)
    dis_sims.to_csv(args.out_sims_path, sep='\t')
    all_dis = set(sum(dis_pairs_related + dis_pairs_unrelated, []))
    for dis in all_dis:
        genes = gda[gda.diseaseId == dis].geneId.unique().tolist()
        utils.write_gene_list(join(args.out_genes_dir, dis + '.entrezlist'),
                              genes)