Esempio n. 1
0
def plotGO(clusterIDs, clusters, outdir, base):

    obodag = GODag("../../obo/go.obo")

    for id in clusterIDs:

        geneset = clusters[id]['geneset']

        goIDs = clusters[id]['go']['terms']

        for category in goIDs.keys():

            success = False

            ids = goIDs[category]

            while not success:

                try:

                    plot_gos(
                        "{}/{}_{}_{}.png".format(outdir, base, id, category),
                        ids, obodag)

                    success = True

                except KeyError as e:

                    value = str(e).replace("'", '')

                    goIDs.remove(value)
Esempio n. 2
0
def test_example(log=sys.stdout):
    """Run Gene Ontology Enrichment Analysis (GOEA) on Nature data."""
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Gene Ontology Enrichment Analysis (GOEA)
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    taxid = 10090 # Mouse study
    # Load ontologies, associations, and population ids
    geneids_pop = GeneID2nt_mus.keys()
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = get_goeaobj("fdr_bh", geneids_pop, taxid)
    # Run GOEA on study
    #keep_if = lambda nt: getattr(nt, "p_fdr_bh" ) < 0.05 # keep if results are significant
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    compare_results(goea_results_all)
    geneids = get_study_items(goea_results_sig)
    # Print GOEA results to files
    goeaobj.wr_xlsx("nbt3102.xlsx", goea_results_sig)
    goeaobj.wr_txt("nbt3102_sig.txt", goea_results_sig)
    goeaobj.wr_txt("nbt3102_all.txt", goea_results_all)
    # Plot all significant GO terms w/annotated study info (large plots)
    #plot_results("nbt3102_{NS}.png", goea_results_sig)
    #plot_results("nbt3102_{NS}_sym.png", goea_results_sig, study_items=5, items_p_line=2, id2symbol=geneids_study)



    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Further examination of GOEA results...
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    obo = goeaobj.obo_dag
    dpi = 150 # For review: Figures can be saved in .jpg, .gif, .tif or .eps, at 150 dpi


    # --------------------------------------------------------------------
    # Item 1) Words in GO names associated with large numbers of study genes
    # --------------------------------------------------------------------
    # What GO term words are associated with the largest number of study genes?
    prt_word2genecnt("nbt3102_genecnt_GOword.txt", goea_results_sig, log)
    # Curated selection of GO words associated with large numbers of study genes
    freq_seen = ['RNA', 'translation', 'mitochondr', 'ribosom', # 'ribosomal', 'ribosome',
        'adhesion', 'endoplasmic', 'nucleotide', 'apoptotic', 'myelin']
    # Collect the GOs which contains the chosen frequently seen words
    word2NS2gos = get_word2NS2gos(freq_seen, goea_results_sig)
    go2res = {nt.GO:nt for nt in goea_results_sig}
    # Print words of interest, the sig GO terms which contain that word, and study genes.
    prt_word_GO_genes("nbt3102_GO_word_genes.txt", word2NS2gos, go2res, geneids_study, log)
    # Plot each set of GOs along w/study gene info 
    for word, NS2gos in word2NS2gos.items():
       for NS in ['BP', 'MF', 'CC']:
           if NS in NS2gos:
               gos = NS2gos[NS]
               goid2goobj = {go:go2res[go].goterm for go in gos}
               # dpi: 150 for review, 1200 for publication
               #dpis = [150, 1200] if word == "RNA" else [150]
               dpis = [150]
               for dpi in dpis:
                   fmts = ['png', 'tif', 'eps'] if word == "RNA" else ['png']
                   for fmt in fmts:
                       plot_goid2goobj(
                           "nbt3102_{WORD}_{NS}_dpi{DPI}.{FMT}".format(WORD=word, NS=NS, DPI=dpi, FMT=fmt),
                           goid2goobj, # source GOs and their GOTerm object
                           items_p_line=3,
                           study_items=6, # Max number of gene symbols to print in each GO term
                           id2symbol=geneids_study, # Contains GeneID-to-Symbol
                           goea_results=goea_results_all, # pvals used for GO Term coloring
                           dpi=dpi)
      
    
    # --------------------------------------------------------------------
    # Item 2) Explore findings of Nature paper:
    #
    #     Gene Ontology (GO) enrichment analysis showed that the
    #     differentially expressed genes contained statistically
    #     significant enrichments of genes involved in 
    #         glycolysis,
    #         cellular response to IL-4 stimulation and 
    #         positive regulation of B-cell proliferation
    # --------------------------------------------------------------------
    goid_subset = [
        'GO:0006096', # BP 4.24e-12 10 glycolytic process
        'GO:0071353', # BP 7.45e-06  5 cellular response to interleukin-4
        'GO:0030890', # BP 8.22e-07  7 positive regulation of B cell proliferation
    ]
    plot_gos("nbt3102_GOs.png", goid_subset, obo, dpi=dpi)
    plot_gos("nbt3102_GOs_genecnt.png", goid_subset, obo, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_genelst.png", goid_subset, obo, 
        study_items=True, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_symlst.png", goid_subset, obo, 
        study_items=True, id2symbol=geneids_study, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_symlst_trunc.png", goid_subset, obo, 
        study_items=5, id2symbol=geneids_study, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_GO0005743.png", ["GO:0005743"], obo, 
        items_p_line=2, study_items=6, 
        id2symbol=geneids_study, goea_results=goea_results_all, dpi=dpi)

    # --------------------------------------------------------------------
    # Item 3) Create one GO sub-plot per significant GO term from study
    # --------------------------------------------------------------------
    for rec in goea_results_sig:
        png = "nbt3102_{NS}_{GO}.png".format(GO=rec.GO.replace(':', '_'), NS=rec.NS)
        goid2goobj = {rec.GO:rec.goterm}
        plot_goid2goobj(png,
            goid2goobj, # source GOs and their GOTerm object
            study_items=15, # Max number of gene symbols to print in each GO term
            id2symbol=geneids_study, # Contains GeneID-to-Symbol
            goea_results=goea_results_all, # pvals used for GO Term coloring
            dpi=dpi)

    # --------------------------------------------------------------------
    # Item 4) Explore using manually curated lists of GO terms
    # --------------------------------------------------------------------
    goid_subset = [
      'GO:0030529', # CC D03 intracellular ribonucleoprotein complex (42 genes)
      'GO:0015934', # CC D05 large ribosomal subunit (4 genes)
      'GO:0015935', # CC D05 small ribosomal subunit (13 genes)
      'GO:0022625', # CC D06 cytosolic large ribosomal subunit (16 genes)
      'GO:0022627', # CC D06 cytosolic small ribosomal subunit (19 genes)
      'GO:0036464', # CC D06 cytoplasmic ribonucleoprotein granule (4 genes)
      'GO:0005840', # CC D05 ribosome (35 genes)
      'GO:0005844', # CC D04 polysome (6 genes)
    ]
    plot_gos("nbt3102_CC_ribosome.png", goid_subset, obo, 
        study_items=6, id2symbol=geneids_study, items_p_line=3,
        goea_results=goea_results_sig, dpi=dpi)

    goid_subset = [
      'GO:0003723', # MF D04 RNA binding (32 genes)
      'GO:0044822', # MF D05 poly(A) RNA binding (86 genes)
      'GO:0003729', # MF D06 mRNA binding (11 genes)
      'GO:0019843', # MF D05 rRNA binding (6 genes)
      'GO:0003746', # MF D06 translation elongation factor activity (5 genes)
    ]
    plot_gos("nbt3102_MF_RNA_genecnt.png", 
        goid_subset, 
        obo, 
        goea_results=goea_results_all, dpi=150)
    for dpi in [150, 1200]: # 150 for review, 1200 for publication
        plot_gos("nbt3102_MF_RNA_dpi{DPI}.png".format(DPI=dpi), 
            goid_subset, 
            obo, 
            study_items=6, id2symbol=geneids_study, items_p_line=3,
            goea_results=goea_results_all, dpi=dpi)

    # --------------------------------------------------------------------
    # Item 5) Are any significant geneids related to cell cycle?
    # --------------------------------------------------------------------
    import test_genes_cell_cycle as CC
    genes_cell_cycle = CC.get_genes_cell_cycle(taxid, log=log)
    genes_cell_cycle_sig = genes_cell_cycle.intersection(geneids)
    CC.prt_genes("nbt3102_cell_cycle.txt", genes_cell_cycle_sig, taxid, log=None)
Esempio n. 3
0
print('{N} of {M:,} results were significant'.format(N=len(goea_quiet_sig),
                                                     M=len(goea_quiet_all)))

print('Significant results: {E} enriched, {P} purified'.format(
    E=sum(1 for r in goea_quiet_sig if r.enrichment == 'e'),
    P=sum(1 for r in goea_quiet_sig if r.enrichment == 'p')))

ctr = cx.Counter([r.NS for r in goea_quiet_sig])
print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
    TOTAL=len(goea_quiet_sig),
    BP=ctr['BP'],  # biological_process
    MF=ctr['MF'],  # molecular_function
    CC=ctr['CC']))  # cellular_component

#goeaobj.wr_xlsx("CDK1_test.xlsx", goea_quiet_sig)
goeaobj.wr_txt("CDK1_test.txt", goea_quiet_sig)

goid_subset = [
    'GO:0003723',  # MF D04 RNA binding (32 genes)
    'GO:0044822',  # MF D05 poly(A) RNA binding (86 genes)
    'GO:0003729',  # MF D06 mRNA binding (11 genes)
    'GO:0019843',  # MF D05 rRNA binding (6 genes)
    'GO:0003746',  # MF D06 translation elongation factor activity (5 genes)
]
plot_gos(
    "nbt3102_MF_RNA_genecnt.png",
    goid_subset,  # Source GO ids
    obodag,
    goea_results=goea_quiet_all)  # Use pvals for coloring
Esempio n. 4
0
def test_example(log=sys.stdout):
    """Run Gene Ontology Enrichment Analysis (GOEA) on Nature data."""
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Gene Ontology Enrichment Analysis (GOEA)
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    taxid = 10090  # Mouse study
    # Load ontologies, associations, and population ids
    geneids_pop = GeneID2nt_mus.keys()
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = get_goeaobj("fdr_bh", geneids_pop, taxid)
    # Run GOEA on study
    #keep_if = lambda nt: getattr(nt, "p_fdr_bh" ) < 0.05 # keep if results are significant
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    compare_results(goea_results_all)
    geneids = get_study_items(goea_results_sig)
    # Print GOEA results to files
    goeaobj.wr_xlsx("nbt3102.xlsx", goea_results_sig)
    goeaobj.wr_txt("nbt3102_sig.txt", goea_results_sig)
    goeaobj.wr_txt("nbt3102_all.txt", goea_results_all)
    # Plot all significant GO terms w/annotated study info (large plots)
    #plot_results("nbt3102_{NS}.png", goea_results_sig)
    #plot_results("nbt3102_{NS}_sym.png", goea_results_sig, study_items=5, items_p_line=2, id2symbol=geneids_study)

    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Further examination of GOEA results...
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    obo = goeaobj.obo_dag
    dpi = 150  # For review: Figures can be saved in .jpg, .gif, .tif or .eps, at 150 dpi

    # --------------------------------------------------------------------
    # Item 1) Words in GO names associated with large numbers of study genes
    # --------------------------------------------------------------------
    # What GO term words are associated with the largest number of study genes?
    prt_word2genecnt("nbt3102_genecnt_GOword.txt", goea_results_sig, log)
    # Curated selection of GO words associated with large numbers of study genes
    freq_seen = [
        'RNA',
        'translation',
        'mitochondr',
        'ribosom',  # 'ribosomal', 'ribosome',
        'adhesion',
        'endoplasmic',
        'nucleotide',
        'apoptotic',
        'myelin'
    ]
    # Collect the GOs which contains the chosen frequently seen words
    word2NS2gos = get_word2NS2gos(freq_seen, goea_results_sig)
    go2res = {nt.GO: nt for nt in goea_results_sig}
    # Print words of interest, the sig GO terms which contain that word, and study genes.
    prt_word_GO_genes("nbt3102_GO_word_genes.txt", word2NS2gos, go2res,
                      geneids_study, log)
    # Plot each set of GOs along w/study gene info
    for word, NS2gos in word2NS2gos.items():
        for NS in ['BP', 'MF', 'CC']:
            if NS in NS2gos:
                gos = NS2gos[NS]
                goid2goobj = {go: go2res[go].goterm for go in gos}
                # dpi: 150 for review, 1200 for publication
                #dpis = [150, 1200] if word == "RNA" else [150]
                dpis = [150]
                for dpi in dpis:
                    fmts = ['png', 'tif', 'eps'] if word == "RNA" else ['png']
                    for fmt in fmts:
                        plot_goid2goobj(
                            "nbt3102_{WORD}_{NS}_dpi{DPI}.{FMT}".format(
                                WORD=word, NS=NS, DPI=dpi, FMT=fmt),
                            goid2goobj,  # source GOs and their GOTerm object
                            items_p_line=3,
                            study_items=
                            6,  # Max number of gene symbols to print in each GO term
                            id2symbol=geneids_study,  # Contains GeneID-to-Symbol
                            goea_results=
                            goea_results_all,  # pvals used for GO Term coloring
                            dpi=dpi)

    # --------------------------------------------------------------------
    # Item 2) Explore findings of Nature paper:
    #
    #     Gene Ontology (GO) enrichment analysis showed that the
    #     differentially expressed genes contained statistically
    #     significant enrichments of genes involved in
    #         glycolysis,
    #         cellular response to IL-4 stimulation and
    #         positive regulation of B-cell proliferation
    # --------------------------------------------------------------------
    goid_subset = [
        'GO:0006096',  # BP 4.24e-12 10 glycolytic process
        'GO:0071353',  # BP 7.45e-06  5 cellular response to interleukin-4
        'GO:0030890',  # BP 8.22e-07  7 positive regulation of B cell proliferation
    ]
    plot_gos("nbt3102_GOs.png", goid_subset, obo, dpi=dpi)
    plot_gos("nbt3102_GOs_genecnt.png",
             goid_subset,
             obo,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_genelst.png",
             goid_subset,
             obo,
             study_items=True,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_symlst.png",
             goid_subset,
             obo,
             study_items=True,
             id2symbol=geneids_study,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_symlst_trunc.png",
             goid_subset,
             obo,
             study_items=5,
             id2symbol=geneids_study,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_GO0005743.png", ["GO:0005743"],
             obo,
             items_p_line=2,
             study_items=6,
             id2symbol=geneids_study,
             goea_results=goea_results_all,
             dpi=dpi)

    # --------------------------------------------------------------------
    # Item 3) Create one GO sub-plot per significant GO term from study
    # --------------------------------------------------------------------
    for rec in goea_results_sig:
        png = "nbt3102_{NS}_{GO}.png".format(GO=rec.GO.replace(':', '_'),
                                             NS=rec.NS)
        goid2goobj = {rec.GO: rec.goterm}
        plot_goid2goobj(
            png,
            goid2goobj,  # source GOs and their GOTerm object
            study_items=
            15,  # Max number of gene symbols to print in each GO term
            id2symbol=geneids_study,  # Contains GeneID-to-Symbol
            goea_results=goea_results_all,  # pvals used for GO Term coloring
            dpi=dpi)

    # --------------------------------------------------------------------
    # Item 4) Explore using manually curated lists of GO terms
    # --------------------------------------------------------------------
    goid_subset = [
        'GO:0030529',  # CC D03 intracellular ribonucleoprotein complex (42 genes)
        'GO:0015934',  # CC D05 large ribosomal subunit (4 genes)
        'GO:0015935',  # CC D05 small ribosomal subunit (13 genes)
        'GO:0022625',  # CC D06 cytosolic large ribosomal subunit (16 genes)
        'GO:0022627',  # CC D06 cytosolic small ribosomal subunit (19 genes)
        'GO:0036464',  # CC D06 cytoplasmic ribonucleoprotein granule (4 genes)
        'GO:0005840',  # CC D05 ribosome (35 genes)
        'GO:0005844',  # CC D04 polysome (6 genes)
    ]
    plot_gos("nbt3102_CC_ribosome.png",
             goid_subset,
             obo,
             study_items=6,
             id2symbol=geneids_study,
             items_p_line=3,
             goea_results=goea_results_sig,
             dpi=dpi)

    goid_subset = [
        'GO:0003723',  # MF D04 RNA binding (32 genes)
        'GO:0044822',  # MF D05 poly(A) RNA binding (86 genes)
        'GO:0003729',  # MF D06 mRNA binding (11 genes)
        'GO:0019843',  # MF D05 rRNA binding (6 genes)
        'GO:0003746',  # MF D06 translation elongation factor activity (5 genes)
    ]
    plot_gos("nbt3102_MF_RNA_genecnt.png",
             goid_subset,
             obo,
             goea_results=goea_results_all,
             dpi=150)
    for dpi in [150, 1200]:  # 150 for review, 1200 for publication
        plot_gos("nbt3102_MF_RNA_dpi{DPI}.png".format(DPI=dpi),
                 goid_subset,
                 obo,
                 study_items=6,
                 id2symbol=geneids_study,
                 items_p_line=3,
                 goea_results=goea_results_all,
                 dpi=dpi)

    # --------------------------------------------------------------------
    # Item 5) Are any significant geneids related to cell cycle?
    # --------------------------------------------------------------------
    import test_genes_cell_cycle as CC
    genes_cell_cycle = CC.get_genes_cell_cycle(taxid, log=log)
    genes_cell_cycle_sig = genes_cell_cycle.intersection(geneids)
    CC.prt_genes("nbt3102_cell_cycle.txt",
                 genes_cell_cycle_sig,
                 taxid,
                 log=None)
def _runGOanalysis(cluster, n_genes=75):
    clusterNum=cluster.replace('Cluster', '')
    genesList = []
    if comparison == 'DiffExp_DownOC':
        sigGenes = genesDf.loc[genesDf['(1, ' + str(clusterNum) + ')_p'] < .05]
        genesList = sigGenes['(1, ' + str(clusterNum) + ')_n'].tolist()
    elif comparison == 'DiffExp_UpOC':
        sigGenes = genesDf.loc[genesDf['(2, ' + str(clusterNum) + ')_p'] < .05]
        genesList = sigGenes['(2, ' + str(clusterNum) + ')_n'].tolist()        
    else:
        genesList = genesDf['(' + str(clusterNum) + '_n)']

    genesList =genesList[:n_genes]
    
    setName = cluster + '_' + comparison
    entrezIDlist, nullGenes = getEntrezIDs(genesList)
    
    print(str(len(entrezIDlist)) + " Entrez IDs retrieved, searching NCBI for " + str(len(nullGenes)) + " genes")
    
    newGeneList = []
    newEntrezList = []
    unfoundList = []
    for gene in nullGenes:
        search = Entrez.esearch(db='gene', term=gene, retmax=1, start=0, sort='relevance')
        record = Entrez.read(search)
        ID = record.get('IdList')
        if not len(ID) == 0:
            newGeneList.append(gene)
            newEntrezList.append(ID[0])
        else:
            unfoundList.append(gene)
            
#    geneIndex = pd.read_excel(os.path.join(goaResultDir, 'EntrezIndex.xlsx'), index_col=0)
    newEntrez = pd.DataFrame([newEntrezList, newGeneList]).T
    newEntrez.columns=['entrez_id', 'symbol']
#    geneIndex = pd.concat([geneIndex, newEntrez], axis=0)
#    geneIndex.to_excel(os.path.join(goaResultDir, 'EntrezIndex.xlsx'))
            
    print(str(len(newEntrezList)) + " additional Entrez IDs retrieved, " + str(len(unfoundList)) + " genes with no results")
    
    entrezIDlist.extend(newEntrezList)
    
    print("Proceeding with GO analysis of " + str(len(entrezIDlist)) + " genes")
    
    goea_results_all = goeaobj.run_study(entrezIDlist)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    
    print('{N} of {M:,} results were significant'.format(
        N=len(goea_results_sig),
        M=len(goea_results_all)))
    
    print('Significant results: {E} enriched, {P} purified'.format(
        E=sum(1 for r in goea_results_sig if r.enrichment=='e'),
        P=sum(1 for r in goea_results_sig if r.enrichment=='p')))
    
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_all),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC'])) # cellular_component
    
    
    goeaobj.wr_xlsx(os.path.join(goaResultDir, cluster + '_' + comparison + "_GOA_Results_" + str(n_genes) + "genes.xlsx"), goea_results_sig)
 #   goeaobj.wr_txt(os.path.join(goaResultDir, cluster + '_' + comparison + "_GOA_Results.txt"), goea_results_sig)
    
    plot_results(os.path.join(goaResultDir, cluster + '_' + comparison + '_' + str(n_genes) + "genes_GOA.png"), goea_results_sig)
    
    goid_subset = [
        'GO:0098984', # CC neuron to neuron synapse
        'GO:0099699', # CC integral component of synaptic membrane
        'GO:2000474', # BP regulation of opioid receptor signaling
        'GO:0038003', # BP opioid receptor signaling pathway
        'GO:0048167', # BP regulation of synaptic plasticity
        'GO:0021897', # BP forebrain astrocyte development
        'GO:0014003', # BP oligodendroyte development
        'GO:0035249', # BP synaptic transmission, glutamatergic
        'GO:0051932', # BP synaptic transmission, GABAergic
        'GO:0014004', # BP microglia differentiation
    ]
    
    
    """
    This plot contains GOEA results:
    
        GO terms colored by P-value:
            pval < 0.005 (light red)
            pval < 0.01 (light orange)
            pval < 0.05 (yellow)
            pval > 0.05 (grey) Study terms that are not statistically significant
        GO terms with study gene counts printed. e.g., "32 genes"
    
    """
    
    
    plot_gos(os.path.join(goaResultDir, cluster + '_' + comparison + "_" + str(n_genes) + "genes_GOA_Subset_Plot.png"), 
        goid_subset, # Source GO ids
        obodag,
        goea_results=goea_results_all, # use pvals for coloring
        # We can further configure the plot...
 #       id2symbol=geneid2symbol, # Print study gene Symbols, not Entrez GeneIDs
        study_items=6, # Only only 6 gene Symbols max on GO terms
        items_p_line=3, # Print 3 genes per line
        )
    return(newEntrez)
Esempio n. 6
0
                         for g in goea_results_sig])
df_go = df_go1.merge(df_p, left_index=True, right_index=True)

go_genes = pandas.DataFrame([{'id': g.goterm.id, 'gene': s, 'symbol': geneid2symbol[s]}
                         for g in goea_results_sig
                         for s in g.study_items])
df_go = df_go.merge(go_genes, on='id')
print('SAVE -> name spaces of the significant GO terms in namespace.csv')
print(df_go.groupby('namespace').count())
df_go.to_csv(out_dir + "/name_space.csv")

print()
# -------------- Plot subset starting from these significant GO terms --------------
goid_subset = [g.GO for g in goea_results_sig]
plot_gos(out_dir + "/go_enrich.png",
         goid_subset,                       # Source GO ids
         obodag,
         goea_results=goea_results_all)     # Use pvals for coloring

plot_gos(out_dir + "/go_enrich_symbols.pdf",
         goid_subset,                       # Source GO ids
         obodag,
         goea_results=goea_results_all,     # use pvals for coloring
         id2symbol=geneid2symbol,           # Print study gene Symbols, not Entrez GeneIDs
         study_items=6,                     # Only only 6 gene Symbols max on GO terms
         items_p_line=3)                    # Print 3 genes per line

print()
cids = [int(data.drug_idx_to_id[c].replace('CID', '')) for c in pd_idx[1]]
drugs = [Compound.from_cid(int(cid)) for cid in set(cids)]
drug_ids = pandas.DataFrame([[data.drug_id_to_idx['CID{}'.format(d.cid)],
                              d.cid,
Esempio n. 7
0
def enrich(gene2go: str,
           study: str,
           obo: str,
           population: str = None,
           geneid2symbol: str = None,
           correct='fdr_bh',
           alpha=0.05,
           top=20,
           goea_out=None,
           dag_out=None,
           dpi=300,
           show_gene_limit=6,
           only_plot_sig=False):
    """
    Go enrichment based on goatools
    :param gene2go: a file with two columns: gene_id \t go_term_id
    :param study: a file with at least one column, first column contains gene id, second columns is regulation direction
    :param obo: go-basic file download from GeneOntology
    :param population: a file with each row contains one gene; default to use all genes in gene2go file as population
    :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot
    :param correct: pvalue adjustment method:
        Method used for testing and adjustment of pvalues. Can be either the
        full name or initial letters. Available methods are:
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` : step down method using Sidak adjustments
        - `holm` : step-down method using Bonferroni adjustments
        - `simes-hochberg` : step-up method  (independent)
        - `hommel` : closed method based on Simes tests (non-negative)
        - `fdr_bh` : Benjamini/Hochberg  (non-negative)
        - `fdr_by` : Benjamini/Yekutieli (negative)
        - `fdr_tsbh` : two stage fdr correction (non-negative)
        - `fdr_tsbky` : two stage fdr correction (non-negative)
    :param alpha: fdr cutoff, default 0.05
    :param top: n top go terms to plot, sorted by corrected pvalue
    :param goea_out: output enrichment result file
    :param dag_out: dag figure file
    :param dpi: resolution of image, no effect for svg
    :param show_gene_limit: the max number of gene in a node to show
    :param only_plot_sig: only plot dag for significantly enriched terms
    :return: None
    """
    if str(correct) == '3':
        correct = 'fdr_bh'
    if geneid2symbol:
        geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol)
                             if x.strip())
    else:
        geneid2symbol = dict()
    obo = GODag(obo, optional_attrs=['relationship', 'is_a'])
    gene2go = read_associations(gene2go)
    study_genes = [x.strip().split()[0] for x in open(study)]
    try:
        reg_dict = dict(x.strip().split()[:2] for x in open(study))
    except:
        reg_dict = {x.strip(): '' for x in open(study)}
    if not population:
        population = gene2go.keys()
    else:
        population = [
            x.strip().split()[0] for x in open(population) if x.strip()
        ]

    goea_obj = GOEnrichmentStudy(population,
                                 gene2go,
                                 obo,
                                 propagate_counts=False,
                                 alpha=alpha,
                                 methods=('fdr_bh', ))
    keep_if = lambda r: r.ratio_in_study[0] != 0
    goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if)
    goea_out = goea_out or study + '.goea.xls'
    goea_obj.wr_tsv(goea_out, goea_results_all)

    def func(y):
        results = []
        genes = [x.strip() for x in y.split(',')]
        for gene in genes:
            tmp = [gene]
            if gene in reg_dict:
                tmp.append(reg_dict[gene])
            if gene in geneid2symbol:
                tmp.append(geneid2symbol[gene])
            results.append('|'.join(tmp))
        return ';'.join(results)

    # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(','))
    table = pd.read_table(goea_out, header=0, index_col=0)
    # 重新校正pvalue, 修改内容
    fdr = multipletests(table['p_uncorrected'], method=correct)[1]
    table['p_fdr_bh'] = fdr
    # 修改goea_result_all方便后续的画图
    for r, fdr in zip(goea_results_all, fdr):
        r.p_fdr_bh = fdr
    table.columns = [
        x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns
    ]
    table['enrichment'] = [
        'e' if x <= alpha else 'p' for x in table['p_corrected']
    ]
    table['study_items'] = table.loc[:, 'study_items'].map(func)
    # table = table.sort_values(by=['p_corrected', 'p_uncorrected'])
    table.to_csv(goea_out, header=True, index=True, sep='\t')

    # -------------------plot dag------------------------
    for each in ['BP', 'MF', 'CC']:
        if only_plot_sig:
            goea_results_sig = table[table['enrichment'] == 'e']
        else:
            goea_results_sig = table.copy()
        goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each]
        if not goea_results_sig.shape[0]:
            print(f"No significant term to plot for {each} ")
            return
        if goea_results_sig.shape[0] >= top:
            goea_results_sig = goea_results_sig.iloc[:top]
        goid_subset = list(goea_results_sig.index)
        # t = obo[goid_subset[5]]
        # for k, v in t.relationship.items():
        #     print(t, k, type(v), list(v)[0].id)
        # print(dag_out[:-4]+'.'+each+dag_out[-4:])
        dag_out = dag_out or study + '.goea.dag.svg'
        plot_gos(
            dag_out[:-4] + '.' + each + dag_out[-4:],
            goid_subset,  # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现
            obo,
            goea_results=
            goea_results_all,  # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname)
            # We can further configure the plot...
            id2symbol=geneid2symbol,  # Print study gene Symbols, not GeneIDs
            study_items=show_gene_limit,  # Only max 6 gene Symbols on GO terms
            items_p_line=3,  # Print 3 genes per line)
            dpi=0 if dag_out.endswith('svg') else dpi,
            # title="Directed Graph of enriched {} terms".format(each)
        )