def run_actual_assc(self, genes_study_arg, ntdesc):
     """Simulate the significance of the user-provided study vs. the population gene sets."""
     genes_study = set(genes_study_arg)
     assc_desc = 'actual'
     alpha = self.objbase.alpha
     genes_pop_masked = self.get_pop_genes_masked(genes_study)
     goeaobj = self.objbase.get_goeaobj(genes_pop_masked,
                                        self.objassc.get_assc())
     goea_results = goeaobj.run_study(
         genes_study, keep_if=lambda nt: nt.p_fdr_bh < alpha)
     fout_txt = "goea_{DESC}_sig_{N:04}.txt".format(DESC=ntdesc.name,
                                                    N=len(genes_study))
     goeaobj.wr_txt(fout_txt, goea_results)
     genes_sig = get_study_items(goea_results)
     # if genes_study != genes_sig:
     #     msg = "FOUND {STUSIG:4} OF {STU:4} {DESC} GENES TO BE SIGNIFICANT\n"
     #     genes_study_sig = genes_study.intersection(genes_sig)
     #     sys.stdout.write(msg.format(
     #         STU=len(genes_study), STUSIG=len(genes_study_sig), DESC=ntdesc.name))
     return {
         'goea_results': goea_results,
         'genes_sig': genes_sig,
         'genes_study': genes_study,
         'assc_desc': assc_desc
     }
Exemple #2
0
 def __init__(self, num_study_genes, num_null, pobj):
     self.pobj = pobj  # RunParams object
     # I. Genes in two groups: Different than population AND no different than population
     self.gene_expsig_list = mk_stochastic_goeasim_source(
         num_study_genes, num_null, pobj.gene_lists['study_bg'],
         pobj.gene_lists['null_bg'])  # [(gene, expsig),
     self.assc_geneid2gos = self._init_assc()
     self.goea_results = self._init_goea_results()
     # for g in self.goea_results:
     #     print "HHHH", g
     self.genes_sig = get_study_items(self.goea_results)
     if self.pobj.params['log'] is not None:
         self._wrlog_summary(num_study_genes, num_null)
 def run_random_assc(self, genes_study_arg, ntdesc):
     """Simulate no significance"""
     genes_study = set(genes_study_arg)
     assc_desc = 'random'
     alpha = self.objbase.alpha
     rand_assoc = RandAssc(
         self.objassc.get_assc()).get_shuffled_associations()
     goeaobj = self.objbase.get_goeaobj(self.objassc.pop_genes, rand_assoc)
     goea_results = goeaobj.run_study(
         genes_study, keep_if=lambda nt: nt.p_fdr_bh < alpha)
     fout_txt = "goea_{DESC}_rnd_{N:04}.txt".format(DESC=ntdesc.name,
                                                    N=len(genes_study))
     goeaobj.wr_txt(fout_txt, goea_results)
     genes_rnd = get_study_items(goea_results)
     assert len(goea_results) == 0, \
         "EXPECTED NO SIGNIFICANT GO TERMS IN RANDOM SIMULATION. FOUND {N}".format(
             N=len(goea_results))
     return {
         'goea_results': goea_results,
         'genes_sig': genes_rnd,
         'genes_study': genes_study,
         'assc_desc': assc_desc
     }
 def _init_get_goids_tgtd(self):
     """Run baseline GOEA to obtain list of 'other' GO IDs which are truly significant."""
     # Run Gene Ontology Analysis w/study genes being entire study gene background.
     attrname = "p_{METHOD}".format(METHOD=self.objbase.method)
     keep_if = lambda nt: getattr(nt, attrname) < self.objbase.alpha
     # Association subset containing only population genes
     assc_all = self.objassc.objassc_all.assc_geneid2gos
     objgoea = self.objbase.get_goeaobj(self.genes['population'], assc_all)
     goea_results = objgoea.run_study(self.genes['study_bg'],
                                      keep_if=keep_if)
     # Check study background genes
     genes_signif = get_study_items(goea_results)
     assert self.genes['study_bg'] == genes_signif
     # Get GO IDs to randomize or remove
     goids_signif = set([nt.GO for nt in goea_results])
     goids_study_bg = self.params['goids_study_bg']
     assert goids_signif.intersection(goids_study_bg) == goids_study_bg
     # GO IDs targeted for removal or randomization
     goids_artifacts = goids_signif.difference(goids_study_bg)
     log = self.params['log']
     if log is not None and goea_results:
         self._prt_significant_artifacts(goea_results, goids_artifacts, log)
     return goids_artifacts
Exemple #5
0
def test_example(log=sys.stdout):
    """Run Gene Ontology Enrichment Analysis (GOEA) on Nature data."""
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Gene Ontology Enrichment Analysis (GOEA)
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    taxid = 10090  # Mouse study
    # Load ontologies, associations, and population ids
    geneids_pop = GeneID2nt_mus.keys()
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = get_goeaobj("fdr_bh", geneids_pop, taxid)
    # Run GOEA on study
    #keep_if = lambda nt: getattr(nt, "p_fdr_bh" ) < 0.05 # keep if results are significant
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    compare_results(goea_results_all)
    geneids = get_study_items(goea_results_sig)
    # Print GOEA results to files
    goeaobj.wr_xlsx("nbt3102.xlsx", goea_results_sig)
    goeaobj.wr_txt("nbt3102_sig.txt", goea_results_sig)
    goeaobj.wr_txt("nbt3102_all.txt", goea_results_all)
    # Plot all significant GO terms w/annotated study info (large plots)
    #plot_results("nbt3102_{NS}.png", goea_results_sig)
    #plot_results("nbt3102_{NS}_sym.png", goea_results_sig, study_items=5, items_p_line=2, id2symbol=geneids_study)

    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Further examination of GOEA results...
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    obo = goeaobj.obo_dag
    dpi = 150  # For review: Figures can be saved in .jpg, .gif, .tif or .eps, at 150 dpi

    # --------------------------------------------------------------------
    # Item 1) Words in GO names associated with large numbers of study genes
    # --------------------------------------------------------------------
    # What GO term words are associated with the largest number of study genes?
    prt_word2genecnt("nbt3102_genecnt_GOword.txt", goea_results_sig, log)
    # Curated selection of GO words associated with large numbers of study genes
    freq_seen = [
        'RNA',
        'translation',
        'mitochondr',
        'ribosom',  # 'ribosomal', 'ribosome',
        'adhesion',
        'endoplasmic',
        'nucleotide',
        'apoptotic',
        'myelin'
    ]
    # Collect the GOs which contains the chosen frequently seen words
    word2NS2gos = get_word2NS2gos(freq_seen, goea_results_sig)
    go2res = {nt.GO: nt for nt in goea_results_sig}
    # Print words of interest, the sig GO terms which contain that word, and study genes.
    prt_word_GO_genes("nbt3102_GO_word_genes.txt", word2NS2gos, go2res,
                      geneids_study, log)
    # Plot each set of GOs along w/study gene info
    for word, NS2gos in word2NS2gos.items():
        for NS in ['BP', 'MF', 'CC']:
            if NS in NS2gos:
                gos = NS2gos[NS]
                goid2goobj = {go: go2res[go].goterm for go in gos}
                # dpi: 150 for review, 1200 for publication
                #dpis = [150, 1200] if word == "RNA" else [150]
                dpis = [150]
                for dpi in dpis:
                    fmts = ['png', 'tif', 'eps'] if word == "RNA" else ['png']
                    for fmt in fmts:
                        plot_goid2goobj(
                            "nbt3102_{WORD}_{NS}_dpi{DPI}.{FMT}".format(
                                WORD=word, NS=NS, DPI=dpi, FMT=fmt),
                            goid2goobj,  # source GOs and their GOTerm object
                            items_p_line=3,
                            study_items=
                            6,  # Max number of gene symbols to print in each GO term
                            id2symbol=geneids_study,  # Contains GeneID-to-Symbol
                            goea_results=
                            goea_results_all,  # pvals used for GO Term coloring
                            dpi=dpi)

    # --------------------------------------------------------------------
    # Item 2) Explore findings of Nature paper:
    #
    #     Gene Ontology (GO) enrichment analysis showed that the
    #     differentially expressed genes contained statistically
    #     significant enrichments of genes involved in
    #         glycolysis,
    #         cellular response to IL-4 stimulation and
    #         positive regulation of B-cell proliferation
    # --------------------------------------------------------------------
    goid_subset = [
        'GO:0006096',  # BP 4.24e-12 10 glycolytic process
        'GO:0071353',  # BP 7.45e-06  5 cellular response to interleukin-4
        'GO:0030890',  # BP 8.22e-07  7 positive regulation of B cell proliferation
    ]
    plot_gos("nbt3102_GOs.png", goid_subset, obo, dpi=dpi)
    plot_gos("nbt3102_GOs_genecnt.png",
             goid_subset,
             obo,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_genelst.png",
             goid_subset,
             obo,
             study_items=True,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_symlst.png",
             goid_subset,
             obo,
             study_items=True,
             id2symbol=geneids_study,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_symlst_trunc.png",
             goid_subset,
             obo,
             study_items=5,
             id2symbol=geneids_study,
             goea_results=goea_results_all,
             dpi=dpi)
    plot_gos("nbt3102_GOs_GO0005743.png", ["GO:0005743"],
             obo,
             items_p_line=2,
             study_items=6,
             id2symbol=geneids_study,
             goea_results=goea_results_all,
             dpi=dpi)

    # --------------------------------------------------------------------
    # Item 3) Create one GO sub-plot per significant GO term from study
    # --------------------------------------------------------------------
    for rec in goea_results_sig:
        png = "nbt3102_{NS}_{GO}.png".format(GO=rec.GO.replace(':', '_'),
                                             NS=rec.NS)
        goid2goobj = {rec.GO: rec.goterm}
        plot_goid2goobj(
            png,
            goid2goobj,  # source GOs and their GOTerm object
            study_items=
            15,  # Max number of gene symbols to print in each GO term
            id2symbol=geneids_study,  # Contains GeneID-to-Symbol
            goea_results=goea_results_all,  # pvals used for GO Term coloring
            dpi=dpi)

    # --------------------------------------------------------------------
    # Item 4) Explore using manually curated lists of GO terms
    # --------------------------------------------------------------------
    goid_subset = [
        'GO:0030529',  # CC D03 intracellular ribonucleoprotein complex (42 genes)
        'GO:0015934',  # CC D05 large ribosomal subunit (4 genes)
        'GO:0015935',  # CC D05 small ribosomal subunit (13 genes)
        'GO:0022625',  # CC D06 cytosolic large ribosomal subunit (16 genes)
        'GO:0022627',  # CC D06 cytosolic small ribosomal subunit (19 genes)
        'GO:0036464',  # CC D06 cytoplasmic ribonucleoprotein granule (4 genes)
        'GO:0005840',  # CC D05 ribosome (35 genes)
        'GO:0005844',  # CC D04 polysome (6 genes)
    ]
    plot_gos("nbt3102_CC_ribosome.png",
             goid_subset,
             obo,
             study_items=6,
             id2symbol=geneids_study,
             items_p_line=3,
             goea_results=goea_results_sig,
             dpi=dpi)

    goid_subset = [
        'GO:0003723',  # MF D04 RNA binding (32 genes)
        'GO:0044822',  # MF D05 poly(A) RNA binding (86 genes)
        'GO:0003729',  # MF D06 mRNA binding (11 genes)
        'GO:0019843',  # MF D05 rRNA binding (6 genes)
        'GO:0003746',  # MF D06 translation elongation factor activity (5 genes)
    ]
    plot_gos("nbt3102_MF_RNA_genecnt.png",
             goid_subset,
             obo,
             goea_results=goea_results_all,
             dpi=150)
    for dpi in [150, 1200]:  # 150 for review, 1200 for publication
        plot_gos("nbt3102_MF_RNA_dpi{DPI}.png".format(DPI=dpi),
                 goid_subset,
                 obo,
                 study_items=6,
                 id2symbol=geneids_study,
                 items_p_line=3,
                 goea_results=goea_results_all,
                 dpi=dpi)

    # --------------------------------------------------------------------
    # Item 5) Are any significant geneids related to cell cycle?
    # --------------------------------------------------------------------
    import test_genes_cell_cycle as CC
    genes_cell_cycle = CC.get_genes_cell_cycle(taxid, log=log)
    genes_cell_cycle_sig = genes_cell_cycle.intersection(geneids)
    CC.prt_genes("nbt3102_cell_cycle.txt",
                 genes_cell_cycle_sig,
                 taxid,
                 log=None)
Exemple #6
0
def test_example(log=sys.stdout):
    """Run Gene Ontology Enrichment Analysis (GOEA) on Nature data."""
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Gene Ontology Enrichment Analysis (GOEA)
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    taxid = 10090 # Mouse study
    # Load ontologies, associations, and population ids
    geneids_pop = GeneID2nt_mus.keys()
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = get_goeaobj("fdr_bh", geneids_pop, taxid)
    # Run GOEA on study
    #keep_if = lambda nt: getattr(nt, "p_fdr_bh" ) < 0.05 # keep if results are significant
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    compare_results(goea_results_all)
    geneids = get_study_items(goea_results_sig)
    # Print GOEA results to files
    goeaobj.wr_xlsx("nbt3102.xlsx", goea_results_sig)
    goeaobj.wr_txt("nbt3102_sig.txt", goea_results_sig)
    goeaobj.wr_txt("nbt3102_all.txt", goea_results_all)
    # Plot all significant GO terms w/annotated study info (large plots)
    #plot_results("nbt3102_{NS}.png", goea_results_sig)
    #plot_results("nbt3102_{NS}_sym.png", goea_results_sig, study_items=5, items_p_line=2, id2symbol=geneids_study)



    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    # Further examination of GOEA results...
    # --------------------------------------------------------------------
    # --------------------------------------------------------------------
    obo = goeaobj.obo_dag
    dpi = 150 # For review: Figures can be saved in .jpg, .gif, .tif or .eps, at 150 dpi


    # --------------------------------------------------------------------
    # Item 1) Words in GO names associated with large numbers of study genes
    # --------------------------------------------------------------------
    # What GO term words are associated with the largest number of study genes?
    prt_word2genecnt("nbt3102_genecnt_GOword.txt", goea_results_sig, log)
    # Curated selection of GO words associated with large numbers of study genes
    freq_seen = ['RNA', 'translation', 'mitochondr', 'ribosom', # 'ribosomal', 'ribosome',
        'adhesion', 'endoplasmic', 'nucleotide', 'apoptotic', 'myelin']
    # Collect the GOs which contains the chosen frequently seen words
    word2NS2gos = get_word2NS2gos(freq_seen, goea_results_sig)
    go2res = {nt.GO:nt for nt in goea_results_sig}
    # Print words of interest, the sig GO terms which contain that word, and study genes.
    prt_word_GO_genes("nbt3102_GO_word_genes.txt", word2NS2gos, go2res, geneids_study, log)
    # Plot each set of GOs along w/study gene info
    for word, NS2gos in word2NS2gos.items():
       for NS in ['BP', 'MF', 'CC']:
           if NS in NS2gos:
               gos = NS2gos[NS]
               goid2goobj = {go:go2res[go].goterm for go in gos}
               # dpi: 150 for review, 1200 for publication
               #dpis = [150, 1200] if word == "RNA" else [150]
               dpis = [150]
               for dpi in dpis:
                   fmts = ['png', 'tif', 'eps'] if word == "RNA" else ['png']
                   for fmt in fmts:
                       plot_goid2goobj(
                           "nbt3102_{WORD}_{NS}_dpi{DPI}.{FMT}".format(WORD=word, NS=NS, DPI=dpi, FMT=fmt),
                           goid2goobj, # source GOs and their GOTerm object
                           items_p_line=3,
                           study_items=6, # Max number of gene symbols to print in each GO term
                           id2symbol=geneids_study, # Contains GeneID-to-Symbol
                           goea_results=goea_results_all, # pvals used for GO Term coloring
                           dpi=dpi)


    # --------------------------------------------------------------------
    # Item 2) Explore findings of Nature paper:
    #
    #     Gene Ontology (GO) enrichment analysis showed that the
    #     differentially expressed genes contained statistically
    #     significant enrichments of genes involved in
    #         glycolysis,
    #         cellular response to IL-4 stimulation and
    #         positive regulation of B-cell proliferation
    # --------------------------------------------------------------------
    goid_subset = [
        'GO:0006096', # BP 4.24e-12 10 glycolytic process
        'GO:0071353', # BP 7.45e-06  5 cellular response to interleukin-4
        'GO:0030890', # BP 8.22e-07  7 positive regulation of B cell proliferation
    ]
    plot_gos("nbt3102_GOs.png", goid_subset, obo, dpi=dpi)
    plot_gos("nbt3102_GOs_genecnt.png", goid_subset, obo, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_genelst.png", goid_subset, obo,
        study_items=True, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_symlst.png", goid_subset, obo,
        study_items=True, id2symbol=geneids_study, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_symlst_trunc.png", goid_subset, obo,
        study_items=5, id2symbol=geneids_study, goea_results=goea_results_all, dpi=dpi)
    plot_gos("nbt3102_GOs_GO0005743.png", ["GO:0005743"], obo,
        items_p_line=2, study_items=6,
        id2symbol=geneids_study, goea_results=goea_results_all, dpi=dpi)

    # --------------------------------------------------------------------
    # Item 3) Create one GO sub-plot per significant GO term from study
    # --------------------------------------------------------------------
    for rec in goea_results_sig:
        png = "nbt3102_{NS}_{GO}.png".format(GO=rec.GO.replace(':', '_'), NS=rec.NS)
        goid2goobj = {rec.GO:rec.goterm}
        plot_goid2goobj(png,
            goid2goobj, # source GOs and their GOTerm object
            study_items=15, # Max number of gene symbols to print in each GO term
            id2symbol=geneids_study, # Contains GeneID-to-Symbol
            goea_results=goea_results_all, # pvals used for GO Term coloring
            dpi=dpi)

    # --------------------------------------------------------------------
    # Item 4) Explore using manually curated lists of GO terms
    # --------------------------------------------------------------------
    goid_subset = [
      'GO:0030529', # CC D03 intracellular ribonucleoprotein complex (42 genes)
      'GO:0015934', # CC D05 large ribosomal subunit (4 genes)
      'GO:0015935', # CC D05 small ribosomal subunit (13 genes)
      'GO:0022625', # CC D06 cytosolic large ribosomal subunit (16 genes)
      'GO:0022627', # CC D06 cytosolic small ribosomal subunit (19 genes)
      'GO:0036464', # CC D06 cytoplasmic ribonucleoprotein granule (4 genes)
      'GO:0005840', # CC D05 ribosome (35 genes)
      'GO:0005844', # CC D04 polysome (6 genes)
    ]
    plot_gos("nbt3102_CC_ribosome.png", goid_subset, obo,
        study_items=6, id2symbol=geneids_study, items_p_line=3,
        goea_results=goea_results_sig, dpi=dpi)

    goid_subset = [
      'GO:0003723', # MF D04 RNA binding (32 genes)
      'GO:0044822', # MF D05 poly(A) RNA binding (86 genes)
      'GO:0003729', # MF D06 mRNA binding (11 genes)
      'GO:0019843', # MF D05 rRNA binding (6 genes)
      'GO:0003746', # MF D06 translation elongation factor activity (5 genes)
    ]
    plot_gos("nbt3102_MF_RNA_genecnt.png",
        goid_subset,
        obo,
        goea_results=goea_results_all, dpi=150)
    for dpi in [150, 1200]: # 150 for review, 1200 for publication
        plot_gos("nbt3102_MF_RNA_dpi{DPI}.png".format(DPI=dpi),
            goid_subset,
            obo,
            study_items=6, id2symbol=geneids_study, items_p_line=3,
            goea_results=goea_results_all, dpi=dpi)

    # --------------------------------------------------------------------
    # Item 5) Are any significant geneids related to cell cycle?
    # --------------------------------------------------------------------
    import test_genes_cell_cycle as CC
    genes_cell_cycle = CC.get_genes_cell_cycle(taxid, log=log)
    genes_cell_cycle_sig = genes_cell_cycle.intersection(geneids)
    CC.prt_genes("nbt3102_cell_cycle.txt", genes_cell_cycle_sig, taxid, log=None)