def test_gosearch(log=sys.stdout): """Test GoSearch class with no annotations.""" taxids = [9606, 10090] # Download ontologies and annotations, if necessary fin_go_obo = os.path.join(REPO, "go-basic.obo") download_go_basic_obo(fin_go_obo, loading_bar=None) # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) # Initialize GO-search helper object with obo and annotations(go2items) for taxid in taxids: obj = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs'], log=log) assert len(obj.obo_dag) > 40000 GoSearch(fin_go_obo, dict(), log=log) assert len(obj.obo_dag) > 40000
def test_write_summary_cnts(log=sys.stdout): """Print level/depth summaries for various sets of GO terms.""" obodag = _get_obodag() rptobj = RptLevDepth(obodag, log) # Report level/depth summary for all GOs in a dag log.write("\nSummary for all Ontologies:\n") rptobj.write_summary_cnts_all() # Report level/depth summary for all GOs in human, fly, and mouse taxids = [9606, 7227, 10090] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Get associations for human fly and mouse get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs) for taxid, assc in taxid2asscs.items(): log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_ids = assc['GO2GeneIDs'].keys() rptobj.write_summary_cnts(go_ids) log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_objs = [obodag[goid] for goid in go_ids] rptobj.write_summary_cnts_goobjs(go_objs)
def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods.""" # Download ontologies and annotations, if necessary fin_go_obo = "go-basic.obo" if not os.path.exists(fin_go_obo): wget.download("http://geneontology.org/ontology/go-basic.obo") # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs) # Initialize GO-search helper object with obo and annotations(go2items) srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs']) # Compile search pattern for 'cell cycle' cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) with open(fout_allgos, "w") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) # in the results when it should not be because the match was found: # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) log.write(' FOUND {N:>5} GOs: {F}\n'.format(N=len(gos_all), F=fout_allgos)) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" obo_dag = get_godag() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) # obo_dag is also found in goeaobj.obo_dag return goeaobj
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study obo_dag = GODag(download_go_basic_obo(prt=prt)) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt) for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher ) fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt) return fisher2pvals
def test_ncbi_gene2go(log=sys.stdout): """Return GO associations to Entrez GeneIDs. Download if necessary.""" # Get associations for human(9606), mouse(10090), and fly(7227) taxid2asscs = get_assoc_ncbi_taxids([9606, 10090, 7227]) # Report findings for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs['GeneID2GOs']) num_go2genes = len(asscs['GO2GeneIDs']) log.write("{N:>5} GOs and {M:>5} annotated GeneIDs for tax_id: {TAXID:>6}\n".format( TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, godag, propagate_counts=False, alpha=0.05, methods=[method]) # godag is also found in goeaobj.godag return goeaobj
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes study_ids = _get_geneids() population_ids = GeneID2nt.keys() # Get databases gene2go = get_assoc_ncbi_taxids([9606], loading_bar=None) fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" obo_dag = get_godag() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) # obo_dag is also found in goeaobj.obo_dag return goeaobj
def get_genes_cell_cycle(taxid=9606, log=sys.stdout): """Test GOEA with local multipletest correction methods.""" # Download ontologies and annotations, if necessary fin_go_obo = "go-basic.obo" if not os.path.exists(fin_go_obo): wget.download("http://geneontology.org/ontology/go-basic.obo") # Because get_assoc_ncbi_taxids returns id2gos, we will opt to # use the (optional) multi-level dictionary separate associations by taxid # taxid2asscs contains both GO2GeneIDs and GeneID2GOs. taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) get_assoc_ncbi_taxids([taxid], taxid2asscs=taxid2asscs) # Initialize GO-search helper object with obo and annotations(go2items) srch = GoSearch(fin_go_obo, go2items=taxid2asscs[taxid]['GO2GeneIDs']) # Compile search pattern for 'cell cycle' cell_cycle = re.compile(r'cell cycle', flags=re.IGNORECASE) # Find ALL GOs that have 'cell cycle'. Store results in file. fout_allgos = "cell_cycle_gos_{TAXID}.log".format(TAXID=taxid) with open(fout_allgos, "w") as prt: # Search for 'cell cycle' in GO terms gos_cc_all = srch.get_matching_gos(cell_cycle, prt=prt) # Researcher carefully reviews GO results and finds GO:0005764(lysosome) # in the results when it should not be because the match was found: # cell cycle-independent # Researcher removes 'lysosome' from 'cell cycle' results # by removing any GOs matching 'cell cycle-independent' cell_cycle_ind = re.compile(r'cell cycle.independent', flags=re.IGNORECASE) gos_no_cc = srch.get_matching_gos(cell_cycle_ind, gos=gos_cc_all, prt=prt) gos = gos_cc_all.difference(gos_no_cc) # Add children GOs of cell cycle GOs gos_all = srch.add_children_gos(gos) if log is not None: log.write(' taxid {TAXID:>5}\n'.format(TAXID=taxid)) log.write(' FOUND {N:>5} GOs: {F}\n'.format( N=len(gos_all), F=fout_allgos)) # Get Entrez GeneIDs for cell cycle GOs geneids = srch.get_items(gos_all) return geneids
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, godag, propagate_counts=False, alpha=0.05, methods=[method]) # godag is also found in goeaobj.godag return goeaobj
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, godag, propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, methods=['fdr_bh']) return goeaobj.run_study(geneids_study, prt=prt)
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = "go-basic.obo" if not os.path.isfile(fin_obo): wget.download("wget http://geneontology.org/ontology/go-basic.obo") obo_dag = GODag(fin_obo) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) return goeaobj
def test_write_summary_cnts(log=sys.stdout): """Print level/depth summaries for various sets of GO terms.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) rptobj = RptLevDepth(godag, log) # Report level/depth summary for all GOs in a dag log.write("\nSummary for all Ontologies:\n") rptobj.write_summary_cnts_all() # Report level/depth summary for all GOs in human, fly, and mouse taxids = [9606, 7227, 10090] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Get associations for human fly and mouse get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) for taxid, assc in taxid2asscs.items(): log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_ids = assc['GO2GeneIDs'].keys() rptobj.write_summary_cnts(go_ids) log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_objs = [godag.get(goid) for goid in go_ids] rptobj.write_summary_cnts_goobjs(go_objs) # Print GO depth count table for full GO DAG in LaTeX format rptobj.prttex_summary_cnts_all(prt=log)
def test_ncbi_gene2go(log=sys.stdout): """Return GO associations to Entrez GeneIDs. Download if necessary.""" # Get associations for human(9606), mouse(10090), and fly(7227) taxid2asscs = get_assoc_ncbi_taxids([9606, 10090, 7227]) # Report findings for taxid, asscs in taxid2asscs.items(): num_gene2gos = len(asscs['GeneID2GOs']) num_go2genes = len(asscs['GO2GeneIDs']) log.write( "{N:>5} GOs and {M:>5} annotated GeneIDs for tax_id: {TAXID:>6}\n". format(TAXID=taxid, N=num_go2genes, M=num_gene2gos)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos > 11000 assert num_go2genes > 6000
def test_write_summary_cnts(log=sys.stdout): """Print level/depth summaries for various sets of GO terms.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) rptobj = RptLevDepth(godag, log) # Report level/depth summary for all GOs in a dag log.write("\nSummary for all Ontologies:\n") rptobj.write_summary_cnts_all() # Report level/depth summary for all GOs in human, fly, and mouse taxids = [9606, 7227, 10090] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Get associations for human fly and mouse get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) assert taxid2asscs, 'taxid2asscs EMPTY' for taxid, assc in taxid2asscs.items(): log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_ids = assc['GO2IDs'].keys() rptobj.write_summary_cnts(go_ids) log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_objs = [godag.get(goid) for goid in go_ids] rptobj.write_summary_cnts_goobjs(go_objs) # Print GO depth count table for full GO DAG in LaTeX format rptobj.prttex_summary_cnts_all(prt=log)
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = "go-basic.obo" if not os.path.isfile(fin_obo): wget.download("wget http://geneontology.org/ontology/go-basic.obo") obo_dag = GODag(fin_obo) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts = False, alpha = 0.05, methods = [method]) return goeaobj
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout): """Run a GOEA. Return results""" taxid = 10090 # Mouse study geneids_pop = set(GeneID2nt_mus.keys()) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, godag, propagate_counts=propagate_counts, relationships=relationships, alpha=0.05, methods=['fdr_bh']) return goeaobj.run_study(geneids_study, prt=prt)
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study obo_dag = GODag(download_go_basic_obo(prt=prt)) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) geneids_study = _get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx", prt) for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher) fisher2pvals[fisher] = goeaobj._get_pval_uncorr(geneids_study, prt) return fisher2pvals
def _get_pvals(pvalfnc_names, prt=sys.stdout): fisher2pvals = {} taxid = 10090 # Mouse study file_obo = os.path.join(os.getcwd(), "go-basic.obo") obo_dag = get_godag(file_obo, prt, loading_bar=None) geneids_pop = GeneID2nt_mus.keys() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx") for fisher in pvalfnc_names: goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=None, pvalcalc=fisher) fisher2pvals[fisher] = goeaobj.get_pval_uncorr(geneids_study, prt) return fisher2pvals
def test_ncbi_gene2go(log=sys.stdout): """Return GO associations to Entrez GeneIDs. Download if necessary. Example report generated with Feb 22, 2013 download of: NCBI Gene tables and associations in gene2go 49672 items found in gene2go from NCBI's ftp server taxid GOs GeneIDs Description ----- ------ ------- ----------- 10090 16,807 18,971 all DNA items 7227 7,022 12,019 all DNA items 7227 6,956 10,590 76% GO coverage of 13,919 protein-coding genes 9606 16,299 18,680 all DNA items 9606 16,296 18,253 87% GO coverage of 20,913 protein-coding genes """ # Get associations for human(9606), mouse(10090), and fly(7227) # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Simple dictionary containing id2gos taxids = [9606, 10090, 7227] id2gos = get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) log.write(" {N} items found in gene2go from NCBI's ftp server\n".format( N=len(id2gos))) taxid2pc = {9606: GeneID2nt_hsa, 7227: GeneID2nt_dme} # Report findings log.write(" taxid GOs GeneIDs Description\n") log.write(" ----- ------ ------- -----------\n") for taxid, asscs in taxid2asscs.items(): num_gene2gos_all = len(asscs['GeneID2GOs']) num_go2genes_all = len(asscs['GO2GeneIDs']) log.write(" {TAXID:>6} {N:>6,} {M:>7,} all DNA items\n".format( TAXID=taxid, N=num_go2genes_all, M=num_gene2gos_all)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos_all > 11000 assert num_go2genes_all > 6000 if taxid in taxid2pc.keys(): rpt_coverage(taxid, asscs, taxid2pc[taxid], log)
def test_ncbi_gene2go(log=sys.stdout): """Return GO associations to Entrez GeneIDs. Download if necessary. Example report generated with Feb 22, 2013 download of: NCBI Gene tables and associations in gene2go 49672 items found in gene2go from NCBI's ftp server taxid GOs GeneIDs Description ----- ------ ------- ----------- 10090 16,807 18,971 all DNA items 7227 7,022 12,019 all DNA items 7227 6,956 10,590 76% GO coverage of 13,919 protein-coding genes 9606 16,299 18,680 all DNA items 9606 16,296 18,253 87% GO coverage of 20,913 protein-coding genes """ # Get associations for human(9606), mouse(10090), and fly(7227) # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Simple dictionary containing id2gos id2gos = get_assoc_ncbi_taxids(taxids=[9606, 10090, 7227], taxid2asscs=taxid2asscs) log.write(" {N} items found in gene2go from NCBI's ftp server\n".format(N=len(id2gos))) taxid2pc = {9606:GeneID2nt_hsa, 7227:GeneID2nt_dme} # Report findings log.write(" taxid GOs GeneIDs Description\n") log.write(" ----- ------ ------- -----------\n") for taxid, asscs in taxid2asscs.items(): num_gene2gos_all = len(asscs['GeneID2GOs']) num_go2genes_all = len(asscs['GO2GeneIDs']) log.write(" {TAXID:>6} {N:>6,} {M:>7,} all DNA items\n".format( TAXID=taxid, N=num_go2genes_all, M=num_gene2gos_all)) # Basic check to ensure gene2go was downloaded and data was returned. assert num_gene2gos_all > 11000 assert num_go2genes_all > 6000 if taxid in taxid2pc.keys(): rpt_coverage(taxid, asscs, taxid2pc[taxid], log)