def test_gosubdag_relationships(wr_new_obo_subset=False): """Plot both the standard 'is_a' field and the 'part_of' relationship.""" # Leaf GO: viral triggering of virus induced gene silencing goid_chosen = 'GO:0060150' # Load GODag with all relationships fin_obo = os.path.join(REPO, "go-basic.obo") godag_r0 = get_godag(fin_obo, loading_bar=None) godag_r1 = get_godag(fin_obo, loading_bar=None, optional_attrs=['relationship']) file_sub = os.path.join(REPO, "tests/data/viral_gene_silence.obo") # Get all GO terms above this low-level GO ID using all relationships if wr_new_obo_subset: _wr_sub_obo(file_sub, goid_chosen, godag_r1, fin_obo) gosubdag_r0 = GoSubDag(set([goid_chosen]), godag_r0) gosubdag_r1 = GoSubDag(set([goid_chosen]), godag_r1, relationships=True) _run_baseline_r0(gosubdag_r0, gosubdag_r1) # BASELINE r1: Test that GOTerm.get_all_upper() is the same as GoSubDag ancestors for goid, term in gosubdag_r1.go2obj.items(): ancestors_r1 = gosubdag_r1.rcntobj.go2parents[goid] assert ancestors_r1 == term.get_all_upper()
def test_pc_w_rels(prt=sys.stdout): """Test P-value calculations.""" file_obo = os.path.join(REPO, "go-basic.obo") godag_r0 = get_godag(file_obo, prt, loading_bar=None) godag_r1 = get_godag(file_obo, prt, loading_bar=None, optional_attrs=['relationship']) results_r0 = _get_results(godag_r1, propagate_counts=True, relationships=False, prt=prt) results_r1 = _get_results(godag_r1, propagate_counts=True, relationships=True, prt=prt) _chk_results(results_r0, results_r1, prt)
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = "{REPO}/go-basic.obo".format(REPO=REPO) if not os.path.isfile(obo_fin): get_godag("go-basic.obo") obo_dag = GODag(obo_fin) assoc = read_associations( "{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def test_semsim_wang(prt=stdout): """Test setting edge weights for various relationships""" # Log file # Check that all relationships seem in DAG are expected by SsWang fin_godag = join(REPO, 'go-basic.obo') godag_r0 = get_godag(fin_godag, prt=prt) passed = False try: wang = SsWang({}, godag_r0, {'part_of',}) except RuntimeError as err: assert str(err) == '**ERROR: SsWang GODag not loaded with relationships', '({})'.format(err) passed = True assert passed wang = SsWang({}, godag_r0) assert wang.w_e == {'is_a': 0.8} wang = SsWang({}, godag_r0, rel2scf={'is_a': 0.9, 'part_of': 0.7}) assert wang.w_e == {'is_a': 0.9} godag_r1 = get_godag(fin_godag, optional_attrs=['relationship'], prt=prt) _chk_relationships(godag_r1) # Run randoms relationships = {'part_of'} wang = SsWang({}, godag_r1, relationships, rel2scf={}) assert wang.w_e == {'is_a': 0.8, 'part_of': 0.6} wang = SsWang({}, godag_r1, relationships, rel2scf={'is_a': 0.9, 'part_of': 0.7}) assert wang.w_e == {'is_a': 0.9, 'part_of': 0.7} # pylint: disable=line-too-long wang = SsWang({}, godag_r1, relationships, rel2scf={'is_a': 0.9, 'part_of': 0.7, 'regulates':0.2}) assert wang.w_e == {'is_a': 0.9, 'part_of': 0.7} wang = SsWang({}, godag_r1) assert wang.w_e == {'is_a': 0.8} wang = SsWang({}, godag_r1, rel2scf={'is_a': 0.9, 'part_of': 0.7}) assert wang.w_e == {'is_a': 0.9} wang = SsWang({}, godag_r1, rel2scf={'is_a': 0.9, 'part_of': 0.7, 'regulates':0.2}) assert wang.w_e == {'is_a': 0.9} wang = SsWang({}, godag_r1, relationships={'mock_rel'}) assert wang.w_e == {'is_a': 0.8} print('**PASSED: Properly reported ERROR in relationship, mock_rel') wang = SsWang({}, godag_r1, rel2scf={'mock_rel':.7}) assert wang.w_e == {'is_a': 0.8}
def test_find_enrichment(): """Recreate run in run.sh.""" # Set params objtest = ArgsDict() get_godag(objtest.namespace['obo'], loading_bar=None) objtest.namespace['indent'] = True args = objtest.ntobj(**objtest.namespace) # Run test objcli = GoeaCliFnc(args) # Check results ## expected_cnts = {'fdr_bh': 17, 'sidak': 5, 'holm': 5, 'bonferroni': 5} expected_cnts = {'fdr_bh': 19, 'sidak': 9, 'holm': 9, 'bonferroni': 9} _chk_results(objcli.results_all, expected_cnts, objcli) print("TEST PASSED")
def test_find_enrichment(run_all=False): """RUn an enrichments using all annotation file formats""" if run_all: fin_obo = join(REPO, 'go-basic.obo') get_godag(fin_obo, optional_attrs={'relationship'}, loading_bar=None) fin_gaf = join(REPO, 'goa_human.gaf') dnld_annotation(fin_gaf) for idx, cmd in enumerate(_get_cmds()): print('------------------- TEST {I} ------------------------------------'.format(I=idx)) print('CMD: {CMD}'.format(CMD=cmd)) assert system(cmd) == 0 print("TEST PASSED") else: print('RUN THIS TEST WITH AN ARGUMENT')
def test_update_association(): """Compare new propagate cnts function with original function. Test assc results is same.""" print('\n1) READ GODAG:') assc_name = "goa_human.gaf" # gene_association.fb gene_association.mgi obo = os.path.join(REPO, "go-basic.obo") tic = timeit.default_timer() godag = get_godag(obo) tic = prt_hms(tic, "Created two GODags: One for original and one for new propagate counts") print('\n2) READ ANNOTATIONS:') assc_orig = dnld_assc(os.path.join(REPO, assc_name), godag) tic = prt_hms(tic, "Associations Read") objanno = get_objanno(os.path.join(REPO, assc_name), 'gaf', godag=godag) tic = prt_hms(tic, "Associations Read") print('\n3) MAKE COPIES OF ASSOCIATIONS:') assc1 = {g:set(gos) for g, gos in assc_orig.items()} assc2 = {g:set(gos) for g, gos in assc_orig.items()} tic = prt_hms(tic, "Associations Copied: One for original and one for new") print('\n4) UPDATE ASSOCIATIONS (PROPAGATE COUNTS):') godag.update_association(assc1) tic = prt_hms(tic, "ORIG: godag.update_association(assc)") update_association(assc2, godag) tic = prt_hms(tic, "NEW SA: update_association(go2obj, assc_orig)") assc3 = objanno.get_id2gos(namespace='BP', propagate_counts=True) tic = prt_hms(tic, "NEW BASE: update_association(go2obj, assc_orig)") print('\n5) RUN CHECKS') _chk_assc(assc1, assc2) _chk_assc(assc1, assc3) _chk_godag(godag, obo)
def test_all(prt=sys.stdout): """Test initialization and operation of CountRelatives for GO term branch(s) visualization.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), prt=sys.stdout) rcntobj = CountRelatives(godag) _wr_xlsx_d1(rcntobj) _run_get_letters_d1(rcntobj) _run_get_letters_d2(godag, rcntobj, prt)
def test_go_parents(): """Run GO parent tests""" gosubdag_all = GoSubDag(None, get_godag("go-basic.obo", prt=None), rcntobj=True) run_1(gosubdag_all) run_2(gosubdag_all)
def main(prt=sys.stdout): """Statistics for the protein-coding mouse gene association.""" godag = get_godag() params = { 'association_file': os.path.join(REPO, 'gene_association.mgi'), 'genes_population': ensm2nt.keys() } # Population genes objassc = DataAssc(params, godag) # Statistics for number of genes per GO in the mouse association for protein-coding genes go2numgenes = {go: len(genes) for go, genes in objassc.go2genes.items()} objdesc = StatsDescribe("GOs", "{:>5.0f}") objdesc.prt_hdr(prt, name="\nname ") objdesc.prt_data("# genes/GO", go2numgenes.values(), prt) # Statistics for number of GOs per gene in the mouse association for protein-coding genes gene2numgos = { gene: len(gos) for gene, gos in objassc.objassc_all.assc_geneid2gos.items() } objdesc = StatsDescribe("genes", "{:>5.0f}") objdesc.prt_hdr(prt, name="\nname ") objdesc.prt_data("# GOs/gene", gene2numgos.values(), prt) # Percentage of Ensembl mouse genes covered by GO annotations num_pc = len(params['genes_population']) num_assc = len(objassc.objassc_all.assc_geneid2gos) prt.write( "{PERC:2.0f}% of {A} of {P} Mouse protein-coding genes are annotated by GO IDs.\n" .format(PERC=100.0 * num_assc / num_pc, P=num_pc, A=num_assc))
def __init__(self, taxid, fin_gene2go, fin_gobasic): _fin = os.path.join(REPO, fin_gene2go) dnld_ncbi_gene_file(_fin, loading_bar=None) self.gene2go = read_ncbi_gene2go(_fin, [taxid]) _fin_obo = os.path.join(REPO, fin_gobasic) self.godag = get_godag(_fin_obo, loading_bar=None)
def test_get_children(prt=sys.stdout): """Semantic Similarity test for Issue #86.""" # Load GO-DAG fin_obo = "go-basic.obo" repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") godag = get_godag(os.path.join(repo, fin_obo)) go2obj = {go: o for go, o in godag.items() if go == o.id} # Get all children for all GO IDs using get_all_children in GOTerm class tic = timeit.default_timer() go2children_orig = {} go2children_empty = set() for goobj in go2obj.values(): children = goobj.get_all_children() if children: go2children_orig[goobj.id] = children else: go2children_empty.add(goobj.id) tic = prt_hms(tic, "Get all goobj's children using GOTerm.get_all_children()", prt) # Get all children for all GO IDs using GOTerm get_all_children go2children_fast = get_id2children(go2obj.values()) prt_hms(tic, "Get all goobj's children using go_tasks::get_id2children", prt) # Compare children lists CheckGOs('test_get_children', go2obj).chk_a2bset(go2children_orig, go2children_fast)
def test_gpad_read(run_desc="mouse", prt=sys.stdout): """Test reading GPAD files from GOA source http://www.ebi.ac.uk/GOA.""" objdnld = DnldGoa() species2gpad = _dnld_gpad(objdnld, run_desc) # Count Annotation Extension Relations across all species relations = cx.Counter() godag = get_godag() pat = "{N:8,} of {M:8,} {P:5.2f}% associations have Annotation Extensions in {ORG}\n" for org, gpad_file in sorted(species2gpad.items()): orgstr = "{ORG} {GPAD}".format(ORG=org, GPAD=os.path.basename(gpad_file)) prt.write("\n{GPAD}\n".format(GPAD=orgstr)) objgpad = GpadReader(gpad_file, godag=godag) for ntgpad in objgpad.associations: # Assertions are present in the GPAD reader class if ntgpad.Extension: relations += ntgpad.Extension.get_relations_cnt() num_ext = len( [nt for nt in objgpad.associations if nt.Extension is not None]) # The Extensions field is new in GPAD prt.write( pat.format(N=num_ext, M=objgpad.qty, P=100. * num_ext / objgpad.qty, ORG=org)) for rel, cnt in objgpad.get_relation_cnt().most_common(): prt.write(" {C:6,} {R}\n".format(C=cnt, R=rel)) prt.write( "\n{N} Annotation Extensions Relations found among all species:\n". format(N=len(relations))) for rel, cnt in relations.most_common(): prt.write("{C:10,} {R}\n".format(C=cnt, R=rel))
def test_wr_sections_all(): """Test that all sections files generated by wr_sections have the same content.""" f_sec_rd = "data/gjoneska_pfenning/sections_in.txt" f_sec_wr = "tmp_test_sections_out.txt" # Travis-CI path is cwd f_sec_py = "tmp_test_sections.py" # f_sec_mod = "tmp_test_sections" # Read user GO IDs. Setup to write sections text file and Python file usrgos = [getattr(nt, 'GO') for nt in goea_results] sec_rd = _read_sections(f_sec_rd) # Do preliminaries godag = get_godag("go-basic.obo", prt=None, loading_bar=False, optional_attrs=['relationship']) gosubdag = GoSubDag(usrgos, godag, relationships=True, tcntobj=None) grprdflt = GrouperDflts(gosubdag) # Exclude ungrouped "Misc." section of sections var(sec_rd) hdrobj = HdrgosSections(gosubdag, grprdflt.hdrgos_dflt, sec_rd[:-1]) assert sec_rd[-1][0] == hdrobj.secdflt, sec_rd[-1][0] grprobj = Grouper("test", usrgos, hdrobj, gosubdag) # Create text and Python sections files objsecwr = WrSectionsTxt(grprobj) objsecwr.wr_txt_section_hdrgos(os.path.join(REPO, f_sec_wr)) objsecpy = WrSectionsPy(grprobj) objsecpy.wr_py_sections(os.path.join(REPO, f_sec_py), sec_rd, doc=godag.version) # Read text and Python sections files sec_wr = _read_sections(f_sec_wr) sec_py = _read_sections(f_sec_py)
def test_find_enrichment(): """RUn an enrichments using all annotation file formats""" godag = get_godag("go-basic.obo", optional_attrs=['relationship']) gos = _get_enriched_goids('GO:0006959', godag) # GO IDs related to humoral response # pylint: disable=superfluous-parens print('- DOWNLOAD AND LOAD -----------------------------------------------') annoobjs = [ _get_objanno('gene2go', taxid=10090), _get_objanno('gene2go', taxid=9606), _get_objanno('goa_human.gaf'), _get_objanno('goa_human.gpad', godag=godag), _get_objanno('data/association', anno_type='id2gos', godag=godag), ] for obj in annoobjs: ns2assc = obj.get_ns2assc() pop = list(itertools.chain.from_iterable(ns2assc.values())) print('{N:6,} population IDs'.format(N=len(pop))) enriched = set(nt.DB_ID for nt in obj.associations if nt.GO_ID in gos) objgoeans = _get_objgoeans(pop, ns2assc, godag) results = objgoeans.run_study(enriched) print('{N} results'.format(N=len(results))) # Run one branch bp2assc = {'BP': ns2assc['BP']} objgoeabp = _get_objgoeans(pop, bp2assc, godag) results_bp = objgoeabp.run_study(enriched) print('{N} results'.format(N=len(results_bp))) print("TEST PASSED")
def test_find_enrichment(): """RUn an enrichments using all annotation file formats""" godag = get_godag("go-basic.obo", optional_attrs=['relationship']) gos = _get_enriched_goids('GO:0006959', godag) # GO IDs related to humoral response # pylint: disable=superfluous-parens print( '- DOWNLOAD AND LOAD -----------------------------------------------') annoobjs = [ _get_objanno('gene2go', taxid=10090), _get_objanno('gene2go', taxid=9606), _get_objanno('goa_human.gaf'), _get_objanno('goa_human.gpad', godag=godag), _get_objanno('data/association', anno_type='id2gos', godag=godag), ] for obj in annoobjs: ns2assc = obj.get_ns2assc() pop = list(itertools.chain.from_iterable(ns2assc.values())) print('{N:6,} population IDs'.format(N=len(pop))) enriched = set(nt.DB_ID for nt in obj.associations if nt.GO_ID in gos) objgoeans = _get_objgoeans(pop, ns2assc, godag) results = objgoeans.run_study(enriched) print('{N} results'.format(N=len(results))) # Run one branch bp2assc = {'BP': ns2assc['BP']} objgoeabp = _get_objgoeans(pop, bp2assc, godag) results_bp = objgoeabp.run_study(enriched) print('{N} results'.format(N=len(results_bp))) print("TEST PASSED")
def _get_godag(): """Get GO DAG.""" fin = os.path.join(REPO, 'go-basic.obo') return get_godag(fin, prt=None, loading_bar=False, optional_attrs=['relationship'])
def test_get_lowerselect(prt=sys.stdout): """Test getting parents and user-specfied ancestor relationships""" # Load GO-DAG repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") godag = get_godag(os.path.join(repo, 'go-basic.obo'), optional_attrs='relationship') run = RelationshipCombos(godag) run.chk_relationships_all() rels_combo = run.get_relationship_combos() print('{N} COMBINATIONS OF RELATIONSHIPS'.format(N=len(rels_combo))) for relidx, rels_set in enumerate(rels_combo, 1): print('{I}) RELATIONSHIPS[{N}]: {Rs}'.format( I=relidx, N=len(rels_set), Rs=' '.join(sorted(rels_set)))) # ------------------------------------------------------------------------ # Get all parents for all GO IDs using get_all_parents in GOTerm class tic = timeit.default_timer() # pylint: disable=line-too-long go2lowerselect_orig = {o.item_id:get_all_lowerselect(o, rels_set) for o in run.go2obj.values()} tic = prt_hms(tic, "Get all goobj's parents using get_all_lowerselect(GOTerm)", prt) # ------------------------------------------------------------------------ # Get all parents for all GO IDs using GOTerm get_all_parents go2lowerselect_fast = get_id2lowerselect(run.go2obj.values(), rels_set) tic = prt_hms(tic, "Get all goobj's parents using go_tasks::get_id2lowerselect", prt) # ------------------------------------------------------------------------ # Compare parent lists chkr = CheckGOs('test_get_lower_select', godag) chkr.chk_a2bset(go2lowerselect_orig, go2lowerselect_fast) # EXPECTED, ACTUAL print("PASSED: get_lowerselect RELATIONSHIPS[{N}]: {Rs}".format( N=len(rels_set), Rs=' '.join(sorted(rels_set))))
def test_alt_id(): """Ensure that alternate GO IDs.""" obo_dag = get_godag("go-basic.obo", loading_bar=None) alt_ids = get_altids(obo_dag) obo_goids = obo_dag.keys() obo_goids_set = set(obo_goids) assert len(alt_ids.intersection(obo_goids_set)) == len(alt_ids)
def test_write_hier_bp_mf_cc(): """Test that write hierarchy writes all: BP, MF, CC""" fin_anno = os.path.join(REPO, 'gene2go') fin_dag = os.path.join(REPO, "go-basic.obo") _dnld_anno(fin_anno) #godag = get_godag(os.path.join(REPO, 'go-basic.obo'), loading_bar=None) print('\nTEST STORING ONLY ONE SPECIES') #### obj = Gene2GoReader(fin_anno) godag = get_godag(fin_dag) gene2gos = read_annotations(namespace='ALL') tcntobj = TermCounts(godag, gene2gos) if gene2gos else None gosubdag = GoSubDag(godag.keys(), godag, relationships=False, tcntobj=tcntobj, children=True, prt=sys.stdout) objwr = WrHierGO(gosubdag) # 2020 11: # 594,748 GO lines under GO:0008150 # 23,199 GO lines under GO:0003674 # 6,259 GO lines under GO:0005575 # 624,206 items WROTE: tmp_test_wr_hier_BP_MF_CC.txt assert len(_wr_hier(['BP', 'MF', 'CC'], gosubdag.go2nt, objwr)) > 600000 assert len(_wr_hier([ 'BP', ], gosubdag.go2nt, objwr)) > 500000 assert len(_wr_hier([ 'MF', ], gosubdag.go2nt, objwr)) > 20000 assert len(_wr_hier([ 'CC', ], gosubdag.go2nt, objwr)) > 5000
def test_all(prt=sys.stdout): """Test initialization and operation of CountRelatives for GO term branch(s) visualization.""" obo_dag = get_godag("go-basic.obo", prt=None) rcntobj = CountRelatives(obo_dag) _wr_xlsx_d1(rcntobj) _run_get_letters_d1(rcntobj) _run_get_letters_d2(obo_dag, rcntobj, prt)
def cli(self, prt=sys.stdout): """Command-line interface for go_draw script.""" kws = self.objdoc.get_docargs(prt=None) godag = get_godag(kws['obo'], prt=None, loading_bar=False, optional_attrs=['relationship']) usrgos = GetGOs(godag, max_gos=200).get_usrgos(kws.get('GO_FILE'), prt) tcntobj = self._get_tcntobj(usrgos, godag, **kws) # Gets TermCounts or None self.gosubdag = GoSubDag(usrgos, godag, relationships=True, tcntobj=tcntobj, prt=None) grprdflt = GrouperDflts(self.gosubdag, kws['slims']) ver_list = [godag.version, grprdflt.ver_goslims] prt.write("{VER}\n".format(VER="\n".join(ver_list))) sections = self._read_sections(kws['ifile']) # print("SECSECSEC", sections) hdrobj = HdrgosSections(self.gosubdag, grprdflt.hdrgos_dflt, sections) grprobj = Grouper("init", usrgos, hdrobj, self.gosubdag) # Write sections objsecwr = WrSectionsTxt(grprobj, ver_list) if not os.path.exists(kws['ifile']): objsecwr.wr_txt_section_hdrgos(kws['ifile']) objsecwr.wr_txt_section_hdrgos(kws['ofile']) objsecpy = WrSectionsPy(grprobj, ver_list) if 'py' in kws: objsecpy.wr_py_sections(kws['py'], sections, doc=godag.version) # Write user GO IDs in sections sortobj = Sorter(grprobj) objgowr = WrXlsxSortedGos("init", sortobj, ver_list) objgowr.wr_txt_gos(kws['txt'], sortby=objsecpy.fncsortnt) #objwr.wr_txt_section_hdrgos(kws['ofile'], sortby=objwr.fncsortnt) self._prt_cnt_usrgos(usrgos, sys.stdout)
def test_nb(): """Test notebook code""" godag = get_godag("go-basic.obo", optional_attrs={'relationship'}) go_leafs = set(o.item_id for o in godag.values() if not o.children) virion = 'GO:0019012' gosubdag_r0 = GoSubDag(go_leafs, godag) nt_virion = gosubdag_r0.go2nt[virion] print(nt_virion) print('r0 THE VALUE OF dcnt IS: {dcnt}'.format(dcnt=nt_virion.dcnt)) gosubdag_r1 = GoSubDag(go_leafs, godag, relationships=True) nt_virion = gosubdag_r1.go2nt[virion] print(nt_virion) print('r1 THE VALUE OF dcnt IS: {dcnt}'.format(dcnt=nt_virion.dcnt)) gosubdag_partof = GoSubDag(go_leafs, godag, relationships={'part_of'}) nt_virion = gosubdag_partof.go2nt[virion] print(nt_virion) print('THE VALUE OF dcnt IS: {dcnt}'.format(dcnt=nt_virion.dcnt)) virion_descendants = gosubdag_partof.rcntobj.go2descendants[virion] print('{N} descendants of virion were found'.format( N=len(virion_descendants))) # Limit plot of descendants to get a smaller plot virion_capsid_fiber = {'GO:0098033', 'GO:0098032'} gosubdag_partof.prt_goids(virion_capsid_fiber, '{NS} {GO} dcnt({dcnt}) D-{depth:02} {GO_name}') # Limit plot size by choosing just two virion descendants # Get a subset containing only a couple virion descendants and their ancestors pltdag = GoSubDag(virion_capsid_fiber, godag, relationships={'part_of'}) pltobj = GoSubDagPlot(pltdag) pltobj.plt_dag('virion_capsid_fiber.png')
def test_gpad_read(run_desc="mouse", prt=sys.stdout): """Test reading GPAD files from GOA source http://www.ebi.ac.uk/GOA.""" objdnld = DnldGoa() species2gpad = _dnld_gpad(objdnld, run_desc) # Count Annotation Extension Relations across all species relations = cx.Counter() godag = get_godag() pat = "{N:8,} of {M:8,} {P:5.2f}% associations have Annotation Extensions in {ORG}\n" for org, gpad_file in sorted(species2gpad.items()): orgstr = "{ORG} {GPAD}".format(ORG=org, GPAD=os.path.basename(gpad_file)) prt.write("\n{GPAD}\n".format(GPAD=orgstr)) objgpad = GpadReader(gpad_file, godag=godag) for ntgpad in objgpad.associations: # Assertions are present in the GPAD reader class if ntgpad.Extension: relations += ntgpad.Extension.get_relations_cnt() num_ext = len([nt for nt in objgpad.associations if nt.Extension is not None]) # The Extensions field is new in GPAD prt.write(pat.format(N=num_ext, M=objgpad.qty, P=100.*num_ext/objgpad.qty, ORG=org)) for rel, cnt in objgpad.get_relation_cnt().most_common(): prt.write(" {C:6,} {R}\n".format(C=cnt, R=rel)) prt.write("\n{N} Annotation Extensions Relations found among all species:\n".format( N=len(relations))) for rel, cnt in relations.most_common(): prt.write("{C:10,} {R}\n".format(C=cnt, R=rel))
def test_i177(): """Run code from issue #177, which is reporting a recursion error""" go_id = 'GO:0050807' godag = get_godag('go.obo', optional_attrs='relationship') gosubdag_r0 = GoSubDag([go_id], godag, prt=None) print('{GO} ancestors: {P}'.format( GO=go_id, P=gosubdag_r0.rcntobj.go2ancestors[go_id]))
def _get_gosubdag(): """Get GO DAG.""" fin = os.path.join(REPO, 'go-basic.obo') godag = get_godag(fin, prt=sys.stdout, loading_bar=False, optional_attrs=['relationship']) return GoSubDag(None, godag)
def _get_grprobj(): """Get object for grouping GO IDs.""" fin_obo = os.path.join(REPO, "go-basic.obo") godag = get_godag(fin_obo, prt=None, loading_bar=False, optional_attrs=['relationship']) gosubdag = GoSubDag(USER_GOS, godag, relationships=True, tcntobj=None) grprdflt = GrouperDflts(gosubdag) hdrobj = HdrgosSections(gosubdag, grprdflt.hdrgos_dflt, SECTIONS) return Grouper("wrusrgos", USER_GOS, hdrobj, gosubdag)
def test_godag(prt=sys.stdout): """Test downloading GO DAG.""" cwd = os.getcwd() for fin_obo in ['go-basic.obo', 'goslim_generic.obo']: fin_full = os.path.join(cwd, fin_obo) os.system("rm -f {OBO}".format(OBO=fin_obo)) godag = get_godag(fin_full, prt, loading_bar=None) # Get GODag object assert godag, "GO-DAG({OBO}) NOT PROPERLY LOADED".format(OBO=fin_obo)
def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = ROOT + "association" assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, godag, methods=methods), study_ids
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = os.path.join(REPO, "go-basic.obo") obo_dag = get_godag(obo_fin, loading_bar=None) fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO) assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def __init__(self, gosubdag=None, goslim_filename="goslim_generic.obo", hdrgos=None): self.gosubdag = self.get_gosubdag(gosubdag) _dagslim = get_godag(goslim_filename, prt=None, loading_bar=False) self.ver_goslims = _dagslim.version self.goslims = self._init_goslims(_dagslim) self.hdrgos_dflt = self._init_hdrgos( ) if hdrgos is None else hdrgos # goid set
def test_find_enrichment(): """Recreate run in run.sh.""" fin_genes = os.path.join(REPO, "data/study") pop = set(_.strip() for _ in open(fin_genes) if _.strip()) stu_orig = pop num_pop = len(pop) objtest = ArgsDict() get_godag(objtest.namespace['obo'], loading_bar=None) for min_overlap in [.25, .50, .75]: objtest.namespace['min_overlap'] = min_overlap args = objtest.ntobj(**objtest.namespace) objcli = GoeaCliFnc(args) num_stu_in_pop = int(round(min_overlap*num_pop)) + 10 study = _get_studygenes(stu_orig, num_stu_in_pop) overlap = objcli.get_overlap(study, pop) print("{N:3} of {M} ({OL}%) in study in pop".format( N=num_stu_in_pop, M=num_pop, OL=100.0*overlap)) objcli.chk_genes(study, pop) print("TEST PASSED")
def main(): """Write Table of depth-01 GO terms w/child count""" fout_tex = "gos_depth01.tex" fin_dag = 'go-basic.obo' godag = get_godag(fin_dag, optional_attrs='relationship') rcntobj = CountRelatives(godag, relationships=True, dcnt=True) wrobj = GoDepth1LettersWr(rcntobj) wrobj.wr_tex(fout_tex)
def get_gosubdag(gosubdag=None): """Gets a GoSubDag initialized for use by a Grouper object.""" if gosubdag is not None: if gosubdag.rcntobj is not None: return gosubdag else: gosubdag.init_auxobjs() return gosubdag else: go2obj = get_godag() return GoSubDag(None, go2obj, rcntobj=True)
def __init__(self, **kws): _objdoc = DocOptParse(__doc__, self.kws_dict, self.kws_set) self.kws = _objdoc.get_docargs(prt=None) if not kws else kws self.godag = get_godag(self.kws.get('obo'), prt=sys.stdout, loading_bar=False, optional_attrs=['relationship']) _ini = _Init(self.godag) self.go_ntsets = _ini.get_go_ntsets(self.kws.get('GO_FILE')) self.go_all = set.union(*[nt.go_set for nt in self.go_ntsets]) _tcntobj = _ini.get_tcntobj(self.go_all, **self.kws) # Gets TermCounts or None self.gosubdag = GoSubDag(self.go_all, self.godag, True, tcntobj=_tcntobj, prt=sys.stdout) self.objgrpd = _ini.get_grouped(self.go_ntsets, self.go_all, self.gosubdag, **self.kws)
def test_assc_stats(prt=sys.stdout): """Test association statistics.""" associations = [ ('hsa', 'goa_human.gaf'), # human ('mus', 'mgi.gaf'), # mouse ('dme', 'fb.gaf')] # fly godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) describe_go2obj(godag, prt) obj = StatsDescribe('Assc', "{:6,}") obj.prt_hdr(prt, "Assc.") for org, assc_name in associations: fin_assc = os.path.join(REPO, assc_name) describe_assc(org, fin_assc, godag, obj, prt)
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" obo_dag = get_godag() assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) # obo_dag is also found in goeaobj.obo_dag return goeaobj
def test_find_enrichment(): """RUn an enrichments using all annotation file formats""" godag = get_godag("go-basic.obo", optional_attrs=['relationship']) e_goids = _get_enriched_e_goids('GO:0006959', godag) # GO IDs related to humoral response # pylint: disable=superfluous-parens print('- DOWNLOAD AND LOAD -----------------------------------------------') annoobjs = [ _get_objanno('gene2go', taxid=10090), _get_objanno('gene2go', taxid=9606), _get_objanno('goa_human.gaf'), _get_objanno('goa_human.gpad', godag=godag), _get_objanno('data/association', anno_type='id2gos', godag=godag), ] pat = ('python3 scripts/find_enrichment.py {STU} {POP} {ASSC} ' '--pval=0.05 --method=fdr_bh --pval_field=fdr_bh ' '--taxid={TAXID} {INC} {EXC} --outfile=results_{NAME}.xlsx') cmds = [] for obj in annoobjs: ns2assc = obj.get_ns2assc() _idngos_list = list(chain.from_iterable([k2v.items() for k2v in ns2assc.values()])) pop = set(d for d, _ in _idngos_list) # TODO: 20,263 pop IDs 6,847 stu IDs 2,884 int IDs enriched = set(nt.DB_ID for nt in obj.get_associations() if nt.GO_ID in e_goids) stu = enriched.intersection(pop) print('{N:6,} pop IDs: {ID}'.format(N=len(pop), ID=list(pop)[:4])) print('{N:6,} enr IDs: {ID}'.format(N=len(enriched), ID=list(enriched)[:4])) print('{N:6,} int IDs: {ID}'.format(N=len(stu), ID=list(stu)[:4])) fout_pop = os.path.join(REPO, 'ids_pop_{BASE}.txt'.format(BASE=obj.get_name())) fout_stu = os.path.join(REPO, 'ids_stu_{BASE}.txt'.format(BASE=obj.get_name())) _wr(fout_pop, pop) _wr(fout_stu, list(stu)[:100]) cmd = pat.format(STU=fout_stu, POP=fout_pop, ASSC=obj.filename, TAXID=obj.get_taxid(), NAME=obj.get_name(), INC='', EXC='') cmds.append(cmd) print('\nRUNNING {NAME}: {CMD}\n'.format(CMD=cmd, NAME=obj.get_name())) assert os.system(cmd) == 0 fout_scr = 'test_find_enrichment_script.sh' with open(fout_scr, 'w') as prt: print("COMANDS RUN:") for cmd in cmds: print(cmd) prt.write('{CMD}\n'.format(CMD=cmd)) print(' WROTE: {SCRIPT}'.format(SCRIPT=fout_scr)) print("TEST PASSED")
def __init__(self, obo, gaf, prt): self.prt = prt self.cwd = os.getcwd() # Gene Ontologies self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo)) # Annotations #_file_gaf = dnld_gaf(os.path.join(REPO, gaf)) _file_gaf = dnld_gaf(gaf) print("GAF: {GAF}\n".format(GAF=_file_gaf)) self.gene2gos = read_gaf(_file_gaf) self.tcntobj = TermCounts(self.go2obj_all, self.gene2gos) # GoSubDag self.gosubdag_all = GoSubDag(None, self.go2obj_all, tcntobj=self.tcntobj, prt=prt) self.prtfmt = self.gosubdag_all.prt_attr['fmta']
def test_alt_id(): """Ensure that alternate GO IDs.""" obo_dag = get_godag(os.path.join(REPO, "go-basic.obo")) # Create/Initialize GoSubDag goids = _get_data0() gosubdag = GoSubDag(obo_dag.keys(), obo_dag) grprdflt = _get_grprdflt(gosubdag) # Create/Initialize Grouper hdrobj = HdrgosSections(grprdflt.gosubdag, grprdflt.hdrgos_dflt, sections=None, hdrgos=None) Grouper("test_altid_gosubdag", goids, hdrobj, grprdflt.gosubdag, go2nt=None) alt_ids = _get_altids(obo_dag) obo_goids = obo_dag.keys() obo_goids_set = set(obo_goids) assert len(alt_ids.intersection(obo_goids_set)) == len(alt_ids)
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None) goeaobj = GOEnrichmentStudy( geneids_pop, assoc_geneid2gos, godag, propagate_counts=False, alpha=0.05, methods=[method]) # godag is also found in goeaobj.godag return goeaobj
def run_bonferroni(): """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) fin_assc = os.path.join(REPO, "data/association") assoc = read_associations(fin_assc, 'id2gos', no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))] study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))] # 2. Run enrichment analysis goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni']) results_nt = goea.run_study(study_ids) return results_nt, goea
def __init__(self, args=None, prt=sys.stdout): self.kws = DocOptParse(__doc__, self.kws_dct_all, self.kws_set_all).get_docargs( args, intvals=set(['max_indent', 'dash_len'])) opt_attrs = OboOptionalAttrs.attributes.intersection(self.kws.keys()) godag = get_godag(self.kws['dag'], prt, optional_attrs=opt_attrs) self.gene2gos = read_annotations(**self.kws) self.tcntobj = TermCounts(godag, self.gene2gos) if self.gene2gos is not None else None self.gosubdag = GoSubDag(godag.keys(), godag, relationships='relationship' in opt_attrs, tcntobj=self.tcntobj, children=True, prt=prt) self.goids = self._init_goids() self._adj_item_marks() self._adj_include_only() self._adj_for_assc()
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'sgd.gaf'), ] godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species fin_assc = os.path.join(REPO, assc_name) assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) term = godag[goid] print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format( GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def test_get_children(prt=sys.stdout): """Semantic Similarity test for Issue #86.""" # Load GO-DAG fin_obo = "go-basic.obo" repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") godag = get_godag(os.path.join(repo, fin_obo)) go2obj = {go:o for go, o in godag.items() if go == o.id} # Get all children for all GO IDs using get_all_children in GOTerm class tic = timeit.default_timer() go2children_orig = {} for goobj in go2obj.values(): go2children_orig[goobj.id] = goobj.get_all_children() tic = prt_hms(tic, "Get all goobj's children using GOTerm.get_all_children()", prt) # Get all children for all GO IDs using GOTerm get_all_children go2children_fast = get_id2children(go2obj.values()) prt_hms(tic, "Get all goobj's children using go_tasks::get_go3children", prt) # Compare children lists _chk_a2bset(go2children_orig, go2children_fast)
def test_i96(): """Test to re-produce issue#96: Passes currently.""" # Trying to duplicate: ValueError("All values in table must be nonnegative. # Get genes print('CWD', os.getcwd()) study_ids = _get_geneids() population_ids = GENEID2NT.keys() # Get databases print(os.getcwd()) fin = os.path.join(REPO, 'gene2go') dnld_ncbi_gene_file(fin, loading_bar=None) gene2go = read_ncbi_gene2go(fin, [9606]) fin_obo = os.path.join(REPO, "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh']) # Run GOEA Gene Ontology Enrichment Analysis results_goeas = goeaobj.run_study(study_ids)
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def test_write_summary_cnts(log=sys.stdout): """Print level/depth summaries for various sets of GO terms.""" fin_obo = os.path.join(os.getcwd(), "go-basic.obo") godag = get_godag(fin_obo, loading_bar=None) rptobj = RptLevDepth(godag, log) # Report level/depth summary for all GOs in a dag log.write("\nSummary for all Ontologies:\n") rptobj.write_summary_cnts_all() # Report level/depth summary for all GOs in human, fly, and mouse taxids = [9606, 7227, 10090] # (optional) multi-level dictionary separate associations by taxid taxid2asscs = defaultdict(lambda: defaultdict(lambda: defaultdict(set))) # Get associations for human fly and mouse get_assoc_ncbi_taxids(taxids, taxid2asscs=taxid2asscs, loading_bar=None) assert taxid2asscs, 'taxid2asscs EMPTY' for taxid, assc in taxid2asscs.items(): log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_ids = assc['GO2IDs'].keys() rptobj.write_summary_cnts(go_ids) log.write("\nSummary for Ontologies for taxid({T}):\n".format(T=taxid)) go_objs = [godag.get(goid) for goid in go_ids] rptobj.write_summary_cnts_goobjs(go_objs) # Print GO depth count table for full GO DAG in LaTeX format rptobj.prttex_summary_cnts_all(prt=log)
def test_typedef(): """Ensure that alternate GO IDs.""" obo_dag = get_godag("go-basic.obo", loading_bar=None) print(obo_dag.typedefs['negatively_regulates'])
def get_go2obj(): """Read GODag and return go2obj.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) return {go:o for go, o in godag.items() if not o.is_obsolete}
def __init__(self, obo): self.cwd = os.getcwd() self.go2obj_all = get_godag(os.path.join(REPO, "../goatools/", obo)) self.gosubdag_all = GoSubDag(None, self.go2obj_all) self.prtfmt = self.gosubdag_all.prt_attr['fmta']