def _get_id2gos(file_id2gos, godag, name2go): """Get annotations""" if os.path.exists(file_id2gos): return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC') id2num = { name2go['A']: 10, name2go['B']: 10, name2go['C']: 10, name2go['D']: 10, name2go['E']: 10, name2go['F']: 10, name2go['G']: 10, name2go['H']: 10, name2go['I']: 30, name2go['L']: 30, name2go['M']: 20, name2go['N']: 30, } go2genes = cx.defaultdict(set) genenum = 0 for goid, qty in id2num.items(): for _ in range(qty): go2genes[goid].add(genenum) genenum += 1 id2gos = get_b2aset(go2genes) IdToGosReader.wr_id2gos(file_id2gos, id2gos) return id2gos
def _get_goeaobj(methods=None): """Test GOEA with method, fdr.""" # REad GODag obo_fin = os.path.join(REPO, "go-basic.obo") obo_dag = get_godag(obo_fin, loading_bar=None) # Read association fin_assc = "{REPO}/tests/data/small_association".format(REPO=REPO) objanno = IdToGosReader(fin_assc, godag=obo_dag) ns2assc = objanno.get_ns2assc() popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudyNS(popul_ids, ns2assc, obo_dag, methods=methods) return goeaobj
def _get_id2gos(file_id2gos, godag, name2go, name2num): """Get annotations""" if os.path.exists(file_id2gos): return IdToGosReader(file_id2gos, godag=godag).get_id2gos('CC') go2genes = cx.defaultdict(set) genenum = 0 for name, qty in name2num.items(): goid = name2go[name] for _ in range(qty): go2genes[goid].add(genenum) genenum += 1 id2gos = get_b2aset(go2genes) IdToGosReader.wr_id2gos(file_id2gos, id2gos) return id2gos
def get_objanno(fin_anno, anno_type=None, **kws): """Read annotations in GAF, GPAD, Entrez gene2go, or text format.""" # kws get_objanno: taxids hdr_only prt allow_missing_symbol anno_type = get_anno_desc(fin_anno, anno_type) if anno_type is not None: if anno_type == 'gene2go': # kws: taxid taxids kws_ncbi = { k: kws[k] for k in Gene2GoReader.exp_kws.intersection(kws.keys()) } return Gene2GoReader(fin_anno, **kws_ncbi) if anno_type == 'gaf': kws_gaf = { k: kws[k] for k in GafReader.exp_kws.intersection(kws.keys()) } return GafReader(fin_anno, **kws_gaf) if anno_type == 'gpad': kws_gpad = { k: kws[k] for k in GpadReader.exp_kws.intersection(kws.keys()) } return GpadReader(fin_anno, **kws_gpad) if anno_type == 'id2gos': kws_id2go = { k: kws[k] for k in IdToGosReader.exp_kws.intersection(kws.keys()) } return IdToGosReader(fin_anno, **kws_id2go) raise RuntimeError('UNEXPECTED ANNOTATION FILE FORMAT: {F} {D}'.format( F=fin_anno, D=anno_type))
def intialize_term_counts(): go_freq_dict = dict() go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo")) associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH, godag=go_dag).get_id2gos('all') term_counts = TermCounts(go_dag, associations) for i in go_dag.values(): go_freq_dict[i.id] = term_counts.get_count(i.id) # write frequency dict to JSON file with open(JSON_INDEXED_FILE_PATH, 'w') as json_file: json.dump(go_freq_dict, json_file)
def _precompute_term_frequencies(): print("Start precomputations of term frequencies...") go_freq_dict = dict() go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH, godag=go_dag).get_id2gos('all') term_counts = TermCounts(go_dag, associations) for i in go_dag.values(): go_freq_dict[i.id] = term_counts.get_count(i.id) for alt_id in i.alt_ids: go_freq_dict[alt_id] = term_counts.get_count(i.id) # write frequency dict to JSON file with open(FREQUENCY_COUNTS_FILE_PATH, 'w') as json_file: json.dump(go_freq_dict, json_file)
def test_tcntobj_relationships(do_plt=False): """Test loading of relationships, like part_of, into TermCounts""" # Filenames fin_obo = os.path.join(REPO, "tests/data/yangRWC/fig2a.obo") fin_anno = os.path.join(REPO, "tests/data/yangRWC/fig2a.anno") fout_png_r0 = os.path.join(REPO, 'yang_fig2a_r0.png') fout_png_r1 = os.path.join(REPO, 'yang_fig2a_r1.png') relationships = { 'part_of', } # Load ontologies go2obj = GODag(fin_obo, optional_attrs=['relationship']) # Load annotations assoc = IdToGosReader(fin_anno, godag=go2obj).get_id2gos('CC') # Count genes annotated to GO terms w and wo/relationships tcntobj_r0 = TermCounts(go2obj, assoc) # relationship: G (GO:0000007) is part_of F (GO:0000006) tcntobj_r1 = TermCounts(go2obj, assoc, relationships) # Check results # Adding relationships does not change the total count of genes: assert tcntobj_r0.gocnts['GO:0005575'] == tcntobj_r1.gocnts['GO:0005575'] # Counts without relationships: assert tcntobj_r0.gocnts['GO:0000002'] == 40 # GO Term B assert tcntobj_r0.gocnts['GO:0000006'] == 10 # GO Term F # Counts with relationships: F counts G's 30 genes, so does B assert tcntobj_r1.gocnts['GO:0000002'] == 70 # GO Term B assert tcntobj_r1.gocnts['GO:0000006'] == 40 # GO Term F # Optionally visualize the difference between term counts w and wo/relationships if do_plt: go2txt_r0 = { nt.GO: 'tcnt={}'.format(nt.tcnt) for nt in tcntobj_r0.gosubdag.go2nt.values() } GoSubDagPlot(tcntobj_r0.gosubdag, go2txt=go2txt_r0).plt_dag(fout_png_r0) go2txt_r1 = { nt.GO: 'tcnt={}'.format(nt.tcnt) for nt in tcntobj_r1.gosubdag.go2nt.values() } GoSubDagPlot(tcntobj_r1.gosubdag, go2txt=go2txt_r1).plt_dag(fout_png_r1)
uniprot_notnull['Encoding']) ]) #run gene ontology enrichment analysis # Get http://geneontology.org/ontology/go-basic.obo from goatools.base import download_go_basic_obo obo_fname = download_go_basic_obo() uniprot_df['Gene ontology IDs'] = uniprot_df['Gene ontology IDs'].str.replace( ' ', '') uniprot_df.drop(['Encoding', 'Organism', 'Protein families', 'n'], axis=1).to_csv("GOA.txt", sep='\t', header=False, index=False) from goatools.anno.idtogos_reader import IdToGosReader objanno = IdToGosReader("GOA.txt") ns2assoc = objanno.get_id2gos() from goatools.obo_parser import GODag obodag = GODag("go-basic.obo") from goatools.go_enrichment import GOEnrichmentStudy goeaobj = GOEnrichmentStudy( uniprot_df.Entry, ns2assoc, # geneid/GO associations obodag, # Ontologies propagate_counts=False, alpha=0.001, # default significance cut-off methods=['fdr_bh']) # default multipletest correction method gos = []
sys.stderr = open(snakemake.log[0], "w") import pandas as pd import matplotlib.pyplot as plt from goatools.obo_parser import GODag from goatools.anno.idtogos_reader import IdToGosReader from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS from goatools.godag_plot import plot_results#, plot_goid2goobj, plot_gos # read in directed acyclic graph of GO terms / IDs obodag = GODag(snakemake.input.obo) # read in mapping gene ids from input to GO terms / IDs objanno = IdToGosReader(snakemake.input.ens_gene_to_go, godag = obodag) # extract namespace(?) -> id2gos mapping ns2assoc = objanno.get_ns2assc() for nspc, id2gos in ns2assoc.items(): print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos))) # read gene diffexp table all_genes = pd.read_table(snakemake.input.diffexp) # select genes significantly differentially expressed according to BH FDR of sleuth fdr_level_gene = float(snakemake.params.gene_fdr) sig_genes = all_genes[all_genes['qval']<fdr_level_gene]