def gen_anno_small(): """Generate a maller nnotations containing 10% of the oringal genes""" godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo')) name2go = {o.name: o.item_id for o in godag.values()} file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a_small.anno') name2num = {e:i/10 for e, i in NAME2NUM.items()} _get_id2gos(file_id2gos, godag, name2go, name2num) print(name2num)
class _Run(object): """Group entire go-basic.obo""" obo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../go-basic.obo") def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources))) def prt_cnts(self, cnts): """Compare ancestor/descendant counts with relatives=False/True.""" k2v = {k:self.str_stats(v) for k, v in cnts.items()} print(k2v) @staticmethod def str_stats(vals): """Print statistics on values.""" ntd = stats.describe(vals) std = int(round(np.sqrt(ntd.variance))) return "({m} {M}) STD={STD:,}".format(m=ntd.minmax[0], M=ntd.minmax[1], STD=std) def get_gosubdag_r0(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() gosubdag = GoSubDag(goids, self.godag_r0, relationships=None, #rcntobj=self.gosubdag_r0.rcntobj, prt=None) prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) return gosubdag def get_gosubdag_r1(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() gosubdag = GoSubDag(goids, self.godag_r1, relationships=True, #rcntobj=self.gosubdag_r1.rcntobj, prt=None) prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) return gosubdag def get_goids_rand(self, qty): """Return N randomly chosen GO IDs.""" shuffle(self.goids) return self.goids[:qty]
def test_semantic_similarity(): """Test faster version of sematic similarity""" godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo')) name2go = {o.name: o.item_id for o in godag.values()} assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go) tcntobj = TermCounts(godag, assoc) assert tcntobj.gocnts[name2go['I']] == 50 assert tcntobj.gocnts[name2go['L']] == 50 assert tcntobj.gocnts[name2go['M']] == 50 assert tcntobj.gocnts[name2go['N']] == 50
def intialize_term_counts(): go_freq_dict = dict() go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo")) associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH, godag=go_dag).get_id2gos('all') term_counts = TermCounts(go_dag, associations) for i in go_dag.values(): go_freq_dict[i.id] = term_counts.get_count(i.id) # write frequency dict to JSON file with open(JSON_INDEXED_FILE_PATH, 'w') as json_file: json.dump(go_freq_dict, json_file)
def test_semantic_i150(): """Test that comparing two identical GO IDs returns true""" fin_dag = os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo') ## fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') # Read files godag = GODag(fin_dag) ## objanno = GafReader(fin_gaf) ## gene2gos = objanno.get_id2gos(namespace='CC') ## # Termcounts ## termcounts = TermCounts(godag, gene2gos, prt=sys.stdout) # Compare all GO terms with itself for goterm in set(godag.values()): goid = goterm.item_id assert semantic_similarity(goid, goid, godag) == 1.0
def test_semantic_similarity(): """Test initializing TermCounts with annotations made to alternate GO ID""" godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo')) file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.anno') name2go = {o.name: o.item_id for o in godag.values()} assoc = _get_id2gos(file_id2gos, godag, name2go, NAME2NUM) tcntobj = TermCounts(godag, assoc) # N_v: Test accuracy of Python equivalent to Java: getNumberOfAnnotations # Test number of unique genes annotated to a GO Term PLUS genes annotated to a descendant assert tcntobj.gocnts[name2go['A']] == 100, tcntobj.gocnts assert tcntobj.gocnts[name2go['B']] == 40, tcntobj.gocnts assert tcntobj.gocnts[name2go['C']] == 50, tcntobj.gocnts assert tcntobj.gocnts[name2go['D']] == 10, tcntobj.gocnts assert tcntobj.gocnts[name2go['E']] == 10, tcntobj.gocnts assert tcntobj.gocnts[name2go['F']] == 10, tcntobj.gocnts assert tcntobj.gocnts[name2go['G']] == 30, tcntobj.gocnts
def _precompute_term_frequencies(): print("Start precomputations of term frequencies...") go_freq_dict = dict() go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH, godag=go_dag).get_id2gos('all') term_counts = TermCounts(go_dag, associations) for i in go_dag.values(): go_freq_dict[i.id] = term_counts.get_count(i.id) for alt_id in i.alt_ids: go_freq_dict[alt_id] = term_counts.get_count(i.id) # write frequency dict to JSON file with open(FREQUENCY_COUNTS_FILE_PATH, 'w') as json_file: json.dump(go_freq_dict, json_file)
p.add_option( "--term", help="Write the parents and children of this query term", ) opts, args = p.parse_args() if len(args) != 1: sys.exit(p.print_help()) (obo_file, ) = args def description(rec): level = "level-{:>02}".format(rec.level) description = "{} [{}]".format(rec.name, rec.namespace) if rec.is_obsolete: description += " obsolete" alt_ids = ",".join(rec.alt_ids) return "\t".join((rec.item_id, level, description, alt_ids)) g = GODag(obo_file, prt=None) header = "\t".join(("#id", "level", "name", "alt_ids")) print(header) for rec in sorted(set(g.values()), key=lambda x: x.item_id): print(description(rec)) # run a test case if opts.term: rec = g.query_term(opts.term, verbose=True) g.draw_lineage([rec], verbose=True)
class NxMgAssembler(object): """Class which assembles a networkx MultiGraph based on a list of genes. Parameters ---------- genes : list of dict A list of gene references based on which the graph is assembled. Attributes ---------- graph : networkx.MultiGraph The assembled graph containing links for interactions between genes, GO annotations for genes, and the GO ontology. """ def __init__(self, genes, resource_manager=None): self.genes = genes self.graph = nx.MultiGraph() if not resource_manager: self.resource_manager = ResourceManager() else: self.resource_manager = resource_manager self.go_dag = GODag(self.resource_manager.get_go_obo()) self.goa = self._load_goa_gaf() def _get_go_terms_for_gene(self, gene): # Filter to rows with the given gene's UniProt ID if ('UP' not in gene) or ('HGNC_SYMBOL' not in gene): return [] elif gene['HGNC_SYMBOL'] not in self.graph: return [] df = self.goa[self.goa['DB_ID'] == gene['UP']] go_ids = sorted(list(set(df['GO_ID']))) return go_ids def add_go_annotations(self): """Add edges between gene nodes and GO nodes based on GO annotations.""" logger.info('Adding GO annotations for genes in graph.') for gene in self.genes: go_ids = self._get_go_terms_for_gene(gene) for go_id in go_ids: if go_id in self.go_dag: go_term = self.go_dag[go_id] if go_term.is_obsolete: continue self.graph.add_node(go_term.id, name=go_term.name, GO=go_term.id, domain=go_term.namespace) self.graph.add_edge(gene['HGNC_SYMBOL'], go_term.id, label='GO:annotation') def add_go_ontology(self): """Add edges between GO nodes based on the GO ontology.""" logger.info('Adding GO ontology edges to graph.') for go_term in list(self.go_dag.values()): if go_term.is_obsolete: continue self.graph.add_node(go_term.id, name=go_term.name, GO=go_term.id, domain=go_term.namespace) for parent_term in go_term.parents: if parent_term.is_obsolete: continue self.graph.add_node(go_term.id, name=go_term.name, GO=go_term.id, domain=go_term.namespace) self.graph.add_edge(go_term.id, parent_term.id, label='GO:is_a') def node2edges(self, node_key): """Return the edges corresponding to a node.""" return self.graph.edges(node_key, keys=True) def save_graph(self, fname): """Save the file into a GraphML file. Parameters ---------- fname : str The name of the file to save the graph into. """ nx.write_graphml(self.graph, fname) def _load_goa_gaf(self): """Load the gene/GO annotations as a pandas data frame.""" goa_ec = {'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'HTP', 'HDA', 'HMP', 'HGI', 'HEP', 'IBA', 'IBD'} goa = pd.read_csv(self.resource_manager.get_goa_gaf(), sep='\t', skiprows=23, dtype=str, header=None, names=['DB', 'DB_ID', 'DB_Symbol', 'Qualifier', 'GO_ID', 'DB_Reference', 'Evidence_Code', 'With_From', 'Aspect', 'DB_Object_Name', 'DB_Object_Synonym', 'DB_Object_Type', 'Taxon', 'Date', 'Assigned', 'Annotation_Extension', 'Gene_Product_Form_ID']) goa = goa.sort_values(by=['DB_ID', 'GO_ID']) # Filter out all "NOT" negative evidences goa['Qualifier'].fillna('', inplace=True) goa = goa[~goa['Qualifier'].str.startswith('NOT')] # Filter to rows with evidence code corresponding to experimental # evidence goa = goa[goa['Evidence_Code'].isin(goa_ec)] return goa
class _Run: """Group entire go-basic.obo""" obo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../go-basic.obo") def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms( tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms( tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources))) def prt_cnts(self, cnts): """Compare ancestor/descendant counts with relatives=False/True.""" k2v = {k: self.str_stats(v) for k, v in cnts.items()} print(k2v) @staticmethod def str_stats(vals): """Print statistics on values.""" ntd = stats.describe(vals) std = int(round(np.sqrt(ntd.variance))) return "({m} {M}) STD={STD:,}".format(m=ntd.minmax[0], M=ntd.minmax[1], STD=std) def get_gosubdag_r0(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() gosubdag = GoSubDag( goids, self.godag_r0, relationships=None, #rcntobj=self.gosubdag_r0.rcntobj, prt=None) prt_hms( tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) return gosubdag def get_gosubdag_r1(self, goids): """Return a GoSubDag with N randomly chosen GO sources.""" tic = timeit.default_timer() gosubdag = GoSubDag( goids, self.godag_r1, relationships=True, #rcntobj=self.gosubdag_r1.rcntobj, prt=None) prt_hms( tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(gosubdag.go2obj), S=len(gosubdag.go_sources))) return gosubdag def get_goids_rand(self, qty): """Return N randomly chosen GO IDs.""" shuffle(self.goids) return self.goids[:qty]
def test_parents_ancestors(): """Test getting parents and ancestors""" # Load a small GO DAG to demonstrate getting parents and ancestors file_dag = os.path.join(REPO, 'tests/data/i126/viral_gene_silence.obo') # Load all relationships using optional attribute godag = GODag(file_dag) optional_relationships = set() # Don't trace any optional relationships go2parents_isa = get_go2parents(godag, optional_relationships) go2children_isa = get_go2children(godag, optional_relationships) # TODO: Add more tests for only is_a godag = GODag(file_dag, optional_attrs={'relationship'}) goids = set(o.item_id for o in godag.values()) # Get parents through "is_a" only optional_relationships = set() # Don't trace any optional relationships go2parents_isa = get_go2parents(godag, optional_relationships) go2children_isa = get_go2children(godag, optional_relationships) # Get parents through "is_a" and all the "regulates" realtionships optional_relationships = { 'regulates', 'negatively_regulates', 'positively_regulates' } go2parents_reg = get_go2parents(godag, optional_relationships) go2children_reg = get_go2children(godag, optional_relationships) # Print parents throush "is_a" relationship goid = 'GO:0019222' # regulation of metabolic process assert go2parents_isa[goid] == {'GO:0050789'} assert go2parents_reg[goid] == {'GO:0050789', 'GO:0008152'} exp = {'GO:0009892', 'GO:0060255'} assert go2children_isa[goid] == exp assert go2children_reg[goid] == exp assert go2children_isa['GO:0008152'] == {'GO:0071704'} assert go2children_reg['GO:0008152'] == { 'GO:0071704', 'GO:0019222', 'GO:0009892' } # Load GO DAG into a GoSubDag object, to use user-selected relationships gosubdag_r0 = GoSubDag(goids, godag) assert gosubdag_r0.rcntobj.go2ancestors[goid] == \ {'GO:0050789', 'GO:0065007', 'GO:0008150'} # Load GO DAG into a GoSubDag object, to use user-selected relationships gosubdag_r1 = GoSubDag(goids, godag, relationships=optional_relationships) assert gosubdag_r1.rcntobj.go2ancestors[goid] == \ {'GO:0050789', 'GO:0008152', 'GO:0065007', 'GO:0008150'}, \ gosubdag_r1.rcntobj.go2ancestors[goid] exp = {'GO:0071704', 'GO:0010467', 'GO:0043170'} assert gosubdag_r0.rcntobj.go2descendants['GO:0008152'] == exp assert gosubdag_r0.rcntobj.go2descendants['GO:0043170'] == {'GO:0010467'} exp = { 'GO:0010467', 'GO:0010468', 'GO:0010605', 'GO:0010608', 'GO:0010629', 'GO:0016441', 'GO:0016458', 'GO:0040029', 'GO:0060147', 'GO:0060148', 'GO:0060150', 'GO:0060255', 'GO:0060968' } assert gosubdag_r1.rcntobj.go2descendants['GO:0043170'] == exp gosubdag_r1n = GoSubDag(goids, godag, relationships={'negatively_regulates'}) exp = {'GO:0010629', 'GO:0016441', 'GO:0016458'} assert gosubdag_r1n.rcntobj.go2descendants['GO:0010467'] == exp