def extend_gos_by_parents(GODag, gos, logger=None): gos_out = set() for go in gos: gos_out.add(go) try: gos_out.update(GODag.query_term(go).get_all_parents()) except: if logger: logger.debug('Could not get parents for term {}.'.format(go)) return gos_out
def count_children(GODag, go): return len(GODag.query_term(go).get_all_children())
go_accs = set(interpro_go.loc[interpro_go['Protein Accession'] == protein]['GO Accession']) for go_acc in go_accs: if not pd.isnull(go_acc): all_go_accs_in_a_protein |= set(go_acc.split('|')) if len(all_go_accs_in_a_protein) > 0: for go_term in all_go_accs_in_a_protein: if go_term not in go: continue if USE_SLIM: direct_anc, all_anc = mapslim(go_term, go, goslim) all_goslim_anc_accs_in_a_protein |= all_anc all_goslim_covered_anc |= (all_anc - direct_anc) query_term = go.query_term(go_term) output_table = output_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_term.namespace], 'GO Accession': [go_term], 'GO Description': [query_term.name], 'GO Level':[query_term.level]}), ignore_index=True) if USE_SLIM: if ONLY_DIRECT: all_goslim_direct_anc_accs_in_a_protein = all_goslim_anc_accs_in_a_protein - all_goslim_covered_anc for goslim_term in all_goslim_direct_anc_accs_in_a_protein: query_goslim_term = goslim.query_term(goslim_term) output_slim_table = output_slim_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_goslim_term.namespace], 'GOSlim Accession': [goslim_term], 'GOSlim Description': [query_goslim_term.name], 'GOSlim Level':[query_goslim_term.level]}), ignore_index=True) else: for goslim_term in all_goslim_anc_accs_in_a_protein: query_goslim_term = goslim.query_term(goslim_term) output_slim_table = output_slim_table.append(pd.DataFrame({'Protein Accession': [protein], 'GO Category': [query_goslim_term.namespace], 'GOSlim Accession': [goslim_term], 'GOSlim Description': [query_goslim_term.name], 'GOSlim Level':[query_goslim_term.level]}), ignore_index=True) # write the output if opts.is_sort:
dest='draw_parents', help="Do not draw parents of the query term") p.add_option("--disable-draw-children", action="store_false", dest='draw_children', help="Do not draw children of the query term") p.set_defaults(draw_parents=True) p.set_defaults(draw_children=True) opts, args = p.parse_args() if not len(args): obo_file = "go-basic.obo" else: obo_file = args[0] assert os.path.exists(obo_file), "file %s not found!" % obo_file g = GODag(obo_file) if opts.desc: g.write_dag() # run a test case if opts.term is not None: rec = g.query_term(opts.term, verbose=True) g.draw_lineage([rec], engine=opts.engine, gml=opts.gml, draw_parents=opts.draw_parents, draw_children=opts.draw_children)
p.add_option( "--term", help="Write the parents and children of this query term", ) opts, args = p.parse_args() if len(args) != 1: sys.exit(p.print_help()) (obo_file, ) = args def description(rec): level = "level-{:>02}".format(rec.level) description = "{} [{}]".format(rec.name, rec.namespace) if rec.is_obsolete: description += " obsolete" alt_ids = ",".join(rec.alt_ids) return "\t".join((rec.item_id, level, description, alt_ids)) g = GODag(obo_file, prt=None) header = "\t".join(("#id", "level", "name", "alt_ids")) print(header) for rec in sorted(set(g.values()), key=lambda x: x.item_id): print(description(rec)) # run a test case if opts.term: rec = g.query_term(opts.term, verbose=True) g.draw_lineage([rec], verbose=True)
from goatools.obo_parser import GODag go_obo_file_path = '' go_term_name = GO:0015979" godag = GODag(obo_file=go_obo_file) mygo = godag.query_term(go_term_name) print go_term_name + " Parents" print mygo.get_all_parents() print go_term_name + " Children" print mygo.get_all_children()
def load(args, dba, logfile, logger): gofile = DOWNLOAD_DIR + FILENAME if not args['--quiet']: print "\nParsing GO OBO file: {}".format(gofile) logger.info("Parsing GO OBO file: {}".format(gofile)) godag = GODag(gofile) tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 ti_ct = 0 notfnd = {} dba_err_ct = 0 exp_codes = ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'] for t in dba.get_targets(idg=False, include_annotations=True): ct += 1 p = t['components']['protein'][0] if 'goas' in p: lfe_goa_strs = [] for d in p['goas']: if d['go_term'].startswith('C'): continue # only want MF/BP terms ev = d['evidence'] if ev not in exp_codes: continue # only want experimental evidence GOAs gt = godag.query_term(d['go_id']) if not gt: k = "%s:%s" % (d['go_id'], d['go_term']) notfnd[k] = True logger.error("GO term %s not found in GODag" % k) continue if len(gt.children) == 0: # if it's a leaf node lfe_goa_strs.append("%s|%s|%s" % (d['go_id'], d['go_term'], ev)) if lfe_goa_strs: rv = dba.ins_tdl_info({ 'protein_id': p['id'], 'itype': 'Experimental MF/BP Leaf Term GOA', 'string_value': "; ".join(lfe_goa_strs) }) if not rv: dba_err_ct += 1 continue ti_ct += 1 pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} new tdl_info rows".format(ti_ct) if len(notfnd.keys()) > 0: print "WARNING: {} GO terms not found in GODag. See logfile {} for details.".format( (len(notfnd.keys()), logfile)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def goe( genelist, go_file, goa_file, bg=None, nmin=5, conversion=None, evidence_set={ 'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA', 'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM' }): """Finds GO enrichment with goatools (0.7.11 tested). **WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression. Parameters ------------ genelist: list of str Genes to search for enrichment. go_file: str File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)). goa_file: str File path for GO associations. See parameter **conversion**. bg: list of str Background genes. nmin: int Minimum number of principal genes required in GO. conversion: tuple Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation. * name_from: Gene naming system of genelist. For gene names, use 'symbol,alias'. * name_to: Gene naming system of goa_file. Examples: * Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz). * Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz). * species: Species for gene name conversion. Examples: 'human', 'mouse'. evidence_set: set of str `GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias. Returns ---------- goe: pandas.DataFrame GO enrichment. gotop: str Top enriched GO ID genes: list of str or None Intersection list of genes in gotop and also bg. None if bg is None. """ from tempfile import NamedTemporaryFile from os import linesep from goatools.go_enrichment import GOEnrichmentStudy from goatools.obo_parser import GODag from goatools.associations import read_gaf from collections import defaultdict import itertools from biothings_client import get_client import pandas as pd import logging assert type(genelist) is list and len(genelist) > 0 if nmin < 1: nmin = 1 bg0 = bg # Convert gene names if conversion is not None: assert len(conversion) == 3 name_from, name_to, species = conversion mg = get_client('gene') ans = set(genelist) if bg is not None: t1 = set(bg) assert len(ans - t1) == 0 ans |= t1 ans = list(ans) ans = mg.querymany(ans, scopes=name_from, fields=name_to, species=species) t1 = set(['query', '_score', name_to.split('.')[0]]) ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans)) ans = sorted(ans, key=lambda x: x['_score']) convert = {x['query']: x for x in ans} for xi in name_to.split('.'): convert = filter(lambda x: xi in x[1], convert.items()) convert = {x[0]: x[1][xi] for x in convert} convert = { x[0]: x[1] if type(x[1]) is str else x[1][0] for x in convert.items() } genelist2 = list( set([convert[x] for x in filter(lambda x: x in convert, genelist)])) if bg is not None: bg = list( set([convert[x] for x in filter(lambda x: x in convert, bg)])) t1 = set(genelist) converti = list(filter(lambda x: x[0] in t1, convert.items())) t1 = defaultdict(list) for xi in converti: t1[xi[1]].append(xi[0]) converti = dict(t1) t1 = defaultdict(list) for xi in convert.items(): t1[xi[1]].append(xi[0]) convertia = dict(t1) else: genelist2 = genelist # Load GO DAG and association files logging.debug('Reading GO DAG file ' + go_file) godag = GODag(go_file) logging.debug('Reading GO association file ' + goa_file) goa = read_gaf(goa_file, evidence_set=evidence_set) if bg is None: bg = list(goa.keys()) # Compute enrichment goe = GOEnrichmentStudy(bg, goa, godag) ans = goe.run_study(genelist2) # Format output with NamedTemporaryFile() as f: goe.wr_tsv(f.name, ans) ans = f.read() ans = ans.decode() ans = [x.split('\t') for x in ans.split(linesep)] if len(ans[-1]) < 2: ans = ans[:-1] if len(ans) == 0 or len(ans[0]) == 0: raise ValueError('No enrichment found. Check your input ID type.') ans[0][0] = ans[0][0].strip('# ') ans = pd.DataFrame(ans[1:], columns=ans[0]) ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'], axis=1, inplace=True) for xj in ['p_uncorrected', 'p_bonferroni']: ans[xj] = pd.to_numeric(ans[xj], errors='raise') ans['depth'] = pd.to_numeric(ans['depth'], errors='raise', downcast='unsigned') # Odds ratio column and sort column ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio( ans['ratio_in_pop']) ans = ans[[ 'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio', 'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items' ]] ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', '')) # Convert back study_items if conversion is not None: ans['study_items'] = ans['study_items'].apply(lambda x: ','.join( list( itertools.chain.from_iterable( [converti[y] for y in x.split(',')]))) if len(x) > 0 else x) ans.sort_values('p_uncorrected', inplace=True) # Get top enriched GO by P-value gotop = ans[ (ans['odds_ratio'] > 1) & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)] if len(gotop) == 0: raise ValueError('No GO enrichment found for given criteria.') gotop = str(gotop.iloc[0]['GO']) if bg0 is not None: # Children GOs gos = set([gotop] + list(godag.query_term(gotop).get_all_children())) # Look for genes genes = list( filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0, goa)) if conversion is not None: genes = [ convertia[x] for x in filter(lambda x: x in convertia, genes) ] genes = list(set(list(itertools.chain.from_iterable(genes)))) genes = set(genes) genes = list(filter(lambda x: x in genes, bg0)) else: genes = None return (ans, gotop, genes)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GO Experimental Leaf Term Flags', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'These values are calculated by the loader app and indicate that a protein is annotated with a GO leaf term in either the Molecular Function or Biological Process branch with an experimental evidenve code.' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Experimental MF/BP Leaf Term GOA'" }) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) gofile = DOWNLOAD_DIR + FILENAME logger.info("Parsing GO OBO file: %s" % gofile) godag = GODag(gofile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 ti_ct = 0 notfnd = {} dba_err_ct = 0 exp_codes = ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'] for t in dba.get_targets(idg=False, include_annotations=True): ct += 1 p = t['components']['protein'][0] if 'goas' in p: lfe_goa_strs = [] for d in p['goas']: if d['go_term'].startswith('C'): continue # only want MF/BP terms ev = d['evidence'] if ev not in exp_codes: continue # only want experimental evidence GOAs gt = godag.query_term(d['go_id']) if not gt: k = "%s:%s" % (d['go_id'], d['go_term']) notfnd[k] = True logger.error("GO term %s not found in GODag" % k) continue if len(gt.children) == 0: # if it's a leaf node lfe_goa_strs.append("%s|%s|%s" % (d['go_id'], d['go_term'], ev)) if lfe_goa_strs: rv = dba.ins_tdl_info({ 'protein_id': p['id'], 'itype': 'Experimental MF/BP Leaf Term GOA', 'string_value': "; ".join(lfe_goa_strs) }) if not rv: dba_err_ct += 1 continue ti_ct += 1 pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} new tdl_info rows".format(ti_ct) if len(notfnd.keys()) > 0: print "WARNING: {} GO terms not found in GODag. See logfile {} for details.".format( (len(notfnd.keys()), logfile)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def get_go_protease(): from goatools.obo_parser import GODag g = GODag('/home/gstupp/go/go-basic.obo') go_term = g.query_term('GO:0008233') go_protease = go_term.get_all_children() return go_protease
ana_DTA = '/home/gstupp/01_2015_mass_spec/H1_11082014/1108_Gly1_2014_12_15_15_29205/dtaselect_results_sfp0.01_p2/DTASelect-filter.txt' parser = blazmass_tools.dta_select_parser(ana_DTA, small=True) ps = [get_domains(p) for p in parser] set_go = set(chain(*[p['set_go'] for p in ps if p['set_go'] is not None])) for p in ps: if p['set_go']: p['go_slim'] = set( chain(*[ mapslim(go_term, go_dag, goslim_meta)[0] for go_term in p['set_go'] if go_term in go_dag ])) else: p['go_slim'] = None go_slim = Counter(chain(*[p['go_slim'] for p in ps if p['go_slim']])) labels = { go_term: go_dag.query_term(go_term).name for go_term in go_slim.keys() } [labels[go] for (go, x) in go_slim.most_common(n=10)] import plot_tools cmap = plt.cm.jet colors = cmap(np.linspace(0., 1., len(go_slim.keys()))) explode = [1 if x < 20 else 0 for x in go_slim.values()] fig = plt.figure(figsize=(8, 8)) ax = plt.subplot(111) patches = ax.pie(list(go_slim.values()), autopct='%1.1f', explode=explode,
class GO2Mongo(object): """ """ def __init__(self, obo_file="/data/databases/go/go.obo", db="xomeq", client=None, go="ontologies", go_index="col_ont_idx", ontology_name="go", slim_file=None): """""" self.obo_file = obo_file self.slim_file = slim_file self.graph = nx.DiGraph() self.graph_file = '/data/databases/' + ontology_name + '/' + ontology_name + '.gpickle' self.go_dag = None if isinstance(db, basestring): if not client: client = MongoClient() self.db = client[db] else: assert isinstance(db, Database) self.db = db self.col_go = self.db[go] self.col_go_index = self.db[go_index] self.ontology_name = ontology_name if self.ontology_name == "go": self.root_terms = GO_ROOT_TERMS else: self.root_terms = SO_ROOT_TERMS self.ki = KeywordIndexer() def init(self): _log.debug("Cargando archivo de ontologias:" + self.obo_file) self.go_dag = GODag(self.obo_file) _log.debug("Se cargo el archivo:" + self.obo_file) if os.path.exists(self.graph_file): self.graph = nx.read_gpickle(self.graph_file) else: self._build_graph() nx.write_gpickle(self.graph, self.graph_file) _log.debug("Se genero el grafo de terminos") def add_unknow(self): """ pepe = { "_id" : ObjectId("591f14deaab82b7f88ef8c04"), "term" : "go:9999999", "name" : "Uknown", "ontology" : "go", "databases" : [ ], "description" : "Uknown", "keywords" : [ ], "parents" : [ "go:0005575", "go:0003674", "go:0008150" ], "children" : [ ], "successors" : [ ], "subclases" : [ ], "successors_relationships" : [ ], "database" : "biological_process" } db.getCollection('ontologies').save(pepe) db.proteins.update({organism:"LactoUV",ontologies:{$ne:"go:0008150"}},{$addToSet:{ontologies:"go:9999999"}},{multi:true}) db.ontologies.update({ontology:"go",term:"go:0005575"},{$addToSet:{"subclases":"go:9999999"}}) db.ontologies.update({ontology:"go",term:"go:0005575"},{$addToSet:{"successors":"go:9999999"}}) db.ontologies.update({ontology:"go",term:"go:0005575"},{$addToSet:{"children":"go:9999999"}}) db.ontologies.update({ontology:"go",term:"go:0005575"},{$pull:{"subclases":"go:9999999"}}) db.ontologies.update({ontology:"go",term:"go:0005575"},{$pull:{"successors":"go:9999999"}}) db.ontologies.update({ontology:"go",term:"go:0005575"},{$pull:{"children":"go:9999999"}}) { "_id" : ObjectId(""), "_cls" : "SeqColOntologyIndex", "term" : "go:9999999", "name" : "unknown", "count" : 800, "order" : 27, "keywords" : [ ], "ontology" : "go", "database" : "biological_process", "seq_collection_name" : "LactoUV", "seq_collection_id" : ObjectId("591caafebe737e774090b78d") } """ def load(self): self.init() self._load_mongo() _log.info("Obo %s loaded in %s collection %s and index in %s" % (self.obo_file, self.db.name, self.col_go.name, self.col_go_index.name)) _log.debug("Loading generic slim") self.load_slim() _log.info("Generic slim terms loaded") def load_slim(self, slim_file="/data/databases/go/goslim_generic.obo", database="generic"): parser = GODag(slim_file) for ont in parser: try: go = Ontology.objects(ontology="go", term=ont.lower()).get() go.databases.append(database) go.save() except Exception as ex: _log.error(ex) go = Ontology.objects(ontology="go", term="root").get() go.databases.append(database) go.save() def _load_mongo(self): root = Ontology(ontology=self.ontology_name, term="root", successors=self.root_terms, children=self.root_terms) root.save() for (node, data) in self.graph.nodes_iter( data=True): # self.graph.add_node(node, **data) if node == "root": raise Exception("...") else: successors = self.graph.successors(node) _ancestors = self.complete_subgraph([node]) database = "biological_process" if "go:0005575" in _ancestors: database = "cellular_component" if "go:0003674" in _ancestors: database = "molecular_function" ont_doc = Ontology( ontology=self.ontology_name, term=node, name=data["name"], database=database, successors=self.all_successors(node, []), children=successors, description=self.go_dag.query_term(node.upper()).desc, # successors_relationships=self.successors_relationships(node), subclases=list( set([ x.lower() for x in self.go_dag.query_term( node.upper()).get_all_children() ]))) ont_doc.keywords = self.ki.extract_keywords( [ont_doc.description, ont_doc.name, ont_doc.term]) ont_doc.save() def _build_graph(self): assert self.go_dag, "GO terms where not loaded" self.graph.add_node("root", name="root") processed = [] for root_term in self.root_terms: # Iterates over each root root = self.go_dag.query_term(root_term) self.graph.add_node(root_term.lower(), name=root.name) self.graph.add_edge("root", root_term.lower()) self._load_branch(root, processed) def _load_branch(self, term, processed): term_id = term.id.lower() if term_id in processed: return processed.append(term_id) if term.children: # or term.relationships: for child in term.children: # + [x[1] for x in term.relationships]): child_id = child.id.lower() self.graph.add_node(child_id, name=child.name) self.graph.add_edge(term_id, child_id) self._load_branch(child, processed) def all_successors(self, node, processed): successors = self.graph.successors(node) if successors: for x in successors: if x not in processed: processed.append(x) successors = list( set(successors + self.all_successors(x, processed))) else: successors = list() return successors def successors_relationships(self, node): term = self.go_dag.query_term(node.upper()) return [[x.id.lower(), "is_a"] for x in term.children] + [[x[1].id.lower(), x[0]] for x in term.relationships] def cleanup_cellular_component_annotations(self, genome): for ont_doc in Ontology.objects(ontology=self.ontology_name, database="cellular_component", databases__ne="generic"): # self.db["proteins"].update({"organism":genome, }, {"$pull":{"ontologies":ont_doc.term, "keywords":ont_doc.term}}, multi=True) self.db["col_ont_idx"].remove( { "ontology": "go", "seq_collection_name": genome.name, "term": ont_doc.term }, multi=True) def complete_subgraph(self, ontologies): allontologies = copy.copy(ontologies) for ontology in ontologies: allontologies += self._complete_parents(ontology, []) return [x for x in set(allontologies) if x != "root"] def _complete_parents(self, ontology, walked): allontologies = [ontology] walked.append(ontology) if ontology in self.graph: for ancestor in ancestors(self.graph, ontology): if ancestor not in walked: allontologies += self._complete_parents(ancestor, walked) return allontologies else: return allontologies # def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=True): if drop: print( self.col_go_index.remove({ "seq_collection_id": genome.id, "ontology": self.ontology_name })) ont_succ_cache = {} for ont_doc in tqdm( Ontology.objects(ontology=self.ontology_name).no_cache(), total=Ontology.objects(ontology=self.ontology_name).count()): ont_succ_cache[ont_doc.term] = ont_doc.successors database = "" if hasattr(ont_doc, "database") and ont_doc.database: database = ont_doc.database # if hasattr(ont_doc, "databases") and ont_doc.databases: # database = ont_doc.databases[0] order = len(ont_doc["children"]) seq_ont_ont_idx = SeqColOntologyIndex( term=ont_doc.term.lower(), name=ont_doc.name, count=0, seq_collection_name=genome.name, database=database, ontology=self.ontology_name, order=order, seq_collection_id=genome.id, keywords=ont_doc.keywords) seq_ont_ont_idx.save() ont_count = defaultdict(lambda: 0) query = { "seq_collection_id": genome.id, "ontologies.0": { "$exists": True } } for p in tqdm(self.db[annotated_collection].find( query, {"ontologies": 1}), total=self.db[annotated_collection].count(query)): terms = [x for x in p["ontologies"] if x.startswith("go:")] terms = self.complete_subgraph(terms) for x in terms: ont_count[x] += 1 self.db[annotated_collection].update( {"_id": p["_id"]}, {"$addToSet": { annotated_collection_field: { "$each": terms } }}) for term, count in tqdm(ont_count.items()): for seq_ont_ont_idx in SeqColOntologyIndex.objects( seq_collection_id=genome.id, ontology=self.ontology_name, term=term): seq_ont_ont_idx.count = count seq_ont_ont_idx.save() SeqColOntologyIndex.objects(seq_collection_id=genome.id, count=0).delete() self.cleanup_cellular_component_annotations(genome)