def load_whog(self, whog_path): with open(whog_path, "r") as handle: groups = handle.read().split("_______") for group in groups: glines = [g.strip() for g in group.split("\n") if g.strip()] if glines: line = glines[0].lower() parent = line.lower().strip().split(" ")[0] term = line.lower().strip().split(" ")[1] name = " ".join(line.lower().strip().split(" ")[2:]) ont_doc = Ontology(term=term, name=name, parent=parent, ontology="cog") keywords = self.ki.extract_keywords(line) if len(parent) > 3: for x in parent[1:-1]: parent_ont_doc = Ontology.objects(term='[' + x + ']').get() keywords = list(set(parent_ont_doc.keywords + keywords)) parent_ont_doc.children.append(term) parent_ont_doc.save() else: parent_ont_doc = Ontology.objects(term=parent).get() parent_ont_doc.children.append(term) parent_ont_doc.save() keywords = list(set(parent_ont_doc.keywords + keywords)) ont_doc.keywords = keywords ont_doc.save()
def load_children(self): for ont_doc in Ontology.objects(ontology="ec"): if ont_doc.term == "root": continue ont_doc.children = [x["term"] for x in Ontology.objects( ontology="ec", term__istartswith=ont_doc.term.replace(".-", "") ) if (x.term != ont_doc.term) and (x.term.count("-") == (ont_doc.term.count("-") - 1))] ont_doc.__repr__() ont_doc.save()
def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=False): self.col_index.remove({"ontology": "ec", "seq_collection_id": genome.id}) _log.debug("creating empty ecs idxs") ec_count = Ontology.objects(ontology="ec").count() for ont_doc in tqdm(Ontology.objects(ontology="ec").no_cache(), total=ec_count): order = 9999999 try: order = order / int( ont_doc.term.lower().replace("ec:", "").replace(".", "").replace("-", "").replace("n", "")) except: pass seq_col_ont_idx = SeqColOntologyIndex(seq_collection_id=genome.id, term=ont_doc.term.lower(), seq_collection_name=genome.name, name=ont_doc.name, ontology="ec", order=order, count=0, keywords=ont_doc.keywords) seq_col_ont_idx.save() _log.debug("initializign idx ecs") terms_count = SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).count() for ont_doc in tqdm(SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).order_by( "-term").no_cache().timeout(False), total=terms_count): if "-" in ont_doc.term: str_term = ont_doc.term.replace(".", "\.").replace("-", ".+") else: str_term = ont_doc.term count = self.db[annotated_collection].count({"seq_collection_id": genome.id, annotated_collection_field: {"$regex": '^' + str_term, "$options": "-i"}}) if count: ont_doc.count = count ont_doc.save() else: ont_doc.delete() regx = re.compile("^ec:", re.IGNORECASE) self.db.col_ont_idx.insert( { "_id": ObjectId(), "_cls": "SeqColOntologyIndex", "term": "root", "name": "root", "count": self.db.proteins.count({"organism": genome.name, "ontologies": regx}), "order": 9999999, "keywords": [ ], "ontology": "ec", "seq_collection_name": genome.name, "seq_collection_id": genome.id }) _log.debug("ecs idxs done")
def load_slim(self, slim_file="/data/databases/go/goslim_generic.obo", database="generic"): parser = GODag(slim_file) for ont in parser: try: go = Ontology.objects(ontology="go", term=ont.lower()).get() go.databases.append(database) go.save() except Exception as ex: _log.error(ex) go = Ontology.objects(ontology="go", term="root").get() go.databases.append(database) go.save()
def update_ont_org_idx(self): for term, seq_ont_ont_idx in self.react_ont_idx_dict.items(): reac = Ontology.objects(ontology=self.ontology_name + "_reac", term=term.lower()) if len(reac): reac = reac.first() seq_ont_ont_idx.name = reac.name seq_ont_ont_idx.keywords = list(set(seq_ont_ont_idx.keywords + reac.keywords)) seq_ont_ont_idx.save()
def cleanup_cellular_component_annotations(self, genome): for ont_doc in Ontology.objects(ontology=self.ontology_name, database="cellular_component", databases__ne="generic"): # self.db["proteins"].update({"organism":genome, }, {"$pull":{"ontologies":ont_doc.term, "keywords":ont_doc.term}}, multi=True) self.db["col_ont_idx"].remove( { "ontology": "go", "seq_collection_name": genome.name, "term": ont_doc.term }, multi=True)
def _successors(self, term, counts): try: ec_term = Ontology.objects(ontology="ec", term=term).get() return ec_term.children # [ x for x in counts.keys() if re.match(term.split("-")[0], x) ] except: return []
def index_seq_collection(db, genome, ec=True, go=True, keywords=True, organism_idx=True, pathways=True, structure=False, go_obo="/data/databases/go/go.obo"): collection = SeqCollection.objects(name=genome).get() if ec: ec2mongo = EC2Mongo(db) _log.debug("Building EC index...") ec2mongo.pre_build_index(collection) collection.ec_index = True _log.debug("EC index finished") collection.save() if go: go2mongo = GO2Mongo(go_obo, db) go2mongo.init() _log.debug("Building GO index...") go2mongo.pre_build_index(collection) collection.go_index = True collection.save() _log.debug("GO index finished") if structure: si = StructuromeIndexer(collection) si.build_index() if pathways: biocyc = BioCyc(db) biocyc.user = BioMongoDB.demo _log.debug("Building Biocyc index...") biocyc.pre_build_index(collection) _log.debug("Biocyc index finished") if keywords: _log.debug("indexing by keyword...") ki = KeywordIndexer() cache = {} total_p = db.proteins.count({"organism": genome}) with tqdm(Protein.objects(organism=genome).no_cache().timeout(False), total=total_p) as pbar: for prot in pbar: pbar.set_description(prot.name) # Basic keywords current_keywords = list( set([ x.lower().strip() for x in reduce( list.__add__, map(ki.extract_keywords, [prot.name, prot.description] + prot.gene)) ])) prot.keywords = current_keywords + prot.keywords # ontologies keywords terms = prot.ontologies terms = terms + [ x.identifier.strip().lower() for x in prot.features if x.identifier ] terms = terms + [ x.type.strip().lower() for x in prot.features if x.identifier ] terms = list(set([x.lower() for x in terms])) for term in terms: if term not in cache: ont = Ontology.objects(term=str(term)) if len(ont): cache[term] = ont.first() if term in cache: prot.keywords = prot.keywords + cache[term].keywords # SO:0001060 missense_variant prot.keywords = list(set(prot.keywords + terms)) prot.save() _log.debug("Keyword index finished") if organism_idx: _log.debug("indexing ontology by organism") prots = list( db.proteins.find({ "organism": genome, "ontologies.0": { "$exists": True } })) for prot in tqdm(prots): for term in prot["ontologies"]: if (term in cache) and cache[term].ontology not in ["ec", "go"]: seq_col_ont_idx = SeqColOntologyIndex( seq_collection_id=collection.id, term=term, seq_collection_name=genome, name=cache[term].name, ontology=cache[term].ontology, keywords=cache[term].keywords) seq_col_ont_idx.save() SeqColOntologyIndex.objects(count=0).delete() _log.debug("Organism index finished") collection.save() _log.info("indexing %s finished" + genome)
def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=True): if drop: print( self.col_go_index.remove({ "seq_collection_id": genome.id, "ontology": self.ontology_name })) ont_succ_cache = {} for ont_doc in tqdm( Ontology.objects(ontology=self.ontology_name).no_cache(), total=Ontology.objects(ontology=self.ontology_name).count()): ont_succ_cache[ont_doc.term] = ont_doc.successors database = "" if hasattr(ont_doc, "database") and ont_doc.database: database = ont_doc.database # if hasattr(ont_doc, "databases") and ont_doc.databases: # database = ont_doc.databases[0] order = len(ont_doc["children"]) seq_ont_ont_idx = SeqColOntologyIndex( term=ont_doc.term.lower(), name=ont_doc.name, count=0, seq_collection_name=genome.name, database=database, ontology=self.ontology_name, order=order, seq_collection_id=genome.id, keywords=ont_doc.keywords) seq_ont_ont_idx.save() ont_count = defaultdict(lambda: 0) query = { "seq_collection_id": genome.id, "ontologies.0": { "$exists": True } } for p in tqdm(self.db[annotated_collection].find( query, {"ontologies": 1}), total=self.db[annotated_collection].count(query)): terms = [x for x in p["ontologies"] if x.startswith("go:")] terms = self.complete_subgraph(terms) for x in terms: ont_count[x] += 1 self.db[annotated_collection].update( {"_id": p["_id"]}, {"$addToSet": { annotated_collection_field: { "$each": terms } }}) for term, count in tqdm(ont_count.items()): for seq_ont_ont_idx in SeqColOntologyIndex.objects( seq_collection_id=genome.id, ontology=self.ontology_name, term=term): seq_ont_ont_idx.count = count seq_ont_ont_idx.save() SeqColOntologyIndex.objects(seq_collection_id=genome.id, count=0).delete() self.cleanup_cellular_component_annotations(genome)