def create_ontology(self, ontology_db): ontology_db.remove({"ontology": "cog"}) for ontology in self.cog: keywords = self.ki.extract_keywords(ontology) ont_doc = Ontology(term=ontology, keywords=keywords, ontology="cog") ont_doc.save()
def load_children(self): for ont_doc in Ontology.objects(ontology="ec"): if ont_doc.term == "root": continue ont_doc.children = [x["term"] for x in Ontology.objects( ontology="ec", term__istartswith=ont_doc.term.replace(".-", "") ) if (x.term != ont_doc.term) and (x.term.count("-") == (ont_doc.term.count("-") - 1))] ont_doc.__repr__() ont_doc.save()
def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=False): self.col_index.remove({"ontology": "ec", "seq_collection_id": genome.id}) _log.debug("creating empty ecs idxs") ec_count = Ontology.objects(ontology="ec").count() for ont_doc in tqdm(Ontology.objects(ontology="ec").no_cache(), total=ec_count): order = 9999999 try: order = order / int( ont_doc.term.lower().replace("ec:", "").replace(".", "").replace("-", "").replace("n", "")) except: pass seq_col_ont_idx = SeqColOntologyIndex(seq_collection_id=genome.id, term=ont_doc.term.lower(), seq_collection_name=genome.name, name=ont_doc.name, ontology="ec", order=order, count=0, keywords=ont_doc.keywords) seq_col_ont_idx.save() _log.debug("initializign idx ecs") terms_count = SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).count() for ont_doc in tqdm(SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).order_by( "-term").no_cache().timeout(False), total=terms_count): if "-" in ont_doc.term: str_term = ont_doc.term.replace(".", "\.").replace("-", ".+") else: str_term = ont_doc.term count = self.db[annotated_collection].count({"seq_collection_id": genome.id, annotated_collection_field: {"$regex": '^' + str_term, "$options": "-i"}}) if count: ont_doc.count = count ont_doc.save() else: ont_doc.delete() regx = re.compile("^ec:", re.IGNORECASE) self.db.col_ont_idx.insert( { "_id": ObjectId(), "_cls": "SeqColOntologyIndex", "term": "root", "name": "root", "count": self.db.proteins.count({"organism": genome.name, "ontologies": regx}), "order": 9999999, "keywords": [ ], "ontology": "ec", "seq_collection_name": genome.name, "seq_collection_id": genome.id }) _log.debug("ecs idxs done")
def load_pathways(self, pathways_file, database): with open(pathways_file) as pathways_handle: for x in pathways_handle.readlines(): if not (x.strip().startswith("#") or x.strip().startswith("UNIQUE-ID")): line = re.sub(r'\t+', '\t', x) term = line.split("\t")[0].strip().lower() name = line.split("\t")[1].strip() ont_doc = Ontology(term=term, name=name, ontology=self.ontology_name + "_pw", keywords=self.ki.extract_keywords(name) + [term]) ont_doc.databases.append(database) ont_doc.save()
def load_fun(self, fun_path): with open(fun_path, "r") as handle: string = handle.read() groups = re.split(re.compile("\r\n\r\n"), string) for group in groups: glines = [g.strip() for g in group.split("\n") if g.strip()] term = glines[0].strip().lower() keywords = self.ki.extract_keywords(term) parent_ont_doc = Ontology(term=term, name=term, keywords=keywords, ontology="cog") parent_ont_doc.save() for line in glines[1:]: term = line.lower().strip().split(" ")[0] name = " ".join(line.lower().strip().split(" ")[1:]) keywords = list( set(parent_ont_doc.keywords + self.ki.extract_keywords(line))) ont_doc = Ontology(term=term, name=name, parent=parent_ont_doc.term, keywords=keywords, ontology="cog") parent_ont_doc.children.append(term) ont_doc.save()
def load_slim(self, slim_file="/data/databases/go/goslim_generic.obo", database="generic"): parser = GODag(slim_file) for ont in parser: try: go = Ontology.objects(ontology="go", term=ont.lower()).get() go.databases.append(database) go.save() except Exception as ex: _log.error(ex) go = Ontology.objects(ontology="go", term="root").get() go.databases.append(database) go.save()
def update_ont_org_idx(self): for term, seq_ont_ont_idx in self.react_ont_idx_dict.items(): reac = Ontology.objects(ontology=self.ontology_name + "_reac", term=term.lower()) if len(reac): reac = reac.first() seq_ont_ont_idx.name = reac.name seq_ont_ont_idx.keywords = list(set(seq_ont_ont_idx.keywords + reac.keywords)) seq_ont_ont_idx.save()
def load_whog(self, whog_path): with open(whog_path, "r") as handle: groups = handle.read().split("_______") for group in groups: glines = [g.strip() for g in group.split("\n") if g.strip()] if glines: line = glines[0].lower() parent = line.lower().strip().split(" ")[0] term = line.lower().strip().split(" ")[1] name = " ".join(line.lower().strip().split(" ")[2:]) ont_doc = Ontology(term=term, name=name, parent=parent, ontology="cog") keywords = self.ki.extract_keywords(line) if len(parent) > 3: for x in parent[1:-1]: parent_ont_doc = Ontology.objects(term='[' + x + ']').get() keywords = list(set(parent_ont_doc.keywords + keywords)) parent_ont_doc.children.append(term) parent_ont_doc.save() else: parent_ont_doc = Ontology.objects(term=parent).get() parent_ont_doc.children.append(term) parent_ont_doc.save() keywords = list(set(parent_ont_doc.keywords + keywords)) ont_doc.keywords = keywords ont_doc.save()
def load_enzclass(self, enzclass_file_path): root = Ontology(ontology=self.ontology_name, term="root", name="ec", children=["ec:1.-.-.-", "ec:2.-.-.-", "ec:3.-.-.-", "ec:4.-.-.-", "ec:5.-.-.-", "ec:6.-.-.-"]) root.save() with open(enzclass_file_path) as enzclass_handle: for line in enzclass_handle: if re.match(r'^[1-6][.]', line): name = line.split(".-")[-1].strip() term = "ec:" + line.replace(name, "").replace(" ", "").strip() ont_doc = Ontology(ontology=self.ontology_name, term=term, name=name) ont_doc.keywords = self.ki.extract_keywords(ont_doc.name) + [ont_doc.term] ont_doc.save()
def load_enzdata(self, enzdata_file_path): ont_doc = None with open(enzdata_file_path) as enzclass_handle: for line in enzclass_handle: if line.startswith("DE"): ont_doc.name = line.split("DE")[1].strip() ont_doc.keywords = self.ki.extract_keywords([ont_doc.description, ont_doc.name]) + [ont_doc.term] ont_doc.save() elif line.startswith("ID"): term = "ec:" + line.split("ID")[1].strip() ont_doc = Ontology(ontology=self.ontology_name, term=term)
def cleanup_cellular_component_annotations(self, genome): for ont_doc in Ontology.objects(ontology=self.ontology_name, database="cellular_component", databases__ne="generic"): # self.db["proteins"].update({"organism":genome, }, {"$pull":{"ontologies":ont_doc.term, "keywords":ont_doc.term}}, multi=True) self.db["col_ont_idx"].remove( { "ontology": "go", "seq_collection_name": genome.name, "term": ont_doc.term }, multi=True)
def _load_mongo(self): root = Ontology(ontology=self.ontology_name, term="root", successors=self.root_terms, children=self.root_terms) root.save() for (node, data) in self.graph.nodes_iter( data=True): # self.graph.add_node(node, **data) if node == "root": raise Exception("...") else: successors = self.graph.successors(node) _ancestors = self.complete_subgraph([node]) database = "biological_process" if "go:0005575" in _ancestors: database = "cellular_component" if "go:0003674" in _ancestors: database = "molecular_function" ont_doc = Ontology( ontology=self.ontology_name, term=node, name=data["name"], database=database, successors=self.all_successors(node, []), children=successors, description=self.go_dag.query_term(node.upper()).desc, # successors_relationships=self.successors_relationships(node), subclases=list( set([ x.lower() for x in self.go_dag.query_term( node.upper()).get_all_children() ]))) ont_doc.keywords = self.ki.extract_keywords( [ont_doc.description, ont_doc.name, ont_doc.term]) ont_doc.save()
def _successors(self, term, counts): try: ec_term = Ontology.objects(ontology="ec", term=term).get() return ec_term.children # [ x for x in counts.keys() if re.match(term.split("-")[0], x) ] except: return []
def load_dat(self, reactions_file, database, postfix): with open(reactions_file) as reactions_handle: lines = [ x for x in reactions_handle.readlines() if not x.startswith("#") ] records = re.split("//\n", "\n".join(lines)) for record in records: if not record.strip(): continue ont_doc = Ontology(ontology=self.ontology_name + postfix) ont_doc.databases.append(database) reaction_types = [] ec = None for str_record in [y for y in record.split("\n") if y]: if str_record.strip() and len(str_record.strip()) > 3: if len(str_record.split(" - ")) > 1: field = str_record.split(" - ")[0].strip() try: value = str_record.split( " - ")[1].strip().decode("utf-8") except UnicodeDecodeError: continue if field == "UNIQUE-ID": ont_doc.term = value.lower() elif field == "TYPES": reaction_types.append(value) elif field == "IN-PATHWAY": ont_doc.parents.append(value) elif field == "COMMON-NAME": ont_doc.name = value elif (field == "COMMENT") and (not ont_doc.name): ont_doc.description = value elif (field == "EC-NUMBER") and (not ont_doc.name): ec = value if not ont_doc.description: ont_doc.description = "|".join(reaction_types) if not ont_doc.name: if ec: ont_doc.name = ec else: ont_doc.name = ont_doc.term ont_doc.keywords = self.ki.extract_keywords( ont_doc.name) + [ont_doc.term] ont_doc.types = reaction_types if ec: ont_doc.keywords.append(ec) if not ont_doc.term: print(record) else: ont_doc.save()
def index_seq_collection(db, genome, ec=True, go=True, keywords=True, organism_idx=True, pathways=True, structure=False, go_obo="/data/databases/go/go.obo"): collection = SeqCollection.objects(name=genome).get() if ec: ec2mongo = EC2Mongo(db) _log.debug("Building EC index...") ec2mongo.pre_build_index(collection) collection.ec_index = True _log.debug("EC index finished") collection.save() if go: go2mongo = GO2Mongo(go_obo, db) go2mongo.init() _log.debug("Building GO index...") go2mongo.pre_build_index(collection) collection.go_index = True collection.save() _log.debug("GO index finished") if structure: si = StructuromeIndexer(collection) si.build_index() if pathways: biocyc = BioCyc(db) biocyc.user = BioMongoDB.demo _log.debug("Building Biocyc index...") biocyc.pre_build_index(collection) _log.debug("Biocyc index finished") if keywords: _log.debug("indexing by keyword...") ki = KeywordIndexer() cache = {} total_p = db.proteins.count({"organism": genome}) with tqdm(Protein.objects(organism=genome).no_cache().timeout(False), total=total_p) as pbar: for prot in pbar: pbar.set_description(prot.name) # Basic keywords current_keywords = list( set([ x.lower().strip() for x in reduce( list.__add__, map(ki.extract_keywords, [prot.name, prot.description] + prot.gene)) ])) prot.keywords = current_keywords + prot.keywords # ontologies keywords terms = prot.ontologies terms = terms + [ x.identifier.strip().lower() for x in prot.features if x.identifier ] terms = terms + [ x.type.strip().lower() for x in prot.features if x.identifier ] terms = list(set([x.lower() for x in terms])) for term in terms: if term not in cache: ont = Ontology.objects(term=str(term)) if len(ont): cache[term] = ont.first() if term in cache: prot.keywords = prot.keywords + cache[term].keywords # SO:0001060 missense_variant prot.keywords = list(set(prot.keywords + terms)) prot.save() _log.debug("Keyword index finished") if organism_idx: _log.debug("indexing ontology by organism") prots = list( db.proteins.find({ "organism": genome, "ontologies.0": { "$exists": True } })) for prot in tqdm(prots): for term in prot["ontologies"]: if (term in cache) and cache[term].ontology not in ["ec", "go"]: seq_col_ont_idx = SeqColOntologyIndex( seq_collection_id=collection.id, term=term, seq_collection_name=genome, name=cache[term].name, ontology=cache[term].ontology, keywords=cache[term].keywords) seq_col_ont_idx.save() SeqColOntologyIndex.objects(count=0).delete() _log.debug("Organism index finished") collection.save() _log.info("indexing %s finished" + genome)
def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=True): if drop: print( self.col_go_index.remove({ "seq_collection_id": genome.id, "ontology": self.ontology_name })) ont_succ_cache = {} for ont_doc in tqdm( Ontology.objects(ontology=self.ontology_name).no_cache(), total=Ontology.objects(ontology=self.ontology_name).count()): ont_succ_cache[ont_doc.term] = ont_doc.successors database = "" if hasattr(ont_doc, "database") and ont_doc.database: database = ont_doc.database # if hasattr(ont_doc, "databases") and ont_doc.databases: # database = ont_doc.databases[0] order = len(ont_doc["children"]) seq_ont_ont_idx = SeqColOntologyIndex( term=ont_doc.term.lower(), name=ont_doc.name, count=0, seq_collection_name=genome.name, database=database, ontology=self.ontology_name, order=order, seq_collection_id=genome.id, keywords=ont_doc.keywords) seq_ont_ont_idx.save() ont_count = defaultdict(lambda: 0) query = { "seq_collection_id": genome.id, "ontologies.0": { "$exists": True } } for p in tqdm(self.db[annotated_collection].find( query, {"ontologies": 1}), total=self.db[annotated_collection].count(query)): terms = [x for x in p["ontologies"] if x.startswith("go:")] terms = self.complete_subgraph(terms) for x in terms: ont_count[x] += 1 self.db[annotated_collection].update( {"_id": p["_id"]}, {"$addToSet": { annotated_collection_field: { "$each": terms } }}) for term, count in tqdm(ont_count.items()): for seq_ont_ont_idx in SeqColOntologyIndex.objects( seq_collection_id=genome.id, ontology=self.ontology_name, term=term): seq_ont_ont_idx.count = count seq_ont_ont_idx.save() SeqColOntologyIndex.objects(seq_collection_id=genome.id, count=0).delete() self.cleanup_cellular_component_annotations(genome)