def _protein_keywords(self, protein): keywords = [] for reaction in protein.reactions: keywords.append(reaction.name) if reaction.name not in self.react_ont_idx_dict: seq_ont_ont_idx = SeqColOntologyIndex( term=reaction.name, seq_collection_name=protein.organism, count=1, seq_collection_id=protein.seq_collection_id, ontology=self.ontology_name + "_reac", keywords=[reaction.name]) self.react_ont_idx_dict[reaction.name] = seq_ont_ont_idx else: seq_ont_ont_idx = self.react_ont_idx_dict[reaction.name] seq_ont_ont_idx.count = seq_ont_ont_idx.count + 1 keywords = keywords + reaction.pathways for pathway in reaction.pathways: if pathway not in self.pathways: self.pathways[pathway] = [] self.pathways[pathway].append(protein.id) for specie in reaction.substrates + reaction.products: keywords.append(specie.name) seq_ont_ont_idx.keywords.append(specie.name) keywords = keywords + specie.producers + specie.consumers return keywords
def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=False): self.col_index.remove({"ontology": "ec", "seq_collection_id": genome.id}) _log.debug("creating empty ecs idxs") ec_count = Ontology.objects(ontology="ec").count() for ont_doc in tqdm(Ontology.objects(ontology="ec").no_cache(), total=ec_count): order = 9999999 try: order = order / int( ont_doc.term.lower().replace("ec:", "").replace(".", "").replace("-", "").replace("n", "")) except: pass seq_col_ont_idx = SeqColOntologyIndex(seq_collection_id=genome.id, term=ont_doc.term.lower(), seq_collection_name=genome.name, name=ont_doc.name, ontology="ec", order=order, count=0, keywords=ont_doc.keywords) seq_col_ont_idx.save() _log.debug("initializign idx ecs") terms_count = SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).count() for ont_doc in tqdm(SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).order_by( "-term").no_cache().timeout(False), total=terms_count): if "-" in ont_doc.term: str_term = ont_doc.term.replace(".", "\.").replace("-", ".+") else: str_term = ont_doc.term count = self.db[annotated_collection].count({"seq_collection_id": genome.id, annotated_collection_field: {"$regex": '^' + str_term, "$options": "-i"}}) if count: ont_doc.count = count ont_doc.save() else: ont_doc.delete() regx = re.compile("^ec:", re.IGNORECASE) self.db.col_ont_idx.insert( { "_id": ObjectId(), "_cls": "SeqColOntologyIndex", "term": "root", "name": "root", "count": self.db.proteins.count({"organism": genome.name, "ontologies": regx}), "order": 9999999, "keywords": [ ], "ontology": "ec", "seq_collection_name": genome.name, "seq_collection_id": genome.id }) _log.debug("ecs idxs done")
def _genome_summary(self, genome): pathways_count = {x: len(set(y)) for x, y in self.pathways.items()} SeqColOntologyIndex.objects(seq_collection_name=genome.name, ontology=self.ontology_name + "_pw").delete() for x in genome.pathways : seq_ont_ont_idx = SeqColOntologyIndex(term=x["term"], name=x["name"], seq_collection_name=genome.name, ontology=self.ontology_name + "_pw", keywords=self.ki.extract_keywords(x["name"]) + [x["term"]], count=x["count"], seq_collection_id=genome.id) seq_ont_ont_idx.save()
def index_seq_collection(db, genome, ec=True, go=True, keywords=True, organism_idx=True, pathways=True, structure=False, go_obo="/data/databases/go/go.obo"): collection = SeqCollection.objects(name=genome).get() if ec: ec2mongo = EC2Mongo(db) _log.debug("Building EC index...") ec2mongo.pre_build_index(collection) collection.ec_index = True _log.debug("EC index finished") collection.save() if go: go2mongo = GO2Mongo(go_obo, db) go2mongo.init() _log.debug("Building GO index...") go2mongo.pre_build_index(collection) collection.go_index = True collection.save() _log.debug("GO index finished") if structure: si = StructuromeIndexer(collection) si.build_index() if pathways: biocyc = BioCyc(db) biocyc.user = BioMongoDB.demo _log.debug("Building Biocyc index...") biocyc.pre_build_index(collection) _log.debug("Biocyc index finished") if keywords: _log.debug("indexing by keyword...") ki = KeywordIndexer() cache = {} total_p = db.proteins.count({"organism": genome}) with tqdm(Protein.objects(organism=genome).no_cache().timeout(False), total=total_p) as pbar: for prot in pbar: pbar.set_description(prot.name) # Basic keywords current_keywords = list( set([ x.lower().strip() for x in reduce( list.__add__, map(ki.extract_keywords, [prot.name, prot.description] + prot.gene)) ])) prot.keywords = current_keywords + prot.keywords # ontologies keywords terms = prot.ontologies terms = terms + [ x.identifier.strip().lower() for x in prot.features if x.identifier ] terms = terms + [ x.type.strip().lower() for x in prot.features if x.identifier ] terms = list(set([x.lower() for x in terms])) for term in terms: if term not in cache: ont = Ontology.objects(term=str(term)) if len(ont): cache[term] = ont.first() if term in cache: prot.keywords = prot.keywords + cache[term].keywords # SO:0001060 missense_variant prot.keywords = list(set(prot.keywords + terms)) prot.save() _log.debug("Keyword index finished") if organism_idx: _log.debug("indexing ontology by organism") prots = list( db.proteins.find({ "organism": genome, "ontologies.0": { "$exists": True } })) for prot in tqdm(prots): for term in prot["ontologies"]: if (term in cache) and cache[term].ontology not in ["ec", "go"]: seq_col_ont_idx = SeqColOntologyIndex( seq_collection_id=collection.id, term=term, seq_collection_name=genome, name=cache[term].name, ontology=cache[term].ontology, keywords=cache[term].keywords) seq_col_ont_idx.save() SeqColOntologyIndex.objects(count=0).delete() _log.debug("Organism index finished") collection.save() _log.info("indexing %s finished" + genome)
def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies", drop=True): if drop: print( self.col_go_index.remove({ "seq_collection_id": genome.id, "ontology": self.ontology_name })) ont_succ_cache = {} for ont_doc in tqdm( Ontology.objects(ontology=self.ontology_name).no_cache(), total=Ontology.objects(ontology=self.ontology_name).count()): ont_succ_cache[ont_doc.term] = ont_doc.successors database = "" if hasattr(ont_doc, "database") and ont_doc.database: database = ont_doc.database # if hasattr(ont_doc, "databases") and ont_doc.databases: # database = ont_doc.databases[0] order = len(ont_doc["children"]) seq_ont_ont_idx = SeqColOntologyIndex( term=ont_doc.term.lower(), name=ont_doc.name, count=0, seq_collection_name=genome.name, database=database, ontology=self.ontology_name, order=order, seq_collection_id=genome.id, keywords=ont_doc.keywords) seq_ont_ont_idx.save() ont_count = defaultdict(lambda: 0) query = { "seq_collection_id": genome.id, "ontologies.0": { "$exists": True } } for p in tqdm(self.db[annotated_collection].find( query, {"ontologies": 1}), total=self.db[annotated_collection].count(query)): terms = [x for x in p["ontologies"] if x.startswith("go:")] terms = self.complete_subgraph(terms) for x in terms: ont_count[x] += 1 self.db[annotated_collection].update( {"_id": p["_id"]}, {"$addToSet": { annotated_collection_field: { "$each": terms } }}) for term, count in tqdm(ont_count.items()): for seq_ont_ont_idx in SeqColOntologyIndex.objects( seq_collection_id=genome.id, ontology=self.ontology_name, term=term): seq_ont_ont_idx.count = count seq_ont_ont_idx.save() SeqColOntologyIndex.objects(seq_collection_id=genome.id, count=0).delete() self.cleanup_cellular_component_annotations(genome)