コード例 #1
0
    def _protein_keywords(self, protein):
        keywords = []
        for reaction in protein.reactions:
            keywords.append(reaction.name)
            if reaction.name not in self.react_ont_idx_dict:
                seq_ont_ont_idx = SeqColOntologyIndex(
                    term=reaction.name,
                    seq_collection_name=protein.organism,
                    count=1,
                    seq_collection_id=protein.seq_collection_id,
                    ontology=self.ontology_name + "_reac",
                    keywords=[reaction.name])
                self.react_ont_idx_dict[reaction.name] = seq_ont_ont_idx
            else:
                seq_ont_ont_idx = self.react_ont_idx_dict[reaction.name]
                seq_ont_ont_idx.count = seq_ont_ont_idx.count + 1
            keywords = keywords + reaction.pathways
            for pathway in reaction.pathways:
                if pathway not in self.pathways:
                    self.pathways[pathway] = []
                self.pathways[pathway].append(protein.id)

            for specie in reaction.substrates + reaction.products:
                keywords.append(specie.name)
                seq_ont_ont_idx.keywords.append(specie.name)
                keywords = keywords + specie.producers + specie.consumers
        return keywords
コード例 #2
0
    def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies",
                        drop=False):
        self.col_index.remove({"ontology": "ec", "seq_collection_id": genome.id})

        _log.debug("creating empty ecs idxs")
        ec_count = Ontology.objects(ontology="ec").count()
        for ont_doc in tqdm(Ontology.objects(ontology="ec").no_cache(), total=ec_count):
            order = 9999999
            try:
                order = order / int(
                    ont_doc.term.lower().replace("ec:", "").replace(".", "").replace("-", "").replace("n", ""))
            except:
                pass

            seq_col_ont_idx = SeqColOntologyIndex(seq_collection_id=genome.id, term=ont_doc.term.lower(),
                                                  seq_collection_name=genome.name, name=ont_doc.name,
                                                  ontology="ec", order=order,
                                                  count=0, keywords=ont_doc.keywords)
            seq_col_ont_idx.save()



        _log.debug("initializign idx ecs")
        terms_count = SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).count()
        for ont_doc in tqdm(SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).order_by(
                "-term").no_cache().timeout(False), total=terms_count):
            if "-" in ont_doc.term:
                str_term = ont_doc.term.replace(".", "\.").replace("-", ".+")
            else:
                str_term = ont_doc.term

            count = self.db[annotated_collection].count({"seq_collection_id": genome.id,
                                                         annotated_collection_field: {"$regex": '^' + str_term,
                                                                                      "$options": "-i"}})
            if count:
                ont_doc.count = count
                ont_doc.save()
            else:
                ont_doc.delete()

        regx = re.compile("^ec:", re.IGNORECASE)
        self.db.col_ont_idx.insert(
            {
                "_id": ObjectId(),
                "_cls": "SeqColOntologyIndex",
                "term": "root",
                "name": "root",
                "count": self.db.proteins.count({"organism": genome.name, "ontologies": regx}),
                "order": 9999999,
                "keywords": [
                ],
                "ontology": "ec",
                "seq_collection_name": genome.name,
                "seq_collection_id": genome.id
            })
        _log.debug("ecs idxs done")
コード例 #3
0
    def _genome_summary(self, genome):
        pathways_count = {x: len(set(y)) for x, y in self.pathways.items()}



        SeqColOntologyIndex.objects(seq_collection_name=genome.name, ontology=self.ontology_name + "_pw").delete()

        for x in genome.pathways :
            seq_ont_ont_idx = SeqColOntologyIndex(term=x["term"], name=x["name"],
                                                  seq_collection_name=genome.name,
                                                  ontology=self.ontology_name + "_pw",
                                                  keywords=self.ki.extract_keywords(x["name"]) + [x["term"]],
                                                  count=x["count"], seq_collection_id=genome.id)
            seq_ont_ont_idx.save()
コード例 #4
0
ファイル: Index.py プロジェクト: ezequieljsosa/sndg-bio
def index_seq_collection(db,
                         genome,
                         ec=True,
                         go=True,
                         keywords=True,
                         organism_idx=True,
                         pathways=True,
                         structure=False,
                         go_obo="/data/databases/go/go.obo"):

    collection = SeqCollection.objects(name=genome).get()

    if ec:
        ec2mongo = EC2Mongo(db)
        _log.debug("Building EC index...")
        ec2mongo.pre_build_index(collection)
        collection.ec_index = True
        _log.debug("EC index finished")
        collection.save()

    if go:
        go2mongo = GO2Mongo(go_obo, db)
        go2mongo.init()
        _log.debug("Building GO index...")
        go2mongo.pre_build_index(collection)
        collection.go_index = True
        collection.save()
        _log.debug("GO index finished")

    if structure:
        si = StructuromeIndexer(collection)
        si.build_index()

    if pathways:
        biocyc = BioCyc(db)
        biocyc.user = BioMongoDB.demo
        _log.debug("Building Biocyc index...")
        biocyc.pre_build_index(collection)
        _log.debug("Biocyc index finished")

    if keywords:

        _log.debug("indexing by keyword...")
        ki = KeywordIndexer()
        cache = {}
        total_p = db.proteins.count({"organism": genome})
        with tqdm(Protein.objects(organism=genome).no_cache().timeout(False),
                  total=total_p) as pbar:

            for prot in pbar:
                pbar.set_description(prot.name)
                # Basic keywords
                current_keywords = list(
                    set([
                        x.lower().strip() for x in reduce(
                            list.__add__,
                            map(ki.extract_keywords,
                                [prot.name, prot.description] + prot.gene))
                    ]))

                prot.keywords = current_keywords + prot.keywords
                # ontologies keywords
                terms = prot.ontologies
                terms = terms + [
                    x.identifier.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = terms + [
                    x.type.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = list(set([x.lower() for x in terms]))

                for term in terms:
                    if term not in cache:
                        ont = Ontology.objects(term=str(term))
                        if len(ont):
                            cache[term] = ont.first()

                    if term in cache:
                        prot.keywords = prot.keywords + cache[term].keywords
                    # SO:0001060 missense_variant

                prot.keywords = list(set(prot.keywords + terms))
                prot.save()
        _log.debug("Keyword index finished")

    if organism_idx:
        _log.debug("indexing ontology by organism")
        prots = list(
            db.proteins.find({
                "organism": genome,
                "ontologies.0": {
                    "$exists": True
                }
            }))
        for prot in tqdm(prots):
            for term in prot["ontologies"]:
                if (term
                        in cache) and cache[term].ontology not in ["ec", "go"]:
                    seq_col_ont_idx = SeqColOntologyIndex(
                        seq_collection_id=collection.id,
                        term=term,
                        seq_collection_name=genome,
                        name=cache[term].name,
                        ontology=cache[term].ontology,
                        keywords=cache[term].keywords)
                    seq_col_ont_idx.save()
        SeqColOntologyIndex.objects(count=0).delete()
        _log.debug("Organism index finished")
    collection.save()
    _log.info("indexing %s finished" + genome)
コード例 #5
0
    def pre_build_index(self,
                        genome,
                        annotated_collection="proteins",
                        annotated_collection_field="ontologies",
                        drop=True):
        if drop:
            print(
                self.col_go_index.remove({
                    "seq_collection_id": genome.id,
                    "ontology": self.ontology_name
                }))

        ont_succ_cache = {}
        for ont_doc in tqdm(
                Ontology.objects(ontology=self.ontology_name).no_cache(),
                total=Ontology.objects(ontology=self.ontology_name).count()):
            ont_succ_cache[ont_doc.term] = ont_doc.successors
            database = ""

            if hasattr(ont_doc, "database") and ont_doc.database:
                database = ont_doc.database
            #             if hasattr(ont_doc, "databases") and ont_doc.databases:
            #                 database = ont_doc.databases[0]
            order = len(ont_doc["children"])

            seq_ont_ont_idx = SeqColOntologyIndex(
                term=ont_doc.term.lower(),
                name=ont_doc.name,
                count=0,
                seq_collection_name=genome.name,
                database=database,
                ontology=self.ontology_name,
                order=order,
                seq_collection_id=genome.id,
                keywords=ont_doc.keywords)
            seq_ont_ont_idx.save()

        ont_count = defaultdict(lambda: 0)
        query = {
            "seq_collection_id": genome.id,
            "ontologies.0": {
                "$exists": True
            }
        }
        for p in tqdm(self.db[annotated_collection].find(
                query, {"ontologies": 1}),
                      total=self.db[annotated_collection].count(query)):
            terms = [x for x in p["ontologies"] if x.startswith("go:")]
            terms = self.complete_subgraph(terms)
            for x in terms:
                ont_count[x] += 1
            self.db[annotated_collection].update(
                {"_id": p["_id"]},
                {"$addToSet": {
                    annotated_collection_field: {
                        "$each": terms
                    }
                }})

        for term, count in tqdm(ont_count.items()):
            for seq_ont_ont_idx in SeqColOntologyIndex.objects(
                    seq_collection_id=genome.id,
                    ontology=self.ontology_name,
                    term=term):
                seq_ont_ont_idx.count = count
                seq_ont_ont_idx.save()

        SeqColOntologyIndex.objects(seq_collection_id=genome.id,
                                    count=0).delete()

        self.cleanup_cellular_component_annotations(genome)