Exemple #1
0
    def load_whog(self, whog_path):
        with open(whog_path, "r") as handle:
            groups = handle.read().split("_______")
        for group in groups:
            glines = [g.strip() for g in group.split("\n") if g.strip()]
            if glines:
                line = glines[0].lower()
                parent = line.lower().strip().split(" ")[0]
                term = line.lower().strip().split(" ")[1]
                name = " ".join(line.lower().strip().split(" ")[2:])

                ont_doc = Ontology(term=term,
                                   name=name,
                                   parent=parent,
                                   ontology="cog")
                keywords = self.ki.extract_keywords(line)

                if len(parent) > 3:
                    for x in parent[1:-1]:
                        parent_ont_doc = Ontology.objects(term='[' + x +
                                                          ']').get()
                        keywords = list(set(parent_ont_doc.keywords +
                                            keywords))
                        parent_ont_doc.children.append(term)
                        parent_ont_doc.save()
                else:
                    parent_ont_doc = Ontology.objects(term=parent).get()
                    parent_ont_doc.children.append(term)
                    parent_ont_doc.save()
                    keywords = list(set(parent_ont_doc.keywords + keywords))

                ont_doc.keywords = keywords

                ont_doc.save()
Exemple #2
0
 def load_children(self):
     for ont_doc in Ontology.objects(ontology="ec"):
         if ont_doc.term == "root":
             continue
         ont_doc.children = [x["term"] for x in Ontology.objects(
             ontology="ec", term__istartswith=ont_doc.term.replace(".-", "")
         ) if (x.term != ont_doc.term) and (x.term.count("-") == (ont_doc.term.count("-") - 1))]
         ont_doc.__repr__()
         ont_doc.save()
Exemple #3
0
    def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies",
                        drop=False):
        self.col_index.remove({"ontology": "ec", "seq_collection_id": genome.id})

        _log.debug("creating empty ecs idxs")
        ec_count = Ontology.objects(ontology="ec").count()
        for ont_doc in tqdm(Ontology.objects(ontology="ec").no_cache(), total=ec_count):
            order = 9999999
            try:
                order = order / int(
                    ont_doc.term.lower().replace("ec:", "").replace(".", "").replace("-", "").replace("n", ""))
            except:
                pass

            seq_col_ont_idx = SeqColOntologyIndex(seq_collection_id=genome.id, term=ont_doc.term.lower(),
                                                  seq_collection_name=genome.name, name=ont_doc.name,
                                                  ontology="ec", order=order,
                                                  count=0, keywords=ont_doc.keywords)
            seq_col_ont_idx.save()



        _log.debug("initializign idx ecs")
        terms_count = SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).count()
        for ont_doc in tqdm(SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).order_by(
                "-term").no_cache().timeout(False), total=terms_count):
            if "-" in ont_doc.term:
                str_term = ont_doc.term.replace(".", "\.").replace("-", ".+")
            else:
                str_term = ont_doc.term

            count = self.db[annotated_collection].count({"seq_collection_id": genome.id,
                                                         annotated_collection_field: {"$regex": '^' + str_term,
                                                                                      "$options": "-i"}})
            if count:
                ont_doc.count = count
                ont_doc.save()
            else:
                ont_doc.delete()

        regx = re.compile("^ec:", re.IGNORECASE)
        self.db.col_ont_idx.insert(
            {
                "_id": ObjectId(),
                "_cls": "SeqColOntologyIndex",
                "term": "root",
                "name": "root",
                "count": self.db.proteins.count({"organism": genome.name, "ontologies": regx}),
                "order": 9999999,
                "keywords": [
                ],
                "ontology": "ec",
                "seq_collection_name": genome.name,
                "seq_collection_id": genome.id
            })
        _log.debug("ecs idxs done")
Exemple #4
0
 def load_slim(self,
               slim_file="/data/databases/go/goslim_generic.obo",
               database="generic"):
     parser = GODag(slim_file)
     for ont in parser:
         try:
             go = Ontology.objects(ontology="go", term=ont.lower()).get()
             go.databases.append(database)
             go.save()
         except Exception as ex:
             _log.error(ex)
     go = Ontology.objects(ontology="go", term="root").get()
     go.databases.append(database)
     go.save()
Exemple #5
0
 def update_ont_org_idx(self):
     for term, seq_ont_ont_idx in self.react_ont_idx_dict.items():
         reac = Ontology.objects(ontology=self.ontology_name + "_reac", term=term.lower())
         if len(reac):
             reac = reac.first()
             seq_ont_ont_idx.name = reac.name
             seq_ont_ont_idx.keywords = list(set(seq_ont_ont_idx.keywords + reac.keywords))
             seq_ont_ont_idx.save()
Exemple #6
0
 def cleanup_cellular_component_annotations(self, genome):
     for ont_doc in Ontology.objects(ontology=self.ontology_name,
                                     database="cellular_component",
                                     databases__ne="generic"):
         # self.db["proteins"].update({"organism":genome, }, {"$pull":{"ontologies":ont_doc.term, "keywords":ont_doc.term}}, multi=True)
         self.db["col_ont_idx"].remove(
             {
                 "ontology": "go",
                 "seq_collection_name": genome.name,
                 "term": ont_doc.term
             },
             multi=True)
Exemple #7
0
 def _successors(self, term, counts):
     try:
         ec_term = Ontology.objects(ontology="ec", term=term).get()
         return ec_term.children  # [ x for x in counts.keys() if re.match(term.split("-")[0], x)   ]
     except:
         return []
Exemple #8
0
def index_seq_collection(db,
                         genome,
                         ec=True,
                         go=True,
                         keywords=True,
                         organism_idx=True,
                         pathways=True,
                         structure=False,
                         go_obo="/data/databases/go/go.obo"):

    collection = SeqCollection.objects(name=genome).get()

    if ec:
        ec2mongo = EC2Mongo(db)
        _log.debug("Building EC index...")
        ec2mongo.pre_build_index(collection)
        collection.ec_index = True
        _log.debug("EC index finished")
        collection.save()

    if go:
        go2mongo = GO2Mongo(go_obo, db)
        go2mongo.init()
        _log.debug("Building GO index...")
        go2mongo.pre_build_index(collection)
        collection.go_index = True
        collection.save()
        _log.debug("GO index finished")

    if structure:
        si = StructuromeIndexer(collection)
        si.build_index()

    if pathways:
        biocyc = BioCyc(db)
        biocyc.user = BioMongoDB.demo
        _log.debug("Building Biocyc index...")
        biocyc.pre_build_index(collection)
        _log.debug("Biocyc index finished")

    if keywords:

        _log.debug("indexing by keyword...")
        ki = KeywordIndexer()
        cache = {}
        total_p = db.proteins.count({"organism": genome})
        with tqdm(Protein.objects(organism=genome).no_cache().timeout(False),
                  total=total_p) as pbar:

            for prot in pbar:
                pbar.set_description(prot.name)
                # Basic keywords
                current_keywords = list(
                    set([
                        x.lower().strip() for x in reduce(
                            list.__add__,
                            map(ki.extract_keywords,
                                [prot.name, prot.description] + prot.gene))
                    ]))

                prot.keywords = current_keywords + prot.keywords
                # ontologies keywords
                terms = prot.ontologies
                terms = terms + [
                    x.identifier.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = terms + [
                    x.type.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = list(set([x.lower() for x in terms]))

                for term in terms:
                    if term not in cache:
                        ont = Ontology.objects(term=str(term))
                        if len(ont):
                            cache[term] = ont.first()

                    if term in cache:
                        prot.keywords = prot.keywords + cache[term].keywords
                    # SO:0001060 missense_variant

                prot.keywords = list(set(prot.keywords + terms))
                prot.save()
        _log.debug("Keyword index finished")

    if organism_idx:
        _log.debug("indexing ontology by organism")
        prots = list(
            db.proteins.find({
                "organism": genome,
                "ontologies.0": {
                    "$exists": True
                }
            }))
        for prot in tqdm(prots):
            for term in prot["ontologies"]:
                if (term
                        in cache) and cache[term].ontology not in ["ec", "go"]:
                    seq_col_ont_idx = SeqColOntologyIndex(
                        seq_collection_id=collection.id,
                        term=term,
                        seq_collection_name=genome,
                        name=cache[term].name,
                        ontology=cache[term].ontology,
                        keywords=cache[term].keywords)
                    seq_col_ont_idx.save()
        SeqColOntologyIndex.objects(count=0).delete()
        _log.debug("Organism index finished")
    collection.save()
    _log.info("indexing %s finished" + genome)
Exemple #9
0
    def pre_build_index(self,
                        genome,
                        annotated_collection="proteins",
                        annotated_collection_field="ontologies",
                        drop=True):
        if drop:
            print(
                self.col_go_index.remove({
                    "seq_collection_id": genome.id,
                    "ontology": self.ontology_name
                }))

        ont_succ_cache = {}
        for ont_doc in tqdm(
                Ontology.objects(ontology=self.ontology_name).no_cache(),
                total=Ontology.objects(ontology=self.ontology_name).count()):
            ont_succ_cache[ont_doc.term] = ont_doc.successors
            database = ""

            if hasattr(ont_doc, "database") and ont_doc.database:
                database = ont_doc.database
            #             if hasattr(ont_doc, "databases") and ont_doc.databases:
            #                 database = ont_doc.databases[0]
            order = len(ont_doc["children"])

            seq_ont_ont_idx = SeqColOntologyIndex(
                term=ont_doc.term.lower(),
                name=ont_doc.name,
                count=0,
                seq_collection_name=genome.name,
                database=database,
                ontology=self.ontology_name,
                order=order,
                seq_collection_id=genome.id,
                keywords=ont_doc.keywords)
            seq_ont_ont_idx.save()

        ont_count = defaultdict(lambda: 0)
        query = {
            "seq_collection_id": genome.id,
            "ontologies.0": {
                "$exists": True
            }
        }
        for p in tqdm(self.db[annotated_collection].find(
                query, {"ontologies": 1}),
                      total=self.db[annotated_collection].count(query)):
            terms = [x for x in p["ontologies"] if x.startswith("go:")]
            terms = self.complete_subgraph(terms)
            for x in terms:
                ont_count[x] += 1
            self.db[annotated_collection].update(
                {"_id": p["_id"]},
                {"$addToSet": {
                    annotated_collection_field: {
                        "$each": terms
                    }
                }})

        for term, count in tqdm(ont_count.items()):
            for seq_ont_ont_idx in SeqColOntologyIndex.objects(
                    seq_collection_id=genome.id,
                    ontology=self.ontology_name,
                    term=term):
                seq_ont_ont_idx.count = count
                seq_ont_ont_idx.save()

        SeqColOntologyIndex.objects(seq_collection_id=genome.id,
                                    count=0).delete()

        self.cleanup_cellular_component_annotations(genome)