コード例 #1
0
ファイル: COG2Mongo.py プロジェクト: ezequieljsosa/sndg-bio
 def create_ontology(self, ontology_db):
     ontology_db.remove({"ontology": "cog"})
     for ontology in self.cog:
         keywords = self.ki.extract_keywords(ontology)
         ont_doc = Ontology(term=ontology,
                            keywords=keywords,
                            ontology="cog")
         ont_doc.save()
コード例 #2
0
 def load_children(self):
     for ont_doc in Ontology.objects(ontology="ec"):
         if ont_doc.term == "root":
             continue
         ont_doc.children = [x["term"] for x in Ontology.objects(
             ontology="ec", term__istartswith=ont_doc.term.replace(".-", "")
         ) if (x.term != ont_doc.term) and (x.term.count("-") == (ont_doc.term.count("-") - 1))]
         ont_doc.__repr__()
         ont_doc.save()
コード例 #3
0
    def pre_build_index(self, genome, annotated_collection="proteins", annotated_collection_field="ontologies",
                        drop=False):
        self.col_index.remove({"ontology": "ec", "seq_collection_id": genome.id})

        _log.debug("creating empty ecs idxs")
        ec_count = Ontology.objects(ontology="ec").count()
        for ont_doc in tqdm(Ontology.objects(ontology="ec").no_cache(), total=ec_count):
            order = 9999999
            try:
                order = order / int(
                    ont_doc.term.lower().replace("ec:", "").replace(".", "").replace("-", "").replace("n", ""))
            except:
                pass

            seq_col_ont_idx = SeqColOntologyIndex(seq_collection_id=genome.id, term=ont_doc.term.lower(),
                                                  seq_collection_name=genome.name, name=ont_doc.name,
                                                  ontology="ec", order=order,
                                                  count=0, keywords=ont_doc.keywords)
            seq_col_ont_idx.save()



        _log.debug("initializign idx ecs")
        terms_count = SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).count()
        for ont_doc in tqdm(SeqColOntologyIndex.objects(ontology="ec", seq_collection_id=genome.id).order_by(
                "-term").no_cache().timeout(False), total=terms_count):
            if "-" in ont_doc.term:
                str_term = ont_doc.term.replace(".", "\.").replace("-", ".+")
            else:
                str_term = ont_doc.term

            count = self.db[annotated_collection].count({"seq_collection_id": genome.id,
                                                         annotated_collection_field: {"$regex": '^' + str_term,
                                                                                      "$options": "-i"}})
            if count:
                ont_doc.count = count
                ont_doc.save()
            else:
                ont_doc.delete()

        regx = re.compile("^ec:", re.IGNORECASE)
        self.db.col_ont_idx.insert(
            {
                "_id": ObjectId(),
                "_cls": "SeqColOntologyIndex",
                "term": "root",
                "name": "root",
                "count": self.db.proteins.count({"organism": genome.name, "ontologies": regx}),
                "order": 9999999,
                "keywords": [
                ],
                "ontology": "ec",
                "seq_collection_name": genome.name,
                "seq_collection_id": genome.id
            })
        _log.debug("ecs idxs done")
コード例 #4
0
 def load_pathways(self, pathways_file, database):
     with open(pathways_file) as pathways_handle:
         for x in pathways_handle.readlines():
             if not (x.strip().startswith("#") or x.strip().startswith("UNIQUE-ID")):
                 line = re.sub(r'\t+', '\t', x)
                 term = line.split("\t")[0].strip().lower()
                 name = line.split("\t")[1].strip()
                 ont_doc = Ontology(term=term,
                                    name=name,
                                    ontology=self.ontology_name + "_pw",
                                    keywords=self.ki.extract_keywords(name) + [term])
                 ont_doc.databases.append(database)
                 ont_doc.save()
コード例 #5
0
ファイル: COG2Mongo.py プロジェクト: ezequieljsosa/sndg-bio
    def load_fun(self, fun_path):
        with open(fun_path, "r") as handle:
            string = handle.read()
            groups = re.split(re.compile("\r\n\r\n"), string)
        for group in groups:
            glines = [g.strip() for g in group.split("\n") if g.strip()]

            term = glines[0].strip().lower()

            keywords = self.ki.extract_keywords(term)
            parent_ont_doc = Ontology(term=term,
                                      name=term,
                                      keywords=keywords,
                                      ontology="cog")
            parent_ont_doc.save()

            for line in glines[1:]:
                term = line.lower().strip().split(" ")[0]
                name = " ".join(line.lower().strip().split(" ")[1:])
                keywords = list(
                    set(parent_ont_doc.keywords +
                        self.ki.extract_keywords(line)))
                ont_doc = Ontology(term=term,
                                   name=name,
                                   parent=parent_ont_doc.term,
                                   keywords=keywords,
                                   ontology="cog")
                parent_ont_doc.children.append(term)
                ont_doc.save()
コード例 #6
0
 def load_slim(self,
               slim_file="/data/databases/go/goslim_generic.obo",
               database="generic"):
     parser = GODag(slim_file)
     for ont in parser:
         try:
             go = Ontology.objects(ontology="go", term=ont.lower()).get()
             go.databases.append(database)
             go.save()
         except Exception as ex:
             _log.error(ex)
     go = Ontology.objects(ontology="go", term="root").get()
     go.databases.append(database)
     go.save()
コード例 #7
0
 def update_ont_org_idx(self):
     for term, seq_ont_ont_idx in self.react_ont_idx_dict.items():
         reac = Ontology.objects(ontology=self.ontology_name + "_reac", term=term.lower())
         if len(reac):
             reac = reac.first()
             seq_ont_ont_idx.name = reac.name
             seq_ont_ont_idx.keywords = list(set(seq_ont_ont_idx.keywords + reac.keywords))
             seq_ont_ont_idx.save()
コード例 #8
0
ファイル: COG2Mongo.py プロジェクト: ezequieljsosa/sndg-bio
    def load_whog(self, whog_path):
        with open(whog_path, "r") as handle:
            groups = handle.read().split("_______")
        for group in groups:
            glines = [g.strip() for g in group.split("\n") if g.strip()]
            if glines:
                line = glines[0].lower()
                parent = line.lower().strip().split(" ")[0]
                term = line.lower().strip().split(" ")[1]
                name = " ".join(line.lower().strip().split(" ")[2:])

                ont_doc = Ontology(term=term,
                                   name=name,
                                   parent=parent,
                                   ontology="cog")
                keywords = self.ki.extract_keywords(line)

                if len(parent) > 3:
                    for x in parent[1:-1]:
                        parent_ont_doc = Ontology.objects(term='[' + x +
                                                          ']').get()
                        keywords = list(set(parent_ont_doc.keywords +
                                            keywords))
                        parent_ont_doc.children.append(term)
                        parent_ont_doc.save()
                else:
                    parent_ont_doc = Ontology.objects(term=parent).get()
                    parent_ont_doc.children.append(term)
                    parent_ont_doc.save()
                    keywords = list(set(parent_ont_doc.keywords + keywords))

                ont_doc.keywords = keywords

                ont_doc.save()
コード例 #9
0
    def load_enzclass(self, enzclass_file_path):

        root = Ontology(ontology=self.ontology_name, term="root", name="ec",
                        children=["ec:1.-.-.-", "ec:2.-.-.-", "ec:3.-.-.-", "ec:4.-.-.-", "ec:5.-.-.-", "ec:6.-.-.-"])
        root.save()

        with open(enzclass_file_path) as enzclass_handle:
            for line in enzclass_handle:
                if re.match(r'^[1-6][.]', line):
                    name = line.split(".-")[-1].strip()
                    term = "ec:" + line.replace(name, "").replace(" ", "").strip()

                    ont_doc = Ontology(ontology=self.ontology_name, term=term, name=name)
                    ont_doc.keywords = self.ki.extract_keywords(ont_doc.name) + [ont_doc.term]
                    ont_doc.save()
コード例 #10
0
 def load_enzdata(self, enzdata_file_path):
     ont_doc = None
     with open(enzdata_file_path) as enzclass_handle:
         for line in enzclass_handle:
             if line.startswith("DE"):
                 ont_doc.name = line.split("DE")[1].strip()
                 ont_doc.keywords = self.ki.extract_keywords([ont_doc.description, ont_doc.name]) + [ont_doc.term]
                 ont_doc.save()
             elif line.startswith("ID"):
                 term = "ec:" + line.split("ID")[1].strip()
                 ont_doc = Ontology(ontology=self.ontology_name, term=term)
コード例 #11
0
 def cleanup_cellular_component_annotations(self, genome):
     for ont_doc in Ontology.objects(ontology=self.ontology_name,
                                     database="cellular_component",
                                     databases__ne="generic"):
         # self.db["proteins"].update({"organism":genome, }, {"$pull":{"ontologies":ont_doc.term, "keywords":ont_doc.term}}, multi=True)
         self.db["col_ont_idx"].remove(
             {
                 "ontology": "go",
                 "seq_collection_name": genome.name,
                 "term": ont_doc.term
             },
             multi=True)
コード例 #12
0
    def _load_mongo(self):
        root = Ontology(ontology=self.ontology_name,
                        term="root",
                        successors=self.root_terms,
                        children=self.root_terms)
        root.save()
        for (node, data) in self.graph.nodes_iter(
                data=True):  # self.graph.add_node(node, **data)
            if node == "root":
                raise Exception("...")
            else:
                successors = self.graph.successors(node)
                _ancestors = self.complete_subgraph([node])

                database = "biological_process"
                if "go:0005575" in _ancestors:
                    database = "cellular_component"
                if "go:0003674" in _ancestors:
                    database = "molecular_function"

                ont_doc = Ontology(
                    ontology=self.ontology_name,
                    term=node,
                    name=data["name"],
                    database=database,
                    successors=self.all_successors(node, []),
                    children=successors,
                    description=self.go_dag.query_term(node.upper()).desc,
                    # successors_relationships=self.successors_relationships(node),
                    subclases=list(
                        set([
                            x.lower() for x in self.go_dag.query_term(
                                node.upper()).get_all_children()
                        ])))
                ont_doc.keywords = self.ki.extract_keywords(
                    [ont_doc.description, ont_doc.name, ont_doc.term])
                ont_doc.save()
コード例 #13
0
ファイル: EC2Mongo.py プロジェクト: ezequieljsosa/sndg-bio
 def _successors(self, term, counts):
     try:
         ec_term = Ontology.objects(ontology="ec", term=term).get()
         return ec_term.children  # [ x for x in counts.keys() if re.match(term.split("-")[0], x)   ]
     except:
         return []
コード例 #14
0
    def load_dat(self, reactions_file, database, postfix):
        with open(reactions_file) as reactions_handle:
            lines = [
                x for x in reactions_handle.readlines()
                if not x.startswith("#")
            ]
            records = re.split("//\n", "\n".join(lines))
            for record in records:
                if not record.strip():
                    continue

                ont_doc = Ontology(ontology=self.ontology_name + postfix)
                ont_doc.databases.append(database)
                reaction_types = []
                ec = None
                for str_record in [y for y in record.split("\n") if y]:
                    if str_record.strip() and len(str_record.strip()) > 3:

                        if len(str_record.split(" - ")) > 1:

                            field = str_record.split(" - ")[0].strip()
                            try:
                                value = str_record.split(
                                    " - ")[1].strip().decode("utf-8")
                            except UnicodeDecodeError:
                                continue

                            if field == "UNIQUE-ID":
                                ont_doc.term = value.lower()
                            elif field == "TYPES":
                                reaction_types.append(value)
                            elif field == "IN-PATHWAY":
                                ont_doc.parents.append(value)
                            elif field == "COMMON-NAME":
                                ont_doc.name = value
                            elif (field == "COMMENT") and (not ont_doc.name):
                                ont_doc.description = value
                            elif (field == "EC-NUMBER") and (not ont_doc.name):
                                ec = value

                if not ont_doc.description:
                    ont_doc.description = "|".join(reaction_types)
                if not ont_doc.name:
                    if ec:
                        ont_doc.name = ec
                    else:
                        ont_doc.name = ont_doc.term
                ont_doc.keywords = self.ki.extract_keywords(
                    ont_doc.name) + [ont_doc.term]
                ont_doc.types = reaction_types
                if ec:
                    ont_doc.keywords.append(ec)
                if not ont_doc.term:
                    print(record)
                else:
                    ont_doc.save()
コード例 #15
0
ファイル: Index.py プロジェクト: ezequieljsosa/sndg-bio
def index_seq_collection(db,
                         genome,
                         ec=True,
                         go=True,
                         keywords=True,
                         organism_idx=True,
                         pathways=True,
                         structure=False,
                         go_obo="/data/databases/go/go.obo"):

    collection = SeqCollection.objects(name=genome).get()

    if ec:
        ec2mongo = EC2Mongo(db)
        _log.debug("Building EC index...")
        ec2mongo.pre_build_index(collection)
        collection.ec_index = True
        _log.debug("EC index finished")
        collection.save()

    if go:
        go2mongo = GO2Mongo(go_obo, db)
        go2mongo.init()
        _log.debug("Building GO index...")
        go2mongo.pre_build_index(collection)
        collection.go_index = True
        collection.save()
        _log.debug("GO index finished")

    if structure:
        si = StructuromeIndexer(collection)
        si.build_index()

    if pathways:
        biocyc = BioCyc(db)
        biocyc.user = BioMongoDB.demo
        _log.debug("Building Biocyc index...")
        biocyc.pre_build_index(collection)
        _log.debug("Biocyc index finished")

    if keywords:

        _log.debug("indexing by keyword...")
        ki = KeywordIndexer()
        cache = {}
        total_p = db.proteins.count({"organism": genome})
        with tqdm(Protein.objects(organism=genome).no_cache().timeout(False),
                  total=total_p) as pbar:

            for prot in pbar:
                pbar.set_description(prot.name)
                # Basic keywords
                current_keywords = list(
                    set([
                        x.lower().strip() for x in reduce(
                            list.__add__,
                            map(ki.extract_keywords,
                                [prot.name, prot.description] + prot.gene))
                    ]))

                prot.keywords = current_keywords + prot.keywords
                # ontologies keywords
                terms = prot.ontologies
                terms = terms + [
                    x.identifier.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = terms + [
                    x.type.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = list(set([x.lower() for x in terms]))

                for term in terms:
                    if term not in cache:
                        ont = Ontology.objects(term=str(term))
                        if len(ont):
                            cache[term] = ont.first()

                    if term in cache:
                        prot.keywords = prot.keywords + cache[term].keywords
                    # SO:0001060 missense_variant

                prot.keywords = list(set(prot.keywords + terms))
                prot.save()
        _log.debug("Keyword index finished")

    if organism_idx:
        _log.debug("indexing ontology by organism")
        prots = list(
            db.proteins.find({
                "organism": genome,
                "ontologies.0": {
                    "$exists": True
                }
            }))
        for prot in tqdm(prots):
            for term in prot["ontologies"]:
                if (term
                        in cache) and cache[term].ontology not in ["ec", "go"]:
                    seq_col_ont_idx = SeqColOntologyIndex(
                        seq_collection_id=collection.id,
                        term=term,
                        seq_collection_name=genome,
                        name=cache[term].name,
                        ontology=cache[term].ontology,
                        keywords=cache[term].keywords)
                    seq_col_ont_idx.save()
        SeqColOntologyIndex.objects(count=0).delete()
        _log.debug("Organism index finished")
    collection.save()
    _log.info("indexing %s finished" + genome)
コード例 #16
0
    def pre_build_index(self,
                        genome,
                        annotated_collection="proteins",
                        annotated_collection_field="ontologies",
                        drop=True):
        if drop:
            print(
                self.col_go_index.remove({
                    "seq_collection_id": genome.id,
                    "ontology": self.ontology_name
                }))

        ont_succ_cache = {}
        for ont_doc in tqdm(
                Ontology.objects(ontology=self.ontology_name).no_cache(),
                total=Ontology.objects(ontology=self.ontology_name).count()):
            ont_succ_cache[ont_doc.term] = ont_doc.successors
            database = ""

            if hasattr(ont_doc, "database") and ont_doc.database:
                database = ont_doc.database
            #             if hasattr(ont_doc, "databases") and ont_doc.databases:
            #                 database = ont_doc.databases[0]
            order = len(ont_doc["children"])

            seq_ont_ont_idx = SeqColOntologyIndex(
                term=ont_doc.term.lower(),
                name=ont_doc.name,
                count=0,
                seq_collection_name=genome.name,
                database=database,
                ontology=self.ontology_name,
                order=order,
                seq_collection_id=genome.id,
                keywords=ont_doc.keywords)
            seq_ont_ont_idx.save()

        ont_count = defaultdict(lambda: 0)
        query = {
            "seq_collection_id": genome.id,
            "ontologies.0": {
                "$exists": True
            }
        }
        for p in tqdm(self.db[annotated_collection].find(
                query, {"ontologies": 1}),
                      total=self.db[annotated_collection].count(query)):
            terms = [x for x in p["ontologies"] if x.startswith("go:")]
            terms = self.complete_subgraph(terms)
            for x in terms:
                ont_count[x] += 1
            self.db[annotated_collection].update(
                {"_id": p["_id"]},
                {"$addToSet": {
                    annotated_collection_field: {
                        "$each": terms
                    }
                }})

        for term, count in tqdm(ont_count.items()):
            for seq_ont_ont_idx in SeqColOntologyIndex.objects(
                    seq_collection_id=genome.id,
                    ontology=self.ontology_name,
                    term=term):
                seq_ont_ont_idx.count = count
                seq_ont_ont_idx.save()

        SeqColOntologyIndex.objects(seq_collection_id=genome.id,
                                    count=0).delete()

        self.cleanup_cellular_component_annotations(genome)