Python OntologyFactory.subontology Examples

Programming Language: Python

Namespace/Package Name: ontobio

Class/Type: OntologyFactory

Method/Function: subontology

Examples at hotexamples.com: 5

Python OntologyFactory.subontology - 5 examples found. These are the top rated real world Python examples of ontobio.OntologyFactory.subontology extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OntologyFactory(17)

create(10)

parents(5)

subontology(5)

add_node(2)

add_parent(2)

nodes(1)

Example #1

Show file

    def get_generators(self, filepath, batch_size):  # noqa
        """Get Generators."""
        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        go_term_list = []
        go_isas_list = []
        go_partofs_list = []
        go_synonyms_list = []
        go_regulates_list = []
        go_negatively_regulates_list = []
        go_positively_regulates_list = []
        go_altids_list = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue
            if node.get('type') == 'PROPERTY':
                continue

            # Switching id to curie form and saving URI in "uri"
            # might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            subset = []
            definition = ""
            is_obsolete = "false"

            if "meta" in node:
                meta = node.get('meta')
                basic_property_values = meta.get('basicPropertyValues')
                for property_value_map in basic_property_values:
                    pred = property_value_map['pred']
                    val = property_value_map['val']
                    if pred == 'OIO:hasOBONamespace':
                        term_type = val

                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        go_synonym = {"primary_id": key, "synonym": synonym}
                        go_synonyms_list.append(go_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [
                        s["val"] for s in node["meta"]["basicPropertyValues"]
                    ]
                    for alt_id in alt_ids:
                        if "GO:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            go_altids_list.append(secondary_id)

                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"

                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]

                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)

                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])
            for item in isas_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_isas_list.append(dictionary)

            partofs_without_names = all_parents_subont.parents(
                key, relations=['BFO:0000050'])
            for item in partofs_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_partofs_list.append(dictionary)

            regulates = all_parents_subont.parents(key,
                                                   relations=['RO:0002211'])

            for item in regulates:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_regulates_list.append(dictionary)

            negatively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002212'])
            for item in negatively_regulates:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_negatively_regulates_list.append(dictionary)

            positively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002213'])
            for item in positively_regulates:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_positively_regulates_list.append(dictionary)

            dict_to_append = {
                'oid': key,
                'definition': definition,
                'type': term_type,
                'name': node.get('label'),
                'subset': subset,
                'name_key': node.get('label'),
                'is_obsolete': is_obsolete,
                'href':
                'http://amigo.geneontology.org/amigo/term/' + node['id'],
            }

            go_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [
                    go_term_list, go_isas_list, go_partofs_list,
                    go_synonyms_list, go_regulates_list,
                    go_negatively_regulates_list, go_positively_regulates_list,
                    go_altids_list
                ]

                go_term_list = []
                go_isas_list = []
                go_partofs_list = []
                go_synonyms_list = []
                go_regulates_list = []
                go_negatively_regulates_list = []
                go_positively_regulates_list = []
                go_altids_list = []
                counter = 0

        if counter > 0:
            yield [
                go_term_list, go_isas_list, go_partofs_list, go_synonyms_list,
                go_regulates_list, go_negatively_regulates_list,
                go_positively_regulates_list, go_altids_list
            ]

Example #2

Show file

    def get_data(self, filepath):  # noqa
        """Get Data."""
        ont = OntologyFactory().create(filepath)

        parsed_line = ont.graph.copy().node

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key in parsed_line.items():
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []
            # So code commented out with NU: at start means it is Not Used.
            # NU: xrefs = []
            # NU: xref_urls = []

            # NU: def_links_unprocessed = []
            # NU: def_links_processed = []
            subset = []
            definition = ""
            namespace = ""
            is_obsolete = "false"
            # NU:ident = key

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                # NU: leave in call commented out in case it is used at a later time
                # if "xrefs" in node["meta"]:
                #     o_xrefs = node["meta"].get('xrefs')
                #     self.ortho_xrefs(o_xrefs, ident, xref_urls)
                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    # NU: def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets
                if "basicPropertyValues" in node['meta']:
                    for bpv in node['meta']['basicPropertyValues']:
                        if bpv.get('pred') == 'OIO:hasOBONamespace':
                            namespace = bpv.get('val')
                            break

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)

            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])
            partofs_without_names = all_parents_subont.parents(
                key, relations=['BFO:0000050'])
            regulates = all_parents_subont.parents(key,
                                                   relations=['RO:0002211'])
            negatively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002212'])
            positively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002213'])

            # NU: def_links_unprocessed = []
            # def_links = ""
            if definition is None:
                definition = ""
            # else:
            #     if definition is not None and "\"" in definition:
            #         split_definition = definition.split("\"")
            #         if len(split_definition) > 1:
            #             if len(split_definition) > 2 and "[" in split_definition[2].strip():
            #                 def_links = split_definition[2].strip()
            #                 def_links_unprocessed.append(def_links.rstrip("]").replace("[", ""))

            # NU: def_links_processed not used later, it is commented out.
            # for def_link_str in def_links_unprocessed:
            #     def_link_str = def_link_str.replace("url:www", "http://www")
            #     def_link_str = def_link_str.replace("url:", "")
            #     def_link_str = def_link_str.replace("URL:", "")
            #     def_link_str = def_link_str.replace("\\:", ":")

            #     if "," in def_link_str:
            #         def_links = def_link_str.split(",")
            #         for link in def_links:
            #             if link.strip().startswith("http"):
            #                 def_links_processed.append(link)
            #     else:
            #         if def_link_str.strip().startswith("http"):
            #             def_links_processed.append(def_link_str)

            # NU: alt_ids = node.get('alt_id')
            # if alt_ids:
            #    if not isinstance(alt_ids, (list, tuple)):
            #        alt_ids = [alt_ids]
            # else:
            #    alt_ids = []

            dict_to_append = {
                'o_type': namespace,
                'name': node.get('label'),
                'href':
                'http://amigo.geneontology.org/amigo/term/' + node['id'],
                'name_key': node.get('label'),
                'oid': node['id'],
                'definition': definition,
                'is_obsolete': is_obsolete,
                'subset': subset,
                'o_synonyms': syns,
                'isas': isas_without_names,
                'partofs': partofs_without_names,
                'regulates': regulates,
                'negatively_regulates': negatively_regulates,
                'positively_regulates': positively_regulates,

                # This data might be needed for gene descriptions
                # Maybe should be turned into a different method in order
                # to keep the go do dict's smaller
                # 'o_genes': [],
                # 'o_species': [],
                # 'xrefs': xrefs,
                # 'ontologyLabel': filepath,
                # TODO: fix links to not be passed for each ontology load.
                # 'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\
                #              + '?species=All&x=1&acc_id='+node['id']+'#annot',
                # 'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                   + 'species=All&x=1&acc_id=' + node['id'] + '#annot',
                # 'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                       + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot',
                # 'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                        + 'species=Human&x=1&acc_id=' +node['id'] + '#annot',
                # 'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'],
                # 'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'],
                # 'sgd_link': 'https://yeastgenome.org/disease/'+node['id'],
                # 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'],
                # 'zfin_link': 'https://zfin.org/'+node['id'],
                # 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'],
                # 'oPrefix': prefix,
                # 'crossReferences': xref_urls,
                # 'defText': def_text,
                # 'defLinksProcessed': def_links_processed,
                # 'oboFile': prefix,
                # 'category': 'go',
                # 'alt_ids': alt_ids,
            }

            if node['id'] == 'GO:0099616':
                self.logger.debug(dict_to_append)

            node = {**node, **dict_to_append}
            ont.graph.node[node["id"]] = node

        return ont

Example #3

Show file

    def get_data(self, filepath):
        """Get Data"""

        ont = OntologyFactory().create(filepath)

        parsed_line = ont.graph.copy().node

        #Convert parsed obo term into a schema-friendly AGR dictionary.
        for key in parsed_line.items():
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            ### Switching id to curie form and saving URI in "uri"
            # might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []
            xrefs = []
            xref_urls = []

            local_id = None
            def_links_unprocessed = []
            def_links_processed = []
            def_text = None
            subset = []
            definition = ""
            namespace = ""
            is_obsolete = "false"
            ident = key
            prefix = ident.split(":")[0]
            if syns is None:
                syns = []  # Set the synonyms to an empty array if None. Necessary for Neo4j parsing

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                if "xrefs" in node["meta"]:

                    o_xrefs = node["meta"].get('xrefs')
                    if o_xrefs is not None:
                        for xref_id_dict in o_xrefs:
                            xref_id = xref_id_dict["val"]
                            if ":" in xref_id:
                                local_id = xref_id.split(":")[1].strip()
                                prefix = xref_id.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id)
                                generated_xref = ETLHelper.get_xref_dict( \
                                        local_id, prefix,
                                        "ontology_provided_cross_reference",
                                        "ontology_provided_cross_reference",
                                        xref_id,
                                        complete_url,
                                        xref_id + "ontology_provided_cross_reference")
                                generated_xref["oid"] = ident
                                xref_urls.append(generated_xref)
                        else:
                            if ":" in o_xrefs:
                                local_id = o_xrefs.split(":")[1].strip()
                                prefix = o_xrefs.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs)
                                generated_xref = ETLHelper.get_xref_dict( \
                                        local_id,
                                        prefix,
                                        "ontology_provided_cross_reference",
                                        "ontology_provided_cross_reference",
                                        o_xrefs,
                                        complete_url,
                                        o_xrefs)
                                generated_xref["oid"] = ident
                                xref_urls.append(generated_xref)
                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets
                if "basicPropertyValues" in node['meta']:
                    for bpv in node['meta']['basicPropertyValues']:
                        if bpv.get('pred') == 'OIO:hasOBONamespace':
                            namespace = bpv.get('val')
                            break

            # Set the synonyms to an empty array if None. Necessary for Neo4j parsing
            if xrefs is None:
                xrefs = []

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)

            isas_without_names = all_parents_subont.parents(key, relations=['subClassOf'])
            partofs_without_names = all_parents_subont.parents(key, relations=['BFO:0000050'])
            regulates = all_parents_subont.parents(key, relations=['RO:0002211'])
            negatively_regulates = all_parents_subont.parents(key, relations=['RO:0002212'])
            positively_regulates = all_parents_subont.parents(key, relations=['RO:0002213'])

            def_links_unprocessed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                #Remove new lines that cause this to split across two lines in the file
                #definition = definition.replace('\n', ' ')

                #Remove any extra double space that might have been introduces in the last replace
                #definition = definition.replace('  ', ' ')
                if definition is not None and "\"" in definition:
                    split_definition = definition.split("\"")
                    if len(split_definition) > 1:
                        def_text = split_definition[1].strip()
                        if len(split_definition) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links_unprocessed.append(def_links.rstrip("]").replace("[", ""))
                else:
                    def_text = definition

            for def_link_str in def_links_unprocessed:
                def_link_str = def_link_str.replace("url:www", "http://www")
                def_link_str = def_link_str.replace("url:", "")
                def_link_str = def_link_str.replace("URL:", "")
                def_link_str = def_link_str.replace("\\:", ":")

                if "," in def_link_str:
                    def_links = def_link_str.split(",")
                    for link in def_links:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                # elif "." in dl:
                #     dl = dl.split(".")
                #     for link in dl:
                #         if link.strip().startswith("http"):
                #             def_links_processed.append(link)
                else:
                    if def_link_str.strip().startswith("http"):
                        def_links_processed.append(def_link_str)

            # TODO: make this a generic section based on hte resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas


            alt_ids = node.get('alt_id')
            if alt_ids:
                if not isinstance(alt_ids, (list, tuple)):
                    alt_ids = [alt_ids]
            else:
                alt_ids = []

            dict_to_append = {

                'o_type': namespace,
                'name': node.get('label'),
                'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'],
                'name_key': node.get('label'),
                'oid': node['id'],
                'definition': definition,
                'is_obsolete': is_obsolete,
                'subset': subset,
                'o_synonyms': syns,
                'isas': isas_without_names,
                'partofs': partofs_without_names,
                'regulates': regulates,
                'negatively_regulates': negatively_regulates,
                'positively_regulates': positively_regulates,

                ### This data might be needed for gene descriptions
                ### Maybe should be turned into a different method in order
                ### to keep the go do dict's smaller
                #'o_genes': [],
                #'o_species': [],
                #'xrefs': xrefs,
                #'ontologyLabel': filepath,
                #TODO: fix links to not be passed for each ontology load.
                #'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\
                #              + '?species=All&x=1&acc_id='+node['id']+'#annot',
                #'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                   + 'species=All&x=1&acc_id=' + node['id'] + '#annot',
                #'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                       + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot',
                #'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                        + 'species=Human&x=1&acc_id=' +node['id'] + '#annot',
                #'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'],
                #'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'],
                #'sgd_link': 'https://yeastgenome.org/disease/'+node['id'],
                #'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'],
                #'zfin_link': 'https://zfin.org/'+node['id'],
                #'oUrl': "http://www.disease-ontology.org/?id=" + node['id'],
                #'oPrefix': prefix,
                #'crossReferences': xref_urls,
                #'defText': def_text,
                #'defLinksProcessed': def_links_processed,
                #'oboFile': prefix,
                #'category': 'go',
                #'alt_ids': alt_ids,
            }

            if node['id'] == 'GO:0099616':
                print(dict_to_append)

            node = {**node, **dict_to_append}
            ont.graph.node[node["id"]] = node

        return ont

Example #4

Show file

File: do_etl.py Project: sierra-moxon/agr_loader

    def get_generators(self, filepath,
                       batch_size):  # noqa TODO:Needs splitting up really
        """Get Generators."""
        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        do_term_list = []
        do_isas_list = []
        do_synonyms_list = []
        do_alt_ids_list = []
        xrefs = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # - might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []

            def_links_unprocessed = []
            def_links_processed = []
            subset = []
            definition = ""
            is_obsolete = "false"
            ident = key

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        do_synonym = {"primary_id": key, "synonym": synonym}
                        do_synonyms_list.append(do_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [
                        s["val"] for s in node["meta"]["basicPropertyValues"]
                    ]
                    for alt_id in alt_ids:
                        if "DOID:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            do_alt_ids_list.append(secondary_id)

                if "xrefs" in node["meta"]:
                    o_xrefs = node["meta"].get('xrefs')
                    self.ortho_xrefs(o_xrefs, ident, xrefs)

                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])

            for item in isas_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}

                do_isas_list.append(dictionary)

            def_links_processed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                # Remove new lines that cause this to split across two lines in the file
                # definition = definition.replace('\n', ' ')

                # Remove any extra double space that might have been introduces in the last replace
                # definition = definition.replace('  ', ' ')

                if definition is not None and "\"" in definition:
                    split_definition = re.split(r'(?<!\\)"', definition)
                    if len(split_definition) > 1:
                        if len(split_definition
                               ) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links = def_links.rstrip("]").replace("[", "")
                            def_links_unprocessed.append(def_links)

            for def_link in def_links_unprocessed:
                def_link = def_link.replace("url:www", "http://www")
                def_link = def_link.replace("url:", "")
                def_link = def_link.replace("URL:", "")
                def_link = def_link.replace("\\:", ":")
                def_link = def_link.replace('\\', '')

                if "," in def_link:
                    def_link = def_link.split(",")
                    for link in def_link:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                else:
                    if def_link.strip().startswith("http"):
                        def_links_processed.append(def_link)

            # TODO: make this a generic section based on the resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas

            # NU: alt_ids = node.get('alt_id')
            # if alt_ids:
            #     if not isinstance(alt_ids, (list, tuple)):
            #         alt_ids = [alt_ids]
            # else:
            #     alt_ids = []

            # TODO: Need to add urls to resource Descriptis for SGD and MGI.
            # NOTE: MGI had one but has 'MGI:' at the end of the url not required here.
            dict_to_append = {
                'oid':
                node['id'],
                'name':
                node.get('label'),
                'name_key':
                node.get('label'),
                'definition':
                definition,
                'defLinksProcessed':
                def_links_processed,
                'is_obsolete':
                is_obsolete,
                'subset':
                subset,
                'oUrl':
                self.etlh.rdh2.return_url_from_key_value('DOID', node['id']),
                'rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/all'),
                'rat_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/rat'),
                'human_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/human'),
                'mgi_link':
                'http://www.informatics.jax.org/disease/' + node['id'],
                'zfin_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'ZFIN', node['id'], 'disease'),
                'flybase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'FB', node['id'], 'disease'),
                'wormbase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'WB', node['id'], 'disease'),
                'sgd_link':
                'https://yeastgenome.org/disease/' + node['id']
            }

            do_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [
                    do_term_list, do_isas_list, do_synonyms_list, xrefs,
                    do_alt_ids_list
                ]
                do_term_list = []
                do_isas_list = []
                do_synonyms_list = []
                do_alt_ids_list = []
                xrefs = []
                counter = 0

        if counter > 0:
            yield [
                do_term_list, do_isas_list, do_synonyms_list, xrefs,
                do_alt_ids_list
            ]

Example #5

Show file

File: do_etl.py Project: markquintontulloch/agr_loader

    def get_generators(self, filepath, batch_size):
        """Get Generators"""

        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        do_term_list = []
        do_isas_list = []
        do_synonyms_list = []
        do_alt_ids_list = []
        xrefs = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # - might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []

            local_id = None
            def_links_unprocessed = []
            def_links_processed = []
            subset = []
            definition = ""
            is_obsolete = "false"
            ident = key
            prefix = ident.split(":")[0]

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        do_synonym = {
                            "primary_id": key,
                            "synonym": synonym
                        }
                        do_synonyms_list.append(do_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [s["val"] for s in node["meta"]["basicPropertyValues"]]
                    for alt_id in alt_ids:
                        if "DOID:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            do_alt_ids_list.append(secondary_id)

                if "xrefs" in node["meta"]:
                    o_xrefs = node["meta"].get('xrefs')
                    if o_xrefs is not None:
                        for xref_id_dict in o_xrefs:
                            xref_id = xref_id_dict["val"]
                            if ":" in xref_id:
                                local_id = xref_id.split(":")[1].strip()
                                prefix = xref_id.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id)
                                generated_xref = ETLHelper.get_xref_dict(local_id, 
                                    prefix,
                                    "ontology_provided_cross_reference",
                                    "ontology_provided_cross_reference",
                                    xref_id,
                                    complete_url,
                                    xref_id + "ontology_provided_cross_reference")
                                generated_xref["oid"] = ident
                                xrefs.append(generated_xref)
                        else: #TODO Need to make sure this else is correct
                            if ":" in o_xrefs:
                                local_id = o_xrefs.split(":")[1].strip()
                                prefix = o_xrefs.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs)
                                generated_xref = ETLHelper.get_xref_dict(local_id,
                                        prefix,
                                        "ontology_provided_cross_reference",
                                        "ontology_provided_cross_reference",
                                        o_xrefs,
                                        complete_url,
                                        o_xrefs)
                                generated_xref["oid"] = ident
                                xrefs.append(generated_xref)
                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(key, relations=['subClassOf'])

            for item in isas_without_names:
                dictionary = {
                    "primary_id": key,
                    "primary_id2": item
                }

                do_isas_list.append(dictionary)

            def_links_processed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                # Remove new lines that cause this to split across two lines in the file
                # definition = definition.replace('\n', ' ')

                # Remove any extra double space that might have been introduces in the last replace
                # definition = definition.replace('  ', ' ')

                if definition is not None and "\"" in definition:
                    split_definition = re.split(r'(?<!\\)"', definition)
                    if len(split_definition) > 1:
                        if len(split_definition) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links = def_links.rstrip("]").replace("[", "")
                            def_links_unprocessed.append(def_links)

            for def_link in def_links_unprocessed:
                def_link = def_link.replace("url:www", "http://www")
                def_link = def_link.replace("url:", "")
                def_link = def_link.replace("URL:", "")
                def_link = def_link.replace("\\:", ":")
                def_link = def_link.replace('\\', '')

                if "," in def_link:
                    def_link = def_link.split(",")
                    for link in def_link:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                else:
                    if def_link.strip().startswith("http"):
                        def_links_processed.append(def_link)

            # TODO: make this a generic section based on the resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas


            alt_ids = node.get('alt_id')
            if alt_ids:
                if not isinstance(alt_ids, (list, tuple)):
                    alt_ids = [alt_ids]
            else:
                alt_ids = []

            dict_to_append = {
                'oid': node['id'],
                'name': node.get('label'),
                'name_key': node.get('label'),
                'definition': definition,
                'defLinksProcessed': def_links_processed,
                'is_obsolete': is_obsolete,
                'subset': subset,
                'oUrl': "http://www.disease-ontology.org/?id=" + node['id'],
                'rgd_link': 'http://rgd.mcw.edu'
                            + '/rgdweb/ontology/annot.html?species=All&x=1&acc_id='
                            + node['id'] + '#annot',
                'rat_only_rgd_link': 'http://rgd.mcw.edu'
                                     + '/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id='
                                     + node['id'] + '#annot',
                'human_only_rgd_link': 'http://rgd.mcw.edu'
                                       + '/rgdweb/ontology/annot.html?species=Human&x=1&acc_id='
                                       + node['id'] + '#annot',
                'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'],
                'zfin_link': 'https://zfin.org/' + node['id'],
                'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id=' + node['id'],
                'wormbase_link': 'http://www.wormbase.org/resources/disease/' + node['id'],
                'sgd_link': 'https://yeastgenome.org/disease/' + node['id']
            }

            do_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]
                do_term_list = []
                do_isas_list = []
                do_synonyms_list = []
                do_alt_ids_list = []
                xrefs = []
                counter = 0

        if counter > 0:
            yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]