Ejemplo n.º 1
0
    def get_generators(self, filepath, batch_size):  # noqa
        """Get Generators."""
        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        go_term_list = []
        go_isas_list = []
        go_partofs_list = []
        go_synonyms_list = []
        go_regulates_list = []
        go_negatively_regulates_list = []
        go_positively_regulates_list = []
        go_altids_list = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue
            if node.get('type') == 'PROPERTY':
                continue

            # Switching id to curie form and saving URI in "uri"
            # might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            subset = []
            definition = ""
            is_obsolete = "false"

            if "meta" in node:
                meta = node.get('meta')
                basic_property_values = meta.get('basicPropertyValues')
                for property_value_map in basic_property_values:
                    pred = property_value_map['pred']
                    val = property_value_map['val']
                    if pred == 'OIO:hasOBONamespace':
                        term_type = val

                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        go_synonym = {"primary_id": key, "synonym": synonym}
                        go_synonyms_list.append(go_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [
                        s["val"] for s in node["meta"]["basicPropertyValues"]
                    ]
                    for alt_id in alt_ids:
                        if "GO:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            go_altids_list.append(secondary_id)

                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"

                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]

                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)

                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])
            for item in isas_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_isas_list.append(dictionary)

            partofs_without_names = all_parents_subont.parents(
                key, relations=['BFO:0000050'])
            for item in partofs_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_partofs_list.append(dictionary)

            regulates = all_parents_subont.parents(key,
                                                   relations=['RO:0002211'])

            for item in regulates:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_regulates_list.append(dictionary)

            negatively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002212'])
            for item in negatively_regulates:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_negatively_regulates_list.append(dictionary)

            positively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002213'])
            for item in positively_regulates:
                dictionary = {"primary_id": key, "primary_id2": item}
                go_positively_regulates_list.append(dictionary)

            dict_to_append = {
                'oid': key,
                'definition': definition,
                'type': term_type,
                'name': node.get('label'),
                'subset': subset,
                'name_key': node.get('label'),
                'is_obsolete': is_obsolete,
                'href':
                'http://amigo.geneontology.org/amigo/term/' + node['id'],
            }

            go_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [
                    go_term_list, go_isas_list, go_partofs_list,
                    go_synonyms_list, go_regulates_list,
                    go_negatively_regulates_list, go_positively_regulates_list,
                    go_altids_list
                ]

                go_term_list = []
                go_isas_list = []
                go_partofs_list = []
                go_synonyms_list = []
                go_regulates_list = []
                go_negatively_regulates_list = []
                go_positively_regulates_list = []
                go_altids_list = []
                counter = 0

        if counter > 0:
            yield [
                go_term_list, go_isas_list, go_partofs_list, go_synonyms_list,
                go_regulates_list, go_negatively_regulates_list,
                go_positively_regulates_list, go_altids_list
            ]
Ejemplo n.º 2
0
    def get_data(self, filepath):  # noqa
        """Get Data."""
        ont = OntologyFactory().create(filepath)

        parsed_line = ont.graph.copy().node

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key in parsed_line.items():
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []
            # So code commented out with NU: at start means it is Not Used.
            # NU: xrefs = []
            # NU: xref_urls = []

            # NU: def_links_unprocessed = []
            # NU: def_links_processed = []
            subset = []
            definition = ""
            namespace = ""
            is_obsolete = "false"
            # NU:ident = key

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                # NU: leave in call commented out in case it is used at a later time
                # if "xrefs" in node["meta"]:
                #     o_xrefs = node["meta"].get('xrefs')
                #     self.ortho_xrefs(o_xrefs, ident, xref_urls)
                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    # NU: def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets
                if "basicPropertyValues" in node['meta']:
                    for bpv in node['meta']['basicPropertyValues']:
                        if bpv.get('pred') == 'OIO:hasOBONamespace':
                            namespace = bpv.get('val')
                            break

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)

            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])
            partofs_without_names = all_parents_subont.parents(
                key, relations=['BFO:0000050'])
            regulates = all_parents_subont.parents(key,
                                                   relations=['RO:0002211'])
            negatively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002212'])
            positively_regulates = all_parents_subont.parents(
                key, relations=['RO:0002213'])

            # NU: def_links_unprocessed = []
            # def_links = ""
            if definition is None:
                definition = ""
            # else:
            #     if definition is not None and "\"" in definition:
            #         split_definition = definition.split("\"")
            #         if len(split_definition) > 1:
            #             if len(split_definition) > 2 and "[" in split_definition[2].strip():
            #                 def_links = split_definition[2].strip()
            #                 def_links_unprocessed.append(def_links.rstrip("]").replace("[", ""))

            # NU: def_links_processed not used later, it is commented out.
            # for def_link_str in def_links_unprocessed:
            #     def_link_str = def_link_str.replace("url:www", "http://www")
            #     def_link_str = def_link_str.replace("url:", "")
            #     def_link_str = def_link_str.replace("URL:", "")
            #     def_link_str = def_link_str.replace("\\:", ":")

            #     if "," in def_link_str:
            #         def_links = def_link_str.split(",")
            #         for link in def_links:
            #             if link.strip().startswith("http"):
            #                 def_links_processed.append(link)
            #     else:
            #         if def_link_str.strip().startswith("http"):
            #             def_links_processed.append(def_link_str)

            # NU: alt_ids = node.get('alt_id')
            # if alt_ids:
            #    if not isinstance(alt_ids, (list, tuple)):
            #        alt_ids = [alt_ids]
            # else:
            #    alt_ids = []

            dict_to_append = {
                'o_type': namespace,
                'name': node.get('label'),
                'href':
                'http://amigo.geneontology.org/amigo/term/' + node['id'],
                'name_key': node.get('label'),
                'oid': node['id'],
                'definition': definition,
                'is_obsolete': is_obsolete,
                'subset': subset,
                'o_synonyms': syns,
                'isas': isas_without_names,
                'partofs': partofs_without_names,
                'regulates': regulates,
                'negatively_regulates': negatively_regulates,
                'positively_regulates': positively_regulates,

                # This data might be needed for gene descriptions
                # Maybe should be turned into a different method in order
                # to keep the go do dict's smaller
                # 'o_genes': [],
                # 'o_species': [],
                # 'xrefs': xrefs,
                # 'ontologyLabel': filepath,
                # TODO: fix links to not be passed for each ontology load.
                # 'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\
                #              + '?species=All&x=1&acc_id='+node['id']+'#annot',
                # 'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                   + 'species=All&x=1&acc_id=' + node['id'] + '#annot',
                # 'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                       + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot',
                # 'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                        + 'species=Human&x=1&acc_id=' +node['id'] + '#annot',
                # 'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'],
                # 'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'],
                # 'sgd_link': 'https://yeastgenome.org/disease/'+node['id'],
                # 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'],
                # 'zfin_link': 'https://zfin.org/'+node['id'],
                # 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'],
                # 'oPrefix': prefix,
                # 'crossReferences': xref_urls,
                # 'defText': def_text,
                # 'defLinksProcessed': def_links_processed,
                # 'oboFile': prefix,
                # 'category': 'go',
                # 'alt_ids': alt_ids,
            }

            if node['id'] == 'GO:0099616':
                self.logger.debug(dict_to_append)

            node = {**node, **dict_to_append}
            ont.graph.node[node["id"]] = node

        return ont
Ejemplo n.º 3
0
    def get_data(self, filepath):
        """Get Data"""

        ont = OntologyFactory().create(filepath)

        parsed_line = ont.graph.copy().node

        #Convert parsed obo term into a schema-friendly AGR dictionary.
        for key in parsed_line.items():
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            ### Switching id to curie form and saving URI in "uri"
            # might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []
            xrefs = []
            xref_urls = []

            local_id = None
            def_links_unprocessed = []
            def_links_processed = []
            def_text = None
            subset = []
            definition = ""
            namespace = ""
            is_obsolete = "false"
            ident = key
            prefix = ident.split(":")[0]
            if syns is None:
                syns = []  # Set the synonyms to an empty array if None. Necessary for Neo4j parsing

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                if "xrefs" in node["meta"]:

                    o_xrefs = node["meta"].get('xrefs')
                    if o_xrefs is not None:
                        for xref_id_dict in o_xrefs:
                            xref_id = xref_id_dict["val"]
                            if ":" in xref_id:
                                local_id = xref_id.split(":")[1].strip()
                                prefix = xref_id.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id)
                                generated_xref = ETLHelper.get_xref_dict( \
                                        local_id, prefix,
                                        "ontology_provided_cross_reference",
                                        "ontology_provided_cross_reference",
                                        xref_id,
                                        complete_url,
                                        xref_id + "ontology_provided_cross_reference")
                                generated_xref["oid"] = ident
                                xref_urls.append(generated_xref)
                        else:
                            if ":" in o_xrefs:
                                local_id = o_xrefs.split(":")[1].strip()
                                prefix = o_xrefs.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs)
                                generated_xref = ETLHelper.get_xref_dict( \
                                        local_id,
                                        prefix,
                                        "ontology_provided_cross_reference",
                                        "ontology_provided_cross_reference",
                                        o_xrefs,
                                        complete_url,
                                        o_xrefs)
                                generated_xref["oid"] = ident
                                xref_urls.append(generated_xref)
                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets
                if "basicPropertyValues" in node['meta']:
                    for bpv in node['meta']['basicPropertyValues']:
                        if bpv.get('pred') == 'OIO:hasOBONamespace':
                            namespace = bpv.get('val')
                            break

            # Set the synonyms to an empty array if None. Necessary for Neo4j parsing
            if xrefs is None:
                xrefs = []

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)

            isas_without_names = all_parents_subont.parents(key, relations=['subClassOf'])
            partofs_without_names = all_parents_subont.parents(key, relations=['BFO:0000050'])
            regulates = all_parents_subont.parents(key, relations=['RO:0002211'])
            negatively_regulates = all_parents_subont.parents(key, relations=['RO:0002212'])
            positively_regulates = all_parents_subont.parents(key, relations=['RO:0002213'])

            def_links_unprocessed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                #Remove new lines that cause this to split across two lines in the file
                #definition = definition.replace('\n', ' ')

                #Remove any extra double space that might have been introduces in the last replace
                #definition = definition.replace('  ', ' ')
                if definition is not None and "\"" in definition:
                    split_definition = definition.split("\"")
                    if len(split_definition) > 1:
                        def_text = split_definition[1].strip()
                        if len(split_definition) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links_unprocessed.append(def_links.rstrip("]").replace("[", ""))
                else:
                    def_text = definition

            for def_link_str in def_links_unprocessed:
                def_link_str = def_link_str.replace("url:www", "http://www")
                def_link_str = def_link_str.replace("url:", "")
                def_link_str = def_link_str.replace("URL:", "")
                def_link_str = def_link_str.replace("\\:", ":")

                if "," in def_link_str:
                    def_links = def_link_str.split(",")
                    for link in def_links:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                # elif "." in dl:
                #     dl = dl.split(".")
                #     for link in dl:
                #         if link.strip().startswith("http"):
                #             def_links_processed.append(link)
                else:
                    if def_link_str.strip().startswith("http"):
                        def_links_processed.append(def_link_str)

            # TODO: make this a generic section based on hte resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas


            alt_ids = node.get('alt_id')
            if alt_ids:
                if not isinstance(alt_ids, (list, tuple)):
                    alt_ids = [alt_ids]
            else:
                alt_ids = []

            dict_to_append = {

                'o_type': namespace,
                'name': node.get('label'),
                'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'],
                'name_key': node.get('label'),
                'oid': node['id'],
                'definition': definition,
                'is_obsolete': is_obsolete,
                'subset': subset,
                'o_synonyms': syns,
                'isas': isas_without_names,
                'partofs': partofs_without_names,
                'regulates': regulates,
                'negatively_regulates': negatively_regulates,
                'positively_regulates': positively_regulates,

                ### This data might be needed for gene descriptions
                ### Maybe should be turned into a different method in order
                ### to keep the go do dict's smaller
                #'o_genes': [],
                #'o_species': [],
                #'xrefs': xrefs,
                #'ontologyLabel': filepath,
                #TODO: fix links to not be passed for each ontology load.
                #'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\
                #              + '?species=All&x=1&acc_id='+node['id']+'#annot',
                #'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                   + 'species=All&x=1&acc_id=' + node['id'] + '#annot',
                #'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                       + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot',
                #'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\
                #                        + 'species=Human&x=1&acc_id=' +node['id'] + '#annot',
                #'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'],
                #'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'],
                #'sgd_link': 'https://yeastgenome.org/disease/'+node['id'],
                #'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'],
                #'zfin_link': 'https://zfin.org/'+node['id'],
                #'oUrl': "http://www.disease-ontology.org/?id=" + node['id'],
                #'oPrefix': prefix,
                #'crossReferences': xref_urls,
                #'defText': def_text,
                #'defLinksProcessed': def_links_processed,
                #'oboFile': prefix,
                #'category': 'go',
                #'alt_ids': alt_ids,
            }

            if node['id'] == 'GO:0099616':
                print(dict_to_append)

            node = {**node, **dict_to_append}
            ont.graph.node[node["id"]] = node

        return ont
Ejemplo n.º 4
0
    def get_generators(self, filepath,
                       batch_size):  # noqa TODO:Needs splitting up really
        """Get Generators."""
        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        do_term_list = []
        do_isas_list = []
        do_synonyms_list = []
        do_alt_ids_list = []
        xrefs = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # - might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []

            def_links_unprocessed = []
            def_links_processed = []
            subset = []
            definition = ""
            is_obsolete = "false"
            ident = key

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        do_synonym = {"primary_id": key, "synonym": synonym}
                        do_synonyms_list.append(do_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [
                        s["val"] for s in node["meta"]["basicPropertyValues"]
                    ]
                    for alt_id in alt_ids:
                        if "DOID:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            do_alt_ids_list.append(secondary_id)

                if "xrefs" in node["meta"]:
                    o_xrefs = node["meta"].get('xrefs')
                    self.ortho_xrefs(o_xrefs, ident, xrefs)

                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(
                key, relations=['subClassOf'])

            for item in isas_without_names:
                dictionary = {"primary_id": key, "primary_id2": item}

                do_isas_list.append(dictionary)

            def_links_processed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                # Remove new lines that cause this to split across two lines in the file
                # definition = definition.replace('\n', ' ')

                # Remove any extra double space that might have been introduces in the last replace
                # definition = definition.replace('  ', ' ')

                if definition is not None and "\"" in definition:
                    split_definition = re.split(r'(?<!\\)"', definition)
                    if len(split_definition) > 1:
                        if len(split_definition
                               ) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links = def_links.rstrip("]").replace("[", "")
                            def_links_unprocessed.append(def_links)

            for def_link in def_links_unprocessed:
                def_link = def_link.replace("url:www", "http://www")
                def_link = def_link.replace("url:", "")
                def_link = def_link.replace("URL:", "")
                def_link = def_link.replace("\\:", ":")
                def_link = def_link.replace('\\', '')

                if "," in def_link:
                    def_link = def_link.split(",")
                    for link in def_link:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                else:
                    if def_link.strip().startswith("http"):
                        def_links_processed.append(def_link)

            # TODO: make this a generic section based on the resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas

            # NU: alt_ids = node.get('alt_id')
            # if alt_ids:
            #     if not isinstance(alt_ids, (list, tuple)):
            #         alt_ids = [alt_ids]
            # else:
            #     alt_ids = []

            # TODO: Need to add urls to resource Descriptis for SGD and MGI.
            # NOTE: MGI had one but has 'MGI:' at the end of the url not required here.
            dict_to_append = {
                'oid':
                node['id'],
                'name':
                node.get('label'),
                'name_key':
                node.get('label'),
                'definition':
                definition,
                'defLinksProcessed':
                def_links_processed,
                'is_obsolete':
                is_obsolete,
                'subset':
                subset,
                'oUrl':
                self.etlh.rdh2.return_url_from_key_value('DOID', node['id']),
                'rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/all'),
                'rat_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/rat'),
                'human_only_rgd_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'RGD', node['id'], 'disease/human'),
                'mgi_link':
                'http://www.informatics.jax.org/disease/' + node['id'],
                'zfin_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'ZFIN', node['id'], 'disease'),
                'flybase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'FB', node['id'], 'disease'),
                'wormbase_link':
                self.etlh.rdh2.return_url_from_key_value(
                    'WB', node['id'], 'disease'),
                'sgd_link':
                'https://yeastgenome.org/disease/' + node['id']
            }

            do_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [
                    do_term_list, do_isas_list, do_synonyms_list, xrefs,
                    do_alt_ids_list
                ]
                do_term_list = []
                do_isas_list = []
                do_synonyms_list = []
                do_alt_ids_list = []
                xrefs = []
                counter = 0

        if counter > 0:
            yield [
                do_term_list, do_isas_list, do_synonyms_list, xrefs,
                do_alt_ids_list
            ]
Ejemplo n.º 5
0
    def get_generators(self, filepath, batch_size):
        """Get Generators"""

        ont = OntologyFactory().create(filepath)
        parsed_line = ont.graph.copy().node

        do_term_list = []
        do_isas_list = []
        do_synonyms_list = []
        do_alt_ids_list = []
        xrefs = []
        counter = 0

        # Convert parsed obo term into a schema-friendly AGR dictionary.
        for key, line in parsed_line.items():
            counter = counter + 1
            node = ont.graph.node[key]
            if len(node) == 0:
                continue

            # Switching id to curie form and saving URI in "uri"
            # - might wildly break things later on???
            node["uri"] = node["id"]
            node["id"] = key

            syns = []

            local_id = None
            def_links_unprocessed = []
            def_links_processed = []
            subset = []
            definition = ""
            is_obsolete = "false"
            ident = key
            prefix = ident.split(":")[0]

            if "meta" in node:
                if "synonyms" in node["meta"]:
                    syns = [s["val"] for s in node["meta"]["synonyms"]]
                    for synonym in syns:
                        do_synonym = {
                            "primary_id": key,
                            "synonym": synonym
                        }
                        do_synonyms_list.append(do_synonym)

                if "basicPropertyValues" in node["meta"]:
                    alt_ids = [s["val"] for s in node["meta"]["basicPropertyValues"]]
                    for alt_id in alt_ids:
                        if "DOID:" in alt_id:
                            secondary_id = {
                                "primary_id": key,
                                "secondary_id": alt_id
                            }
                            do_alt_ids_list.append(secondary_id)

                if "xrefs" in node["meta"]:
                    o_xrefs = node["meta"].get('xrefs')
                    if o_xrefs is not None:
                        for xref_id_dict in o_xrefs:
                            xref_id = xref_id_dict["val"]
                            if ":" in xref_id:
                                local_id = xref_id.split(":")[1].strip()
                                prefix = xref_id.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id)
                                generated_xref = ETLHelper.get_xref_dict(local_id, 
                                    prefix,
                                    "ontology_provided_cross_reference",
                                    "ontology_provided_cross_reference",
                                    xref_id,
                                    complete_url,
                                    xref_id + "ontology_provided_cross_reference")
                                generated_xref["oid"] = ident
                                xrefs.append(generated_xref)
                        else: #TODO Need to make sure this else is correct
                            if ":" in o_xrefs:
                                local_id = o_xrefs.split(":")[1].strip()
                                prefix = o_xrefs.split(":")[0].strip()
                                complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs)
                                generated_xref = ETLHelper.get_xref_dict(local_id,
                                        prefix,
                                        "ontology_provided_cross_reference",
                                        "ontology_provided_cross_reference",
                                        o_xrefs,
                                        complete_url,
                                        o_xrefs)
                                generated_xref["oid"] = ident
                                xrefs.append(generated_xref)
                if node["meta"].get('is_obsolete'):
                    is_obsolete = "true"
                elif node["meta"].get('deprecated'):
                    is_obsolete = "true"
                if "definition" in node["meta"]:
                    definition = node["meta"]["definition"]["val"]
                    def_links_unprocessed = node["meta"]["definition"]["xrefs"]
                if "subsets" in node["meta"]:
                    new_subset = node['meta'].get('subsets')
                    if isinstance(new_subset, (list, tuple)):
                        subset = new_subset
                    else:
                        if new_subset is not None:
                            subset.append(new_subset)
                if len(subset) > 1:
                    converted_subsets = []
                    for subset_str in subset:
                        if "#" in subset_str:
                            subset_str = subset_str.split("#")[-1]
                        converted_subsets.append(subset_str)
                    subset = converted_subsets

            all_parents = ont.parents(key)
            all_parents.append(key)

            # Improves performance when traversing relations
            all_parents_subont = ont.subontology(all_parents)
            isas_without_names = all_parents_subont.parents(key, relations=['subClassOf'])

            for item in isas_without_names:
                dictionary = {
                    "primary_id": key,
                    "primary_id2": item
                }

                do_isas_list.append(dictionary)

            def_links_processed = []
            def_links = ""
            if definition is None:
                definition = ""
            else:
                # Remove new lines that cause this to split across two lines in the file
                # definition = definition.replace('\n', ' ')

                # Remove any extra double space that might have been introduces in the last replace
                # definition = definition.replace('  ', ' ')

                if definition is not None and "\"" in definition:
                    split_definition = re.split(r'(?<!\\)"', definition)
                    if len(split_definition) > 1:
                        if len(split_definition) > 2 and "[" in split_definition[2].strip():
                            def_links = split_definition[2].strip()
                            def_links = def_links.rstrip("]").replace("[", "")
                            def_links_unprocessed.append(def_links)

            for def_link in def_links_unprocessed:
                def_link = def_link.replace("url:www", "http://www")
                def_link = def_link.replace("url:", "")
                def_link = def_link.replace("URL:", "")
                def_link = def_link.replace("\\:", ":")
                def_link = def_link.replace('\\', '')

                if "," in def_link:
                    def_link = def_link.split(",")
                    for link in def_link:
                        if link.strip().startswith("http"):
                            def_links_processed.append(link)
                else:
                    if def_link.strip().startswith("http"):
                        def_links_processed.append(def_link)

            # TODO: make this a generic section based on the resourceDescriptor.yaml file.
            # need to have MODs add disease pages to their yaml stanzas


            alt_ids = node.get('alt_id')
            if alt_ids:
                if not isinstance(alt_ids, (list, tuple)):
                    alt_ids = [alt_ids]
            else:
                alt_ids = []

            dict_to_append = {
                'oid': node['id'],
                'name': node.get('label'),
                'name_key': node.get('label'),
                'definition': definition,
                'defLinksProcessed': def_links_processed,
                'is_obsolete': is_obsolete,
                'subset': subset,
                'oUrl': "http://www.disease-ontology.org/?id=" + node['id'],
                'rgd_link': 'http://rgd.mcw.edu'
                            + '/rgdweb/ontology/annot.html?species=All&x=1&acc_id='
                            + node['id'] + '#annot',
                'rat_only_rgd_link': 'http://rgd.mcw.edu'
                                     + '/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id='
                                     + node['id'] + '#annot',
                'human_only_rgd_link': 'http://rgd.mcw.edu'
                                       + '/rgdweb/ontology/annot.html?species=Human&x=1&acc_id='
                                       + node['id'] + '#annot',
                'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'],
                'zfin_link': 'https://zfin.org/' + node['id'],
                'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id=' + node['id'],
                'wormbase_link': 'http://www.wormbase.org/resources/disease/' + node['id'],
                'sgd_link': 'https://yeastgenome.org/disease/' + node['id']
            }

            do_term_list.append(dict_to_append)

            if counter == batch_size:
                yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]
                do_term_list = []
                do_isas_list = []
                do_synonyms_list = []
                do_alt_ids_list = []
                xrefs = []
                counter = 0

        if counter > 0:
            yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]