def get_generators(self, filepath, batch_size): # noqa """Get Generators.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node go_term_list = [] go_isas_list = [] go_partofs_list = [] go_synonyms_list = [] go_regulates_list = [] go_negatively_regulates_list = [] go_positively_regulates_list = [] go_altids_list = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue if node.get('type') == 'PROPERTY': continue # Switching id to curie form and saving URI in "uri" # might wildly break things later on??? node["uri"] = node["id"] node["id"] = key subset = [] definition = "" is_obsolete = "false" if "meta" in node: meta = node.get('meta') basic_property_values = meta.get('basicPropertyValues') for property_value_map in basic_property_values: pred = property_value_map['pred'] val = property_value_map['val'] if pred == 'OIO:hasOBONamespace': term_type = val if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: go_synonym = {"primary_id": key, "synonym": synonym} go_synonyms_list.append(go_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [ s["val"] for s in node["meta"]["basicPropertyValues"] ] for alt_id in alt_ids: if "GO:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } go_altids_list.append(secondary_id) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) for item in isas_without_names: dictionary = {"primary_id": key, "primary_id2": item} go_isas_list.append(dictionary) partofs_without_names = all_parents_subont.parents( key, relations=['BFO:0000050']) for item in partofs_without_names: dictionary = {"primary_id": key, "primary_id2": item} go_partofs_list.append(dictionary) regulates = all_parents_subont.parents(key, relations=['RO:0002211']) for item in regulates: dictionary = {"primary_id": key, "primary_id2": item} go_regulates_list.append(dictionary) negatively_regulates = all_parents_subont.parents( key, relations=['RO:0002212']) for item in negatively_regulates: dictionary = {"primary_id": key, "primary_id2": item} go_negatively_regulates_list.append(dictionary) positively_regulates = all_parents_subont.parents( key, relations=['RO:0002213']) for item in positively_regulates: dictionary = {"primary_id": key, "primary_id2": item} go_positively_regulates_list.append(dictionary) dict_to_append = { 'oid': key, 'definition': definition, 'type': term_type, 'name': node.get('label'), 'subset': subset, 'name_key': node.get('label'), 'is_obsolete': is_obsolete, 'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'], } go_term_list.append(dict_to_append) if counter == batch_size: yield [ go_term_list, go_isas_list, go_partofs_list, go_synonyms_list, go_regulates_list, go_negatively_regulates_list, go_positively_regulates_list, go_altids_list ] go_term_list = [] go_isas_list = [] go_partofs_list = [] go_synonyms_list = [] go_regulates_list = [] go_negatively_regulates_list = [] go_positively_regulates_list = [] go_altids_list = [] counter = 0 if counter > 0: yield [ go_term_list, go_isas_list, go_partofs_list, go_synonyms_list, go_regulates_list, go_negatively_regulates_list, go_positively_regulates_list, go_altids_list ]
def get_data(self, filepath): # noqa """Get Data.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node # Convert parsed obo term into a schema-friendly AGR dictionary. for key in parsed_line.items(): node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] # So code commented out with NU: at start means it is Not Used. # NU: xrefs = [] # NU: xref_urls = [] # NU: def_links_unprocessed = [] # NU: def_links_processed = [] subset = [] definition = "" namespace = "" is_obsolete = "false" # NU:ident = key if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] # NU: leave in call commented out in case it is used at a later time # if "xrefs" in node["meta"]: # o_xrefs = node["meta"].get('xrefs') # self.ortho_xrefs(o_xrefs, ident, xref_urls) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] # NU: def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets if "basicPropertyValues" in node['meta']: for bpv in node['meta']['basicPropertyValues']: if bpv.get('pred') == 'OIO:hasOBONamespace': namespace = bpv.get('val') break all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) partofs_without_names = all_parents_subont.parents( key, relations=['BFO:0000050']) regulates = all_parents_subont.parents(key, relations=['RO:0002211']) negatively_regulates = all_parents_subont.parents( key, relations=['RO:0002212']) positively_regulates = all_parents_subont.parents( key, relations=['RO:0002213']) # NU: def_links_unprocessed = [] # def_links = "" if definition is None: definition = "" # else: # if definition is not None and "\"" in definition: # split_definition = definition.split("\"") # if len(split_definition) > 1: # if len(split_definition) > 2 and "[" in split_definition[2].strip(): # def_links = split_definition[2].strip() # def_links_unprocessed.append(def_links.rstrip("]").replace("[", "")) # NU: def_links_processed not used later, it is commented out. # for def_link_str in def_links_unprocessed: # def_link_str = def_link_str.replace("url:www", "http://www") # def_link_str = def_link_str.replace("url:", "") # def_link_str = def_link_str.replace("URL:", "") # def_link_str = def_link_str.replace("\\:", ":") # if "," in def_link_str: # def_links = def_link_str.split(",") # for link in def_links: # if link.strip().startswith("http"): # def_links_processed.append(link) # else: # if def_link_str.strip().startswith("http"): # def_links_processed.append(def_link_str) # NU: alt_ids = node.get('alt_id') # if alt_ids: # if not isinstance(alt_ids, (list, tuple)): # alt_ids = [alt_ids] # else: # alt_ids = [] dict_to_append = { 'o_type': namespace, 'name': node.get('label'), 'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'], 'name_key': node.get('label'), 'oid': node['id'], 'definition': definition, 'is_obsolete': is_obsolete, 'subset': subset, 'o_synonyms': syns, 'isas': isas_without_names, 'partofs': partofs_without_names, 'regulates': regulates, 'negatively_regulates': negatively_regulates, 'positively_regulates': positively_regulates, # This data might be needed for gene descriptions # Maybe should be turned into a different method in order # to keep the go do dict's smaller # 'o_genes': [], # 'o_species': [], # 'xrefs': xrefs, # 'ontologyLabel': filepath, # TODO: fix links to not be passed for each ontology load. # 'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\ # + '?species=All&x=1&acc_id='+node['id']+'#annot', # 'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=All&x=1&acc_id=' + node['id'] + '#annot', # 'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot', # 'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Human&x=1&acc_id=' +node['id'] + '#annot', # 'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'], # 'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'], # 'sgd_link': 'https://yeastgenome.org/disease/'+node['id'], # 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'], # 'zfin_link': 'https://zfin.org/'+node['id'], # 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], # 'oPrefix': prefix, # 'crossReferences': xref_urls, # 'defText': def_text, # 'defLinksProcessed': def_links_processed, # 'oboFile': prefix, # 'category': 'go', # 'alt_ids': alt_ids, } if node['id'] == 'GO:0099616': self.logger.debug(dict_to_append) node = {**node, **dict_to_append} ont.graph.node[node["id"]] = node return ont
def get_data(self, filepath): """Get Data""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node #Convert parsed obo term into a schema-friendly AGR dictionary. for key in parsed_line.items(): node = ont.graph.node[key] if len(node) == 0: continue ### Switching id to curie form and saving URI in "uri" # might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] xrefs = [] xref_urls = [] local_id = None def_links_unprocessed = [] def_links_processed = [] def_text = None subset = [] definition = "" namespace = "" is_obsolete = "false" ident = key prefix = ident.split(":")[0] if syns is None: syns = [] # Set the synonyms to an empty array if None. Necessary for Neo4j parsing if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') if o_xrefs is not None: for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict( \ local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xref_urls.append(generated_xref) else: if ":" in o_xrefs: local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict( \ local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xref_urls.append(generated_xref) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets if "basicPropertyValues" in node['meta']: for bpv in node['meta']['basicPropertyValues']: if bpv.get('pred') == 'OIO:hasOBONamespace': namespace = bpv.get('val') break # Set the synonyms to an empty array if None. Necessary for Neo4j parsing if xrefs is None: xrefs = [] all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents(key, relations=['subClassOf']) partofs_without_names = all_parents_subont.parents(key, relations=['BFO:0000050']) regulates = all_parents_subont.parents(key, relations=['RO:0002211']) negatively_regulates = all_parents_subont.parents(key, relations=['RO:0002212']) positively_regulates = all_parents_subont.parents(key, relations=['RO:0002213']) def_links_unprocessed = [] def_links = "" if definition is None: definition = "" else: #Remove new lines that cause this to split across two lines in the file #definition = definition.replace('\n', ' ') #Remove any extra double space that might have been introduces in the last replace #definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = definition.split("\"") if len(split_definition) > 1: def_text = split_definition[1].strip() if len(split_definition) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links_unprocessed.append(def_links.rstrip("]").replace("[", "")) else: def_text = definition for def_link_str in def_links_unprocessed: def_link_str = def_link_str.replace("url:www", "http://www") def_link_str = def_link_str.replace("url:", "") def_link_str = def_link_str.replace("URL:", "") def_link_str = def_link_str.replace("\\:", ":") if "," in def_link_str: def_links = def_link_str.split(",") for link in def_links: if link.strip().startswith("http"): def_links_processed.append(link) # elif "." in dl: # dl = dl.split(".") # for link in dl: # if link.strip().startswith("http"): # def_links_processed.append(link) else: if def_link_str.strip().startswith("http"): def_links_processed.append(def_link_str) # TODO: make this a generic section based on hte resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas alt_ids = node.get('alt_id') if alt_ids: if not isinstance(alt_ids, (list, tuple)): alt_ids = [alt_ids] else: alt_ids = [] dict_to_append = { 'o_type': namespace, 'name': node.get('label'), 'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'], 'name_key': node.get('label'), 'oid': node['id'], 'definition': definition, 'is_obsolete': is_obsolete, 'subset': subset, 'o_synonyms': syns, 'isas': isas_without_names, 'partofs': partofs_without_names, 'regulates': regulates, 'negatively_regulates': negatively_regulates, 'positively_regulates': positively_regulates, ### This data might be needed for gene descriptions ### Maybe should be turned into a different method in order ### to keep the go do dict's smaller #'o_genes': [], #'o_species': [], #'xrefs': xrefs, #'ontologyLabel': filepath, #TODO: fix links to not be passed for each ontology load. #'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\ # + '?species=All&x=1&acc_id='+node['id']+'#annot', #'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=All&x=1&acc_id=' + node['id'] + '#annot', #'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot', #'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Human&x=1&acc_id=' +node['id'] + '#annot', #'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'], #'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'], #'sgd_link': 'https://yeastgenome.org/disease/'+node['id'], #'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'], #'zfin_link': 'https://zfin.org/'+node['id'], #'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], #'oPrefix': prefix, #'crossReferences': xref_urls, #'defText': def_text, #'defLinksProcessed': def_links_processed, #'oboFile': prefix, #'category': 'go', #'alt_ids': alt_ids, } if node['id'] == 'GO:0099616': print(dict_to_append) node = {**node, **dict_to_append} ont.graph.node[node["id"]] = node return ont
def get_generators(self, filepath, batch_size): # noqa TODO:Needs splitting up really """Get Generators.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = {"primary_id": key, "synonym": synonym} do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [ s["val"] for s in node["meta"]["basicPropertyValues"] ] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') self.ortho_xrefs(o_xrefs, ident, xrefs) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) for item in isas_without_names: dictionary = {"primary_id": key, "primary_id2": item} do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition ) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas # NU: alt_ids = node.get('alt_id') # if alt_ids: # if not isinstance(alt_ids, (list, tuple)): # alt_ids = [alt_ids] # else: # alt_ids = [] # TODO: Need to add urls to resource Descriptis for SGD and MGI. # NOTE: MGI had one but has 'MGI:' at the end of the url not required here. dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': self.etlh.rdh2.return_url_from_key_value('DOID', node['id']), 'rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/all'), 'rat_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/rat'), 'human_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/human'), 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': self.etlh.rdh2.return_url_from_key_value( 'ZFIN', node['id'], 'disease'), 'flybase_link': self.etlh.rdh2.return_url_from_key_value( 'FB', node['id'], 'disease'), 'wormbase_link': self.etlh.rdh2.return_url_from_key_value( 'WB', node['id'], 'disease'), 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ]
def get_generators(self, filepath, batch_size): """Get Generators""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] local_id = None def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key prefix = ident.split(":")[0] if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = { "primary_id": key, "synonym": synonym } do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [s["val"] for s in node["meta"]["basicPropertyValues"]] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') if o_xrefs is not None: for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict(local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xrefs.append(generated_xref) else: #TODO Need to make sure this else is correct if ":" in o_xrefs: local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict(local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xrefs.append(generated_xref) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents(key, relations=['subClassOf']) for item in isas_without_names: dictionary = { "primary_id": key, "primary_id2": item } do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas alt_ids = node.get('alt_id') if alt_ids: if not isinstance(alt_ids, (list, tuple)): alt_ids = [alt_ids] else: alt_ids = [] dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], 'rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=All&x=1&acc_id=' + node['id'] + '#annot', 'rat_only_rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id=' + node['id'] + '#annot', 'human_only_rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=Human&x=1&acc_id=' + node['id'] + '#annot', 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': 'https://zfin.org/' + node['id'], 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id=' + node['id'], 'wormbase_link': 'http://www.wormbase.org/resources/disease/' + node['id'], 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]