def learn(resource, input, outdir, target_assocfile, target_ontology, target_root_class): """ Learn association rules """ logging.basicConfig(level=logging.INFO) afa = AssociationSetFactory() ofa = OntologyFactory() ont = ofa.create(resource) aset = afa.create_from_file(file=input, ontology=ont, fmt=None) learner = ol.OntologyLearner(assocs=aset) isa_ont = ont.subontology(relations=['subClassOf']) if target_root_class: learner.split_assocs(target_root_class, ontology=isa_ont) if target_ontology: learner.target_ontology = ofa.create(target_ontology) if target_assocfile: tont = ont if learner.target_ontology is not None: tont = learner.target_ontology learner.target_assocs = afa.create_from_file(target_assocfile, ontology=tont, fmt=None) with open(outdir + '/index.md', 'w') as file: learner.fit_all(dir=outdir, reportfile=file)
def test_merge(): factory = OntologyFactory() print("Creating ont") ont = factory.create('tests/resources/lexmap_test.json') ont2 = Ontology() ont2.merge([ont]) assert ont2.xref_graph is not None
def test_lexmap_multi(): """ Text lexical mapping """ factory = OntologyFactory() print("Creating ont") files = ['x', 'm', 'h', 'bto'] onts = [ factory.create('tests/resources/autopod-{}.json'.format(f)) for f in files ] lexmap = LexicalMapEngine() lexmap.index_ontologies(onts) #print(lexmap.lmap) #print(ont.all_synonyms()) g = lexmap.get_xref_graph() for x in g.nodes(): print("{} --> {}".format(x, lexmap.grouped_mappings(x))) for x, y, d in g.edges_iter(data=True): cl = nx.ancestors(g, x) print("{} '{}' <-> {} '{}' :: {} CLOSURE={}".format( x, lexmap.label(x), y, lexmap.label(y), d, len(cl))) cpr = d[lexmap.CONDITIONAL_PR] assert cpr > 0 and cpr <= 1.0 unmapped = lexmap.unmapped_nodes(g) print('U: {}'.format(len(unmapped))) unmapped = lexmap.unmapped_nodes(g, rs_threshold=4) print('U4: {}'.format(len(unmapped))) cliques = lexmap.cliques(g) maxc = max(cliques, key=len) print('CLIQUES: {}'.format(cliques)) print('MAX CLIQUES: {}'.format(maxc)) df = lexmap.as_dataframe(g) print(df.to_csv(sep="\t"))
def test_subontology(): """ subontology """ factory = OntologyFactory() print("Creating ont") ont = factory.create('go') assert ont.is_obsolete('GO:0000267') == True print("ONT NODES: {}".format(ont.nodes())) subont = ont.subontology(relations=['subClassOf']) PERM = 'GO:1990578' print("NODES: {}".format(subont.nodes())) ancs = subont.ancestors(PERM, reflexive=True) print(str(ancs)) for a in ancs: print(" ANC: {} '{}'".format(a, subont.label(a))) assert len(ancs) > 0 assert subont.is_obsolete('GO:0000267') == True w = GraphRenderer.create('tree') w.write_subgraph(ont, ancs) # TODO: sub-ontology does not create # full metadata w = GraphRenderer.create('obo') w.write_subgraph(ont, ancs)
def test_set_ontology(self): ontology = OntologyFactory().create() for i in range(4): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser) self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes()))
def test_dynamic_query(): """ Dynamic query """ factory = OntologyFactory() print("Creating ont") ont = factory.create('pato') ids = ont.sparql(body="{?x rdfs:subClassOf+ "+SHAPE+"}", inject_prefixes = ont.prefixes(), single_column=True) assert Y_SHAPED in ids assert ABSENT not in ids
def load_from_files(self, files: List[str]) -> None: """ loads an ontology from an obojson file :param files: list of fils in obojson format :return: """ factory = OntologyFactory() ont = None for file in files: if ont == None: ont = factory.create(file) else: ont.merge(factory.create(file)) self.ontology = ont
def test_expand(): factory = OntologyFactory() ontobj = factory.create("tests/resources/goslim_pombe.json") expand_tsv(INPUT, ontology=ontobj, outfile=open(OUTPUT, "w"), cols=["term"]) reader = csv.DictReader(open(OUTPUT, "r"), delimiter='\t') n = 0 for row in reader: if row['term'] == 'GO:0002181': assert row['term_label'] == 'cytoplasmic translation' n += 1 if row['term'] == 'FAKE:123': assert row['term_label'] == '' n += 1 assert n == 2
def test_lexmap_basic(): """ Text lexical mapping """ factory = OntologyFactory() print("Creating ont") ont = factory.create('tests/resources/lexmap_test.json') lexmap = LexicalMapEngine() lexmap.index_ontology(ont) print(lexmap.lmap) print(ont.all_synonyms()) g = lexmap.get_xref_graph() for x, y, d in g.edges_iter(data=True): print("{}<->{} :: {}".format(x, y, d)) for x in g.nodes(): print("{} --> {}".format(x, lexmap.grouped_mappings(x))) assert g.has_edge('Z:2', 'ZZ:2') # roman numerals assert g.has_edge('Z:2', 'Y:2') # case insensitivity assert g.has_edge('A:1', 'B:1') # synonyms assert g.has_edge('B:1', 'A:1') # bidirectional for x, y, d in g.edges_iter(data=True): print("{}<->{} :: {}".format(x, y, d)) cpr = d[lexmap.CONDITIONAL_PR] assert cpr > 0 and cpr <= 1.0 df = lexmap.as_dataframe(g) print(df.to_csv(sep="\t")) lexmap = LexicalMapEngine( config=dict(synsets=[dict(word="", synonym="ignoreme", weight=-2.0)], normalized_form_confidence=0.25, abbreviation_confidence=0.5, meaningful_ids=True, ontology_configurations=[ dict(prefix='AA', normalized_form_confidence=-1000) ])) assert len(lexmap._get_config_val('NULL', 'synsets')) == 1 assert lexmap._normalize_label('ignoreme foo', {'ignoreme': ''}) == 'foo' assert lexmap._normalize_label('replaceme foo', {'replaceme': 'zz'}) == 'foo zz' ont.add_node('TEST:1', 'foo bar') ont.add_node('TEST:2', 'bar foo') ont.add_node('TEST:3', 'foo bar') ont.add_node('TEST:4', 'wiz') syn = Synonym('TEST:4', val='bar foo', pred='hasRelatedSynonym') ont.add_synonym(syn) ont.add_node('http://x.org/wiz#FooBar') ont.add_node('TEST:6', '123') ont.add_node('TEST:7', '123') ont.add_node('TEST:8', 'bar ignoreme foo') ont.add_node('AA:1', 'foo bar') ont.add_node('AA:2', 'bar foo') ont.add_node('ABBREV:1', 'ABCD') ont.add_node('ABBREV:2', 'ABCD') for s in ont.synonyms('TEST:4'): print('S={}'.format(s)) lexmap.index_ontology(ont) g = lexmap.get_xref_graph() for x, d in g['TEST:1'].items(): print('XREF: {} = {}'.format(x, d)) assert g.has_edge('TEST:1', 'TEST:2') # normalized logging.info('E 1-2 = {}'.format(g['TEST:1']['TEST:2'])) assert int(g['TEST:1']['TEST:2']['score']) == 25 assert int(g['TEST:1']['TEST:3']['score']) == 100 assert int(g['TEST:1']['TEST:4']['score']) < 25 assert g.has_edge('TEST:3', 'http://x.org/wiz#FooBar') # IDs and CamelCase assert not g.has_edge('TEST:6', 'TEST:7') # should omit syns with no alphanumeric # test exclude normalized form assert not g.has_edge('AA:1', 'AA:2') # test custom synsets are used assert g.has_edge('TEST:8', 'TEST:2') assert g.has_edge('TEST:8', 'AA:2') assert not g.has_edge('TEST:8', 'AA:1') # do not normalize AAs assert lexmap.smap['ABBREV:1'][0].is_abbreviation() assert lexmap.smap['ABBREV:2'][0].is_abbreviation() assert g.has_edge('ABBREV:1', 'ABBREV:2') assert int(g['ABBREV:1']['ABBREV:2']['score']) == 25 df = lexmap.unmapped_dataframe(g) print(df.to_csv())
def test_remote_sparql_pato(): """ Load ontology from remote SPARQL endpoint """ factory = OntologyFactory() print("Creating ont") ont = factory.create('pato') ploidy = ont.node(PLOIDY) print("PLOIDY: {}".format(ploidy)) assert ont.label(PLOIDY) == 'ploidy' # exact match search_results = ont.search('shape') print("SEARCH (exact): {}".format(search_results)) assert [SHAPE] == search_results # implicit regexp search_results = ont.search('%shape%') print("SEARCH (re, implicit): {}".format(search_results)) assert SHAPE in search_results assert len(search_results)>10 # explicit regexp search_results = ont.search('.*shape.*', is_regex=True) print("SEARCH (re, explicit): {}".format(search_results)) assert SHAPE in search_results assert len(search_results)>10 # syns syn = 'cone-shaped' search_results = ont.search(syn, synonyms=False) print("SEARCH (no syns): {}".format(search_results)) assert [] == search_results #search_results = ont.search(syn, synonyms=True) #print("SEARCH (with syns): {}".format(search_results)) #assert [CONICAL] == search_results num_nodes = 0 for n in ont.nodes(): num_nodes = num_nodes+1 assert num_nodes > 100 ancs = ont.ancestors(PLOIDY) print("ANCS ploidy (all): {}".format(ancs)) assert QUALITY in ancs assert PENTAPLOID not in ancs ancs = ont.ancestors(PLOIDY, relations=['subClassOf']) print("ANCS ploidy (subClassOf): {}".format(ancs)) assert QUALITY in ancs assert PENTAPLOID not in ancs # this is a non-use case ancs = ont.ancestors(SWOLLEN, relations=[HAS_PART]) print("ANCS swollen (has_part): {}".format(ancs)) assert INCREASED_SIZE in ancs assert PROTRUDING in ancs assert len(ancs) == 2 ancs = ont.ancestors(SWOLLEN, relations=['subClassOf']) print("ANCS swollen (has_part): {}".format(ancs)) assert MORPHOLOGY in ancs assert QUALITY in ancs assert PROTRUDING not in ancs decs = ont.descendants(PLOIDY) print("DECS ploidy (all): {}".format(decs)) assert QUALITY not in decs assert EUPLOID in decs assert PENTAPLOID in decs # this is a non-use case ancs = ont.descendants(INCREASED_SIZE, relations=[HAS_PART]) print("ANCS increased size (has part): {}".format(ancs)) assert SWOLLEN in ancs assert len(ancs) == 1 subsets = ont.subsets() print("SUBSETS: {}".format(subsets)) slim = ont.extract_subset('absent_slim') print("SLIM: {}".format(slim)) assert ABSENT in slim assert QUALITY not in slim syns = ont.synonyms(INCREASED_SIZE) print("SYNS: {}".format(syns)) syn_vals = [syn.val for syn in syns] assert 'big' in syn_vals [bigsyn] = [syn for syn in syns if syn.val=='big'] # TODO xrefs assert not bigsyn.exact_or_label() assert bigsyn.scope() == 'RELATED' w = GraphRenderer.create('obo') w.write_subgraph(ont, [INCREASED_SIZE])
def get_generators(self, filepath, batch_size): """Get Generators""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] local_id = None def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key prefix = ident.split(":")[0] if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = { "primary_id": key, "synonym": synonym } do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [s["val"] for s in node["meta"]["basicPropertyValues"]] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') if o_xrefs is not None: for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict(local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xrefs.append(generated_xref) else: #TODO Need to make sure this else is correct if ":" in o_xrefs: local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict(local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xrefs.append(generated_xref) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents(key, relations=['subClassOf']) for item in isas_without_names: dictionary = { "primary_id": key, "primary_id2": item } do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas alt_ids = node.get('alt_id') if alt_ids: if not isinstance(alt_ids, (list, tuple)): alt_ids = [alt_ids] else: alt_ids = [] dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], 'rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=All&x=1&acc_id=' + node['id'] + '#annot', 'rat_only_rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=Rat&x=1&acc_id=' + node['id'] + '#annot', 'human_only_rgd_link': 'http://rgd.mcw.edu' + '/rgdweb/ontology/annot.html?species=Human&x=1&acc_id=' + node['id'] + '#annot', 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': 'https://zfin.org/' + node['id'], 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id=' + node['id'], 'wormbase_link': 'http://www.wormbase.org/resources/disease/' + node['id'], 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list]
def get_data(self, filepath): # noqa """Get Data.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node # Convert parsed obo term into a schema-friendly AGR dictionary. for key in parsed_line.items(): node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] # So code commented out with NU: at start means it is Not Used. # NU: xrefs = [] # NU: xref_urls = [] # NU: def_links_unprocessed = [] # NU: def_links_processed = [] subset = [] definition = "" namespace = "" is_obsolete = "false" # NU:ident = key if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] # NU: leave in call commented out in case it is used at a later time # if "xrefs" in node["meta"]: # o_xrefs = node["meta"].get('xrefs') # self.ortho_xrefs(o_xrefs, ident, xref_urls) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] # NU: def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets if "basicPropertyValues" in node['meta']: for bpv in node['meta']['basicPropertyValues']: if bpv.get('pred') == 'OIO:hasOBONamespace': namespace = bpv.get('val') break all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) partofs_without_names = all_parents_subont.parents( key, relations=['BFO:0000050']) regulates = all_parents_subont.parents(key, relations=['RO:0002211']) negatively_regulates = all_parents_subont.parents( key, relations=['RO:0002212']) positively_regulates = all_parents_subont.parents( key, relations=['RO:0002213']) # NU: def_links_unprocessed = [] # def_links = "" if definition is None: definition = "" # else: # if definition is not None and "\"" in definition: # split_definition = definition.split("\"") # if len(split_definition) > 1: # if len(split_definition) > 2 and "[" in split_definition[2].strip(): # def_links = split_definition[2].strip() # def_links_unprocessed.append(def_links.rstrip("]").replace("[", "")) # NU: def_links_processed not used later, it is commented out. # for def_link_str in def_links_unprocessed: # def_link_str = def_link_str.replace("url:www", "http://www") # def_link_str = def_link_str.replace("url:", "") # def_link_str = def_link_str.replace("URL:", "") # def_link_str = def_link_str.replace("\\:", ":") # if "," in def_link_str: # def_links = def_link_str.split(",") # for link in def_links: # if link.strip().startswith("http"): # def_links_processed.append(link) # else: # if def_link_str.strip().startswith("http"): # def_links_processed.append(def_link_str) # NU: alt_ids = node.get('alt_id') # if alt_ids: # if not isinstance(alt_ids, (list, tuple)): # alt_ids = [alt_ids] # else: # alt_ids = [] dict_to_append = { 'o_type': namespace, 'name': node.get('label'), 'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'], 'name_key': node.get('label'), 'oid': node['id'], 'definition': definition, 'is_obsolete': is_obsolete, 'subset': subset, 'o_synonyms': syns, 'isas': isas_without_names, 'partofs': partofs_without_names, 'regulates': regulates, 'negatively_regulates': negatively_regulates, 'positively_regulates': positively_regulates, # This data might be needed for gene descriptions # Maybe should be turned into a different method in order # to keep the go do dict's smaller # 'o_genes': [], # 'o_species': [], # 'xrefs': xrefs, # 'ontologyLabel': filepath, # TODO: fix links to not be passed for each ontology load. # 'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\ # + '?species=All&x=1&acc_id='+node['id']+'#annot', # 'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=All&x=1&acc_id=' + node['id'] + '#annot', # 'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot', # 'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Human&x=1&acc_id=' +node['id'] + '#annot', # 'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'], # 'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'], # 'sgd_link': 'https://yeastgenome.org/disease/'+node['id'], # 'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'], # 'zfin_link': 'https://zfin.org/'+node['id'], # 'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], # 'oPrefix': prefix, # 'crossReferences': xref_urls, # 'defText': def_text, # 'defLinksProcessed': def_links_processed, # 'oboFile': prefix, # 'category': 'go', # 'alt_ids': alt_ids, } if node['id'] == 'GO:0099616': self.logger.debug(dict_to_append) node = {**node, **dict_to_append} ont.graph.node[node["id"]] = node return ont
def get_generators(self, filepath, batch_size): # noqa TODO:Needs splitting up really """Get Generators.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue # Switching id to curie form and saving URI in "uri" # - might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] def_links_unprocessed = [] def_links_processed = [] subset = [] definition = "" is_obsolete = "false" ident = key if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: do_synonym = {"primary_id": key, "synonym": synonym} do_synonyms_list.append(do_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [ s["val"] for s in node["meta"]["basicPropertyValues"] ] for alt_id in alt_ids: if "DOID:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } do_alt_ids_list.append(secondary_id) if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') self.ortho_xrefs(o_xrefs, ident, xrefs) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) for item in isas_without_names: dictionary = {"primary_id": key, "primary_id2": item} do_isas_list.append(dictionary) def_links_processed = [] def_links = "" if definition is None: definition = "" else: # Remove new lines that cause this to split across two lines in the file # definition = definition.replace('\n', ' ') # Remove any extra double space that might have been introduces in the last replace # definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = re.split(r'(?<!\\)"', definition) if len(split_definition) > 1: if len(split_definition ) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links = def_links.rstrip("]").replace("[", "") def_links_unprocessed.append(def_links) for def_link in def_links_unprocessed: def_link = def_link.replace("url:www", "http://www") def_link = def_link.replace("url:", "") def_link = def_link.replace("URL:", "") def_link = def_link.replace("\\:", ":") def_link = def_link.replace('\\', '') if "," in def_link: def_link = def_link.split(",") for link in def_link: if link.strip().startswith("http"): def_links_processed.append(link) else: if def_link.strip().startswith("http"): def_links_processed.append(def_link) # TODO: make this a generic section based on the resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas # NU: alt_ids = node.get('alt_id') # if alt_ids: # if not isinstance(alt_ids, (list, tuple)): # alt_ids = [alt_ids] # else: # alt_ids = [] # TODO: Need to add urls to resource Descriptis for SGD and MGI. # NOTE: MGI had one but has 'MGI:' at the end of the url not required here. dict_to_append = { 'oid': node['id'], 'name': node.get('label'), 'name_key': node.get('label'), 'definition': definition, 'defLinksProcessed': def_links_processed, 'is_obsolete': is_obsolete, 'subset': subset, 'oUrl': self.etlh.rdh2.return_url_from_key_value('DOID', node['id']), 'rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/all'), 'rat_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/rat'), 'human_only_rgd_link': self.etlh.rdh2.return_url_from_key_value( 'RGD', node['id'], 'disease/human'), 'mgi_link': 'http://www.informatics.jax.org/disease/' + node['id'], 'zfin_link': self.etlh.rdh2.return_url_from_key_value( 'ZFIN', node['id'], 'disease'), 'flybase_link': self.etlh.rdh2.return_url_from_key_value( 'FB', node['id'], 'disease'), 'wormbase_link': self.etlh.rdh2.return_url_from_key_value( 'WB', node['id'], 'disease'), 'sgd_link': 'https://yeastgenome.org/disease/' + node['id'] } do_term_list.append(dict_to_append) if counter == batch_size: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ] do_term_list = [] do_isas_list = [] do_synonyms_list = [] do_alt_ids_list = [] xrefs = [] counter = 0 if counter > 0: yield [ do_term_list, do_isas_list, do_synonyms_list, xrefs, do_alt_ids_list ]
def get_data(self, filepath): """Get Data""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node #Convert parsed obo term into a schema-friendly AGR dictionary. for key in parsed_line.items(): node = ont.graph.node[key] if len(node) == 0: continue ### Switching id to curie form and saving URI in "uri" # might wildly break things later on??? node["uri"] = node["id"] node["id"] = key syns = [] xrefs = [] xref_urls = [] local_id = None def_links_unprocessed = [] def_links_processed = [] def_text = None subset = [] definition = "" namespace = "" is_obsolete = "false" ident = key prefix = ident.split(":")[0] if syns is None: syns = [] # Set the synonyms to an empty array if None. Necessary for Neo4j parsing if "meta" in node: if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] if "xrefs" in node["meta"]: o_xrefs = node["meta"].get('xrefs') if o_xrefs is not None: for xref_id_dict in o_xrefs: xref_id = xref_id_dict["val"] if ":" in xref_id: local_id = xref_id.split(":")[1].strip() prefix = xref_id.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, xref_id) generated_xref = ETLHelper.get_xref_dict( \ local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", xref_id, complete_url, xref_id + "ontology_provided_cross_reference") generated_xref["oid"] = ident xref_urls.append(generated_xref) else: if ":" in o_xrefs: local_id = o_xrefs.split(":")[1].strip() prefix = o_xrefs.split(":")[0].strip() complete_url = ETLHelper.get_complete_url_ont(local_id, o_xrefs) generated_xref = ETLHelper.get_xref_dict( \ local_id, prefix, "ontology_provided_cross_reference", "ontology_provided_cross_reference", o_xrefs, complete_url, o_xrefs) generated_xref["oid"] = ident xref_urls.append(generated_xref) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] def_links_unprocessed = node["meta"]["definition"]["xrefs"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets if "basicPropertyValues" in node['meta']: for bpv in node['meta']['basicPropertyValues']: if bpv.get('pred') == 'OIO:hasOBONamespace': namespace = bpv.get('val') break # Set the synonyms to an empty array if None. Necessary for Neo4j parsing if xrefs is None: xrefs = [] all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents(key, relations=['subClassOf']) partofs_without_names = all_parents_subont.parents(key, relations=['BFO:0000050']) regulates = all_parents_subont.parents(key, relations=['RO:0002211']) negatively_regulates = all_parents_subont.parents(key, relations=['RO:0002212']) positively_regulates = all_parents_subont.parents(key, relations=['RO:0002213']) def_links_unprocessed = [] def_links = "" if definition is None: definition = "" else: #Remove new lines that cause this to split across two lines in the file #definition = definition.replace('\n', ' ') #Remove any extra double space that might have been introduces in the last replace #definition = definition.replace(' ', ' ') if definition is not None and "\"" in definition: split_definition = definition.split("\"") if len(split_definition) > 1: def_text = split_definition[1].strip() if len(split_definition) > 2 and "[" in split_definition[2].strip(): def_links = split_definition[2].strip() def_links_unprocessed.append(def_links.rstrip("]").replace("[", "")) else: def_text = definition for def_link_str in def_links_unprocessed: def_link_str = def_link_str.replace("url:www", "http://www") def_link_str = def_link_str.replace("url:", "") def_link_str = def_link_str.replace("URL:", "") def_link_str = def_link_str.replace("\\:", ":") if "," in def_link_str: def_links = def_link_str.split(",") for link in def_links: if link.strip().startswith("http"): def_links_processed.append(link) # elif "." in dl: # dl = dl.split(".") # for link in dl: # if link.strip().startswith("http"): # def_links_processed.append(link) else: if def_link_str.strip().startswith("http"): def_links_processed.append(def_link_str) # TODO: make this a generic section based on hte resourceDescriptor.yaml file. # need to have MODs add disease pages to their yaml stanzas alt_ids = node.get('alt_id') if alt_ids: if not isinstance(alt_ids, (list, tuple)): alt_ids = [alt_ids] else: alt_ids = [] dict_to_append = { 'o_type': namespace, 'name': node.get('label'), 'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'], 'name_key': node.get('label'), 'oid': node['id'], 'definition': definition, 'is_obsolete': is_obsolete, 'subset': subset, 'o_synonyms': syns, 'isas': isas_without_names, 'partofs': partofs_without_names, 'regulates': regulates, 'negatively_regulates': negatively_regulates, 'positively_regulates': positively_regulates, ### This data might be needed for gene descriptions ### Maybe should be turned into a different method in order ### to keep the go do dict's smaller #'o_genes': [], #'o_species': [], #'xrefs': xrefs, #'ontologyLabel': filepath, #TODO: fix links to not be passed for each ontology load. #'rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html'\ # + '?species=All&x=1&acc_id='+node['id']+'#annot', #'rgd_all_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=All&x=1&acc_id=' + node['id'] + '#annot', #'rat_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Rat&x=1&acc_id=' +node['id'] + '#annot', #'human_only_rgd_link': 'http://rgd.mcw.edu/rgdweb/ontology/annot.html?'\ # + 'species=Human&x=1&acc_id=' +node['id'] + '#annot', #'mgi_link': 'http://www.informatics.jax.org/disease/'+node['id'], #'wormbase_link': 'http://www.wormbase.org/resources/disease/'+node['id'], #'sgd_link': 'https://yeastgenome.org/disease/'+node['id'], #'flybase_link': 'http://flybase.org/cgi-bin/cvreport.html?id='+node['id'], #'zfin_link': 'https://zfin.org/'+node['id'], #'oUrl': "http://www.disease-ontology.org/?id=" + node['id'], #'oPrefix': prefix, #'crossReferences': xref_urls, #'defText': def_text, #'defLinksProcessed': def_links_processed, #'oboFile': prefix, #'category': 'go', #'alt_ids': alt_ids, } if node['id'] == 'GO:0099616': print(dict_to_append) node = {**node, **dict_to_append} ont.graph.node[node["id"]] = node return ont
def test_set_covering_with_ontology(self): # 0 ic(0) = 0 # /| |\ # / | | \ # 1 2 3 4 ic(1) = 0.693147181, ic(2) = 0.470003629, ic(3) = 0.980829253 # /\ /\/ \/ # / 5 6 7 ic(5) = 0.980829253, ic(6) = 1.16315081, ic(7) = 1.16315081 # / /\ \/ # / 8 9 10 ic(8) = 1.049822124, ic(10) = 1.252762968 # \ / \/ \ # 11 12 13 ic(11) = 1.386294361, ic(12) = 1.386294361, ic(13) = 1.386294361 ontology = OntologyFactory().create() for i in range(14): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) ontology.add_parent(4, 0) ontology.add_parent(5, 1) ontology.add_parent(5, 2) ontology.add_parent(6, 2) ontology.add_parent(6, 3) ontology.add_parent(7, 3) ontology.add_parent(7, 4) ontology.add_parent(8, 5) ontology.add_parent(9, 5) ontology.add_parent(10, 6) ontology.add_parent(10, 7) ontology.add_parent(11, 1) ontology.add_parent(11, 8) ontology.add_parent(12, 8) ontology.add_parent(12, 9) ontology.add_parent(13, 10) subsets = [ CommonAncestor(node_id=1, node_label="1", covered_starting_nodes={"11", "12"}), CommonAncestor(node_id=2, node_label="2", covered_starting_nodes={"11", "12", "13"}), CommonAncestor(node_id=3, node_label="3", covered_starting_nodes={"13"}), CommonAncestor(node_id=4, node_label="4", covered_starting_nodes={"13"}), CommonAncestor(node_id=5, node_label="2", covered_starting_nodes={"11", "12"}), CommonAncestor(node_id=6, node_label="6", covered_starting_nodes={"13"}), CommonAncestor(node_id=7, node_label="7", covered_starting_nodes={"13"}), CommonAncestor(node_id=8, node_label="8", covered_starting_nodes={"11", "12"}), CommonAncestor(node_id=9, node_label="9", covered_starting_nodes={"12"}), CommonAncestor(node_id=10, node_label="10", covered_starting_nodes={"13"}), CommonAncestor(node_id=11, node_label="11", covered_starting_nodes={"11"}), CommonAncestor(node_id=12, node_label="12", covered_starting_nodes={"12"}), CommonAncestor(node_id=13, node_label="13", covered_starting_nodes={"13"}) ] values = [1, 1, 1, 1, 1, 1, 1, 20, 1, 1, 100, 1, 1] res = find_set_covering(subsets=subsets, ontology=ontology, value=values, max_num_subsets=2) self.assertTrue(all([sub[0] != 11 for sub in res]))
def get_generators(self, filepath, batch_size): # noqa """Get Generators.""" ont = OntologyFactory().create(filepath) parsed_line = ont.graph.copy().node go_term_list = [] go_isas_list = [] go_partofs_list = [] go_synonyms_list = [] go_regulates_list = [] go_negatively_regulates_list = [] go_positively_regulates_list = [] go_altids_list = [] counter = 0 # Convert parsed obo term into a schema-friendly AGR dictionary. for key, line in parsed_line.items(): counter = counter + 1 node = ont.graph.node[key] if len(node) == 0: continue if node.get('type') == 'PROPERTY': continue # Switching id to curie form and saving URI in "uri" # might wildly break things later on??? node["uri"] = node["id"] node["id"] = key subset = [] definition = "" is_obsolete = "false" if "meta" in node: meta = node.get('meta') basic_property_values = meta.get('basicPropertyValues') for property_value_map in basic_property_values: pred = property_value_map['pred'] val = property_value_map['val'] if pred == 'OIO:hasOBONamespace': term_type = val if "synonyms" in node["meta"]: syns = [s["val"] for s in node["meta"]["synonyms"]] for synonym in syns: go_synonym = {"primary_id": key, "synonym": synonym} go_synonyms_list.append(go_synonym) if "basicPropertyValues" in node["meta"]: alt_ids = [ s["val"] for s in node["meta"]["basicPropertyValues"] ] for alt_id in alt_ids: if "GO:" in alt_id: secondary_id = { "primary_id": key, "secondary_id": alt_id } go_altids_list.append(secondary_id) if node["meta"].get('is_obsolete'): is_obsolete = "true" elif node["meta"].get('deprecated'): is_obsolete = "true" if "definition" in node["meta"]: definition = node["meta"]["definition"]["val"] if "subsets" in node["meta"]: new_subset = node['meta'].get('subsets') if isinstance(new_subset, (list, tuple)): subset = new_subset else: if new_subset is not None: subset.append(new_subset) if len(subset) > 1: converted_subsets = [] for subset_str in subset: if "#" in subset_str: subset_str = subset_str.split("#")[-1] converted_subsets.append(subset_str) subset = converted_subsets all_parents = ont.parents(key) all_parents.append(key) # Improves performance when traversing relations all_parents_subont = ont.subontology(all_parents) isas_without_names = all_parents_subont.parents( key, relations=['subClassOf']) for item in isas_without_names: dictionary = {"primary_id": key, "primary_id2": item} go_isas_list.append(dictionary) partofs_without_names = all_parents_subont.parents( key, relations=['BFO:0000050']) for item in partofs_without_names: dictionary = {"primary_id": key, "primary_id2": item} go_partofs_list.append(dictionary) regulates = all_parents_subont.parents(key, relations=['RO:0002211']) for item in regulates: dictionary = {"primary_id": key, "primary_id2": item} go_regulates_list.append(dictionary) negatively_regulates = all_parents_subont.parents( key, relations=['RO:0002212']) for item in negatively_regulates: dictionary = {"primary_id": key, "primary_id2": item} go_negatively_regulates_list.append(dictionary) positively_regulates = all_parents_subont.parents( key, relations=['RO:0002213']) for item in positively_regulates: dictionary = {"primary_id": key, "primary_id2": item} go_positively_regulates_list.append(dictionary) dict_to_append = { 'oid': key, 'definition': definition, 'type': term_type, 'name': node.get('label'), 'subset': subset, 'name_key': node.get('label'), 'is_obsolete': is_obsolete, 'href': 'http://amigo.geneontology.org/amigo/term/' + node['id'], } go_term_list.append(dict_to_append) if counter == batch_size: yield [ go_term_list, go_isas_list, go_partofs_list, go_synonyms_list, go_regulates_list, go_negatively_regulates_list, go_positively_regulates_list, go_altids_list ] go_term_list = [] go_isas_list = [] go_partofs_list = [] go_synonyms_list = [] go_regulates_list = [] go_negatively_regulates_list = [] go_positively_regulates_list = [] go_altids_list = [] counter = 0 if counter > 0: yield [ go_term_list, go_isas_list, go_partofs_list, go_synonyms_list, go_regulates_list, go_negatively_regulates_list, go_positively_regulates_list, go_altids_list ]
def expand(tsvfile, cols, ontology, output): factory = OntologyFactory() ontobj = factory.create(ontology) expand_tsv(tsvfile, ontology=ontobj, outfile=output, cols=cols)