def load(file, olr=None, mkdir=False): filepath = os.path.expanduser(file) _, ext = os.path.splitext(filepath) filetype = ext.strip('.') if filetype == 'ttl': infmt = 'turtle' else: infmt = None print(filepath) graph = rdflib.Graph() try: graph.parse(filepath, format=infmt) except rdflib.plugins.parsers.notation3.BadSyntax as e: print('PARSING FAILED', filepath) raise e og = makeGraph('', graph=graph) # FIXME this should really just be a function :/ curie, *prefs = kludge(filepath) name = os.path.splitext(os.path.basename(filepath))[0] if 'slim' in name: name = name.replace('slim', '') try: version = list(graph.subject_objects(owl.versionIRI))[0][1] except IndexError: version = list(graph.subjects(rdf.type, owl.Ontology))[0] ng = createOntology(f'{name}-dead', f'NIF {curie} deprecated', makePrefixes('replacedBy', 'NIFRID', curie, *prefs), f'{name}dead', f'Classes from {curie} with owl:deprecated true that we want rdfs:subClassOf NIFRID:birnlexRetiredClass, or classes hiding in a oboInOwl:hasAlternativeId annotation. This file was generated by pyontutils/necromancy from {version}.', local_base=olr) extract(og, ng, curie, mkdir)
def ncbigene_make(): IDS_FILE = (Path(__file__).parent / 'resources/gene-subset-ids.txt').as_posix() with open(IDS_FILE, 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db': 'gene', 'retmode': 'json', 'retmax': 5000, 'id': None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') ng = createOntology( 'ncbigeneslim', 'NIF NCBI Gene subset', makePrefixes('ilxtr', 'NIFRID', 'NCBIGene', 'NCBITaxon', 'skos', 'owl'), 'ncbigeneslim', 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE, remote_base='http://ontology.neuinfo.org/NIF/') for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ng.write()
def uri_switch(filenames, get_values): replacement_graph = createOntology( 'NIF-NIFSTD-mapping', 'NIF* to NIFSTD equivalents', makePrefixes('BIRNANN', 'BIRNOBI', 'BIRNOBO', 'NIFANN', 'NIFCELL', 'NIFCHEM', 'NIFDYS', 'NIFFUN', 'NIFGA', 'NIFGG', 'NIFINV', 'NIFMOL', 'NIFMOLINF', 'NIFMOLROLE', 'NIFNCBISLIM', 'NIFNEURBR', 'NIFNEURBR2', 'NIFNEURCIR', 'NIFNEURMC', 'NIFNEURMOR', 'NIFNEURNT', 'NIFORG', 'NIFQUAL', 'NIFRES', 'NIFRET', 'NIFSCID', 'NIFSUB', 'NIFUNCL', 'OBOANN', 'SAOCORE')) fragment_prefixes, ureps = get_values(replacement_graph) print('Start writing') trips_lists = Parallel(n_jobs=9)( delayed(do_file)(f, swapUriSwitch, ureps, fragment_prefixes) for f in filenames) print('Done writing') [replacement_graph.g.add(t) for trips in trips_lists for t in trips] replacement_graph.write()
def main(): abagraph = rdflib.Graph() abagraph.parse( (gitf / 'NIF-Ontology/ttl/generated/parcellation/mbaslim.ttl').as_posix(), format='turtle') abagraph.parse( (gitf / 'NIF-Ontology/ttl/bridge/aba-bridge.ttl').as_posix(), format='turtle') nses = {k: rdflib.Namespace(v) for k, v in abagraph.namespaces()} #nses['ABA'] = nses['MBA'] # enable quick check against the old xrefs syn_iri = nses['NIFRID']['synonym'] acro_iri = nses['NIFRID']['acronym'] abasyns = {} abalabs = {} abaacro = {} ABA_PREFIX = 'MBA:' #ABA_PREFIX = 'ABA:' # all bad for sub in abagraph.subjects(rdflib.RDF.type, rdflib.OWL.Class): if not sub.startswith(nses[ABA_PREFIX[:-1]]['']): continue subkey = ABA_PREFIX + sub.rsplit('/', 1)[1] sub = rdflib.URIRef(sub) abalabs[subkey] = [ o for o in abagraph.objects(rdflib.URIRef(sub), rdflib.RDFS.label) ][0].toPython() syns = [] for s in abagraph.objects(sub, syn_iri): syns.append(s.toPython()) abasyns[subkey] = syns abaacro[subkey] = [ a.toPython() for a in abagraph.objects(sub, acro_iri) ] url = 'http://api.brain-map.org/api/v2/tree_search/Structure/997.json?descendants=true' resp = requests.get(url).json() ids = set([ABA_PREFIX + str(r['id']) for r in resp['msg']]) Query = namedtuple('Query', ['id', 'relationshipType', 'direction', 'depth']) #uberon = Query('UBERON:0000955', 'http://purl.obolibrary.org/obo/BFO_0000050', 'INCOMING', 9) uberon = Query('UBERON:0001062', 'subClassOf', 'INCOMING', 10) # anatomical entity output = g.getNeighbors(**uberon._asdict()) # TODO figure out the superclass that can actually get all the brain parts meta_edge = 'http://www.geneontology.org/formats/oboInOwl#hasDbXref' u_a_map = {} a_u_map = {} uberon_syns = {} uberon_labs = {} syn_types = { 'http://www.geneontology.org/formats/oboInOwl#hasExactSynonym': 'Exact', 'http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym': 'Narrow', 'http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym': 'Related', 'http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym': 'Broad', } for node in output['nodes']: curie = node['id'] uberon_labs[curie] = node['lbl'] uberon_syns[curie] = {} if 'synonym' in node['meta']: for stype in syn_types: if stype in node['meta']: uberon_syns[curie][stype] = node['meta'][stype] if meta_edge in node['meta']: xrefs = node['meta'][meta_edge] mba_ref = [r for r in xrefs if r.startswith(ABA_PREFIX)] u_a_map[curie] = mba_ref if mba_ref: for mba in mba_ref: a_u_map[mba] = curie else: u_a_map[curie] = None def obo_output( ): # oh man obo_io is a terrible interface for writing obofiles :/ for aid in abalabs: # set aids not in uberon to none if aid not in a_u_map: a_u_map[aid] = None e = OboFile() n = OboFile() r = OboFile() b = OboFile() name_order = 'Exact', 'Narrow', 'Related', 'Broad' rev = {v: k for k, v in syn_types.items()} # sillyness syn_order = [rev[n] for n in name_order] files_ = { rev['Broad']: b, rev['Exact']: e, rev['Narrow']: n, rev['Related']: r } for aid, uid in sorted(a_u_map.items()): id_line = 'id: ' + aid lines = [] lines.append(id_line) lines.append('name: ' + abalabs[aid]) if uid in uberon_syns: syns = uberon_syns[uid] else: syns = {} for syn_type in syn_order: f = files_[syn_type] if syn_types[syn_type] == 'Exact' and uid is not None: syn_line = 'synonym: "' + uberon_labs[ uid] + '" ' + syn_types[syn_type].upper( ) + ' [from label]' lines.append(syn_line) if syn_type in syns: for syn in sorted(syns[syn_type]): syn_line = 'synonym: "' + syn + '" ' + syn_types[ syn_type].upper() + ' []' lines.append(syn_line) block = '\n'.join(lines) term = Term(block, f) e.filename = 'e-syns.obo' n.filename = 'en-syns.obo' r.filename = 'enr-syns.obo' b.filename = 'enrb-syns.obo' for f in files_.values(): h = Header('format-version: 1.2\nontology: %s\n' % f.filename) h.append_to_obofile(f) f.write(f.filename) #embed() #obo_output() def make_record(uid, aid): # edit this to change the format to_format = ('{uberon_id: <20}{uberon_label:}\n' '{aba_id: <20}{aba_label}\n' '------ABA SYNS------\n' '{aba_syns}\n' '-----UBERON SYNS-----\n' '{uberon_syns}\n') uberon_syn_rec = uberon_syns[uid] insert_uberon = [] for edge, syns in sorted(uberon_syn_rec.items()): insert_uberon.append('--{abv}--\n{syns}'.format( abv=syn_types[edge], syns='\n'.join(sorted(syns)))) kwargs = { 'uberon_id': uid, 'uberon_label': uberon_labs[uid], 'aba_id': aid, 'aba_label': abalabs[aid], 'aba_syns': '\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns': '\n'.join(insert_uberon) } return to_format.format(**kwargs) #text = '\n\n'.join([make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) #with open('aba_uberon_syn_review.txt', 'wt') as f: #f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = createOntology('uberon-parcellation-mappings', 'Uberon Parcellation Mappings', makePrefixes('owl', 'ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_trip(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_trip(arefs[0], 'ilx:delineates', u) ubridge.write() if __name__ == '__main__': embed()
def main(): DB_URI = 'mysql+mysqlconnector://{user}:{password}@{host}:{port}/{db}' if socket.gethostname() != 'orpheus': config = mysql_conn_helper('localhost', 'nif_eelg', 'nif_eelg_secure', 33060) # see .ssh/config else: config = mysql_conn_helper('nif-mysql.crbs.ucsd.edu', 'nif_eelg', 'nif_eelg_secure') engine = create_engine(DB_URI.format(**config), echo=True) config = None del(config) insp = inspect(engine) terms = [c['name'] for c in insp.get_columns('terms')] term_existing_ids = [c['name'] for c in insp.get_columns('term_existing_ids')] #breakpoint() #sys.exit() query = engine.execute('SELECT * FROM term_existing_ids as teid JOIN terms as t ON t.id = teid.tid WHERE t.type != "cde"') header = term_existing_ids + terms data = query.fetchall() cdata = list(zip(*data)) def datal(head): return cdata[header.index(head)] ilx_labels = {ilxb[ilx_fragment]:label for ilx_fragment, label in zip(datal('ilx'), datal('label'))} mapping_no_sao = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0]] # 9446 mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neuinfo' in p[0] or '/sao' in p[0]] # 9883 done = [ilx for iri, ilx in mapping] obo_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'obolibrary' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in obo_mapping] db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'drugbank' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in db_mapping] t3db_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 't3db' in p[0] and p[1] not in done] done = done + [ilx for iri, ilx in t3db_mapping] wiki_mapping = [p for p in zip(datal('iri'), datal('ilx')) if 'neurolex' in p[0] and p[1] not in done] sao_mapping = {o.toPython():s for s, o in Graph().parse((gitf / 'nlxeol/sao-nlxwiki-fixes.ttl').as_posix(), format='ttl').subject_objects(oboInOwl.hasAlternativeId)} scr = Graph().parse((gitf / 'NIF-Ontology/scicrunch-registry.ttl').as_posix(), format='turtle') moved_to_scr = {} #PROBLEM = set() for s, o in scr.subject_objects(oboInOwl.hasDbXref): if 'SCR_' in o: print(f'WARNING Registry identifier listed as alt id! {s} hasDbXref {o}') continue uri = NIFSTD[o] #try: assert uri not in moved_to_scr, f'utoh {uri} was mapped to more than one registry entry! {s} {moved_to_scr[uri]}' #except AssertionError: #PROBLEM.add(uri) moved_to_scr[uri] = s to_scr = [(k, v) for k, v in moved_to_scr.items() if noneMembers(k, 'SciEx_', 'OMICS_', 'rid_', 'SciRes_', 'biodbcore-', 'C0085410', 'doi.org', 'C43960', 'doi:10.', 'GAZ:', # 'birnlex_', 'nlx_', 'nif-' )] replacement_graph = createOntology(filename='NIFSTD-ILX-mapping', name='NLX* to ILX equivalents', prefixes=makePrefixes('ILX'),) scr_rep_graph = createOntology(filename='NIFSTD-SCR-mapping', name='NLX* to SCR equivalents', prefixes=makePrefixes('SCR'),) _existing = {} def dupes(this, other, set_, dupes_): if this not in set_: set_.add(this) _existing[this] = other elif _existing[this] != other: dupes_[this].add(_existing[this]) dupes_[this].add(other) iri_done = set() ilx_done = set() iri_dupes = defaultdict(set) ilx_dupes = defaultdict(set) def check_dupes(iri, ilx): dupes(iri, ilx, iri_done, iri_dupes) dupes(ilx, iri, ilx_done, ilx_dupes) BIRNLEX = Namespace(uPREFIXES['BIRNLEX']) trouble = [ # some are _2 issues :/ # in interlex -- YES WE KNOW THEY DONT MATCH SOME IDIOT DID THIS IN THE PAST BIRNLEX['1006'], # this one appears to be entirely novel despite a note that it was created in 2006... BIRNLEX['1152'], # this was used in uberon ;_; BIRNLEX['2476'], # can be owl:sameAs ed -> _2 version BIRNLEX['2477'], # can be owl:sameAs ed -> _2 version BIRNLEX['2478'], # can be owl:sameAs ed -> _2 version BIRNLEX['2479'], # can be owl:sameAs ed -> _2 version BIRNLEX['2480'], # can be owl:sameAs ed -> _2 version BIRNLEX['2533'], # This is in interlex as a wiki id http://uri.interlex.org/base/ilx_0109349 since never used in the ontology, we could add it to the list of 'same as' for cosmetic purposes which will probably happen... BIRNLEX['3074'], # -> CHEBI:26848 # add to slim and bridge... BIRNLEX['3076'], # -> CHEBI:26195 # XXX when we go to load chebi make sure we don't dupe this... ] aaaaaaaaaaaaaaaaaaaaaaaaaaaaa = [t + '_2' for t in trouble] # _never_ do this # TODO check for cases where there is an ilx and scr for the same id >_< sao_help = set() for iri, ilx_fragment in chain(mapping, to_scr): # XXX core loop if iri in sao_mapping: uri = sao_mapping[iri] sao_help.add(uri) else: uri = URIRef(iri) if uri in trouble: #print('TROUBLE', iri, ilxb[ilx_fragment]) print('TROUBLE', ilxb[ilx_fragment]) if uri in moved_to_scr: # TODO I think we need to have _all_ the SCR redirects here... s, p, o = uri, ilxtr.hasScrId, moved_to_scr[uri] scr_rep_graph.g.add((s, p, o)) else: s, p, o = uri, ilxtr.hasIlxId, ilxb[ilx_fragment] #s, p, o = o, ilxtr.ilxIdFor, s replacement_graph.g.add((s, p, o)) check_dupes(s, o) dupes = {k:v for k, v in iri_dupes.items()} idupes = {k:v for k, v in ilx_dupes.items()} assert not dupes, f'there are duplicate mappings for an external id {dupes}' #print(ilx_dupes) # there are none yet ng = cull_prefixes(replacement_graph.g, prefixes=uPREFIXES) ng.filename = replacement_graph.filename sng = cull_prefixes(scr_rep_graph.g, prefixes=uPREFIXES) sng.filename = scr_rep_graph.filename _ = [print(k.toPython(), ' '.join(sorted(ng.qname(_.toPython()) for _ in v))) for k, v in idupes.items()] # run `resolver_uris = sorted(set(e for t in graph for e in t if 'uri.neuinfo.org' in e))` on a graph with everything loaded to get this file... resources = Path(__file__).resolve().absolute().parent / 'resources' with open((resources / 'all-uri.neuinfo.org-uris.pickle').as_posix(), 'rb') as f: all_uris = pickle.load(f) # come in as URIRefs... with open((resources / 'all-uri.neuinfo.org-uris-old.pickle').as_posix(), 'rb') as f: all_uris_old = pickle.load(f) # come in as URIRefs... with open((resources / 'all-uri.neuinfo.org-uris-old2.pickle').as_posix(), 'rb') as f: all_uris_old2 = pickle.load(f) # come in as URIRefs... resolver_uris = set(e for t in chain(ng.g, sng.g) for e in t if 'uri.neuinfo.org' in e) ilx_only = resolver_uris - all_uris # aka nlxonly resolver_not_ilx_only = resolver_uris - ilx_only problem_uris = all_uris - resolver_uris old_uris = all_uris_old - all_uris old_uris2 = all_uris_old2 - all_uris dold_uris = all_uris_old - all_uris_old2 #idold_uris = all_uris_old2 - all_uris_old # empty as expected #nxrefs = Graph().parse((gitf / 'NIF-Ontology/ttl/generated/nlx-xrefs.ttl').as_posix(), format='turtle') nxrefs = Graph().parse((gitf / 'nlxeol/nlx-xrefs.ttl').as_posix(), format='turtle') xrefs_uris = set(e for t in nxrefs for e in t if 'uri.neuinfo.org' in e) test_old_uris = old_uris2 - xrefs_uris diff_uris = test_old_uris - ilx_only #diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_149160')) # ORNL was included in an old bad version of the xrefs file and was pulled in in the old all-uris # now dealt with by the scr mapping diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd/nlx_40280,birnlex_1731')) # one of the doubled neurolex ids diff_uris.remove(URIRef('http://uri.neuinfo.org/nif/nifstd')) # i have zero idea how this snuck in assert not diff_uris, 'old uris and problem uris should be identical' _ilx = set(e for t in ng.g for e in t) _scr = set(e for t in sng.g for e in t) for uri in ilx_only: if uri in _ilx and uri in _scr: raise BaseException('AAAAAAAAAAAAAAAAAAAAAAAAAAAAA') elif uri in _ilx: g = ng.g elif uri in _scr: g = sng.g else: raise BaseException('????????????') g.add((uri, ilxtr.isDefinedBy, URIRef('http://neurolex.org'))) # XXX write the graphs ng.write() sng.write() nsuris = set(uri for uri, ilx in mapping_no_sao) auris = set(_.toPython() for _ in all_uris) iuris = set(_.toPython() for _ in resolver_uris) #sao_missing = iuris - nsuris # now fixed and cannot run due to addition of scr ids to resolver_uris #assert not sao_missing, f'whoops {sao_missing}' ilx_missing = auris - iuris all_missing = iuris - auris #assert not all_missing, f'all is not all! {all_missing}' # XXX have to deal with ilx_only separately as NLX-ILX or something # fixed #sao_add = {o.toPython():s.toPython() for s, p, o in ng.g if s.toPython() in sao_missing} #assert len(sao_add) == len(sao_missing), 'EEEEEEEEEEEEEEE' #with open('/tmp/please-add-these-sao-ids-as-existing-ids-to-the-listed-interlex-record.json', 'wt') as f: #json.dump(sao_add, f, indent=2) to_review = sorted(ilx_missing) # not relevant anymore #with open('thought-to-be-missing.json', 'rt') as f: #thought_to_be_missing = json.load(f) # from troy has issues #with open('nifext-duplicates-and-new.json', 'rt') as f: #nifext_data = json.load(f) #nifext_dupes = {v['current_nifext_id']:v['dropped_nifext_ids'][-1] if v['dropped_nifext_ids'] else None for v in nifext_data.values()} sgv = Vocabulary(cache=True) trts = [(v, (sgv.findById(v)['labels'][0] if sgv.findById(v)['labels'] else '<--NO-LABEL-->') if sgv.findById(v) else '<------>') for v in to_review] sgg = sGraph(cache=True) SGG = Namespace(sgg._basePath.rstrip('/') + '/graph/') rg = Graph().parse((gitf / 'NIF-Ontology/ttl/unused/NIF-Retired.ttl').as_posix(), format='turtle') retired = set(e.toPython() for t in rg for e in t if 'uri.neuinfo.org' in e) retfile = '<ttl/unused/NIF-Retired.ttl>' help_graph = createOntology(filename='NIFSTD-BLACKHOLE-mapping', name='HELPPPPPPPP!!!!', prefixes=uPREFIXES,) def make_rt(to_review_tuples, retired=retired): def inner(u, l, retired=retired): ne = sgg.getNeighbors(u, relationshipType="isDefinedBy", depth=1) if ne: curie = help_graph.qname(u) help_graph.g.add((URIRef(u), ilxtr.SciGraphLookup, URIRef(f'http://scigraph.olympiangods.org/scigraph/graph/{curie}'))) if ne and ne['edges']: src = ' '.join([f'<{e["obj"]}>' for e in ne["edges"]]) elif u in retired: src = retfile else: src = '<>' return f'{u:<70} {l:<50} {src}' out = Async(rate=3000)(deferred(inner)(u, l) for u, l in sorted(to_review_tuples, key=lambda a:a[-1])) return '\n'.join(out) review_text = make_rt(trts) trts2 = [(u, l) for u, l in trts if 'nifext' not in u] not_nifext = make_rt(trts2) hng = cull_prefixes(help_graph.g, prefixes=uPREFIXES) hng.filename = help_graph.filename hng.write() ### # Accounting of uri.neuinfo.org ids that do not resolve ### not_in_interlex = set(s for s, o in hng.g.subject_objects(ilxtr.SciGraphLookup)) bh_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and sgv.findById(s)['deprecated']) bh_not_deprecated = set(s for s in hng.g.subjects() if sgv.findById(s) and not sgv.findById(s)['deprecated']) bh_nifexts = set(s for s in bh_not_deprecated if 'nifext' in s) bh_readable = set(s for s in bh_not_deprecated if 'readable' in s) unaccounted = not_in_interlex - bh_readable - bh_nifexts - bh_deprecated namedinds = set(s for s in unaccounted if sgv.findById(s) and sgg.getNode(s)['nodes'][0]['meta']['types'] and sgg.getNode(s)['nodes'][0]['meta']['types'][0] == 'NamedIndividual') unaccounted = unaccounted - namedinds ual = sorted(o for s in unaccounted for o in hng.g.objects(s, ilxtr.SciGraphLookup)) report = ( f'Total {len(not_in_interlex)}\n' f'deprecated {len(bh_deprecated)}\n' f'nd nifext {len(bh_nifexts)}\n' f'nd readable {len(bh_readable)}\n' f'nd namedind {len(namedinds)}\n' f'unaccounted {len(unaccounted)}\n' ) print(report) def reverse_report(): ilx = Graph() ilx.parse('/tmp/interlex.ttl', format='turtle') not_in_ontology = set() annotations = set() relations = set() drugbank = set() t3db = set() for subject in ilx.subjects(rdf.type, owl.Class): ok = False for object in ilx.objects(subject, oboInOwl.hasDbXref): if anyMembers(object, 'uri.neuinfo.org', 'GO_', 'CHEBI_', 'PR_', 'PATO_', 'HP_', 'OBI_', 'DOID_', 'COGPO_', 'CAO_', 'UBERON_', 'NCBITaxon_', 'SO_', 'IAO_'): # FIXME doe we areally import HP? ok = True if (subject, rdf.type, owl.AnnotationProperty) in ilx: # FIXME for troy these need to be cleared up annotations.add(subject) elif (subject, rdf.type, owl.ObjectProperty) in ilx: relations.add(subject) elif 'drugbank' in object: drugbank.add(subject) elif 't3db.org' in object: t3db.add(subject) if not ok: not_in_ontology.add(subject) drugbank = drugbank & not_in_ontology t3db = t3db & not_in_ontology annotations = annotations & not_in_ontology relations = relations & not_in_ontology unaccounted = not_in_ontology - drugbank - t3db - annotations - relations report = ( f'Total {len(not_in_ontology)}\n' f'annotations {len(annotations)}\n' f'relations {len(relations)}\n' f'drugbank {len(drugbank)}\n' f't3db {len(t3db)}\n' f'unaccounted {len(unaccounted)}\n' ) print(report) return (not_in_ontology, drugbank, unaccounted) _, _, un = reverse_report() h_uris = set(e for t in hng.g for e in t if 'uri.neuinfo.org' in e) real_problems = problem_uris - h_uris ### # Missing neurons ### with open((gitf / 'nlxeol/neuron_data_curated.csv').as_posix()) as f: r = csv.reader(f) nheader = next(r) rows = list(r) ndata = list(zip(*rows)) def datan(head): return ndata[nheader.index(head)] if __name__ == '__main__': breakpoint()
def main(): olr = auth.get_path('ontology-local-repo') resources = auth.get_path('resources') if not olr.exists(): raise FileNotFoundError(f'{olr} does not exist cannot continue') if not resources.exists(): raise FileNotFoundError(f'{resources} does not exist cannot continue') PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) file = resources / 'chebi-subset-ids.txt' with open(file.as_posix(), 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok g = OntGraph() cg = OntGraph() cd = OntGraph() chemg = OntGraph() molg = OntGraph() cg.parse(olr / 'ttl/generated/chebislim.ttl', format='turtle') list(g.add(t) for t in cg) a1 = check_chebis(g) cd.parse(olr / 'ttl/generated/chebi-dead.ttl', format='turtle') list(g.add(t) for t in cd) a2 = check_chebis(g) chemg.parse(olr / 'ttl/NIF-Chemical.ttl', format='turtle') chemgg = makeGraph('NIF-Chemical', graph=chemg) fixIons(chemg) list(g.add(t) for t in chemg) a3 = check_chebis(g) molg.parse(olr / 'ttl/NIF-Molecule.ttl', format='turtle') molgg = makeGraph('NIF-Molecule', graph=molg) fixIons(molg) list(g.add(t) for t in molg) a4 = check_chebis(g) replacedBy = ug.expand('replacedBy:') deads = {s: o for s, o in cd.subject_objects(replacedBy)} def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_trip(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in switch_dead(g) switch_dead(cg) switch_dead(chemg) switch_dead(molg) def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'NIFRID')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') # ng.replace_uriref('NIFRID:ChEBIid', 'oboInOwl:id') # :id does not exist, do we need an alternative? list(map(fixHasAltId, (g, cg, chemg))) def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') # i = ug.expand('oboInOwl:id') # :id does not exist makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) #for s, o in g.subject_objects(i): # :id does not exist #inner(s, i, o) list(map(fixAltIdIsURIRef, (g, cg, chemg))) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [ set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review ] wat_a = [ set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review ] wat_c_ = [ set(cg.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [ set(g.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = createOntology( 'chebi-bridge', 'NIF ChEBI bridge', makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'oboInOwl', 'NIFMOL', 'NIFRID'), 'chebibridge', ('This bridge file contains additional annotations' ' on top of CHEBI identifiers that were originally' ' included in NIF-Chemical or NIF-Molecule that have' ' not since been added to CHEBI upstream'), path='ttl/bridge/', #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl', #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl')) imports=( 'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl', 'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym py = t[-1].toPython() if py == string and not py.startswith( 'ub' ): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class( sub ) # only need to go at the end because sub is the same for each set def hasImplicitSuperclass(s, o): for super_ in cg.objects(s, rdflib.RDFS.subClassOf): if super_ == o: return True elif hasImplicitSuperclass(super_, o): return True # curation decisions after review (see outtc for full list) curatedOut = [] def curateOut(*t): curatedOut.append( tuple( ug.expand(_) if type(_) is not rdflib.Literal else _ for _ in t)) cb.del_trip(*t) curateOut( 'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367' ) # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def curateOut( 'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870' ) # some ions may also be free radicals, but all free radicals are not ions! #natural product removal since natural product should probably be a role if anything... curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:50906', 'rdfs:label', rdflib.Literal('Chemical role', datatype=rdflib.XSD.string) ) # chebi already has a chemical role... curateOut( 'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432' ) # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property curateOut('CHEBI:22720', 'rdfs:subClassOf', 'CHEBI:27171') # not all children are bicyclic curateOut( 'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188' ) # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate... curateOut( 'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171' ) # not all children are bicyclic, some may be poly, therefore removing curateOut( 'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232' ) # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it) curateOut('CHEBI:51064', 'rdfs:subClassOf', 'CHEBI:35338') # removing since chebi models this with has part curateOut( 'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720' ) # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786') # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea # review hold over subClassOf statements intc = [] outtc = [] for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf): if str( o ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class' or str( o ) == 'http://ontology.neuinfo.org/nif/nifstd/readable/birnlexRetiredClass': # we need to remove any of the cases where deprecation was misused cb.g.remove((s, rdflib.RDFS.subClassOf, o)) elif hasImplicitSuperclass(s, o): cb.g.remove((s, rdflib.RDFS.subClassOf, o)) intc.append((s, rdflib.RDFS.subClassOf, o)) else: outtc.append((s, rdflib.RDFS.subClassOf, o)) def qname(trips): return tuple( tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips) for a, p, b in sorted(qname(outtc)): if 'NIFMOL' in b: continue # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later s = sgv.findById(a) o = sgv.findById(b) if s is None or o is None: print(a, '=>', s) print(b, '=>', o) else: print(s['labels'], s['curie']) print('subClassOf') print(o['labels'], o['curie']) print((a, p, b)) print('---------------------') cb.write( ) # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) # validation diff2 = set(cb.g) - set(cg) diff3 = set(cb.g) - diff2 # should just be all the owl:Class entries diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg) # not informative diff5 = set(cb.g) - diff4 # not informative both = set(chemg) & set( molg) # there is no overlap beyond the owl:Class declarations def getChebis(set_): return set(t for t in set_ if 'CHEBI_' in t[0]) def nodt(graph): return set((s, str(o) if type(o) is rdflib.Literal else o) for s, p, o in graph) cmc = getChebis(((( (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o))) mmc = getChebis(((( (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o))) # remove chebi classes from nifchem and nifmol def remstuff(sources, targets): for source in sources: for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class): for target in targets: target.del_class(id_) remstuff((cg, cd), (chemgg, molgg)) chemgg.write() molgg.write() if __name__ == '__main__': breakpoint()