def ncbigene_make(): IDS_FILE = 'resources/gene-subset-ids.txt' with open(IDS_FILE, 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db': 'gene', 'retmode': 'json', 'retmax': 5000, 'id': None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') ng = createOntology( 'ncbigeneslim', 'NIF NCBI Gene subset', makePrefixes('ILXREPLACE', 'ilx', 'OBOANN', 'NCBIGene', 'NCBITaxon', 'skos', 'owl'), 'ncbigeneslim', 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE, remote_base='http://ontology.neuinfo.org/NIF/') for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ng.write()
def chebi_imp(): PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) with open('resources/chebi-subset-ids.txt', 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok g = rdflib.Graph() cg = rdflib.Graph() cd = rdflib.Graph() chemg = rdflib.Graph() molg = rdflib.Graph() #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') cg.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') list(g.add(t) for t in cg) a1 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') cd.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') list(g.add(t) for t in cd) a2 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemgg = makeGraph('NIF-Chemical', graph=chemg) fixIons(chemg) list(g.add(t) for t in chemg) a3 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molgg = makeGraph('NIF-Molecule', graph=molg) fixIons(molg) list(g.add(t) for t in molg) a4 = check_chebis(g) replacedBy = ug.expand('replacedBy:') deads = {s: o for s, o in cd.subject_objects(replacedBy)} def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_node(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in switch_dead(g) switch_dead(cg) switch_dead(chemg) switch_dead(molg) def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'BIRNANN')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') ng.replace_uriref('BIRNANN:ChEBIid', 'oboInOwl:id') list(map(fixHasAltId, (g, cg, chemg))) def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') i = ug.expand('oboInOwl:id') makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) for s, o in g.subject_objects(i): inner(s, i, o) list(map(fixAltIdIsURIRef, (g, cg, chemg))) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [ set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review ] wat_a = [ set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review ] wat_c_ = [ set(cg.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [ set(g.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = createOntology( 'chebi-bridge', 'NIF ChEBI bridge', makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'oboInOwl', 'NIFMOL', 'OBOANN', 'BIRNANN'), 'chebibridge', ('This bridge file contains additional annotations' ' on top of CHEBI identifiers that were originally' ' included in NIF-Chemical or NIF-Molecule that have' ' not since been added to CHEBI upstream'), path='ttl/bridge/', #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl', #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl')) imports=( 'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl', 'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym py = t[-1].toPython() if py == string and not py.startswith( 'ub' ): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class( sub ) # only need to go at the end because sub is the same for each set def hasImplicitSuperclass(s, o): for super_ in cg.objects(s, rdflib.RDFS.subClassOf): if super_ == o: return True elif hasImplicitSuperclass(super_, o): return True # curation decisions after review (see outtc for full list) curatedOut = [] def curateOut(*t): curatedOut.append( tuple( ug.expand(_) if type(_) is not rdflib.Literal else _ for _ in t)) cb.del_trip(*t) curateOut( 'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367' ) # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def curateOut( 'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870' ) # some ions may also be free radicals, but all free radicals are not ions! #natural product removal since natural product should probably be a role if anything... curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:50906', 'rdfs:label', rdflib.Literal('Chemical role', datatype=rdflib.XSD.string) ) # chebi already has a chemical role... curateOut( 'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432' ) # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property curateOut('CHEBI:22720', 'rdfs:subClassOf', 'CHEBI:27171') # not all children are bicyclic curateOut( 'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188' ) # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate... curateOut( 'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171' ) # not all children are bicyclic, some may be poly, therefore removing curateOut( 'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232' ) # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it) curateOut('CHEBI:51064', 'rdfs:subClassOf', 'CHEBI:35338') # removing since chebi models this with has part curateOut( 'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720' ) # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786') # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea # review hold over subClassOf statements intc = [] outtc = [] for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf): if str( o ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class': # we need to remove any of the cases where deprecation was misused cb.g.remove((s, rdflib.RDFS.subClassOf, o)) elif hasImplicitSuperclass(s, o): cb.g.remove((s, rdflib.RDFS.subClassOf, o)) intc.append((s, rdflib.RDFS.subClassOf, o)) else: outtc.append((s, rdflib.RDFS.subClassOf, o)) def qname(trips): return tuple( tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips) for a, p, b in sorted(qname(outtc)): if 'NIFMOL' in b: continue # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later s = sgv.findById(a) o = sgv.findById(b) if s is None or o is None: print(a, '=>', s) print(b, '=>', o) else: print(s['labels'], s['curie']) print('subClassOf') print(o['labels'], o['curie']) print((a, p, b)) print('---------------------') cb.write( ) # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) # validation diff2 = set(cb.g) - set(cg) diff3 = set(cb.g) - diff2 # should just be all the owl:Class entries diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg) # not informative diff5 = set(cb.g) - diff4 # not informative both = set(chemg) & set( molg) # there is no overlap beyond the owl:Class declarations def getChebis(set_): return set(t for t in set_ if 'CHEBI_' in t[0]) def nodt(graph): return set((s, str(o) if type(o) is rdflib.Literal else o) for s, p, o in graph) cmc = getChebis(((( (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o))) mmc = getChebis(((( (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o))) # remove chebi classes from nifchem and nifmol def remstuff(sources, targets): for source in sources: for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class): for target in targets: target.del_class(id_) remstuff((cg, cd), (chemgg, molgg)) chemgg.write() molgg.write() embed()
def chebi_make(): PREFIXES = makePrefixes('definition', 'hasRole', 'BFO', 'CHEBI', 'owl', 'skos', 'oboInOwl') dPREFIXES = makePrefixes('CHEBI', 'replacedBy', 'owl', 'skos') ug = makeGraph('utilgraph', prefixes=PREFIXES) IDS_FILE = 'resources/chebi-subset-ids.txt' with open(IDS_FILE, 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = set((ug.expand(_.strip()).toPython() for _ in ids_raw)) #gzed = requests.get('http://localhost:8000/chebi.owl') #raw = BytesIO(gzed.content) gzed = requests.get( 'http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/nightly/chebi.owl.gz' ) raw = BytesIO(gzip.decompress(gzed.content)) t = etree.parse(raw) r = t.getroot() cs = r.getchildren() classes = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids ] ontology = t.xpath("/*[local-name()='RDF']/*[local-name()='Ontology']") ops = t.xpath( "/*[local-name()='RDF']/*[local-name()='ObjectProperty']") # TODO wanted = [etree.ElementTree(_) for _ in classes] rpl_check = t.xpath( "/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']" ) rpl_dict = { _.text: _.getparent() for _ in rpl_check if _.text in ids_raw } # we also need to have any new classes that have replaced old ids also_classes = list(rpl_dict.values()) def rec(start_set, done): ids_ = set() for c in start_set: ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']") if _.items() ]) ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']" ) if _.items() ]) supers = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids_ and _ not in done ] if supers: msup, mids = rec(supers, done + supers) supers += msup ids_.update(mids) return supers, ids_ a = ontology + ops + classes + also_classes more, mids = rec(a, a) all_ = set(a + more) r.clear() # wipe all the stuff we don't need for c in all_: r.append(c) data = etree.tostring(r) g = rdflib.Graph() g.parse( data=data ) # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly? src_version = list( g.query( 'SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }' ))[0][0] new_graph = createOntology( 'chebislim', 'NIF ChEBI slim', PREFIXES, 'chebislim', 'This file is generated by pyontutils/slimgen from the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), remote_base='http://ontology.neuinfo.org/NIF/') chebi_dead = createOntology( 'chebi-dead', 'NIF ChEBI deprecated', dPREFIXES, 'chebidead', 'This file is generated by pyontutils/slimgen to make deprecated classes resolvablefrom the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), remote_base='http://ontology.neuinfo.org/NIF/') depwor = { 'CHEBI:33243': 'natural product', # FIXME remove these? 'CHEBI:36809': 'tricyclic antidepressant', } for id_ in sorted( set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))): eid = ug.expand(id_) trips = list(g.triples((eid, None, None))) if not trips: #looks for the id_ as a literal alts = list( g.triples(( None, rdflib.term.URIRef( 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' ), rdflib.Literal( id_, datatype=rdflib.term.URIRef( 'http://www.w3.org/2001/XMLSchema#string'))))) if alts: replaced_by, _, __ = alts[0] if replaced_by.toPython( ) not in ids: # we need to add any replacment classes to the bridge print('REPLACED BY NEW CLASS', id_) for t in g.triples((replaced_by, None, None)): new_graph.add_recursive(t, g) chebi_dead.add_class(id_) chebi_dead.add_node(id_, 'replacedBy:', replaced_by) chebi_dead.add_node(id_, rdflib.OWL.deprecated, True) else: if id_ not in depwor: raise BaseException('wtf error', id_) else: for trip in trips: new_graph.add_recursive(trip, g) # https://github.com/ebi-chebi/ChEBI/issues/3294 madness = new_graph.expand('oboInOwl:hasRelatedSynonym'), rdflib.Literal( '0', datatype=rdflib.namespace.XSD.string) for a in new_graph.g.subjects(*madness): new_graph.g.remove((a, ) + madness) new_graph.write() chebi_dead.write() embed()
kwargs = { 'uberon_id':uid, 'uberon_label':uberon_labs[uid], 'aba_id':aid, 'aba_label':abalabs[aid], 'aba_syns':'\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns':'\n'.join(insert_uberon) } return to_format.format(**kwargs) #text = '\n\n'.join([make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) #with open('aba_uberon_syn_review.txt', 'wt') as f: #f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = createOntology('uberon-parcellation-mappings', 'Uberon Parcellation Mappings', makePrefixes('owl', 'ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_trip(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_trip(arefs[0], 'ilx:delineates', u) ubridge.write() embed()