def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'BIRNANN')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') ng.replace_uriref('BIRNANN:ChEBIid', 'oboInOwl:id')
class MBA(HBA): ont = OntMeta( PARC, 'mbaslim', 'Allen Mouse Brain Atlas Ontology', 'MBA 2011 v2', 'This file is automatically generated from the Allen Brain Atlas API.' + NOTICE, TODAY) concept = PScheme(ILXREPLACE(ont.name), 'Allen Mouse Brain Atlas parcellation concept', 'NCBITaxon:10090', ADULT) atlas = PSArtifact( ILXREPLACE(ont.name + 'atlas'), 'Allen Mouse Brain Atlas v2', '2011 v2', 'November 2011', 'http://mouse.brain-map.org/static/atlas', 'http://help.brain-map.org/download/attachments/2818169/AllenReferenceAtlas_v2_2011.pdf?version=1&modificationDate=1319667383440', # yay no doi! wat tuple(), tuple()) PREFIX = 'MBA' PREFIXES = makePrefixes('NIFRID') PREFIXES[ PREFIX] = 'http://api.brain-map.org/api/v2/data/Structure/' # FIXME hack to allow both HBA and MBA ROOT = 997 @classmethod def datamunge(cls, data): for node in data: if node['id'] == cls.ROOT: node['name'] = 'allen mouse brain atlas parcellation root' node['safe_name'] = 'allen mouse brain atlas parcellation root' node['acronym'] = 'mbaroot' break
class genericPScheme: ont = OntMeta concept = PScheme atlas = PSArtifact PREFIXES = makePrefixes('', 'ilx', 'owl', 'skos', 'BIRNLEX', 'NCBITaxon', 'ILXREPLACE') def __new__(cls, validate=False): error = 'Expected %s got %s' if type(cls.ont) != OntMeta: raise TypeError(error % (OntMeta, type(cls.ont))) elif type(cls.concept) != PScheme: raise TypeError(error % (PScheme, type(cls.concept))) elif type(cls.atlas) != PSArtifact: raise TypeError(error % (PSArtifact, type(cls.atlas))) ontid = cls.ont.path + cls.ont.filename + '.ttl' PREFIXES = {k: v for k, v in cls.PREFIXES.items()} PREFIXES.update(genericPScheme.PREFIXES) #if '' in cls.PREFIXES: # NOT ALLOWED! #if PREFIXES[''] is None: #PREFIXES[''] = ontid + '/' graph = makeGraph(cls.ont.filename, PREFIXES, writeloc=WRITELOC) graph.add_ont(ontid, *cls.ont[2:]) make_scheme(graph, cls.concept, cls.atlas.curie) data = cls.datagetter() cls.datamunge(data) cls.dataproc(graph, data) add_ops(graph) graph.write() if validate or getattr(cls, 'VALIDATE', False): cls.validate(graph) return ontid, cls.atlas @classmethod def datagetter(cls): """ example datagetter function, make any local modifications here """ with open('myfile', 'rt') as f: rows = [r for r in csv.reader(f)] dothing = lambda _: [i for i, v in enumerate(_)] rows = [dothing(_) for _ in rows] raise NotImplementedError('You need to implement this yourlself!') return rows @classmethod def datamunge(cls, data): """ in place modifier of data """ pass @classmethod def dataproc(cls, graph, data): """ example datagetter function, make any local modifications here """ for thing in data: graph.add_trip(*thing) raise NotImplementedError('You need to implement this yourlself!') @classmethod def validate(cls, graph): """ Put any post validation here. """ raise NotImplementedError('You need to implement this yourlself!')
class CoCoMac(genericPScheme): ont = OntMeta( PARC, 'cocomacslim', 'CoCoMac terminology', 'CoCoMac', ('This file is automatically generated from the CoCoMac ' 'database on the terms from BrainMaps_BrainSiteAcronyms.' + NOTICE), TODAY) concept = PScheme(ILXREPLACE(ont.name), 'CoCoMac terminology parcellation concept', 'NCBITaxon:9544', 'ilx:various') atlas = PSArtifact( ILXREPLACE(ont.name + 'atlas'), 'CoCoMac terminology', None, #'no version info', None, #'no date', 'http://cocomac.g-node.org', 'scholarly things', tuple(), tuple()) PREFIXES = makePrefixes('NIFRID') PREFIXES[ 'cocomac'] = 'http://cocomac.g-node.org/services/custom_sql_query.php?sql=SELECT%20*%20from%20BrainMaps_BrainSiteAcronyms%20where%20ID=' # looking for better options @classmethod def datagetter(cls): url = 'http://cocomac.g-node.org/services/custom_sql_query.php?sql=SELECT * from BrainMaps_BrainSiteAcronyms;&format=json' table = requests.get(url).json() fields = table['fields'] data = [fields] + list(table['data'].values()) return data @classmethod def dataproc(cls, graph, data): class cocomac(rowParse): def ID(self, value): self.identifier = 'cocomac:' + value # safe because reset every row (ish) graph.add_class(self.identifier, cls.concept.curie) def Key(self, value): pass def Summary(self, value): pass def Acronym(self, value): graph.add_trip(self.identifier, ACRONYM, value) def FullName(self, value): graph.add_trip(self.identifier, rdfs.label, '(%s) ' % cls.ont.shortname + value) graph.add_trip(self.identifier, PARCLAB, value) def LegacyID(self, value): graph.add_trip(self.identifier, ACRONYM, value) def BrainInfoID(self, value): pass cocomac(data)
def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_node(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in
def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok
class FMRI(genericPScheme): PREFIXES = makePrefixes('', 'skos', 'ILXREPLACE') @classmethod def datagetter(cls): data = cls.DATA return data @classmethod def dataproc(cls, graph, data): for node in data: id_ = 'ATLAS:' + node.get('index') label = node.text display = '(%s) ' % cls.ont.shortname + label graph.add_class(id_, cls.concept.curie, label=display) graph.add_trip(id_, PARCLAB, label)
class HBA(genericPScheme): ont = OntMeta( PARC, 'hbaslim', 'Allen Human Brain Atlas Ontology', 'HBA 2013 v2', 'This file is automatically generated from the Allen Brain Atlas API.' + NOTICE, TODAY) concept = PScheme(ILXREPLACE(ont.name), 'Allen Human Brain Atlas parcellation concept', 'NCBITaxon:9606', ADULT) atlas = PSArtifact( ILXREPLACE(ont.name + 'atlas'), 'Allen Human Brain Atlas v2', '2013 v2', 'October 2013', 'http://human.brain-map.org/', 'http://help.brain-map.org/download/attachments/2818165/HBA_Ontology-and-Nomenclature.pdf?version=1&modificationDate=1382051847989', tuple(), tuple()) PREFIX = 'HBA' PREFIXES = makePrefixes('NIFRID') PREFIXES[ PREFIX] = 'http://api.brain-map.org:80/api/v2/data/Structure/' # FIXME hack to allow both HBA and MBA ROOT = 3999 #VALIDATE = True @classmethod def datagetter(cls): url = 'http://api.brain-map.org/api/v2/tree_search/Structure/{root}.json?descendants=true'.format( root=cls.ROOT) resp = requests.get(url).json() return resp['msg'] @classmethod def dataproc(cls, graph, data): for node in data: curie = graph.expand(cls.PREFIX + ':' + str(node['id'])) graph.add_class(curie, cls.concept.curie) parent = node['parent_structure_id'] graph.add_trip(curie, rdfs.label, '(%s) ' % cls.ont.shortname + node['name']) graph.add_trip(curie, PARCLAB, node['name']) graph.add_trip(curie, ACRONYM, node['acronym']) if node['safe_name'] != node['name']: graph.add_trip(curie, SYNONYM, node['safe_name']) if parent: pcurie = graph.expand(cls.PREFIX + ':' + str(parent)) graph.add_hierarchy(pcurie, PARTOF, curie) @classmethod def validate(cls, graph): check_hierarchy(graph, cls.PREFIX + ':' + str(cls.ROOT), PARTOF, PARCLAB)
def ncbigene_make(): IDS_FILE = 'resources/gene-subset-ids.txt' with open(IDS_FILE, 'rt') as f: # this came from neuroNER ids = [l.split(':')[1].strip() for l in f.readlines()] #url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&retmax=5000&db=gene&id=' #for id_ in ids: #data = requests.get(url + id_).json()['result'][id_] url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' data = { 'db': 'gene', 'retmode': 'json', 'retmax': 5000, 'id': None, } chunks = [] for i, idset in enumerate(chunk_list(ids, 100)): print(i, len(idset)) data['id'] = ','.join(idset), resp = requests.post(url, data=data).json() chunks.append(resp) base = chunks[0]['result'] uids = base['uids'] for more in chunks[1:]: data = more['result'] uids.extend(data['uids']) base.update(data) #base['uids'] = uids # i mean... its just the keys base.pop('uids') ng = createOntology( 'ncbigeneslim', 'NIF NCBI Gene subset', makePrefixes('ILXREPLACE', 'ilx', 'OBOANN', 'NCBIGene', 'NCBITaxon', 'skos', 'owl'), 'ncbigeneslim', 'This subset is automatically generated from the NCBI Gene database on a subset of terms listed in %s.' % IDS_FILE, remote_base='http://ontology.neuinfo.org/NIF/') for k, v in base.items(): #if k != 'uids': ncbi(v, ng) ng.write()
def parcellation_schemes(ontids_atlases): ont = OntMeta('http://ontology.neuinfo.org/NIF/ttl/generated/', 'parcellation', 'NIF collected parcellation schemes ontology', 'NIF Parcellations', 'Brain parcellation schemes as represented by root concepts.', TODAY) ontid = ont.path + ont.filename + '.ttl' PREFIXES = makePrefixes('ilx', 'owl', 'skos', 'OBOANN') graph = makeGraph(ont.filename, PREFIXES, writeloc = '/tmp/parc/') graph.add_ont(ontid, *ont[2:]) for import_id, atlas in sorted(ontids_atlases): graph.add_node(ontid, rdflib.OWL.imports, import_id) add_triples(graph, atlas, make_atlas) graph.add_class(PARC_SUPER[0], label=PARC_SUPER[1]) graph.write(convert=False)
def parcellation_schemes(ontids_atlases): ont = OntMeta( GENERATED, 'parcellation', 'NIF collected parcellation schemes ontology', 'NIF Parcellations', 'Brain parcellation schemes as represented by root concepts.', TODAY) ontid = ont.path + ont.filename + '.ttl' PREFIXES = makePrefixes('', 'ilx', 'owl', 'skos', 'NIFRID', 'ILXREPLACE') graph = makeGraph(ont.filename, PREFIXES, writeloc=WRITELOC) graph.add_ont(ontid, *ont[2:]) for import_id, atlas in sorted(ontids_atlases): graph.add_trip(ontid, owl.imports, import_id) add_triples(graph, atlas, make_atlas) graph.add_class(ATLAS_SUPER, label=atname) graph.add_class(PARC_SUPER, label=psname) graph.write()
class PAXRAT6(genericPScheme): source = 'resources/paxinos09names.txt' ont = OntMeta( PARC, 'paxinos_r_s_6', 'Paxinos Rat Parcellation 6th', 'PAXRAT6', 'This file is automatically generated from ' + source + '.' + NOTICE, TODAY) concept = PScheme(ILXREPLACE(ont.name), 'Paxinos Rat Stereological 6th Ed parcellation concept', 'NCBITaxon:10116', ADULT) atlas = PSArtifact( ILXREPLACE(ont.name + 'atlas'), 'The Rat Brain in Stereotaxic Coordinates 6th Edition', '6th', '02-11-2006', # d-m-y None, # the fact this is missing is very big problem :/ ('Paxinos, George, Charles RR Watson, and Piers C. Emson.' ' "AChE-stained horizontal sections of the rat brain' ' in stereotaxic coordinates." Journal of neuroscience' ' methods 3, no. 2 (1980): 129-149.'), # FIXME ('Paxinos Rat 6th', ), tuple()) PREFIXES = makePrefixes('NIFRID') PREFIXES['PAXRAT'] = interlex_namespace('paxinos/rat/labels') @classmethod def datagetter(cls): with open(cls.source, 'rt') as f: lines = [ l.rsplit('#')[0].strip() for l in f.readlines() if not l.startswith('#') ] return [l.rsplit(' ', 1) for l in lines] @classmethod def dataproc(cls, graph, data): for i, (label, abrv) in enumerate(data): id_ = 'PAXRAT:' + str(i + 1) display = '(%s) ' % cls.ont.shortname + label graph.add_class(id_, cls.concept.curie, label=display) graph.add_trip(id_, PARCLAB, label) graph.add_trip( id_, ACRONYM, abrv) # FIXME these are listed as abbreviations in the text
def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') i = ug.expand('oboInOwl:id') makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) for s, o in g.subject_objects(i): inner(s, i, o)
def chebi_imp(): PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) with open('resources/chebi-subset-ids.txt', 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok g = rdflib.Graph() cg = rdflib.Graph() cd = rdflib.Graph() chemg = rdflib.Graph() molg = rdflib.Graph() #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') cg.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') list(g.add(t) for t in cg) a1 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') cd.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') list(g.add(t) for t in cd) a2 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemgg = makeGraph('NIF-Chemical', graph=chemg) fixIons(chemg) list(g.add(t) for t in chemg) a3 = check_chebis(g) #g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molgg = makeGraph('NIF-Molecule', graph=molg) fixIons(molg) list(g.add(t) for t in molg) a4 = check_chebis(g) replacedBy = ug.expand('replacedBy:') deads = {s: o for s, o in cd.subject_objects(replacedBy)} def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_node(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in switch_dead(g) switch_dead(cg) switch_dead(chemg) switch_dead(molg) def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'BIRNANN')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') ng.replace_uriref('BIRNANN:ChEBIid', 'oboInOwl:id') list(map(fixHasAltId, (g, cg, chemg))) def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') i = ug.expand('oboInOwl:id') makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) for s, o in g.subject_objects(i): inner(s, i, o) list(map(fixAltIdIsURIRef, (g, cg, chemg))) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [ set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review ] wat_a = [ set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review ] wat_c_ = [ set(cg.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [ set(g.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = createOntology( 'chebi-bridge', 'NIF ChEBI bridge', makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'oboInOwl', 'NIFMOL', 'OBOANN', 'BIRNANN'), 'chebibridge', ('This bridge file contains additional annotations' ' on top of CHEBI identifiers that were originally' ' included in NIF-Chemical or NIF-Molecule that have' ' not since been added to CHEBI upstream'), path='ttl/bridge/', #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl', #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl')) imports=( 'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl', 'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym py = t[-1].toPython() if py == string and not py.startswith( 'ub' ): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class( sub ) # only need to go at the end because sub is the same for each set def hasImplicitSuperclass(s, o): for super_ in cg.objects(s, rdflib.RDFS.subClassOf): if super_ == o: return True elif hasImplicitSuperclass(super_, o): return True # curation decisions after review (see outtc for full list) curatedOut = [] def curateOut(*t): curatedOut.append( tuple( ug.expand(_) if type(_) is not rdflib.Literal else _ for _ in t)) cb.del_trip(*t) curateOut( 'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367' ) # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def curateOut( 'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870' ) # some ions may also be free radicals, but all free radicals are not ions! #natural product removal since natural product should probably be a role if anything... curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:50906', 'rdfs:label', rdflib.Literal('Chemical role', datatype=rdflib.XSD.string) ) # chebi already has a chemical role... curateOut( 'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432' ) # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property curateOut('CHEBI:22720', 'rdfs:subClassOf', 'CHEBI:27171') # not all children are bicyclic curateOut( 'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188' ) # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate... curateOut( 'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171' ) # not all children are bicyclic, some may be poly, therefore removing curateOut( 'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232' ) # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it) curateOut('CHEBI:51064', 'rdfs:subClassOf', 'CHEBI:35338') # removing since chebi models this with has part curateOut( 'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720' ) # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786') # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea # review hold over subClassOf statements intc = [] outtc = [] for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf): if str( o ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class': # we need to remove any of the cases where deprecation was misused cb.g.remove((s, rdflib.RDFS.subClassOf, o)) elif hasImplicitSuperclass(s, o): cb.g.remove((s, rdflib.RDFS.subClassOf, o)) intc.append((s, rdflib.RDFS.subClassOf, o)) else: outtc.append((s, rdflib.RDFS.subClassOf, o)) def qname(trips): return tuple( tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips) for a, p, b in sorted(qname(outtc)): if 'NIFMOL' in b: continue # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later s = sgv.findById(a) o = sgv.findById(b) if s is None or o is None: print(a, '=>', s) print(b, '=>', o) else: print(s['labels'], s['curie']) print('subClassOf') print(o['labels'], o['curie']) print((a, p, b)) print('---------------------') cb.write( ) # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) # validation diff2 = set(cb.g) - set(cg) diff3 = set(cb.g) - diff2 # should just be all the owl:Class entries diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg) # not informative diff5 = set(cb.g) - diff4 # not informative both = set(chemg) & set( molg) # there is no overlap beyond the owl:Class declarations def getChebis(set_): return set(t for t in set_ if 'CHEBI_' in t[0]) def nodt(graph): return set((s, str(o) if type(o) is rdflib.Literal else o) for s, p, o in graph) cmc = getChebis(((( (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o))) mmc = getChebis(((( (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o))) # remove chebi classes from nifchem and nifmol def remstuff(sources, targets): for source in sources: for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class): for target in targets: target.del_class(id_) remstuff((cg, cd), (chemgg, molgg)) chemgg.write() molgg.write() embed()
def swanson(): """ not really a parcellation scheme """ source = 'resources/swanson_aligned.txt' ONT_PATH = GENERATED filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = makePrefixes('', 'ilx', 'owl', 'skos', 'NIFRID', 'ILXREPLACE') PREFIXES.update({ #'':ontid + '/', # looking for better options 'SWAN': interlex_namespace('swanson/nt/term'), 'SWAA': interlex_namespace('swanson/nt/appendix'), }) new_graph = makeGraph(filename, PREFIXES, writeloc=WRITELOC) new_graph.add_ont( ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from ' + source + '.' + NOTICE, TODAY) # FIXME citations should really go on the ... anatomy? scheme artifact definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.' definingCitationID = 'ISBN:9780195340624' new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation) new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID) with open(source, 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([ ' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ') ]).replace('human', 'HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.' * 5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = async_getter(sgv.findByTerm, [(d[1], ) for d in data]) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict( self.children) self.appendicies[self._appendix]['parents'] = dict( self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name': apname.capitalize(), 'type': self.citation.capitalize() if self.citation else None } return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix][ 'taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[ self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add( self._rowind) else: self.name += str(self._appendix) + self.nodes[ self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: embed() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [ sorted(n) for k, n in self.names.items() if len(n) > 1 ]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [ _ for _ in sorted([ '{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values() ]) ] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = PREFIXES['SWAN'] + '%s' json_ = {'nodes': [], 'edges': []} parent = ILXREPLACE('swansonBrainRegionConcept') for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, parent, label=anns['label']) new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation']) json_['nodes'].append({'lbl': anns['label'], 'id': 'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon']) # issues arrise here... for appendix, data in sp.appendicies.items(): aid = PREFIXES['SWAA'] + str(appendix) new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_trip( aid, 'ilx:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = HASPART + str(appendix) apo = PARTOF + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items( ): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_hierarchy( cid, ahp, pid) # note hierarhcy inverts direction new_graph.add_hierarchy(pid, apo, cid) json_['edges'].append({ 'sub': 'SWA:' + str(child), 'pred': apo, 'obj': 'SWA:' + str(parent) }) new_graph.write() if False: Query = namedtuple('Query', ['root', 'relationshipType', 'direction', 'depth']) mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? for i, n in enumerate(mapping): a, b = creatTree(*Query('SWA:' + str(n), 'ilx:partOf' + str(i + 1), 'INCOMING', 10), json=json_) print(a) return ontid, None
def swanson(): """ not really a parcellation scheme """ ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/' filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = makePrefixes('ilx', 'owl', 'OBOANN', 'UBERON') PREFIXES.update({ '':ontid + '/', # looking for better options 'SWAN':'http://swanson.org/node/', 'SWAA':'http://swanson.org/appendix/', }) new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/parc/') new_graph.add_ont(ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from....', TODAY) with open('resources/swanson_aligned.txt', 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.'*5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = async_getter(sgv.findByTerm, [(d[1],) for d in data]) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict(self.children) self.appendicies[self._appendix]['parents'] = dict(self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name':apname.capitalize(), 'type':self.citation.capitalize() if self.citation else None} return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add(self._rowind) else: self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: embed() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = 'http://swanson.org/node/%s' json_ = {'nodes':[],'edges':[]} for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, 'ilx:swansonBrainRegionConcept', label=anns['label']) new_graph.add_node(nid, 'OBOANN:definingCitation', anns['citation']) json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_node(nid, rdflib.OWL.equivalentClass, anns['uberon']) # issues arrise here... for appendix, data in sp.appendicies.items(): aid = 'http://swanson.org/appendix/%s' % appendix new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_node(aid, 'ilx:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = HASPART + str(appendix) apo = PARTOF + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items(): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_hierarchy(cid, ahp, pid) # note hierarhcy inverts direction new_graph.add_hierarchy(pid, apo, cid) json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)}) new_graph.write(convert=False) if False: Query = namedtuple('Query', ['root','relationshipType','direction','depth']) mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? for i, n in enumerate(mapping): a, b = creatTree(*Query('SWA:' + str(n), 'ilx:partOf' + str(i + 1), 'INCOMING', 10), json=json_) print(a) return ontid, None
class HCP(genericPScheme): source = 'resources/human_connectome_project_2016.csv' ont = OntMeta( PARC, 'hcp_parcellation', ('Human Connectome Project Multi-Modal ' 'human cortical parcellation'), 'HCP-MMP1.0', 'This file is automatically generated from ' + source + '.' + NOTICE, TODAY) concept = PScheme(ILXREPLACE(ont.name), 'HCP parcellation concept', 'NCBITaxon:9606', ADULT) atlas = PSArtifact( ILXREPLACE(ont.name + 'atlas'), 'Human Connectome Project Multi-Modal human cortical parcellation', '1.0', '20-07-2016', # d-m-y 'awaiting...', 'doi:10.1038/nature18933', ('Human Connectome Project Multi-Modal Parcellation', 'HCP Multi-Modal Parcellation', 'Human Connectome Project Multi-Modal Parcellation version 1.0'), ('HCP_MMP', ont.shortname)) # see also https://balsa.wustl.edu/study/show/RVVG PREFIXES = makePrefixes('NIFRID') PREFIXES['HCPMMP'] = interlex_namespace('hcpmmp/labels') @classmethod def datagetter(cls): with open(cls.source, 'rt') as f: data = [r for r in csv.reader(f)] return data @classmethod def dataproc(cls, graph, data): class hcp2016(rowParse): def Parcellation_Index(self, value): self.id_ = value self.id_ = 'HCPMMP:' + value # safe because reset every row (ish) graph.add_class(self.id_, cls.concept.curie) def Area_Name(self, value): value = value.strip() graph.add_trip(self.id_, ACRONYM, value) def Area_Description(self, value): value = value.strip() graph.add_trip(self.id_, rdfs.label, '(%s) ' % cls.ont.shortname + value) graph.add_trip(self.id_, PARCLAB, value) def Newly_Described(self, value): if value == 'Yes*' or value == 'Yes': graph.add_trip(self.id_, 'NIFRID:definingCitation', 'Glasser and Van Essen 2016') def Results_Sections(self, value): pass def Other_Names(self, value): for name in value.split(','): name = name.strip() if name: if len(name) <= 3: graph.add_trip(self.id_, ACRONYM, name) else: graph.add_trip(self.id_, SYNONYM, name) def Key_Studies(self, value): for study in value.split(','): study = study.strip() if study: graph.add_trip(self.id_, 'NIFRID:definingCitation', study) hcp2016(data)
kwargs = { 'uberon_id':uid, 'uberon_label':uberon_labs[uid], 'aba_id':aid, 'aba_label':abalabs[aid], 'aba_syns':'\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns':'\n'.join(insert_uberon) } return to_format.format(**kwargs) #text = '\n\n'.join([make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) #with open('aba_uberon_syn_review.txt', 'wt') as f: #f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = createOntology('uberon-parcellation-mappings', 'Uberon Parcellation Mappings', makePrefixes('owl', 'ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_trip(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_trip(arefs[0], 'ilx:delineates', u) ubridge.write() embed()
def chebi_make(): PREFIXES = makePrefixes('definition', 'hasRole', 'BFO', 'CHEBI', 'owl', 'skos', 'oboInOwl') dPREFIXES = makePrefixes('CHEBI', 'replacedBy', 'owl', 'skos') ug = makeGraph('utilgraph', prefixes=PREFIXES) IDS_FILE = 'resources/chebi-subset-ids.txt' with open(IDS_FILE, 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = set((ug.expand(_.strip()).toPython() for _ in ids_raw)) #gzed = requests.get('http://localhost:8000/chebi.owl') #raw = BytesIO(gzed.content) gzed = requests.get( 'http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/nightly/chebi.owl.gz' ) raw = BytesIO(gzip.decompress(gzed.content)) t = etree.parse(raw) r = t.getroot() cs = r.getchildren() classes = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids ] ontology = t.xpath("/*[local-name()='RDF']/*[local-name()='Ontology']") ops = t.xpath( "/*[local-name()='RDF']/*[local-name()='ObjectProperty']") # TODO wanted = [etree.ElementTree(_) for _ in classes] rpl_check = t.xpath( "/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']" ) rpl_dict = { _.text: _.getparent() for _ in rpl_check if _.text in ids_raw } # we also need to have any new classes that have replaced old ids also_classes = list(rpl_dict.values()) def rec(start_set, done): ids_ = set() for c in start_set: ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']") if _.items() ]) ids_.update([ _.items()[0][1] for _ in etree.ElementTree(c).xpath( "/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']" ) if _.items() ]) supers = [ _ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids_ and _ not in done ] if supers: msup, mids = rec(supers, done + supers) supers += msup ids_.update(mids) return supers, ids_ a = ontology + ops + classes + also_classes more, mids = rec(a, a) all_ = set(a + more) r.clear() # wipe all the stuff we don't need for c in all_: r.append(c) data = etree.tostring(r) g = rdflib.Graph() g.parse( data=data ) # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly? src_version = list( g.query( 'SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }' ))[0][0] new_graph = createOntology( 'chebislim', 'NIF ChEBI slim', PREFIXES, 'chebislim', 'This file is generated by pyontutils/slimgen from the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), remote_base='http://ontology.neuinfo.org/NIF/') chebi_dead = createOntology( 'chebi-dead', 'NIF ChEBI deprecated', dPREFIXES, 'chebidead', 'This file is generated by pyontutils/slimgen to make deprecated classes resolvablefrom the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), remote_base='http://ontology.neuinfo.org/NIF/') depwor = { 'CHEBI:33243': 'natural product', # FIXME remove these? 'CHEBI:36809': 'tricyclic antidepressant', } for id_ in sorted( set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))): eid = ug.expand(id_) trips = list(g.triples((eid, None, None))) if not trips: #looks for the id_ as a literal alts = list( g.triples(( None, rdflib.term.URIRef( 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' ), rdflib.Literal( id_, datatype=rdflib.term.URIRef( 'http://www.w3.org/2001/XMLSchema#string'))))) if alts: replaced_by, _, __ = alts[0] if replaced_by.toPython( ) not in ids: # we need to add any replacment classes to the bridge print('REPLACED BY NEW CLASS', id_) for t in g.triples((replaced_by, None, None)): new_graph.add_recursive(t, g) chebi_dead.add_class(id_) chebi_dead.add_node(id_, 'replacedBy:', replaced_by) chebi_dead.add_node(id_, rdflib.OWL.deprecated, True) else: if id_ not in depwor: raise BaseException('wtf error', id_) else: for trip in trips: new_graph.add_recursive(trip, g) # https://github.com/ebi-chebi/ChEBI/issues/3294 madness = new_graph.expand('oboInOwl:hasRelatedSynonym'), rdflib.Literal( '0', datatype=rdflib.namespace.XSD.string) for a in new_graph.g.subjects(*madness): new_graph.g.remove((a, ) + madness) new_graph.write() chebi_dead.write() embed()
from hierarchies import creatTree, flatten from parcellation import OntMeta sgg = Graph(cache=True, basePath='http://localhost:9000/scigraph') sgv = Vocabulary(cache=True, basePath='http://localhost:9000/scigraph') Query = namedtuple('Query', ['root', 'relationshipType', 'direction', 'depth']) CON = 'http://www.geneontology.org/formats/oboInOwl#consider' DBX = 'http://www.geneontology.org/formats/oboInOwl#hasDbXref' #FIXME also behaves as objectProperty :/ AID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' IRBC = 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#isReplacedByClass' PREFIXES = makePrefixes( 'UBERON', 'ro', 'owl', 'skos', ) NIFPREFIXES = makePrefixes( 'NIFGA', 'oboInOwl', 'replacedBy', ) NIFPREFIXES.update(PREFIXES) nifga_path = os.path.expanduser('~/git/NIF-Ontology/ttl/NIF-GrossAnatomy.ttl') uberon_path = os.path.expanduser('~/git/NIF-Ontology/ttl/external/uberon.owl') uberon_bridge_path = 'http://berkeleybop.org/ontologies/uberon/bridge/uberon-bridge-to-nifstd.owl' #bridge_path = os.path.expanduser('~/git/NIF-Ontology/ttl/uberon-bridge-to-nifstd.ttl') # scigraph's got us
kwargs = { 'uberon_id':uid, 'uberon_label':uberon_labs[uid], 'aba_id':aid, 'aba_label':abalabs[aid], 'aba_syns':'\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns':'\n'.join(insert_uberon) } return to_format.format(**kwargs) text = '\n\n'.join([make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) with open('aba_uberon_syn_review.txt', 'wt') as f: f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = makeGraph('uberon-parcellation-mappings',prefixes=makePrefixes('ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_node(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_node(arefs[0], 'ilx:delineates', u) ubridge.write() embed()
def chebi_imp(): PREFIXES = makePrefixes('definition', 'hasRole', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) with open('chebi-subset-ids.txt', 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a g = rdflib.Graph() cg = rdflib.Graph() chemg = rdflib.Graph() molg = rdflib.Graph() g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') cg.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebislim.ttl', format='turtle') a1 = check_chebis(g) g.parse('/home/tom/git/NIF-Ontology/ttl/generated/chebi-dead.ttl', format='turtle') a2 = check_chebis(g) g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') chemg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Chemical.ttl', format='turtle') a3 = check_chebis(g) g.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') molg.parse('/home/tom/git/NIF-Ontology/ttl/NIF-Molecule.ttl', format='turtle') a4 = check_chebis(g) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review] wat_a = [set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review] wat_c_ = [set(cg.triples((u, None, None))) for u, _ in review] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [set(g.triples((u, None, None))) for u, _ in review] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = makeGraph('chebi-bridge', makePrefixes('CHEBI', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'NIFMOL', 'OBOANN', 'BIRNANN')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): py = t[-1].toPython() if py == string and not py.startswith('ub'): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class(sub) # only need to go at the end because sub is the same for each set cb.write() # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) embed()
def chebi_make(): PREFIXES = makePrefixes('definition', 'hasRole', 'CHEBI', 'owl', 'skos', 'oboInOwl') dPREFIXES = makePrefixes('CHEBI','replacedBy','owl','skos') ug = makeGraph('utilgraph', prefixes=PREFIXES) IDS_FILE = 'chebi-subset-ids.txt' with open(IDS_FILE, 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = set((ug.expand(_.strip()).toPython() for _ in ids_raw)) #gzed = requests.get('http://localhost:8000/chebi.owl') #raw = BytesIO(gzed.content) gzed = requests.get('http://ftp.ebi.ac.uk/pub/databases/chebi/ontology/nightly/chebi.owl.gz') raw = BytesIO(gzip.decompress(gzed.content)) t = etree.parse(raw) r = t.getroot() cs = r.getchildren() classes = [_ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids] ontology = t.xpath("/*[local-name()='RDF']/*[local-name()='Ontology']") ops = t.xpath("/*[local-name()='RDF']/*[local-name()='ObjectProperty']") # TODO wanted = [etree.ElementTree(_) for _ in classes] rpl_check = t.xpath("/*[local-name()='RDF']/*[local-name()='Class']/*[local-name()='hasAlternativeId']") rpl_dict = {_.text:_.getparent() for _ in rpl_check if _.text in ids_raw } # we also need to have any new classes that have replaced old ids also_classes = list(rpl_dict.values()) def rec(start_set, done): ids_ = set() for c in start_set: ids_.update([_.items()[0][1] for _ in etree.ElementTree(c).xpath("/*[local-name()='Class']/*[local-name()='subClassOf']") if _.items()]) ids_.update([_.items()[0][1] for _ in etree.ElementTree(c).xpath("/*[local-name()='Class']/*[local-name()='subClassOf']/*[local-name()='Restriction']/*[local-name()='someValuesFrom']") if _.items()]) supers = [_ for _ in cs if _.tag == '{http://www.w3.org/2002/07/owl#}Class' and _.values()[0] in ids_ and _ not in done] if supers: msup, mids = rec(supers, done + supers) supers += msup ids_.update(mids) return supers, ids_ a = ontology + ops + classes + also_classes more, mids = rec(a, a) all_ = set(a + more) r.clear() # wipe all the stuff we don't need for c in all_: r.append(c) data = etree.tostring(r) g = rdflib.Graph() g.parse(data=data) # now _this_ is stupidly slow (like 20 minutes of slow) might make more sense to do the xml directly? src_version = list(g.query('SELECT DISTINCT ?match WHERE { ?temp rdf:type owl:Ontology . ?temp owl:versionIRI ?match . }'))[0][0] ont = OntMeta('http://ontology.neuinfo.org/NIF/ttl/generated/', 'chebislim', 'NIF ChEBI slim', 'chebislim', 'This file is generated by pyontutils/slimgen from the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), TODAY) dont = OntMeta('http://ontology.neuinfo.org/NIF/ttl/generated/', 'chebi-dead', 'NIF ChEBI deprecated', 'chebidead', 'This file is generated by pyontutils/slimgen to make deprecated classes resolvablefrom the full ChEBI nightly at versionIRI %s based on the list of terms in %s.' % (src_version, IDS_FILE), TODAY) new_graph = makeGraph(ont.filename, PREFIXES) ontid = ont.path + ont.filename + '.ttl' new_graph.add_ont(ontid, *ont[2:]) chebi_dead = makeGraph(dont.filename, dPREFIXES) dontid = dont.path + dont.filename + '.ttl' chebi_dead.add_ont(dontid, *dont[2:]) depwor = {'CHEBI:33243':'natural product', # FIXME remove these? 'CHEBI:36809':'tricyclic antidepressant', } for id_ in sorted(set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))): eid = ug.expand(id_) trips = list(g.triples((eid, None, None))) if not trips: #looks for the id_ as a literal alts = list(g.triples((None, rdflib.term.URIRef('http://www.geneontology.org/formats/oboInOwl#hasAlternativeId'), rdflib.Literal(id_, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))))) if alts: replaced_by, _, __ = alts[0] if replaced_by.toPython() not in ids: # we need to add any replacment classes to the bridge print('REPLACED BY NEW CLASS', id_) for t in g.triples((replaced_by, None, None)): new_graph.add_recursive(t, g) chebi_dead.add_class(id_) chebi_dead.add_node(id_, 'replacedBy:', replaced_by) chebi_dead.add_node(id_, rdflib.OWL.deprecated, True) else: if id_ not in depwor: raise BaseException('wtf error', id_) else: for trip in trips: new_graph.add_recursive(trip, g) new_graph.write() chebi_dead.write() embed()
#!/usr/bin/env python3.5 import rdflib from utils import makePrefixes, makeGraph PREFIXES = makePrefixes('NIFGA', 'NIFSTD', 'owl') g = rdflib.Graph() g.parse( 'http://purl.obolibrary.org/obo/uberon/bridge/uberon-bridge-to-nifstd.owl', format='xml') name = 'NIFGA-Equivs' ng = makeGraph(name, PREFIXES) [ ng.g.add(t) for t in ((rdflib.URIRef(PREFIXES['NIFGA'] + o.rsplit('/', 1)[-1]), p, o) for s, p, o in g.triples((None, rdflib.OWL.equivalentClass, None))) ] ng.add_ont('http://ontology.neuinfo.org/NIF/ttl/generated/' + name + '.ttl', 'NIFGA to NIFSTD mappings') ng.write()
#!/usr/bin/env python3.5 import os from glob import glob from rdflib.namespace import SKOS from parcellation import OntMeta, TODAY from utils import makeGraph, makePrefixes PREFIXES = makePrefixes( "SCR", "MBA", "NIFMOL", "NIFNEURON", "NIFCELL", "NIFGA", "UBERON", "PR", "NIFNEURMOR", "skos", "owl" ) ont = OntMeta( "http://ontology.neuinfo.org/NIF/ttl/generated/", "ksdesc-defs", "Knolwedge Space Defs", "KSDEFS", "Definitions from knowledge space descriptions. Generated by pyontutils/ksdesc_bridge.py", TODAY, ) ontid = ont.path + ont.filename + ".ttl" g = makeGraph(ont.filename, prefixes=PREFIXES) g.add_ont(ontid, *ont[2:]) top_level = glob(os.path.expanduser("~/git/ksdesc/") + "*") for putative_dir in top_level: if os.path.isdir(putative_dir): for putative_md in glob(putative_dir + "/*.md"): ident = os.path.split(putative_dir)[-1] + ":" + os.path.splitext(os.path.split(putative_md)[-1])[0]
#!/usr/bin/env python3.5 import rdflib from utils import makePrefixes, makeGraph PREFIXES = makePrefixes('NIFGA', 'NIFSTD', 'owl') g = rdflib.Graph() g.parse('http://purl.obolibrary.org/obo/uberon/bridge/uberon-bridge-to-nifstd.owl', format='xml') name = 'NIFGA-Equivs' ng = makeGraph(name, PREFIXES) [ng.g.add(t) for t in ((rdflib.URIRef(PREFIXES['NIFGA'] + o.rsplit('/',1)[-1]), p, o) for s, p, o in g.triples((None, rdflib.OWL.equivalentClass, None)))] ng.add_ont('http://ontology.neuinfo.org/NIF/ttl/generated/' + name + '.ttl', 'NIFGA to NIFSTD mappings') ng.write()
'uberon_id': uid, 'uberon_label': uberon_labs[uid], 'aba_id': aid, 'aba_label': abalabs[aid], 'aba_syns': '\n'.join(sorted(abasyns[aid] + abaacro[aid])), 'uberon_syns': '\n'.join(insert_uberon) } return to_format.format(**kwargs) text = '\n\n'.join( [make_record(uid, aid[0]) for uid, aid in sorted(u_a_map.items()) if aid]) with open('aba_uberon_syn_review.txt', 'wt') as f: f.write(text) print('total uberon terms checked:', len(uberon_labs)) print('total aba terms: ', len(abalabs)) print('total uberon with aba xref:', len([a for a in u_a_map.values() if a])) ubridge = makeGraph('uberon-parcellation-mappings', prefixes=makePrefixes('ilx', 'UBERON', 'MBA')) for u, arefs in u_a_map.items(): if arefs: # TODO check for bad assumptions here ubridge.add_node(u, 'ilx:delineatedBy', arefs[0]) ubridge.add_node(arefs[0], 'ilx:delineates', u) ubridge.write() embed()
expression_edge = 'ilx:hasExpressionPhenotype' expression_defined = 'ilx:ExpressionClassifiedNeuron' NIFCELL_NEURON = 'NIFCELL:sao1417703748' syntax = '{region}{layer_or_subregion}{expression}{ephys}{molecular}{morph}{cellOrNeuron}' ilx_base = 'ILX:{:0>7}' PREFIXES = makePrefixes( 'ilx', 'ILX', 'ILXREPLACE', 'skos', 'owl', 'dc', 'nsu', 'NCBITaxon', 'oboInOwl', 'NIFRID', 'NIFQUAL', 'NIFCELL', 'NIFMOL', 'UBERON', 'PR', ) def replace_object(find, replace, graph): # note that this is not a sed 's/find/replace/g' find = graph.expand(find) for s, p, o in graph.g.triples((None, None, find)): graph.add_trip(s, p, replace)
#!/usr/bin/env python3.6 import os from glob import glob from rdflib.namespace import SKOS from parcellation import OntMeta from utils import TODAY, makeGraph, makePrefixes PREFIXES = makePrefixes('SCR', 'MBA', 'NIFMOL', 'NIFNEURON', 'NIFCELL', 'NIFGA', 'UBERON', 'PR', 'NIFNEURMOR', 'skos', 'owl') ont = OntMeta( 'http://ontology.neuinfo.org/NIF/ttl/generated/', 'ksdesc-defs', 'Knolwedge Space Defs', 'KSDEFS', 'Definitions from knowledge space descriptions. Generated by pyontutils/ksdesc_bridge.py', TODAY) ontid = ont.path + ont.filename + '.ttl' g = makeGraph(ont.filename, prefixes=PREFIXES) g.add_ont(ontid, *ont[2:]) top_level = glob(os.path.expanduser('~/git/ksdesc/') + '*') for putative_dir in top_level: if os.path.isdir(putative_dir): for putative_md in glob(putative_dir + '/*.md'): ident = os.path.split(putative_dir)[-1] + ':' + os.path.splitext( os.path.split(putative_md)[-1])[0] print(ident) with open(putative_md, 'rt') as f: def_ = f.read()
from hierarchies import creatTree, flatten from parcellation import OntMeta, TODAY sgg = Graph(cache=True, basePath='http://localhost:9000/scigraph') sgv = Vocabulary(cache=True, basePath='http://localhost:9000/scigraph') Query = namedtuple('Query', ['root','relationshipType','direction','depth']) CON = 'http://www.geneontology.org/formats/oboInOwl#consider' DBX = 'http://www.geneontology.org/formats/oboInOwl#hasDbXref' #FIXME also behaves as objectProperty :/ AID = 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId' IRBC = 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#isReplacedByClass' PREFIXES = makePrefixes('UBERON', 'ro', 'owl', 'skos', ) NIFPREFIXES = makePrefixes('NIFGA', 'oboInOwl', 'replacedBy', ) NIFPREFIXES.update(PREFIXES) nifga_path = os.path.expanduser('~/git/NIF-Ontology/ttl/NIF-GrossAnatomy.ttl') uberon_path = os.path.expanduser('~/git/NIF-Ontology/ttl/external/uberon.owl') uberon_bridge_path = 'http://berkeleybop.org/ontologies/uberon/bridge/uberon-bridge-to-nifstd.owl' #bridge_path = os.path.expanduser('~/git/NIF-Ontology/ttl/uberon-bridge-to-nifstd.ttl') # scigraph's got us #uberon_obsolete = {'UBERON:0022988', # obsolete regional part of thalamaus