def main(): olr = auth.get_path('ontology-local-repo') ori = OntResIri('http://purl.obolibrary.org/obo/doid.owl') orp = OntResPath(olr / 'ttl/external/doid.owl') ort = ori g = ori.graph query = """ SELECT DISTINCT ?s ?o ?l WHERE { ?s a owl:Class . ?s rdfs:subClassOf* <http://purl.obolibrary.org/obo/DOID_4> . ?s rdfs:subClassOf ?o . ?s rdfs:label ?l . }""" res = list(g.query(query)) filt = [r for r in res if not isinstance(r[1], rdflib.BNode)] spath = 'ttl/generated/doidslim.ttl' go = OntGraph(path=olr / spath) # TODO prov record like the one we have for chebi go.bind('DOID', 'http://purl.obolibrary.org/obo/DOID_') s = rdflib.URIRef('http://ontology.neuinfo.org/NIF/' + spath) go.populate_from_triples( ((s, p, o) for p, o in ((rdf.type, owl.Ontology), (rdfs.label, rdflib.Literal("NIF DOID slim")),))) ds = rdflib.URIRef('http://purl.obolibrary.org/obo/DOID_4') go.add((ds, rdf.type, owl.Class)) go.add((ds, rdfs.label, rdflib.Literal('disease'))) go.populate_from_triples( (t for s, o, l in filt for t in ((s, rdf.type, owl.Class), (s, rdfs.subClassOf, o), (s, rdfs.label, l)))) go.write()
def sneechReviewGraph(self, source_graph, namespaces, sneech_file=None, path_out=None): # TODO cache (already, cannot, maybe, sneeches, maybe_sneeches) = self.preSneech(source_graph, namespaces) # TODO not entirely sure about the best place to put this ... self.reView(source_graph, maybe_sneeches) # FIXME dump and commit review_graph = OntGraph(path=path_out) oq.OntCuries.populate(review_graph) review_graph.bind('snchn', str(snchn)) # FIXME -> curies probably review_graph.bind('sncho', str(sncho)) # FIXME -> curies probably review_graph.bind('h', str(sghashes)) # FIXME -> curies probably if sneech_file: sneech_file.populate(review_graph) gen = self.triples_review(already, cannot, maybe, sneeches, sneech_file) [review_graph.add(t) for t in gen] # TODO hasReport -> maybe_sneeches report / reView # TODO snchn predicate ordering return review_graph, maybe_sneeches
def build_instances(self, instances, dids): folder = Path(self.config.out_graph_path()).parent # WOW do I need to implement the new/better way of # managing writing collections of neurons to graphs neuron_uri = next(NeuronACT.out_graph[:rdf.type:owl.Ontology]) name = 'allen-cell-instances.ttl' base, _ = neuron_uri.rsplit('/', 1) uri = rdflib.URIRef(base + '/' + name) metadata = ((uri, rdf.type, owl.Ontology),) instance_graph = OntGraph(path=folder / name) instance_graph.bind('AIBSSPEC', AIBSSPEC) instance_graph.bind('npokb', npokb) [instance_graph.add(t) for t in metadata] [instance_graph.add(t) for t in instances] [instance_graph.add(t) for t in allDifferent(None, distinctMembers(*dids))] instance_graph.write()
def test_part_of(self): eeeee = self.OntTerm('UBERON:0008933', label='primary somatosensory cortex') g = OntGraph() [g.add(t) for t in eeeee.triples_simple] g.debug() po = [t for t in eeeee.triples_simple if partOf in t] assert po, 'sadness'
def normalize_prefixes(graph, curies): new_graph = OntGraph() oc = OntCuries.new() curies.pop('', None) curies['rdf'] = str(rdf) curies['rdfs'] = str(rdfs) oc(curies) oc.populate(new_graph) [new_graph.add(t) for t in graph] return new_graph
def populate(self, graph=None): """ Populate a graph, or if no graph is provided populate a new empty graph from the current content. (Also useful for debug) """ if graph is None: graph = OntGraph() [graph.add(t) for t in self.triples] OntCuries.populate(graph) return graph
def make_import_chain(self, ontology='nif.ttl'): itrips = self.get_itrips() if not any(ontology in t[0] for t in itrips): return None, None ontologies = ontology, # hack around bad code in ontload import_graph = OntGraph() [import_graph.add(t) for t in itrips] self.tree, self.extra = next(import_tree(import_graph, ontologies)) return self.tree, self.extra
def main(): dandi_terms_path = aug.LocalPath.cwd() g = OntGraph() _ = [ populateFromJsonLd(g, path_yaml(p)) for p in dandi_terms_path.rglob('*.yaml') ] g.write('dandi-raw.ttl') remove = [(s, p, o) for p in (schema.domainIncludes, schema.rangeIncludes, rdfs.subClassOf, rdf.type) for s, o in g[:p:]] add = [(s, p, (g.namespace_manager.expand(o.toPython()) if isinstance( o, rdflib.Literal) else o)) for s, p, o in remove] _ = [g.remove(t) for t in remove] _ = [g.add(t) for t in add] # TODO ontology metadata header section g.write('dandi.ttl')
def swanson(): """ not really a parcellation scheme NOTE: the defining information up here is now deprecated it is kept around to keep the code further down happy """ source = auth.get_path('resources') / 'swanson_aligned.txt' ONT_PATH = 'http://ontology.neuinfo.org/NIF/ttl/generated/' filename = 'swanson_hierarchies' ontid = ONT_PATH + filename + '.ttl' PREFIXES = SwansonLabels.prefixes new_graph = makeGraph(filename, PREFIXES, writeloc='/tmp/') new_graph.add_ont(ontid, 'Swanson brain partomies', 'Swanson 2014 Partonomies', 'This file is automatically generated from ' + source.as_posix() + '.' + '**FIXME**', 'now') # FIXME citations should really go on the ... anatomy? scheme artifact definingCitation = 'Swanson, Larry W. Neuroanatomical Terminology: a lexicon of classical origins and historical foundations. Oxford University Press, USA, 2014.' definingCitationID = 'ISBN:9780195340624' new_graph.add_trip(ontid, 'NIFRID:definingCitation', definingCitation) new_graph.add_trip(ontid, 'NIFRID:definingCitationID', definingCitationID) with open(source, 'rt') as f: lines = [l.strip() for l in f.readlines()] # join header on page 794 lines[635] += ' ' + lines.pop(636) #fix for capitalization since this header is reused fixed = ' or '.join([' ('.join([n.capitalize() for n in _.split(' (')]) for _ in lines[635].lower().split(' or ')]).replace('human','HUMAN') lines[635] = fixed data = [] for l in lines: if not l.startswith('#'): level = l.count('.'*5) l = l.strip('.') if ' (' in l: if ') or' in l: n1, l = l.split(') or') area_name, citationP = n1.strip().split(' (') citation = citationP.rstrip(')') d = (level, area_name, citation, 'NEXT SYN') data.append(d) #print(tc.red(tc.bold(repr(d)))) area_name, citationP = l.strip().split(' (') citation = citationP.rstrip(')') else: area_name = l citation = None d = (level, area_name, citation, None) #print(d) data.append(d) results = Async()(deferred(sgv.findByTerm)(d[1]) for d in data) #results = [None] * len(data) curies = [[r['curie'] for r in _ if 'curie' in r and 'UBERON' in r['curie']] if _ else [] for _ in results] output = [_[0] if _ else None for _ in curies] header = ['Depth', 'Name', 'Citation', 'NextSyn', 'Uberon'] zoop = [header] + [r for r in zip(*zip(*data), output)] + \ [(0, 'Appendix END None', None, None, None)] # needed to add last appendix # TODO annotate the appendicies and the classes with these appendix_root_mapping = (1, 1, 1, 1, 30, 83, 69, 70, 74, 1) # should generate? class SP(rowParse): def __init__(self): self.nodes = defaultdict(dict) self._appendix = 0 self.appendicies = {} self._last_at_level = {} self.names = defaultdict(set) self.children = defaultdict(set) self.parents = defaultdict(set) self.next_syn = False super().__init__(zoop) def Depth(self, value): if self.next_syn: self.synonym = self.next_syn else: self.synonym = False self.depth = value def Name(self, value): self.name = value def Citation(self, value): self.citation = value def NextSyn(self, value): if value: self.next_syn = self._rowind else: self.next_syn = False def Uberon(self, value): self.uberon = value def _row_post(self): # check if we are in the next appendix # may want to xref ids between appendicies as well... if self.depth == 0: if self.name.startswith('Appendix'): if self._appendix: self.appendicies[self._appendix]['children'] = dict(self.children) self.appendicies[self._appendix]['parents'] = dict(self.parents) self._last_at_level = {} self.children = defaultdict(set) self.parents = defaultdict(set) _, num, apname = self.name.split(' ', 2) if num == 'END': return self._appendix = int(num) self.appendicies[self._appendix] = { 'name':apname.capitalize(), 'type':self.citation.capitalize() if self.citation else None} return else: if ' [' in self.name: name, taxonB = self.name.split(' [') self.name = name self.appendicies[self._appendix]['taxon'] = taxonB.rstrip(']').capitalize() else: # top level is animalia self.appendicies[self._appendix]['taxon'] = 'ANIMALIA'.capitalize() self.name = self.name.capitalize() self.citation = self.citation.capitalize() # nodes if self.synonym: self.nodes[self.synonym]['synonym'] = self.name self.nodes[self.synonym]['syn-cite'] = self.citation self.nodes[self.synonym]['syn-uberon'] = self.uberon return else: if self.citation: # Transverse Longitudinal etc all @ lvl4 self.names[self.name + ' ' + self.citation].add(self._rowind) else: self.name += str(self._appendix) + self.nodes[self._last_at_level[self.depth - 1]]['label'] #print(level, self.name) # can't return here because they are their own level # replace with actually doing something... self.nodes[self._rowind]['label'] = self.name self.nodes[self._rowind]['citation'] = self.citation self.nodes[self._rowind]['uberon'] = self.uberon # edges self._last_at_level[self.depth] = self._rowind # TODO will need something to deal with the Lateral/ if self.depth > 0: try: parent = self._last_at_level[self.depth - 1] except: breakpoint() self.children[parent].add(self._rowind) self.parents[self._rowind].add(parent) def _end(self): replace = {} for asdf in [sorted(n) for k,n in self.names.items() if len(n) > 1]: replace_with, to_replace = asdf[0], asdf[1:] for r in to_replace: replace[r] = replace_with for r, rw in replace.items(): #print(self.nodes[rw]) o = self.nodes.pop(r) #print(o) for vals in self.appendicies.values(): children = vals['children'] parents = vals['parents'] # need reversed so children are corrected before swap for r, rw in reversed(sorted(replace.items())): if r in parents: child = r new_child = rw parent = parents.pop(child) parents[new_child] = parent parent = list(parent)[0] children[parent].remove(child) children[parent].add(new_child) if r in children: parent = r new_parent = rw childs = children.pop(parent) children[new_parent] = childs for child in childs: parents[child] = {new_parent} self.nodes = dict(self.nodes) sp = SP() tp = [_ for _ in sorted(['{: <50}'.format(n['label']) + n['uberon'] if n['uberon'] else n['label'] for n in sp.nodes.values()])] #print('\n'.join(tp)) #print(sp.appendicies[1].keys()) #print(sp.nodes[1].keys()) nbase = PREFIXES['SWAN'] + '%s' json_ = {'nodes':[],'edges':[]} parent = ilxtr.swansonBrainRegionConcept og = OntGraph() for node, anns in sp.nodes.items(): nid = nbase % node new_graph.add_class(nid, parent, label=anns['label']) new_graph.add_trip(nid, 'NIFRID:definingCitation', anns['citation']) json_['nodes'].append({'lbl':anns['label'],'id':'SWA:' + str(node)}) #if anns['uberon']: #new_graph.add_trip(nid, owl.equivalentClass, anns['uberon']) # issues arrise here... [og.add(t) for t in map_term(rdflib.URIRef(nid), anns['label'], prefix='UBERON')] og.write(auth.get_path('ontology-local-repo') / 'ttl/generated/swanson-uberon-mapping.ttl') #hrm = [(anns['label'], gn(anns['label'])) for node, anns in sp.nodes.items()] #ok = [(h, test, term_source(h, test)) for h, test in hrm if test] #notok = [h for h, test in hrm if not test] for appendix, data in sp.appendicies.items(): aid = PREFIXES['SWAA'] + str(appendix) new_graph.add_class(aid, label=data['name'].capitalize()) new_graph.add_trip(aid, 'ilxtr:hasTaxonRank', data['taxon']) # FIXME appendix is the data artifact... children = data['children'] ahp = 'swanr:hasPart' + str(appendix) apo = 'swanr:partOf' + str(appendix) new_graph.add_op(ahp, transitive=True) new_graph.add_op(apo, inverse=ahp, transitive=True) for parent, childs in children.items(): # FIXME does this give complete coverage? pid = nbase % parent for child in childs: cid = nbase % child new_graph.add_restriction(pid, ahp, cid) # note hierarhcy inverts direction new_graph.add_restriction(cid, apo, pid) json_['edges'].append({'sub':'SWA:' + str(child),'pred':apo,'obj':'SWA:' + str(parent)}) return new_graph
def run(args): # modes graph = args['graph'] scigraph = args['scigraph'] config = args['config'] imports = args['imports'] chain = args['chain'] extra = args['extra'] # required repo_name = args['<repo>'] remote_base = args['<remote_base>'] ontologies = args['<ontologies>'] # options git_remote = args['--git-remote'] git_local = Path(args['--git-local']).resolve() zip_location = Path(args['--zip-location']).resolve() graphload_config = Path(args['--graphload-config']).resolve() graphload_config_template = graphload_config # NOTE XXX if args['--graphload-ontologies'] is not None: graphload_ontologies = Path(args['--graphload-ontologies']).resolve() else: graphload_ontologies = None org = args['--org'] branch = args['--branch'] commit = args['--commit'] scp = args['--scp-loc'] sorg = args['--scigraph-org'] sbranch = args['--scigraph-branch'] scommit = args['--scigraph-commit'] sscp = args['--scigraph-scp-loc'] scigraph_quiet = args['--scigraph-quiet'] patch_config = args['--patch-config'] curies_location = args['--curies'] patch = args['--patch'] check_built = args['--check-built'] debug = args['--debug'] log = args['--logfile'] # TODO fix_imports_only = args['--fix-imports-only'] load_base = 'scigraph-load -c {config_path}' # now _this_ is easier if args['--view-defaults']: for k, v in defaults.items(): print(f'{k:<22} {v}') return # post parse mods if remote_base == 'NIF': remote_base = 'http://ontology.neuinfo.org/NIF' itrips = None if repo_name is not None: local_base = jpth(git_local, repo_name) if graph: if args['--path-build-scigraph']: # path-build-scigraph path_build_scigraph = Path(args['--path-build-scigraph']) (scigraph_commit, services_zip, scigraph_reset_state) = scigraph_build(path_build_scigraph, git_remote, sorg, path_build_scigraph, sbranch, scommit, check_built=check_built, cleanup_later=True, quiet=scigraph_quiet) else: scigraph_commit = 'dev-9999' services_zip = 'None' scigraph_reset_state = lambda: None with execute_regardless(scigraph_reset_state): rl = ReproLoader( zip_location, git_remote, org, git_local, repo_name, branch, commit, remote_base, load_base, graphload_config_template, graphload_ontologies, patch_config, patch, scigraph_commit, fix_imports_only=fix_imports_only, check_built=check_built, ) if not fix_imports_only: FILE_NAME_ZIP = Path(rl.zip_path).name LATEST = Path(zip_location) / 'LATEST' if LATEST.exists() and LATEST.is_symlink(): LATEST.unlink() LATEST.symlink_to(FILE_NAME_ZIP) itrips, config = rl.itrips, rl.config if not ontologies: ontologies = rl.ontologies print(services_zip) print(rl.zip_path) if '--local' in args: return elif scigraph: (scigraph_commit, services_zip, _) = scigraph_build(zip_location, git_remote, sorg, git_local, sbranch, scommit, check_built=check_built, quiet=scigraph_quiet) print(services_zip) if '--local' in args: return elif config: #graph_path = Path(args['<graph_path>']).resolve() config_path = Path(args['--graph-config-out']).resolve() #local_base = Path(git_local, repo_name).resolve() date_today = TODAY() ReproLoader.make_graphload_config(graphload_config_template, graphload_ontologies, zip_location, date_today, config_path) elif imports: # TODO mismatch between import name and file name needs a better fix itrips = local_imports(remote_base, local_base, ontologies) elif chain: itrips = local_imports(remote_base, local_base, ontologies, readonly=True) elif extra: from nifstd_tools.utils import memoryCheck curies = getCuries(curies_location) curie_prefixes = set(curies.values()) memoryCheck(2665488384) graph = loadall(git_local, repo_name) new_graph = normalize_prefixes(graph, curies) for_burak(new_graph) debug = True elif patch: local_base = jpth(git_local, repo_name) local_versions = tuple(do_patch(patch_config, local_base)) else: raise BaseException('How did we possibly get here docopt?') if itrips: import_graph = OntGraph() [import_graph.add(t) for t in itrips] for tree, extra in import_tree(import_graph, ontologies): name = Path(next(iter(tree.keys()))).name with open(jpth(zip_location, f'{name}-import-closure.html'), 'wt') as f: f.write(extra.html.replace('NIFTTL:', '')) # much more readable if debug: breakpoint()
def inner(local_filepath, remote=False): if noneMembers(local_filepath, *bigleaves) or dobig: ext = os.path.splitext(local_filepath)[-1] if ext == '.ttl': infmt = 'turtle' else: log.info((ext, local_filepath)) infmt = None if remote: resp = requests.get( local_filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: try: with open(local_filepath, 'rb') as f: raw = f.read() except FileNotFoundError as e: if local_filepath.startswith('file://'): log.info( f'local_imports has already been run, skipping {local_filepath}' ) return #raise ValueError('local_imports has already been run') from e else: log.exception( e ) # TODO raise a warning if the file cannot be matched # seems like good practice to have any imported ontology under # version control so all imports are guaranteed to have good # provenance and not split the prior informaiton between the # scigraph config and the repository, the repository remains # the source of truth, load.yaml files can then pick a subset # of the properly tracked files to load as they see fit, but # not add to them (at least in pyontutils land) raw = b'' if oo in raw: # we only care if there are imports or an ontology iri scratch = OntGraph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) for s in scratch.subjects(rdf.type, owl.Ontology): triples.add((s, owl.sameAs, rdflib.URIRef(local_filepath))) # somehow this breaks computing the chain #for p in (rdfs.comment, skos.definition, definition, dc.title, rdfs.label): #for o in scratch[s:p]: #triples.add((s, p, o)) for s, o in sorted(scratch.subject_objects(p)): if revert: raise NotImplementedError('TODO') nlfp = o.replace(remote_base, local_base) triples.add((s, p, o)) if 'http://' in local_filepath or 'external' in local_filepath: # FIXME what to do about https used inconsistently :/ if 'external' in local_filepath: imported_iri = rdflib.URIRef( local_filepath.replace( local_base, remote_base)) # inefficient else: imported_iri = rdflib.URIRef(local_filepath) if s != imported_iri: imported_iri_vs_ontology_iri[ imported_iri] = s # kept for the record triples.add((imported_iri, p, s)) # bridge imported != ontology iri if local_base in nlfp and 'file://' not in o: # FIXME file:// should not be slipping through here... scratch.add((s, p, rdflib.URIRef('file://' + nlfp))) scratch.remove((s, p, o)) if nlfp not in done: done.append(nlfp) if local_base in nlfp and 'external' not in nlfp: # skip externals TODO inner(nlfp) elif readonly: # read external imports if 'external' in nlfp: inner(nlfp) else: inner(nlfp, remote=True) if not readonly: _orp = CustomTurtleSerializer.roundtrip_prefixes # FIXME awful hack :/ CustomTurtleSerializer.roundtrip_prefixes = True ttl = scratch.serialize(format='nifttl', encoding='utf-8') CustomTurtleSerializer.roundtrip_prefixes = _orp ndata, comment = ttl.split(b'###', 1) out = ndata + b'###' + rest with open(local_filepath, 'wb') as f: f.write(out)
def new_index(self, referenceIndex, *, commit=True): """ reference hosts have a single incrementing primary key index to which everything is mapped in theory these indexes could also be per 'prefix' aka the sandboxed uri path or external uri path to which something is mapped I don't see any reason not to do this for this kind of implementation since a regular pattern can be develop """ ''' QUESTION: do we force a remapping of external id sequences into uris/ first? this seems like a bad idea? or rather, it is actually a good idea, but it will have to be done with a pattern based redirect instead of an actual materialization the alternative is to do what ontobee does and pass the external iri as a query parameter ... hrm tradoffs, well we certainly can't make a nice /uberon/uris/obo/{UBERON_} folder if we include the whole uri ... so this seems a reasonable tradeoff http://purl.obolibrary.org/obo/ can wind up being mapped into multiple uri spaces ... /obo/uris/obo/ would seem to make more sense but how to indicate that other organizations/projects map there ... /uberon/uris/obo/UBERON_ could indicate the latest sequence ah, and of course in theory this gets us out of the very annoying situation where /uberon/uris/obo/UBERON_ really IS different than /doid/uris/obo/UBERON_ for some identifiers (sigh) and if they are all mapped and masking based on presence then we can detect the issues HOWEVER how do we enforce that in reality the _mapping_ is all to /obo/uris/obo/ ?? ''' path = self.path_index(referenceIndex) rrp = path.repo_relative_path s = sncho[rrp.with_suffix('').as_posix()] # TODO check ownership if path.exists(): raise FileExistsError(path) g = OntGraph(path=path) OntCuries.populate(g) # TODO these are really identified by the follow: # base/readable/ # {group}/uris/ # base/ontologies/ # {group}/ontologies/uris/ pos = ( (rdf.type, snchn.IndexGraph), (rdfs.label, rdflib.Literal(f'IndexGraph for {referenceIndex}')), (snchn.referenceIndex, rdflib.Literal(referenceIndex)), # TODO HRM #(snchn.indexRemote, ) ) for po in pos: g.add((s, *po)) # FIXME g.path.parent.mkdir(parents=True) g.write() if commit: path.commit(f'add new index for {referenceIndex}') return path
def main(): olr = auth.get_path('ontology-local-repo') resources = auth.get_path('resources') if not olr.exists(): raise FileNotFoundError(f'{olr} does not exist cannot continue') if not resources.exists(): raise FileNotFoundError(f'{resources} does not exist cannot continue') PREFIXES = makePrefixes('definition', 'replacedBy', 'hasRole', 'oboInOwl', 'CHEBI', 'owl', 'skos', 'oboInOwl') ug = makeGraph('utilgraph', prefixes=PREFIXES) file = resources / 'chebi-subset-ids.txt' with open(file.as_posix(), 'rt') as f: ids_raw = set((_.strip() for _ in f.readlines())) ids = sorted(set((ug.expand(_.strip()) for _ in ids_raw))) def check_chebis(g): a = [] for id_ in ids: l = sorted(g.triples((id_, None, None))) ll = len(l) a.append(ll) return a def fixIons(g): # there are a series of atom/ion confusions that shall be dealt with, solution is to add 'iron' as a synonym to the charged form since that is what the biologists are usually referring to... ng = makeGraph('', graph=g, prefixes=makePrefixes('CHEBI')) # atom ion None, 'CHEBI:29108' # calcium is ok ng.replace_uriref('CHEBI:30145', 'CHEBI:49713') # lithium ng.replace_uriref('CHEBI:18248', 'CHEBI:29033') # iron ng.replace_uriref('CHEBI:26216', 'CHEBI:29103') # potassium ng.replace_uriref('CHEBI:26708', 'CHEBI:29101') # sodium None, 'CHEBI:29105' # zinc is ok g = OntGraph() cg = OntGraph() cd = OntGraph() chemg = OntGraph() molg = OntGraph() cg.parse(olr / 'ttl/generated/chebislim.ttl', format='turtle') list(g.add(t) for t in cg) a1 = check_chebis(g) cd.parse(olr / 'ttl/generated/chebi-dead.ttl', format='turtle') list(g.add(t) for t in cd) a2 = check_chebis(g) chemg.parse(olr / 'ttl/NIF-Chemical.ttl', format='turtle') chemgg = makeGraph('NIF-Chemical', graph=chemg) fixIons(chemg) list(g.add(t) for t in chemg) a3 = check_chebis(g) molg.parse(olr / 'ttl/NIF-Molecule.ttl', format='turtle') molgg = makeGraph('NIF-Molecule', graph=molg) fixIons(molg) list(g.add(t) for t in molg) a4 = check_chebis(g) replacedBy = ug.expand('replacedBy:') deads = {s: o for s, o in cd.subject_objects(replacedBy)} def switch_dead(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl')) for f, r in deads.items(): ng.replace_uriref(f, r) ng.add_trip(r, 'oboInOwl:hasAlternateId', rdflib.Literal(f, datatype=rdflib.XSD.string)) g.remove( (r, replacedBy, r)) # in case the replaced by was already in switch_dead(g) switch_dead(cg) switch_dead(chemg) switch_dead(molg) def fixHasAltId(g): ng = makeGraph('', graph=g, prefixes=makePrefixes('oboInOwl', 'NIFCHEM', 'NIFRID')) ng.replace_uriref('NIFCHEM:hasAlternativeId', 'oboInOwl:hasAlternativeId') # ng.replace_uriref('NIFRID:ChEBIid', 'oboInOwl:id') # :id does not exist, do we need an alternative? list(map(fixHasAltId, (g, cg, chemg))) def fixAltIdIsURIRef(g): hai = ug.expand('oboInOwl:hasAlternativeId') # i = ug.expand('oboInOwl:id') # :id does not exist makeGraph('', graph=g, prefixes=makePrefixes( 'CHEBI')) # amazlingly sometimes this is missing... def inner(s, p, o): if type(o) == rdflib.URIRef: qn = g.namespace_manager.qname(o) g.add((s, p, rdflib.Literal(qn, datatype=rdflib.XSD.string))) if 'ns' in qn: print('WARNING UNKNOWN NAMESPACE BEING SHORTENED', str(o), qn) g.remove((s, p, o)) for s, o in g.subject_objects(hai): inner(s, hai, o) #for s, o in g.subject_objects(i): # :id does not exist #inner(s, i, o) list(map(fixAltIdIsURIRef, (g, cg, chemg))) matches = [_ for _ in zip(a1, a2, a3, a4)] changed = [len(set(_)) != 1 for _ in matches] review = [(id_, m) for id_, changed, m in zip(ids, changed, matches) if changed and m[0]] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_c = [ set([(s, str(o.toPython())) for s, p, o in cg.triples((u, None, None))]) for u, _ in review ] wat_a = [ set([(s, str(o.toPython())) for s, p, o in g.triples((u, None, None))]) for u, _ in review ] wat_c_ = [ set(cg.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython wat_a_ = [ set(g.triples((u, None, None))) for u, _ in review ] # for reasons currently lost to implementation details this returns a list of empty lists if run from ipython diff = [a - c for a, c in zip(wat_a, wat_c)] diff_ = [a - c for a, c in zip(wat_a_, wat_c_)] cb = createOntology( 'chebi-bridge', 'NIF ChEBI bridge', makePrefixes('CHEBI', 'BFO1SNAP', 'owl', 'skos', 'dc', 'hasRole', 'NIFCHEM', 'oboInOwl', 'NIFMOL', 'NIFRID'), 'chebibridge', ('This bridge file contains additional annotations' ' on top of CHEBI identifiers that were originally' ' included in NIF-Chemical or NIF-Molecule that have' ' not since been added to CHEBI upstream'), path='ttl/bridge/', #imports=('https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebislim.ttl', #'https://raw.githubusercontent.com/SciCrunch/NIF-Ontology/master/ttl/generated/chebi-dead.ttl')) imports=( 'http://ontology.neuinfo.org/NIF/ttl/generated/chebislim.ttl', 'http://ontology.neuinfo.org/NIF/ttl/generated/chebi-dead.ttl')) out = [] for set_ in diff: for sub, string in sorted(set_): for t in g.triples((sub, None, None)): # please not that this process will do things like remove hasStreenName ectasy from CHEBI:1391 since chebislim has it listed as a synonym py = t[-1].toPython() if py == string and not py.startswith( 'ub' ): # ignore restrictions... this is safe because nifmol and nifchem dont have any restrictions... cb.add_recursive(t, g) cb.add_class( sub ) # only need to go at the end because sub is the same for each set def hasImplicitSuperclass(s, o): for super_ in cg.objects(s, rdflib.RDFS.subClassOf): if super_ == o: return True elif hasImplicitSuperclass(super_, o): return True # curation decisions after review (see outtc for full list) curatedOut = [] def curateOut(*t): curatedOut.append( tuple( ug.expand(_) if type(_) is not rdflib.Literal else _ for _ in t)) cb.del_trip(*t) curateOut( 'CHEBI:6887', 'rdfs:subClassOf', 'CHEBI:23367' ) # defer to the chebi choice of chemical substance over molecular entity since it is classified as a racemate which doesn't quite match the mol ent def curateOut( 'CHEBI:26519', 'rdfs:subClassOf', 'CHEBI:24870' ) # some ions may also be free radicals, but all free radicals are not ions! #natural product removal since natural product should probably be a role if anything... curateOut('CHEBI:18059', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:24921', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:37332', 'rdfs:subClassOf', 'CHEBI:33243') curateOut('CHEBI:50906', 'rdfs:label', rdflib.Literal('Chemical role', datatype=rdflib.XSD.string) ) # chebi already has a chemical role... curateOut( 'CHEBI:22586', 'rdfs:subClassOf', 'CHEBI:24432' ) # antioxidant is already modelled as a chemical role instead of a biological role, the distinction is that the biological roles affect biological processes/property, not chemical processes/property curateOut('CHEBI:22720', 'rdfs:subClassOf', 'CHEBI:27171') # not all children are bicyclic curateOut( 'CHEBI:23447', 'rdfs:subClassOf', 'CHEBI:17188' ) # this one seems obviously flase... all cyclic nucleotides are not nucleoside 5'-monophosphate... curateOut( 'CHEBI:24922', 'rdfs:subClassOf', 'CHEBI:27171' ) # not all children are bicyclic, some may be poly, therefore removing curateOut( 'CHEBI:48706', 'rdfs:subClassOf', 'CHEBI:33232' ) # removing since antagonist is more incidental and pharmacological role is more appropriate (as chebi has it) curateOut('CHEBI:51064', 'rdfs:subClassOf', 'CHEBI:35338') # removing since chebi models this with has part curateOut( 'CHEBI:8247', 'rdfs:subClassOf', 'CHEBI:22720' ) # the structure is 'fused to' a benzo, but it is not a benzo, chebi has the correct #curateOut('CHEBI:9463', 'rdfs:subClassOf', 'CHEBI:50786') # not sure what to make of this wikipedia says one thing, but chebi says another, very strange... not an anabolic agent?!??! wat no idea # review hold over subClassOf statements intc = [] outtc = [] for s, o in cb.g.subject_objects(rdflib.RDFS.subClassOf): if str( o ) == 'http://ontology.neuinfo.org/NIF/Backend/BIRNLex_annotation_properties.owl#_birnlex_retired_class' or str( o ) == 'http://ontology.neuinfo.org/nif/nifstd/readable/birnlexRetiredClass': # we need to remove any of the cases where deprecation was misused cb.g.remove((s, rdflib.RDFS.subClassOf, o)) elif hasImplicitSuperclass(s, o): cb.g.remove((s, rdflib.RDFS.subClassOf, o)) intc.append((s, rdflib.RDFS.subClassOf, o)) else: outtc.append((s, rdflib.RDFS.subClassOf, o)) def qname(trips): return tuple( tuple(cb.g.namespace_manager.qname(_) for _ in t) for t in trips) for a, p, b in sorted(qname(outtc)): if 'NIFMOL' in b: continue # not considering cases where NIFMOL/NIFCHEM ids are used, that can come later s = sgv.findById(a) o = sgv.findById(b) if s is None or o is None: print(a, '=>', s) print(b, '=>', o) else: print(s['labels'], s['curie']) print('subClassOf') print(o['labels'], o['curie']) print((a, p, b)) print('---------------------') cb.write( ) # re-add only the missing edges so that we can zap them from NIF-Molecule and NIF-Chemical (recurse is needed...) # validation diff2 = set(cb.g) - set(cg) diff3 = set(cb.g) - diff2 # should just be all the owl:Class entries diff4 = set(cb.g) - set(chemg) | set(cb.g) - set(molg) # not informative diff5 = set(cb.g) - diff4 # not informative both = set(chemg) & set( molg) # there is no overlap beyond the owl:Class declarations def getChebis(set_): return set(t for t in set_ if 'CHEBI_' in t[0]) def nodt(graph): return set((s, str(o) if type(o) is rdflib.Literal else o) for s, p, o in graph) cmc = getChebis(((( (nodt(chemg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) cmc = sorted(t for s, o in cmc for t in chemg.triples((s, None, o))) mmc = getChebis(((( (nodt(molg) - nodt(cb.g)) - nodt(cg)) - nodt(cd)) - nodt(intc)) - nodt(curatedOut)) mmc = sorted(t for s, o in mmc for t in molg.triples((s, None, o))) # remove chebi classes from nifchem and nifmol def remstuff(sources, targets): for source in sources: for id_ in source.subjects(rdflib.RDF.type, rdflib.OWL.Class): for target in targets: target.del_class(id_) remstuff((cg, cd), (chemgg, molgg)) chemgg.write() molgg.write() if __name__ == '__main__': breakpoint()