def main(): #InterLexSneechenator() test() return # testing index_graph.bind('ILX', ILX) #[index_graph.add((npokb[str(i)], rdf.type, owl.Class)) for i in range(1, 11)] #[index_graph.add((npokb[str(i)], ilxtr.hasTemporaryId, TEMP[str(i)])) for i in range(1, 11)] ios = [] for eff in ('phenotype-core.ttl', 'phenotypes.ttl'): path = auth.get_path('ontology-local-repo') / eff input_graph = OntGraph(path=path) input_graph.parse() output_graph = input_graph.mapTempToIndex(index_graph, ILX, ilxtr) ios.append((input_graph, output_graph)) input_graph, output_graph = ios[0] a, r, c = output_graph.subjectsChanged(input_graph) index_graph.write() # [o.write() for i, o, in ios] # when ready #from sparcur.paths import Path #Path(index_graph.path).xopen() breakpoint()
class TestOntGraph(unittest.TestCase): ts1 = ((ilxtr.a, ilxtr.b, ilxtr.c), ) ts2 = ((ilxtr.a, ilxtr.b, ilxtr.d), ) def populate(self, graph, triples): [graph.add(t) for t in triples] def setUp(self): self.graph1 = OntGraph() self.graph2 = OntGraph() def test_subjectsChanged(self): self.populate(self.graph1, self.ts1) self.populate(self.graph2, self.ts2) d = a, r, c = self.graph1.subjectsChanged(self.graph2) assert not a, d assert not a, d assert c, d def test_not_subjectsChanged(self): self.populate(self.graph1, self.ts1) self.populate(self.graph2, self.ts1) d = a, r, c = self.graph1.subjectsChanged(self.graph2) assert not a, d assert not r, d assert not c, d
def load_header(filepath, remote=False): oo = b'owl:Ontology' path = Path(filepath) if path.suffix == '.ttl': infmt = 'turtle' else: infmt = 'xml' # FIXME assumption if remote: resp = requests.get( filepath ) # TODO nonblocking pull these out, fetch, run inner again until done raw = resp.text.encode() else: with open(filepath, 'rb') as f: # do not catch FileNotFoundErrors raw = f.read() if oo in raw: # we only care if there are imports or an ontology iri scratch = OntGraph() if infmt == 'turtle': data, rest = raw.split(b'###', 1) elif infmt == None: # assume xml xml_tree = etree.parse(BytesIO(raw)) xml_root = xml_tree.getroot() xml_ontology = xml_tree.xpath( "/*[local-name()='RDF']/*[local-name()='Ontology']") xml_root.clear() xml_root.append(xml_ontology[0]) data = etree.tostring(xml_root) scratch.parse(data=data, format=infmt) return scratch
def _populate_published(curation_export, graphs): # datasets = [list(g[:rdf.type:sparc.Dataset]) for g in graphs] published_graphs = [ g for g, doi in [(g, list(g[ds:TEMP.hasDoi])) for g in graphs for ds in g[:rdf.type:sparc.Dataset]] if doi ] merged = OntGraph() for g in published_graphs: merged.namespace_manager.populate_from({ k: v for k, v in dict(g.namespace_manager).items() if k not in ('contributor', 'sample', 'subject') }) merged.populate_from_triples( g.data) # g.data excludes the owl:Ontology section # TODO switch the rdf:type of metadata section on combination to preserve export related metadata mg = curation_export.metadata().graph mg.namespace_manager.populate(merged) new_bi = rdflib.URIRef( mg.boundIdentifier.replace('ontologies/', 'ontologies/published/')) new_vi = rdflib.URIRef( mg.versionIdentifier.replace('ontologies/', 'ontologies/published/')) replace_pairs = ( (rdflib.Literal("SPARC Consortium curation export published graph"), rdflib.Literal("SPARC Consortium curation export graph")), (new_bi, mg.boundIdentifier), (new_vi, mg.versionIdentifier)) new_meta = mg.replaceIdentifiers(replace_pairs) new_meta.populate(merged) return merged
def main(g=None, ce_g=None, protcur_export_path=None, curation_export_path=None): if g is None: if not protcur_export_path: ori = OntResIri('https://cassava.ucsd.edu/sparc/preview/exports/protcur.ttl') g = ori.graph else: g = OntGraph().parse(protcur_export_path) pids = list(g[:rdf.type:sparc.Protocol]) if ce_g is None: if not curation_export_path: ce_ori = OntResIri('https://cassava.ucsd.edu/sparc/preview/exports/curation-export.ttl') ce_g = ce_ori.graph else: ce_g = OntGraph().parse(curation_export_path) ce_pids = list(ce_g[:rdf.type:sparc.Protocol]) ap = [(p, d, list(ce_g[d:TEMP.hasDoi:])) for p in ce_pids for d in ce_g[:TEMP.hasProtocol:p] if list(ce_g[d:TEMP.hasDoi:])] with_published_dataset = {p:dois[0] for p, d, dois in ap} graphs = make_graphs(g, pids, with_published_dataset) write_graphs(graphs, path=None)
def test_part_of(self): eeeee = self.OntTerm('UBERON:0008933', label='primary somatosensory cortex') g = OntGraph() [g.add(t) for t in eeeee.triples_simple] g.debug() po = [t for t in eeeee.triples_simple if partOf in t] assert po, 'sadness'
def make_graphs(g, pids, published): sgs = [] for i in pids: ng = OntGraph() ng.namespace_manager.populate_from(g) ng.namespace_manager.bind( 'spjl', 'https://uilx.org/tgbugs/u/sparcur-protcur-json-ld/') ng.populate_from_triples(tobn(g.subjectGraphClosure(i), published)) sgs.append(ng) return sgs
def normalize_prefixes(graph, curies): new_graph = OntGraph() oc = OntCuries.new() curies.pop('', None) curies['rdf'] = str(rdf) curies['rdfs'] = str(rdfs) oc(curies) oc.populate(new_graph) [new_graph.add(t) for t in graph] return new_graph
def test_new_index(self): rp = temp_path / 'sneechenator' wrangler = snch.SneechWrangler(rp) path_index = wrangler.new_index('uri.interlex.org') assert path_index.exists(), 'wat' g = OntGraph(path=path_index).parse() try: next(g[:rdf.type:snch.snchn.IndexGraph]) except StopIteration: assert False, g.debug()
def populate(self, graph=None): """ Populate a graph, or if no graph is provided populate a new empty graph from the current content. (Also useful for debug) """ if graph is None: graph = OntGraph() [graph.add(t) for t in self.triples] OntCuries.populate(graph) return graph
def make_import_chain(self, ontology='nif.ttl'): itrips = self.get_itrips() if not any(ontology in t[0] for t in itrips): return None, None ontologies = ontology, # hack around bad code in ontload import_graph = OntGraph() [import_graph.add(t) for t in itrips] self.tree, self.extra = next(import_tree(import_graph, ontologies)) return self.tree, self.extra
def triples(self): for blob in self.data['identifier_metadata']: id = blob['id'] if not isinstance(id, idlib.Stream): id = idlib.Auto(id) s = id.asType(rdflib.URIRef) if 'source' in blob: source = blob[ 'source'] # FIXME we need to wrap this in our normalized representation if source == 'Crossref': # FIXME CrossrefConvertor etc. OR put it in idlib as a an alternate ttl pos = ( (rdf.type, owl.NamedIndividual), (rdf.type, TEMP[blob['type']]), (dc.publisher, blob['publisher']), #(dc.type, blob['type']), # FIXME semantify (dc.title, blob['title']), (dc.date, self.published_online(blob)), # FIXME .... dangerzone ) g = OntGraph() doi = idlib.Doi(id) if not isinstance( id, idlib.Doi ) else id # FIXME idlib streams need to recognize their own type in __new__ g.parse(data=doi.ttl(), format='ttl') # FIXME network bad _their_record_s = [ s for s, p, o in g if p == rdflib.term.URIRef( 'http://prismstandard.org/namespaces/basic/2.1/doi' ) ][0] yield s, owl.sameAs, _their_record_s yield from g else: msg = f'dont know what to do with {source}' log.error(msg) #raise NotImplementedError(msg) return else: msg = f'dont know what to do with {blob} for {id.identifier}' log.error(msg) #raise NotImplementedError(msg) return for p, oraw in pos: if oraw is not None: o = rdflib.Literal(oraw) if not isinstance( oraw, rdflib.URIRef) else oraw yield s, p, o
def export_rdf(self, dump_path, latest_path, dataset_blobs): dataset_dump_path = dump_path / 'datasets' dataset_dump_path.mkdir() suffix = '.ttl' mode = 'wb' teds = [] for dataset_blob in dataset_blobs: filename = dataset_blob['id'] filepath = dataset_dump_path / filename filepsuf = filepath.with_suffix(suffix) lfilepath = self.latest_datasets_path / filename lfilepath = latest_path / filename lfilepsuf = lfilepath.with_suffix(suffix) ted = ex.TriplesExportDataset(dataset_blob) teds.append(ted) if self.latest and lfilepsuf.exists(): filepsuf.copy_from(lfilepsuf) graph = OntGraph(path=lfilepsuf).parse() ted._graph = graph else: ted.graph.write(filepsuf) # yay OntGraph defaults loge.info(f'dataset graph exported to {filepsuf}') return teds
def fromYaml(cls, in_path): in_path = aug.RepoPath(in_path).resolve() with open(in_path, 'rt') as f: blob = yaml.safe_load(f) if 'include' in blob: orgs = [ OntResGit(path=aug.RepoPath(subblob['path']), ref=subblob['ref']) for subblob in blob['include'] ] else: orgs = [ OntResGit(path=aug.RepoPath(subblob['path']), ref=subblob['ref']) for subblob in blob['paths'] ] if not orgs: raise ValueError(f'orgs is epty for {in_path}') referenceIndex = blob['referenceIndex'] namespaces = blob['namespaces'] if isinstance(namespaces, str): namespaces = namespaces.split(' ') snchf = cls(orgs=orgs, namespaces=namespaces, referenceIndex=referenceIndex) return cls(graph=snchf.populate(OntGraph()))
def build_instances(self, instances, dids): folder = Path(self.config.out_graph_path()).parent # WOW do I need to implement the new/better way of # managing writing collections of neurons to graphs neuron_uri = next(NeuronACT.out_graph[:rdf.type:owl.Ontology]) name = 'allen-cell-instances.ttl' base, _ = neuron_uri.rsplit('/', 1) uri = rdflib.URIRef(base + '/' + name) metadata = ((uri, rdf.type, owl.Ontology),) instance_graph = OntGraph(path=folder / name) instance_graph.bind('AIBSSPEC', AIBSSPEC) instance_graph.bind('npokb', npokb) [instance_graph.add(t) for t in metadata] [instance_graph.add(t) for t in instances] [instance_graph.add(t) for t in allDifferent(None, distinctMembers(*dids))] instance_graph.write()
def graph(self): if not hasattr(self, '_graph'): self._graph = populateFromJsonLd(OntGraph(), self.asJsonLd()) OntCuries.populate(self._graph) self.populateHeader(self._graph) return self._graph
def debug(self, target=None): if target is None: target = self.asOwl OntGraph( namespace_manager=dict(OntCuries._dict)).populate_from_triples( target()).debug()
def graph(self): """ you can populate other graphs, but this one runs once """ if not hasattr(self, '_graph'): graph = OntGraph() self.populate(graph) self._graph = graph return self._graph
def sneechReviewGraph(self, source_graph, namespaces, sneech_file=None, path_out=None): # TODO cache (already, cannot, maybe, sneeches, maybe_sneeches) = self.preSneech(source_graph, namespaces) # TODO not entirely sure about the best place to put this ... self.reView(source_graph, maybe_sneeches) # FIXME dump and commit review_graph = OntGraph(path=path_out) oq.OntCuries.populate(review_graph) review_graph.bind('snchn', str(snchn)) # FIXME -> curies probably review_graph.bind('sncho', str(sncho)) # FIXME -> curies probably review_graph.bind('h', str(sghashes)) # FIXME -> curies probably if sneech_file: sneech_file.populate(review_graph) gen = self.triples_review(already, cannot, maybe, sneeches, sneech_file) [review_graph.add(t) for t in gen] # TODO hasReport -> maybe_sneeches report / reView # TODO snchn predicate ordering return review_graph, maybe_sneeches
def setUpClass(cls): iri = 'https://cassava.ucsd.edu/sparc/preview/exports/protcur.ttl' cls.graph = OntGraph().parse(iri, format='ttl') cls.nsm = cls.graph.namespace_manager cls.spaql_templates = SparqlQueries(cls.nsm) cls._q_protocol_aspects = cls.spaql_templates.protocol_aspects() cls._q_protocol_inputs = cls.spaql_templates.protocol_inputs() cls._q_protocol_species_dose = cls.spaql_templates.protocol_species_dose( )
def graphFromGithub(link, verbose=False): # mmmm no validation # also caching probably if verbose: log.info(link) g = OntGraph().parse(f'{link}?raw=true', format='turtle') OntCuries.populate(g) return g
def loadall(git_local, repo_name, local=False, dobig=False): local_base = jpth(git_local, repo_name) lb_ttl = os.path.realpath(jpth(local_base, 'ttl')) #match = (rdflib.term.URIRef('http://purl.org/dc/elements/1.1/member'), # iao.owl #rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), #rdflib.term.URIRef('http://www.w3.org/2002/07/owl#AnnotationProperty')) done = [] filenames = [ f for g in ('*', '*/*', '*/*/*') for f in glob(lb_ttl + '/' + g + '.ttl') ] graph = OntGraph() for f in filenames: print(f) done.append(os.path.basename(f)) graph.parse(f, format='turtle') #if match in graph: #raise BaseException('Evil file found %s' % f) def repeat( dobig=dobig): # we don't really know when to stop, so just adjust for s, o in graph.subject_objects(owl.imports): if os.path.basename(o) not in done and o not in done: #if (o, rdf.type, owl.Ontology) not in graph: print(o) done.append(o) ext = os.path.splitext(o)[1] fmt = 'turtle' if ext == '.ttl' else 'xml' if noneMembers(o, *bigleaves) or dobig: graph.parse(o, format=fmt) #if match in graph: #raise BaseException('Evil file found %s' % o) #if local: #repeat(False) #else: if not local: for i in range(10): repeat(True) return graph
def export_protcur(self, dump_path, *hypothesis_groups, no_network=False): # FIXME no_network passed in here is dumb #if (self.latest and # FIXME NOTE this only points to the latest integrated release #self.latest_protcur_path.exists()): #blob_protcur = self.latest_protocols #else: pipeline = pipes.ProtcurPipeline(*hypothesis_groups, no_network=no_network) # FIXME NOTE this does not do the identifier expansion pass protcur = pipeline.data context = { **sc.base_context, **sc.protcur_context, } for f in ('meta', 'subjects', 'samples', 'contributors'): context.pop(f) # FIXME HACK meta @graph for datasets ontology_header = { # FIXME should probably not be added here since it is obscure ... '@id': 'https://cassava.ucsd.edu/sparc/ontologies/protcur.ttl', '@type': 'owl:Ontology', } protcur.append(ontology_header) blob_protcur = { # FIXME this should not be defined here so confusing that it is not with the pipeline ... '@context': context, 'meta': { 'count': len(protcur) }, # FIXME adjust to structure 'prov': { 'timestamp_export_start': self.timestamp, 'export_system_identifier': Path.sysid, 'export_hostname': gethostname(), }, '@graph': protcur, # FIXME regularize elements ? } dump_path.mkdir(parents=True, exist_ok=True) # FIXME TODO make these latest paths accessible # probably by splitting protcur export out into # its own class latest_path = dump_path.parent / 'LATEST' latest_partial_path = dump_path.parent / 'LATEST_PARTIAL' fn = dump_path / 'protcur.json' with open(fn, 'wt') as f: json.dump(blob_protcur, f, sort_keys=True, indent=2, cls=JEncode) symlink_latest(dump_path, latest_partial_path) g = populateFromJsonLd(OntGraph(), fn).write(fn.with_suffix('.ttl')) symlink_latest(dump_path, latest_path) return blob_protcur
def simplify(collapse, blob): to_remove = [] for coll in collapse: exclude = set(p for p in coll) candidates = [e for e in blob['edges'] if e['pred'] in exclude] for c in candidates: # make sure we can remove the edges later # if they have meta the match will fail if 'meta' in c: c.pop('meta') if candidates: edges = [Edge.fromOboGraph(c) for c in candidates] g = OntGraph().populate_from_triples(e.asRdf() for e in edges) nxg = egl.rdflib_to_networkx_multidigraph(g) connected = list(nx.weakly_connected_components(nxg)) # FIXME may not be minimal ends = [e.asRdf()[-1] for e in edges if e.p == coll[-1]] for c in connected: #log.debug('\n' + pformat(c)) nxgt = nx.MultiDiGraph() nxgt.add_edges_from(nxg.edges(c, keys=True)) ordered_nodes = list(nx.topological_sort(nxgt)) paths = [p for n in nxgt.nodes() for e in ends for p in list(nx.all_simple_paths(nxgt, n, e)) if len(p) == len(coll) + 1] for path in sorted(paths): ordered_edges = nxgt.edges(path, keys=True) oe2 = [Edge.fromNx(e) for e in ordered_edges if all([n in path for n in e[:2]])] predicates = [e.p for e in oe2] #log.debug('\n' + pformat(oe2)) if predicates == coll: #in collapse: to_remove.extend(zap(path, predicates, oe2, blob)) else: # have to retain this branch to handle cases where the end predicate is duplicated log.error('\n' + pformat(predicates) + '\n' + pformat(coll)) for preds in [coll]: sublist_start = listIn(predicates, preds) if sublist_start is not None: i = sublist_start j = i + len(preds) npath = path[i:j + 1] # + 1 to include final node oe2 = oe2[i:j] predicates = predicates[i:j] to_remove.extend(zap(npath, predicates, oe2, blob)) for r in to_remove: if r in blob['edges']: blob['edges'].remove(r) #log.debug('\n' + pformat(blob['edges'])) return blob # note that this is in place modification so sort of supruflous
def default(self): g = OntGraph().parse( auth.get_path('ontology-local-repo') / 'ttl/stimulation.ttl') preds = sorted(set(g.qname(p) for p in g.predicates())) header = [[ 'id', 'rdf:type', 'rdfs:domain', 'rdfs:range', 'rdfs:label', 'NIFRID:synonym', 'NIFRID:abbrev', 'definition:', 'editorNote:', 'rdfs:comment' ]] _rows = [] for type_ in ( owl.ObjectProperty, owl.Class, ): for s in sorted(g[:rdf.type:type_], key=natsort): if isinstance(s, rdflib.URIRef): row = [g.qname(s), g.qname(type_) ] + [fun(g, s) for fun in funs] _rows.append(row) rows = header + _rows defs = Defs(readonly=not self.options.update) if self.options.update: defs.upsert(*rows) # FIXME upsert broken on header reordering ? defs.commit() return rows
def main(): url = ( 'http://data.bioontology.org/ontologies/' f'PCL/submissions/7/download?apikey={auth.user_config.secrets("bioportal")}' ) g = OntGraph().parse(url, format='application/rdf+xml') g = fixns(g) og = OntGraph() g.namespace_manager.populate(og) og.populate_from_triples(to_ebm(g)) og.write('/tmp/pCL.ttl')
def graph(self): g = OntGraph() OntCuries.populate(g) self.populate(g) g.bind('local', self.context) g.bind('apinatomy', readable) # FIXME populate from store g.bind('elements', elements) return g
def asOwl(self, identifier_function=lambda self: rdflib.BNode()): # FIXME maybe just use the render function? and control all # of this as part of the renderer? s = identifier_function(self) for type in (PhenotypeCollection, PhenotypeBase): combinators = (phenotype.combinator for phenotype in self if isinstance(phenotype, type)) combinators = list(combinators) if type == PhenotypeCollection and combinators: asdf = list(self.combinator(s, *combinators)) OntGraph().populate_from_triples(asdf).debug() breakpoint() # XXX yield from self.combinator(s, *combinators)
def loadData(cls): """ corresponds to the list of FMA ids from organParts for all organs in the sparc organsList """ g = OntGraph() g.namespace_manager.populate_from(uPREFIXES) # cls._ghead except fma doesn't define FMA: ol = cls.sgd.prod_sparc_organList() top_ids = [n['id'] for n in ol['nodes']] res = Async()(deferred(by_organ)(i, cls.sgd) for i in top_ids) #res = [by_organ(i, cls.sgd) for i in top_ids] #res_stats(res) # why are there dupes? now we know! nodes = [n for o, r in res for n in r['nodes']] ids_raw = set(n['id'] for n in nodes if not n['id'].startswith('_:') and n['id'] != 'owl:Nothing') ids = set(g.namespace_manager.expand(id).toPython() for id in ids_raw) return ids_raw, ids
def COMMENCE(self, *, namespaces=tuple(), orgs=tuple(), sneech_file=None, path_out=None, **kwargs): if sneech_file is not None and not orgs: return sneech_file.COMMENCE(self, path_out) if not orgs: raise TypeError('orgs cannot be empty!') source_graph = OntGraph() for org in orgs: org.populate(source_graph) #derp = g.namespace_manager.store.namespace #namespaces = [derp(p) for p in prefixes] # FIXME prefix vs namespace rg, maybe_sneeches = self.sneechReviewGraph(source_graph, namespaces, sneech_file, path_out)