def test_delete_from_ctx(self, store): ''' Delete triples from a named graph and from the default graph. ''' gr_uri = URIRef('urn:bogus:graph#a') gr2_uri = URIRef('urn:bogus:graph#b') with TxnManager(store, True) as txn: store.remove((None, None, None), gr2_uri) assert len(set(store.triples((None, None, None), gr2_uri))) == 0 assert len(set(store.triples((None, None, None), gr_uri))) == 3 with TxnManager(store, True) as txn: store.remove((URIRef('urn:s:1'), None, None)) assert len(set(store.triples((None, None, None), gr_uri))) == 2 assert len(set(store.triples((None, None, None)))) == 3 with TxnManager(store, True) as txn: store.remove((URIRef('urn:s:4'), None, None), RDFLIB_DEFAULT_GRAPH_URI) assert len(set(store.triples((None, None, None)))) == 2 with TxnManager(store,True) as txn: store.remove((None, None, None)) assert len(set(store.triples((None, None, None)))) == 0 assert len(set(store.triples((None, None, None), gr_uri))) == 0 assert len(store) == 0
def test_ctx_mgr(self, store): ''' Test enclosing a transaction in a context. ''' with TxnManager(store) as txn: pass assert not store.is_txn_open try: with TxnManager(store) as txn: raise RuntimeError except RuntimeError: assert not store.is_txn_open
def test_rollback(self, store): ''' Test rolling back a transaction. ''' try: with TxnManager(store, True) as txn: store.add((URIRef('urn:nogo:s'), URIRef('urn:nogo:p'), URIRef('urn:nogo:o'))) raise RuntimeError # This should roll back the transaction. except RuntimeError: pass with TxnManager(store) as txn: res = set(store.triples((None, None, None))) assert len(res) == 0
def test_add_trp_to_ctx(self, store): ''' Test adding triples to a graph. ''' gr_uri = URIRef('urn:bogus:graph#a') # From previous test gr2_uri = URIRef('urn:bogus:graph#b') # Never created before trp1 = (URIRef('urn:s:1'), URIRef('urn:p:1'), URIRef('urn:o:1')) trp2 = (URIRef('urn:s:2'), URIRef('urn:p:2'), URIRef('urn:o:2')) trp3 = (URIRef('urn:s:3'), URIRef('urn:p:3'), URIRef('urn:o:3')) trp4 = (URIRef('urn:s:4'), URIRef('urn:p:4'), URIRef('urn:o:4')) with TxnManager(store, True) as txn: store.add(trp1, gr_uri) store.add(trp2, gr_uri) store.add(trp2, gr_uri) # Duplicate; dropped. store.add(trp2, None) # Goes to the default graph. store.add(trp3, gr2_uri) store.add(trp3, gr_uri) store.add(trp4) # Goes to the default graph. assert len(set(store.triples((None, None, None)))) == 4 assert len(set(store.triples((None, None, None), RDFLIB_DEFAULT_GRAPH_URI))) == 2 assert len(set(store.triples((None, None, None), gr_uri))) == 3 assert len(set(store.triples((None, None, None), gr2_uri))) == 1 assert gr2_uri in {gr.identifier for gr in store.contexts()} assert trp1 in _clean(store.triples((None, None, None))) assert trp1 not in _clean(store.triples((None, None, None), RDFLIB_DEFAULT_GRAPH_URI)) assert trp2 in _clean(store.triples((None, None, None), gr_uri)) assert trp2 in _clean(store.triples((None, None, None))) assert trp3 in _clean(store.triples((None, None, None), gr2_uri)) assert trp3 not in _clean(store.triples((None, None, None), RDFLIB_DEFAULT_GRAPH_URI))
def test_pfx2ns(self, store, bindings): ''' Test namespace to prefix conversion. ''' with TxnManager(store, True) as txn: for b in bindings: pfx, ns = b assert store.prefix(ns) == pfx
def test_ns2pfx(self, store, bindings): ''' Test namespace to prefix conversion. ''' with TxnManager(store, True) as txn: for b in bindings: pfx, ns = b assert store.namespace(pfx) == ns
def test_add_graph(self, store): ''' Test creating an empty and a non-empty graph. ''' gr_uri = URIRef('urn:bogus:graph#a') with TxnManager(store, True) as txn: store.add_graph(gr_uri) assert gr_uri in {gr.identifier for gr in store.contexts()}
def count_rsrc(self): """ Return a count of first-class resources, subdivided in "live" and historic snapshots. """ with TxnManager(self.ds.store) as txn: main = set(self.ds.graph(META_GR_URI)[:nsc['foaf'].primaryTopic:]) hist = set(self.ds.graph(HIST_GR_URI)[:nsc['foaf'].primaryTopic:]) return {'main': len(main), 'hist': len(hist)}
def stats(): ''' Get repository statistics. @return dict Store statistics, resource statistics. ''' repo_stats = {'rsrc_stats': env.app_globals.rdfly.count_rsrc()} with TxnManager(env.app_globals.rdf_store) as txn: repo_stats['store_stats'] = env.app_globals.rdf_store.stats() return repo_stats
def test_empty_context(self, store): ''' Test creating and deleting empty contexts. ''' gr_uri = URIRef('urn:bogus:empty#a') with TxnManager(store, True) as txn: store.add_graph(gr_uri) assert gr_uri in {gr.identifier for gr in store.contexts()} store.remove_graph(gr_uri) assert gr_uri not in {gr.identifier for gr in store.contexts()}
def stats(): """ Get repository statistics. :rtype: dict :return: Store statistics, resource statistics. """ import lakesuperior.env_setup repo_stats = {'rsrc_stats': env.app_globals.rdfly.count_rsrc()} with TxnManager(env.app_globals.rdf_store) as txn: repo_stats['store_stats'] = env.app_globals.rdf_store.stats() return repo_stats
def _wrapper(*args, **kwargs): # Mark transaction begin timestamp. This is used for create and # update timestamps on resources. env.timestamp = arrow.utcnow() env.timestamp_term = Literal(env.timestamp, datatype=XSD.dateTime) with TxnManager(env.app_globals.rdf_store, write=write) as txn: ret = fn(*args, **kwargs) if len(env.app_globals.changelog): job = Thread(target=_process_queue) job.start() delattr(env, 'timestamp') delattr(env, 'timestamp_term') return ret
def test_triple_match_1bound(self, store): ''' Test triple patterns matching one bound term. ''' with TxnManager(store) as txn: res1 = set(store.triples((URIRef('urn:test:s'), None, None))) res2 = set(store.triples((None, URIRef('urn:test:p'), None))) res3 = set(store.triples((None, None, URIRef('urn:test:o')))) assert _clean(res1) == {( URIRef('urn:test:s'), URIRef('urn:test:p'), URIRef('urn:test:o'))} assert _clean(res2) == _clean(res1) assert _clean(res3) == _clean(res2)
def test_ns(self, store, bindings): ''' Test namespace bindings. ''' with TxnManager(store, True) as txn: for b in bindings: store.bind(*b) nslist = list(store.namespaces()) assert len(nslist) == len(bindings) for i in range(len(bindings)): assert nslist[i] == bindings[i]
def integrity_check(config_dir=None): """ Check integrity of the data set. At the moment this is limited to referential integrity. Other checks can be added and triggered by different argument flags. """ if config_dir: env.config = parse_config(config_dir)[0] env.app_globals = AppGlobals(env.config) else: import lakesuperior.env_setup with TxnManager(env.app_globals.rdfly.store): return {t for t in env.app_globals.rdfly.find_refint_violations()}
def test_remove(self, store): ''' Test removing one or more triples. ''' with TxnManager(store, True) as txn: store.remove((URIRef('urn:test:s3'), URIRef('urn:test:p3'), URIRef('urn:test:o3'))) res1 = set(store.triples((None, None, None))) assert len(res1) == 2 store.remove((URIRef('urn:test:s'), None, None)) res2 = set(store.triples((None, None, None))) assert len(res2) == 0
def bootstrap(self): ''' Delete all graphs and insert the basic triples. ''' logger.info('Deleting all data from the graph store.') store = self.ds.store if getattr(store, 'is_txn_open', False): store.rollback() store.destroy(store.path) logger.info('Initializing the graph store with system data.') store.open() with TxnManager(store, True): with open('data/bootstrap/rsrc_centric_layout.sparql', 'r') as f: self.ds.update(f.read())
def test_create_triple(self, store): ''' Test creation of a single triple. ''' trp = ( URIRef('urn:test:s'), URIRef('urn:test:p'), URIRef('urn:test:o')) with TxnManager(store, True) as txn: store.add(trp) res1 = set(store.triples((None, None, None))) res2 = set(store.triples(trp)) assert len(res1) == 1 assert len(res2) == 1 clean_res1 = _clean(res1) clean_res2 = _clean(res2) assert trp in clean_res1 & clean_res2
def sparql_query(qry_str, fmt): ''' Send a SPARQL query to the triplestore. @param qry_str (str) SPARQL query string. SPARQL 1.1 Query Language (https://www.w3.org/TR/sparql11-query/) is supported. @param fmt(string) Serialization format. This varies depending on the query type (SELECT, ASK, CONSTRUCT, etc.). [@TODO Add reference to RDFLib serialization formats] @return BytesIO ''' with TxnManager(rdf_store) as txn: qres = rdfly.raw_query(qry_str) out_stream = BytesIO(qres.serialize(format=fmt)) return out_stream
def sparql_query(qry_str, fmt): """ Send a SPARQL query to the triplestore. :param str qry_str: SPARQL query string. SPARQL 1.1 Query Language (https://www.w3.org/TR/sparql11-query/) is supported. :param str fmt: Serialization format. This varies depending on the query type (SELECT, ASK, CONSTRUCT, etc.). [TODO Add reference to RDFLib serialization formats] :rtype: BytesIO :return: Serialized SPARQL results. """ with TxnManager(rdf_store) as txn: qres = rdfly.raw_query(qry_str) out_stream = BytesIO(qres.serialize(format=fmt)) return out_stream
def test_triple_no_match(self, store): ''' Test various mismatches. ''' with TxnManager(store, True) as txn: store.add(( URIRef('urn:test:s'), URIRef('urn:test:p2'), URIRef('urn:test:o2'))) store.add(( URIRef('urn:test:s3'), URIRef('urn:test:p3'), URIRef('urn:test:o3'))) res1 = set(store.triples((None, None, None))) assert len(res1) == 3 res1 = set(store.triples( (URIRef('urn:test:s2'), URIRef('urn:test:p'), None))) res2 = set(store.triples( (URIRef('urn:test:s3'), None, URIRef('urn:test:o')))) res3 = set(store.triples( (None, URIRef('urn:test:p3'), URIRef('urn:test:o2')))) assert len(res1) == len(res2) == len(res3) == 0
def bootstrap(): """ Bootstrap binary and graph stores. This script will parse configuration files and initialize a filesystem and triplestore with an empty FCREPO repository. It is used in test suites and on a first run. Additional scaffolding files may be parsed to create initial contents. """ import lakesuperior.env_setup rdfly = env.app_globals.rdfly nonrdfly = env.app_globals.nonrdfly click.echo( click.style('WARNING: This operation will WIPE ALL YOUR DATA.\n', bold=True, fg='red') + 'Are you sure? (Please type `yes` to continue) > ', nl=False) choice = input().lower() if choice != 'yes': click.echo('Aborting.') sys.exit(1) click.echo('Initializing graph store at {}'.format(rdfly.store.path)) with TxnManager(env.app_globals.rdf_store, write=True) as txn: rdfly.bootstrap() rdfly.store.close() click.echo('Graph store initialized.') click.echo('Initializing binary store at {}'.format(nonrdfly.root)) nonrdfly.bootstrap() click.echo('Binary store initialized.') click.echo('Repository successfully set up. Go to town.')
def _crawl(self, uid): """ Get the contents of a resource and its relationships recursively. This method recurses into itself each time a reference to a resource managed by the repository is encountered. :param str uid: The path relative to the source server webroot pointing to the resource to crawl, effectively the resource UID. """ ibase = str(nsc['fcres']) # Public URI of source repo. uri = self.src + uid # Internal URI of destination. iuri = ibase + uid rsp = requests.head(uri) if not self.skip_errors: rsp.raise_for_status() elif rsp.status_code > 399: print('Error retrieving resource {} headers: {} {}'.format( uri, rsp.status_code, rsp.text)) # Determine LDP type. ldp_type = 'ldp_nr' try: for link in requests.utils.parse_header_links( rsp.headers.get('link')): if (link.get('rel') == 'type' and (link.get('url') == str(nsc['ldp'].RDFSource) or link.get('url') == str(nsc['ldp'].Container))): # Resource is an LDP-RS. ldp_type = 'ldp_rs' break except TypeError: ldp_type = 'ldp_rs' #raise ValueError('URI {} is not an LDP resource.'.format(uri)) # Get the whole RDF document now because we have to know all outbound # links. get_uri = (uri if ldp_type == 'ldp_rs' else '{}/fcr:metadata'.format(uri)) get_rsp = requests.get(get_uri) if not self.skip_errors: get_rsp.raise_for_status() elif get_rsp.status_code > 399: print('Error retrieving resource {} body: {} {}'.format( uri, get_rsp.status_code, get_rsp.text)) data = get_rsp.content.replace(self.src.encode('utf-8'), ibase.encode('utf-8')) gr = Graph(identifier=iuri).parse(data=data, format='turtle') # Store raw graph data. No checks. with TxnManager(self.rdfly.store, True): self.rdfly.modify_rsrc(uid, add_trp=set(gr)) # Grab binary and set new resource parameters. if ldp_type == 'ldp_nr': provided_imr = gr.resource(URIRef(iuri)) if self.zero_binaries: data = b'' else: bin_rsp = requests.get(uri) if not self.skip_errors: bin_rsp.raise_for_status() elif bin_rsp.status_code > 399: print('Error retrieving resource {} body: {} {}'.format( uri, bin_rsp.status_code, bin_rsp.text)) data = bin_rsp.content #import pdb; pdb.set_trace() uuid = str(gr.value(URIRef(iuri), nsc['premis'].hasMessageDigest)).split(':')[-1] fpath = self.nonrdfly.local_path(self.nonrdfly.config['path'], uuid) makedirs(path.dirname(fpath), exist_ok=True) with open(fpath, 'wb') as fh: fh.write(data) self._ct += 1 if self._ct % 10 == 0: print('{} resources processed so far.'.format(self._ct)) # Now, crawl through outbound links. # LDP-NR fcr:metadata must be checked too. for pred, obj in gr.predicate_objects(): #import pdb; pdb.set_trace() obj_uid = obj.replace(ibase, '') with TxnManager(self.rdfly.store, True): conditions = bool( isinstance(obj, URIRef) and obj.startswith(iuri) # Avoid ∞ loop with fragment URIs. and str(urldefrag(obj).url) != str(iuri) # Avoid ∞ loop with circular references. and not self.rdfly.ask_rsrc_exists(obj_uid) and pred not in self.ignored_preds) if conditions: print('Object {} will be crawled.'.format(obj_uid)) self._crawl(urldefrag(obj_uid).url)
def __init__(self, src, dest, zero_binaries=False, compact_uris=False, skip_errors=False): """ Set up base paths and clean up existing directories. :param rdflib.URIRef src: Webroot of source repository. This must correspond to the LDP root node (for Fedora it can be e.g. ``http://localhost:8080fcrepo/rest/``) and is used to determine if URIs retrieved are managed by this repository. :param str dest: Destination repository path. If the location exists it must be a writable directory. It will be deleted and recreated. If it does not exist, it will be created along with its parents if missing. :param str binary_handling: One of ``include``, ``truncate`` or ``split``. :param bool compact_uris: NOT IMPLEMENTED. Whether the process should attempt to compact URIs generated with broken up path segments. If the UID matches a pattern such as ``/12/34/56/123456...`` it is converted to ``/123456...``. This would remove a lot of cruft caused by the pairtree segments. Note that this will change the publicly exposed URIs. If durability is a concern, a rewrite directive can be added to the HTTP server that proxies the WSGI endpoint. """ # Set up repo folder structure and copy default configuration to # destination file. cur_dir = path.dirname(path.dirname(path.abspath(__file__))) self.dbpath = '{}/data/ldprs_store'.format(dest) self.fpath = '{}/data/ldpnr_store'.format(dest) self.config_dir = '{}/etc'.format(dest) shutil.rmtree(dest, ignore_errors=True) shutil.copytree('{}/etc.defaults'.format(cur_dir), self.config_dir) # Modify and overwrite destination configuration. orig_config, _ = parse_config(self.config_dir) orig_config['application']['store']['ldp_rs']['location'] = self.dbpath orig_config['application']['store']['ldp_nr']['path'] = self.fpath with open('{}/application.yml'.format(self.config_dir), 'w') \ as config_file: config_file.write(yaml.dump(orig_config['application'])) env.config = parse_config(self.config_dir)[0] env.app_globals = AppGlobals(env.config) self.rdfly = env.app_globals.rdfly self.nonrdfly = env.app_globals.nonrdfly with TxnManager(env.app_globals.rdf_store, write=True) as txn: self.rdfly.bootstrap() self.rdfly.store.close() env.app_globals.nonrdfly.bootstrap() self.src = src.rstrip('/') self.zero_binaries = zero_binaries self.skip_errors = skip_errors