def integrity_check(config_dir=None): """ Check integrity of the data set. At the moment this is limited to referential integrity. Other checks can be added and triggered by different argument flags. """ if config_dir: env.config = parse_config(config_dir)[0] env.app_globals = AppGlobals(env.config) else: import lakesuperior.env_setup with TxnManager(env.app_globals.rdfly.store): return {t for t in env.app_globals.rdfly.find_refint_violations()}
def check_refint(config_folder=None, output=None): """ Check referential integrity. This command scans the graph store to verify that all references to resources within the repository are effectively pointing to existing resources. For repositories set up with the `referential_integrity` option (the default), this is a pre-condition for a consistent data set. If inconsistencies are found, a report is generated in CSV format with the following columns: `s`, `p`, `o` (respectively the terms of the triple containing the dangling relationship) and `missing` which indicates which term is the missing URI (currently always set to `o`). Note: this check can be run regardless of whether the repository enforces referential integrity. """ if config_folder: env.app_globals = AppGlobals(parse_config(config_dir)) else: import lakesuperior.env_setup check_results = admin_api.integrity_check() click.echo('Integrity check results:') if len(check_results): click.echo(click.style('Inconsistencies found!', fg='red', bold=True)) if not output: output = path.join( getcwd(), 'refint_report-{}.csv'.format( arrow.utcnow().format('YYYY-MM-DDTHH:mm:ss.S'))) elif not output.endswith('.csv'): output += '.csv' with open(output, 'w', newline='') as fh: writer = csv.writer(fh) writer.writerow(('s', 'p', 'o', 'missing')) for trp in check_results: # ``o`` is always hardcoded for now. writer.writerow([t.n3() for t in trp[0]] + ['o']) click.echo('Report generated at {}'.format(output)) else: click.echo( click.style('Clean. ', fg='green', bold=True) + 'No inconsistency found. No report generated.')
from lakesuperior.config_parser import config from lakesuperior.globals import AppGlobals from lakesuperior.env import env __doc__ = """ Default configuration. Import this module to initialize the configuration for a production setup:: >>>from lakesuperior import env_setup Will load the default configuration. """ env.config = config env.app_globals = AppGlobals(config)
import sys import pytest sys.path.append('.') from lakesuperior.config_parser import test_config from lakesuperior.globals import AppGlobals from lakesuperior.env import env env.config = test_config env.app_globals = AppGlobals(test_config) from lakesuperior.app import create_app from lakesuperior.util.generators import random_image env.config = test_config @pytest.fixture(scope='module') def app(): app = create_app(env.config['application']) yield app @pytest.fixture(scope='module') def db(app): ''' Set up and tear down test triplestore. ''' rdfly = env.app_globals.rdfly rdfly.bootstrap()
def __init__(self, src, dest, src_auth=(None, None), clear=False, zero_binaries=False, compact_uris=False, skip_errors=False): """ Set up base paths and clean up existing directories. :param rdflib.URIRef src: Webroot of source repository. This must correspond to the LDP root node (for Fedora it can be e.g. ``http://localhost:8080fcrepo/rest/``) and is used to determine if URIs retrieved are managed by this repository. :param str dest: Destination repository path. If the location exists it must be a writable directory. It will be deleted and recreated. If it does not exist, it will be created along with its parents if missing. :param tuple src_auth: if the source repo needs HTTP authentication, specify here username and password as a 2-tuple of strings. :param bool clear: Whether to clear any pre-existing data at the locations indicated. :param bool zero_binaries: Whether to create zero-byte binary files rather than copy the sources. :param bool compact_uris: NOT IMPLEMENTED. Whether the process should attempt to compact URIs generated with broken up path segments. If the UID matches a pattern such as ``/12/34/56/123456...`` it is converted to ``/123456...``. This would remove a lot of cruft caused by the pairtree segments. Note that this will change the publicly exposed URIs. If durability is a concern, a rewrite directive can be added to the HTTP server that proxies the WSGI endpoint. """ # Set up repo folder structure and copy default configuration to # destination file. self.dbpath = '{}/data/ldprs_store'.format(dest) self.fpath = '{}/data/ldpnr_store'.format(dest) self.config_dir = '{}/etc'.format(dest) self.auth = src_auth if clear: shutil.rmtree(dest, ignore_errors=True) if not path.isdir(self.config_dir): shutil.copytree('{}/etc.defaults'.format(basedir), self.config_dir) # Modify and overwrite destination configuration. orig_config = parse_config(self.config_dir) orig_config['application']['store']['ldp_rs']['location'] = self.dbpath orig_config['application']['store']['ldp_nr']['path'] = self.fpath if clear: with open('{}/application.yml'.format(self.config_dir), 'w') \ as config_file: config_file.write(yaml.dump(orig_config['application'])) env.app_globals = AppGlobals(parse_config(self.config_dir)) self.rdfly = env.app_globals.rdfly self.nonrdfly = env.app_globals.nonrdfly if clear: with env.app_globals.rdf_store.txn_mgr(True) as txn: self.rdfly.bootstrap() self.rdfly.store.close() env.app_globals.nonrdfly.bootstrap() self.src = src.rstrip('/') self.zero_binaries = zero_binaries self.skip_errors = skip_errors
def __init__(self, src, dest, zero_binaries=False, compact_uris=False, skip_errors=False): """ Set up base paths and clean up existing directories. :param rdflib.URIRef src: Webroot of source repository. This must correspond to the LDP root node (for Fedora it can be e.g. ``http://localhost:8080fcrepo/rest/``) and is used to determine if URIs retrieved are managed by this repository. :param str dest: Destination repository path. If the location exists it must be a writable directory. It will be deleted and recreated. If it does not exist, it will be created along with its parents if missing. :param str binary_handling: One of ``include``, ``truncate`` or ``split``. :param bool compact_uris: NOT IMPLEMENTED. Whether the process should attempt to compact URIs generated with broken up path segments. If the UID matches a pattern such as ``/12/34/56/123456...`` it is converted to ``/123456...``. This would remove a lot of cruft caused by the pairtree segments. Note that this will change the publicly exposed URIs. If durability is a concern, a rewrite directive can be added to the HTTP server that proxies the WSGI endpoint. """ # Set up repo folder structure and copy default configuration to # destination file. cur_dir = path.dirname(path.dirname(path.abspath(__file__))) self.dbpath = '{}/data/ldprs_store'.format(dest) self.fpath = '{}/data/ldpnr_store'.format(dest) self.config_dir = '{}/etc'.format(dest) shutil.rmtree(dest, ignore_errors=True) shutil.copytree('{}/etc.defaults'.format(cur_dir), self.config_dir) # Modify and overwrite destination configuration. orig_config, _ = parse_config(self.config_dir) orig_config['application']['store']['ldp_rs']['location'] = self.dbpath orig_config['application']['store']['ldp_nr']['path'] = self.fpath with open('{}/application.yml'.format(self.config_dir), 'w') \ as config_file: config_file.write(yaml.dump(orig_config['application'])) env.config = parse_config(self.config_dir)[0] env.app_globals = AppGlobals(env.config) self.rdfly = env.app_globals.rdfly self.nonrdfly = env.app_globals.nonrdfly with TxnManager(env.app_globals.rdf_store, write=True) as txn: self.rdfly.bootstrap() self.rdfly.store.close() env.app_globals.nonrdfly.bootstrap() self.src = src.rstrip('/') self.zero_binaries = zero_binaries self.skip_errors = skip_errors