Exemple #1
0
def integrity_check(config_dir=None):
    """
    Check integrity of the data set.

    At the moment this is limited to referential integrity. Other checks can
    be added and triggered by different argument flags.
    """
    if config_dir:
        env.config = parse_config(config_dir)[0]
        env.app_globals = AppGlobals(env.config)
    else:
        import lakesuperior.env_setup
    with TxnManager(env.app_globals.rdfly.store):
        return {t for t in env.app_globals.rdfly.find_refint_violations()}
Exemple #2
0
def check_refint(config_folder=None, output=None):
    """
    Check referential integrity.

    This command scans the graph store to verify that all references to
    resources within the repository are effectively pointing to existing
    resources. For repositories set up with the `referential_integrity` option
    (the default), this is a pre-condition for a consistent data set.

    If inconsistencies are found, a report is generated in CSV format with the
    following columns: `s`, `p`, `o` (respectively the terms of the
    triple containing the dangling relationship) and `missing` which
    indicates which term is the missing URI (currently always set to `o`).

    Note: this check can be run regardless of whether the repository enforces
    referential integrity.
    """
    if config_folder:
        env.app_globals = AppGlobals(parse_config(config_dir))
    else:
        import lakesuperior.env_setup

    check_results = admin_api.integrity_check()

    click.echo('Integrity check results:')
    if len(check_results):
        click.echo(click.style('Inconsistencies found!', fg='red', bold=True))
        if not output:
            output = path.join(
                getcwd(), 'refint_report-{}.csv'.format(
                    arrow.utcnow().format('YYYY-MM-DDTHH:mm:ss.S')))
        elif not output.endswith('.csv'):
            output += '.csv'

        with open(output, 'w', newline='') as fh:
            writer = csv.writer(fh)
            writer.writerow(('s', 'p', 'o', 'missing'))
            for trp in check_results:
                # ``o`` is always hardcoded for now.
                writer.writerow([t.n3() for t in trp[0]] + ['o'])

        click.echo('Report generated at {}'.format(output))
    else:
        click.echo(
            click.style('Clean. ', fg='green', bold=True) +
            'No inconsistency found. No report generated.')
Exemple #3
0
from lakesuperior.config_parser import config
from lakesuperior.globals import AppGlobals
from lakesuperior.env import env

__doc__ = """
Default configuration.

Import this module to initialize the configuration for a production setup::

    >>>from lakesuperior import env_setup

Will load the default configuration.
"""

env.config = config
env.app_globals = AppGlobals(config)
Exemple #4
0
import sys

import pytest

sys.path.append('.')
from lakesuperior.config_parser import test_config
from lakesuperior.globals import AppGlobals
from lakesuperior.env import env

env.config = test_config
env.app_globals = AppGlobals(test_config)
from lakesuperior.app import create_app
from lakesuperior.util.generators import random_image

env.config = test_config


@pytest.fixture(scope='module')
def app():
    app = create_app(env.config['application'])

    yield app


@pytest.fixture(scope='module')
def db(app):
    '''
    Set up and tear down test triplestore.
    '''
    rdfly = env.app_globals.rdfly
    rdfly.bootstrap()
Exemple #5
0
    def __init__(self,
                 src,
                 dest,
                 src_auth=(None, None),
                 clear=False,
                 zero_binaries=False,
                 compact_uris=False,
                 skip_errors=False):
        """
        Set up base paths and clean up existing directories.

        :param rdflib.URIRef src: Webroot of source repository. This must
            correspond to the LDP root node (for Fedora it can be e.g.
            ``http://localhost:8080fcrepo/rest/``) and is used to determine if
            URIs retrieved are managed by this repository.
        :param str dest: Destination repository path. If the location exists
            it must be a writable directory. It will be deleted and recreated.
            If it does not exist, it will be created along with its parents if
            missing.
        :param tuple src_auth: if the source repo needs HTTP authentication,
            specify here username and password as a 2-tuple of strings.
        :param bool clear: Whether to clear any pre-existing data at the
            locations indicated.
        :param bool zero_binaries: Whether to create zero-byte binary files
            rather than copy the sources.
        :param bool compact_uris: NOT IMPLEMENTED. Whether the process should
            attempt to compact URIs generated with broken up path segments. If
            the UID matches a pattern such as ``/12/34/56/123456...`` it is
            converted to ``/123456...``. This would remove a lot of cruft
            caused by the pairtree segments. Note that this will change the
            publicly exposed URIs. If durability is a concern, a rewrite
            directive can be added to the HTTP server that proxies the WSGI
            endpoint.
        """
        # Set up repo folder structure and copy default configuration to
        # destination file.
        self.dbpath = '{}/data/ldprs_store'.format(dest)
        self.fpath = '{}/data/ldpnr_store'.format(dest)
        self.config_dir = '{}/etc'.format(dest)
        self.auth = src_auth

        if clear:
            shutil.rmtree(dest, ignore_errors=True)
        if not path.isdir(self.config_dir):
            shutil.copytree('{}/etc.defaults'.format(basedir), self.config_dir)

        # Modify and overwrite destination configuration.
        orig_config = parse_config(self.config_dir)
        orig_config['application']['store']['ldp_rs']['location'] = self.dbpath
        orig_config['application']['store']['ldp_nr']['path'] = self.fpath

        if clear:
            with open('{}/application.yml'.format(self.config_dir), 'w') \
                    as config_file:
                config_file.write(yaml.dump(orig_config['application']))

        env.app_globals = AppGlobals(parse_config(self.config_dir))

        self.rdfly = env.app_globals.rdfly
        self.nonrdfly = env.app_globals.nonrdfly

        if clear:
            with env.app_globals.rdf_store.txn_mgr(True) as txn:
                self.rdfly.bootstrap()
                self.rdfly.store.close()
            env.app_globals.nonrdfly.bootstrap()

        self.src = src.rstrip('/')
        self.zero_binaries = zero_binaries
        self.skip_errors = skip_errors
Exemple #6
0
    def __init__(self,
                 src,
                 dest,
                 zero_binaries=False,
                 compact_uris=False,
                 skip_errors=False):
        """
        Set up base paths and clean up existing directories.

        :param rdflib.URIRef src: Webroot of source repository. This must
            correspond to the LDP root node (for Fedora it can be e.g.
            ``http://localhost:8080fcrepo/rest/``) and is used to determine if
            URIs retrieved are managed by this repository.
        :param str dest: Destination repository path. If the location exists
            it must be a writable directory. It will be deleted and recreated.
            If it does not exist, it will be created along with its parents if
            missing.
        :param str binary_handling: One of ``include``, ``truncate`` or
            ``split``.
        :param bool compact_uris: NOT IMPLEMENTED. Whether the process should
            attempt to compact URIs generated with broken up path segments. If
            the UID matches a pattern such as ``/12/34/56/123456...`` it is
            converted to ``/123456...``. This would remove a lot of cruft
            caused by the pairtree segments. Note that this will change the
            publicly exposed URIs. If durability is a concern, a rewrite
            directive can be added to the HTTP server that proxies the WSGI
            endpoint.
        """
        # Set up repo folder structure and copy default configuration to
        # destination file.
        cur_dir = path.dirname(path.dirname(path.abspath(__file__)))
        self.dbpath = '{}/data/ldprs_store'.format(dest)
        self.fpath = '{}/data/ldpnr_store'.format(dest)
        self.config_dir = '{}/etc'.format(dest)

        shutil.rmtree(dest, ignore_errors=True)
        shutil.copytree('{}/etc.defaults'.format(cur_dir), self.config_dir)

        # Modify and overwrite destination configuration.
        orig_config, _ = parse_config(self.config_dir)
        orig_config['application']['store']['ldp_rs']['location'] = self.dbpath
        orig_config['application']['store']['ldp_nr']['path'] = self.fpath

        with open('{}/application.yml'.format(self.config_dir), 'w') \
                as config_file:
            config_file.write(yaml.dump(orig_config['application']))

        env.config = parse_config(self.config_dir)[0]
        env.app_globals = AppGlobals(env.config)

        self.rdfly = env.app_globals.rdfly
        self.nonrdfly = env.app_globals.nonrdfly

        with TxnManager(env.app_globals.rdf_store, write=True) as txn:
            self.rdfly.bootstrap()
            self.rdfly.store.close()
        env.app_globals.nonrdfly.bootstrap()

        self.src = src.rstrip('/')
        self.zero_binaries = zero_binaries
        self.skip_errors = skip_errors