Ejemplo n.º 1
0
    def __call__(self, fcrepo, args):
        if args.notransactions:
            try:
                collection = pcdm.Collection()
                collection.title = args.name
                collection.create_object(fcrepo)
                collection.update_object(fcrepo)

            except RESTAPIException as e:
                logger.error(f'Error in collection creation: {e}')
                raise FailureException()
        else:
            with Transaction(fcrepo) as txn:
                try:
                    collection = pcdm.Collection()
                    collection.title = args.name
                    collection.create_object(fcrepo)
                    collection.update_object(fcrepo)
                    txn.commit()

                except RESTAPIException as e:
                    logger.error(f'Error in collection creation: {e}')
                    raise FailureException()

        if args.batch is not None:
            with open(args.batch, 'r') as batchconfig:
                batch = yaml.safe_load(batchconfig)
                batch['COLLECTION'] = str(collection.uri)
            with open(args.batch, 'w') as batchconfig:
                yaml.dump(batch, batchconfig, default_flow_style=False)
Ejemplo n.º 2
0
    def __call__(self, fcrepo, args):
        fieldnames = ['uri', 'timestamp']

        # read the log of completed items
        try:
            completed = util.ItemLog('logs/annotated.csv', fieldnames, 'uri')
        except Exception as e:
            logger.error('Non-standard map file specified: {0}'.format(e))
            raise FailureException()

        logger.info('Found {0} completed items'.format(len(completed)))

        if args.ignore is not None:
            try:
                ignored = util.ItemLog(args.ignore, fieldnames, 'uri')
            except Exception as e:
                logger.error('Non-standard ignore file specified: {0}'.format(e))
                raise FailureException()
        else:
            ignored = []

        skipfile = 'logs/skipped.extractocr.{0}.csv'.format(now)
        skipped = util.ItemLog(skipfile, fieldnames, 'uri')

        with fcrepo.at_path('/annotations'):
            for line in sys.stdin:
                uri = line.rstrip('\n')
                if uri in completed:
                    continue
                elif uri in ignored:
                    logger.debug('Ignoring {0}'.format(uri))
                    continue

                try:
                    is_extracted = extract(fcrepo, uri)
                except RESTAPIException:
                    logger.error(
                        "Unable to commit or rollback transaction, aborting"
                    )
                    raise FailureException()

                row = {
                    'uri': uri,
                    'timestamp': str(datetime.utcnow())
                }

                if is_extracted:
                    completed.writerow(row)
                else:
                    skipped.writerow(row)
Ejemplo n.º 3
0
    def request(self, method, url, headers=None, **kwargs):
        if headers is None:
            headers = {}

        # make sure the transaction keep-alive thread hasn't failed
        if self.in_transaction() and self.transaction.keep_alive.failed.is_set(
        ):
            raise FailureException('Transaction keep-alive failed'
                                   ) from self.transaction.keep_alive.exception

        target_uri = self._insert_transaction_uri(url)

        if self.is_forwarded():
            # Reverse forward
            target_uri = self.undo_forward(target_uri)

        self.logger.debug("%s %s", method, target_uri)
        if self.ua_string is not None:
            headers['User-Agent'] = self.ua_string
        if self.delegated_user is not None:
            headers['On-Behalf-Of'] = self.delegated_user

        self.auth.refresh_auth(self.session)

        response = self.session.request(method,
                                        target_uri,
                                        headers=headers,
                                        **kwargs)
        self.logger.debug("%s %s", response.status_code, response.reason)
        return response
Ejemplo n.º 4
0
 def parse_message(message):
     access = message.args.get('access')
     message.body = message.body.encode('utf-8').decode('utf-8-sig')
     if access is not None:
         try:
             access_uri = uri_or_curie(access)
         except ArgumentTypeError as e:
             raise FailureException(f'PlastronArg-access {e}')
     else:
         access_uri = None
     return Namespace(
         model=message.args.get('model'),
         limit=message.args.get('limit', None),
         percentage=message.args.get('percent', None),
         validate_only=message.args.get('validate-only', False),
         resume=message.args.get('resume', False),
         import_file=io.StringIO(message.body),
         template_file=None,
         access=access_uri,
         member_of=message.args.get('member-of'),
         binaries_location=message.args.get('binaries-location'),
         container=message.args.get('container', None),
         extract_text_types=message.args.get('extract-text', None),
         job_id=message.job_id,
         structure=message.args.get('structure', None),
         relpath=message.args.get('relpath', None)
     )
Ejemplo n.º 5
0
    def __call__(self, message: PlastronCommandMessage,
                 progress_topic: Destination):
        # determine which command to load to process the message
        command = self.get_command(message.command)

        if message.job_id is None:
            raise FailureException('Expecting a PlastronJobId header')

        logger.info(f'Received message to initiate job {message.job_id}')

        args = command.parse_message(message)

        cmd_repo_config = command.repo_config(self.repo_config, args)

        repo = Repository(config=cmd_repo_config,
                          ua_string=f'plastron/{version}',
                          on_behalf_of=message.args.get('on-behalf-of'))

        if repo.delegated_user is not None:
            logger.info(
                f'Running repository operations on behalf of {repo.delegated_user}'
            )

        for status in (command.execute(repo, args) or []):
            progress_topic.send(
                PlastronMessage(job_id=message.job_id, body=status))

        logger.info(f'Job {message.job_id} complete')

        # default message state is "Done"
        return message.response(state=command.result.get('type', 'Done'),
                                body=command.result)
Ejemplo n.º 6
0
def get_command_class(command_name: str):
    module_name = command_name
    if command_name == 'import':
        # special case for the import command, to avoid conflict
        # with the "import" keyword
        module_name += 'command'
    try:
        command_module = import_module('plastron.commands.' + module_name)
    except ModuleNotFoundError as e:
        raise FailureException(
            f'Unable to load a command with the name {command_name}') from e
    command_class = getattr(command_module, 'Command')
    if command_class is None:
        raise FailureException(
            f'Command class not found in module {command_module}')

    return command_class
Ejemplo n.º 7
0
    def __call__(self, repo: Repository, args: Namespace) -> None:
        csv_file = csv.DictReader(args.source_file)
        if csv_file.fieldnames is None:
            logger.error(f'No fields found in {csv_file}. Exiting.')
            sys.exit(1)

        if args.output_file is not None:
            output_file = open(args.output_file, 'w')
        else:
            output_file = sys.stdout
        csv_writer = csv.DictWriter(output_file, fieldnames=csv_file.fieldnames)

        write_csv_header(csv_file, args, csv_writer)

        for n, row in enumerate(csv_file, start=1):
            identifier = row[args.identifier_column]
            source = get_source(row[args.binary_column])
            if not source:
                logger.warning(f'No source found for {identifier}; skipping')
                csv_writer.writerow(row)
                continue

            item = Item(identifier=identifier, title=f'Stub for {identifier}')
            file = File()
            file.source = source
            item.add_file(file)
            if args.member_of is not None:
                item.member_of = URIRef(args.member_of)
            if args.access is not None:
                item.rdf_type.append(args.access)
                file.rdf_type.append(args.access)
            try:
                with Transaction(repo) as txn:
                    try:
                        item.create(repo, container_path=args.container_path)
                        item.update(repo)
                        # update the CSV with the new URI
                        row[args.binary_column] = file.uri
                        csv_writer.writerow(row)
                        txn.commit()
                    except (RESTAPIException, FileNotFoundError) as e:
                        # if anything fails during item creation or committing the transaction
                        # attempt to rollback the current transaction
                        # failures here will be caught by the main loop's exception handler
                        # and should trigger a system exit
                        logger.error(f'{item.identifier} not created: {e}')
                        txn.rollback()
                    except KeyboardInterrupt:
                        logger.warning("Load interrupted")
                        txn.rollback()
                        raise

            except RESTAPIException as e:
                raise FailureException(f'Transaction rollback failed: {e}') from e

        if output_file is not sys.stdout:
            output_file.close()
Ejemplo n.º 8
0
 def model_class(self):
     if self._model_class is None:
         # Retrieve the model to use for validation
         try:
             self._model_class = getattr(
                 importlib.import_module("plastron.models"), self.model)
         except AttributeError as e:
             raise FailureException(
                 f'Unable to load model "{self.model}"') from e
     return self._model_class
Ejemplo n.º 9
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     # when we leave the transaction context, always
     # set the stop flag on the keep-alive ping
     self.keep_alive.stop()
     # on an exception, rollback the transaction
     if exc_type is not None:
         if exc_type == TransactionError:
             raise FailureException(f'Transaction failed: {exc_val}')
         self.rollback()
         # return false to propagate the exception upward
         return False
Ejemplo n.º 10
0
    def start(self):
        """
        Sets the timestamp for this run, and creates the log directory for it.

        :return:
        """
        if self.dir is not None:
            raise FailureException('Run completed, cannot start again')
        self.timestamp = datetimestamp()
        self.dir = self.job.dir / self.timestamp
        os.makedirs(self.dir)
        return self
Ejemplo n.º 11
0
    def load(self, timestamp: str):
        """
        Load an existing import run by its timestamp.

        :param timestamp: should be 14 digits expressing YYYYMMDDHHMMSS
        :return:
        """
        self.timestamp = timestamp
        self.dir = self.job.dir / self.timestamp
        if not self.dir.is_dir():
            raise FailureException(f'Import run {self.timestamp} not found')
        return self
Ejemplo n.º 12
0
    def __call__(self, fcrepo, args):
        logger.warning(
            'The "mkcol" command is deprecated and will be removed in a future release.'
        )
        logger.warning(
            f'Use: plastron create --container "{fcrepo.relpath}" --collection "{args.name}"'
        )

        if args.notransactions:
            try:
                collection = pcdm.Collection()
                collection.title = args.name
                collection.create(fcrepo, recursive=False)
                collection.update(fcrepo, recursive=False)

            except RESTAPIException as e:
                logger.error(f'Error in collection creation: {e}')
                raise FailureException()
        else:
            with Transaction(fcrepo) as txn:
                try:
                    collection = pcdm.Collection()
                    collection.title = args.name
                    collection.create(fcrepo, recursive=False)
                    collection.update(fcrepo, recursive=False)
                    txn.commit()

                except RESTAPIException as e:
                    logger.error(f'Error in collection creation: {e}')
                    raise FailureException()

        if args.batch is not None:
            with open(args.batch, 'r') as batchconfig:
                batch = yaml.safe_load(batchconfig)
                batch['COLLECTION'] = str(collection.uri)
            with open(args.batch, 'w') as batchconfig:
                yaml.dump(batch, batchconfig, default_flow_style=False)
Ejemplo n.º 13
0
    def process(self, method, use_transaction=True, traverse=None):
        self.use_transaction = use_transaction
        if traverse is not None:
            predicate_list = ', '.join(p.n3() for p in traverse)
            logger.info(
                f"{method.__name__} will traverse the following predicates: {predicate_list}"
            )

        if use_transaction:
            # set up a temporary ItemLog that will be copied to the real item log upon completion of the transaction
            self.completed_buffer = ItemLog(NamedTemporaryFile().name,
                                            ['uri', 'title', 'timestamp'],
                                            'uri',
                                            header=False)
            with Transaction(self.repository, keep_alive=90) as transaction:
                for resource, graph in self.get_resources(traverse=traverse):
                    try:
                        method(resource, graph)
                    except RESTAPIException as e:
                        logger.error(
                            f'{method.__name__} failed for {resource}: {e}: {e.response.text}'
                        )
                        # if anything fails while processing of the list of uris, attempt to
                        # rollback the transaction. Failures here will be caught by the main
                        # loop's exception handler and should trigger a system exit
                        try:
                            transaction.rollback()
                            logger.warning('Transaction rolled back.')
                            return False
                        except RESTAPIException:
                            logger.error(
                                'Unable to roll back transaction, aborting')
                            raise FailureException()
                transaction.commit()
                if self.completed and self.completed.filename:
                    shutil.copyfile(self.completed_buffer.filename,
                                    self.completed.filename)
                return True
        else:
            for resource, graph in self.get_resources(traverse=traverse):
                try:
                    method(resource, graph)
                except RESTAPIException as e:
                    logger.error(
                        f'{method.__name__} failed for {resource}: {e}: {e.response.text}'
                    )
                    logger.warning(
                        f'Continuing {method.__name__} with next item')
            return True
Ejemplo n.º 14
0
    def get_command(self, command_name: str):
        if command_name not in self.commands:
            # get the configuration options for this command
            config = self.command_config.get(command_name.upper(), {})

            command_class = get_command_class(command_name)
            if getattr(command_class, 'parse_message') is None:
                raise FailureException(
                    f'Command class {command_class} does not support message processing'
                )

            # cache an instance of this command
            self.commands[command_name] = command_class(config)

        return self.commands[command_name]
Ejemplo n.º 15
0
def get_ssh_client(sftp_uri, **kwargs):
    if isinstance(sftp_uri, str):
        sftp_uri = urlsplit(sftp_uri)
    if not isinstance(sftp_uri, urllib.parse.SplitResult):
        raise TypeError('Expects a str or a urllib.parse.SplitResult')
    ssh_client = SSHClient()
    ssh_client.load_system_host_keys()
    ssh_client.set_missing_host_key_policy(AutoAddPolicy)
    try:
        ssh_client.connect(hostname=sftp_uri.hostname,
                           username=sftp_uri.username,
                           port=sftp_uri.port or SSH_PORT,
                           **kwargs)
        return ssh_client
    except SSHException as e:
        raise FailureException(str(e)) from e
Ejemplo n.º 16
0
    def execute(self, fcrepo, args):
        self.repository = fcrepo
        self.repository.test_connection()
        self.dry_run = args.dry_run
        self.validate = args.validate
        self.model = args.model
        self.stats = {
            'updated': [],
            'invalid': defaultdict(list),
            'errors': defaultdict(list)
        }

        if self.validate and not self.model:
            raise FailureException(
                "Model must be provided when performing validation")

        self.sparql_update = args.update_file.read().encode('utf-8')

        logger.debug(f'SPARQL Update query:\n'
                     f'====BEGIN====\n'
                     f'{self.sparql_update.decode()}\n'
                     f'=====END=====')

        if self.dry_run:
            logger.info('Dry run enabled, no actual updates will take place')

        self.resources = ResourceList(repository=self.repository,
                                      uri_list=args.uris,
                                      file=args.file,
                                      completed_file=args.completed)
        self.resources.process(method=self.update_item,
                               traverse=parse_predicate_list(args.recursive),
                               use_transaction=args.use_transactions)
        if len(self.stats['errors']) == 0 and len(self.stats['invalid']) == 0:
            state = 'update_complete'
        else:
            state = 'update_incomplete'

        self.result = {'type': state, 'stats': self.stats}
        logger.debug(self.stats)
Ejemplo n.º 17
0
 def __init__(self,
              repository,
              uri_list=None,
              file=None,
              completed_file=None):
     self.repository = repository
     self.uri_list = uri_list
     self.file = file
     self.use_transaction = True
     if completed_file is not None:
         logger.info(
             f'Reading the completed items log from {completed_file}')
         # read the log of completed items
         fieldnames = ['uri', 'title', 'timestamp']
         try:
             self.completed = ItemLog(completed_file, fieldnames, 'uri')
             logger.info(f'Found {len(self.completed)} completed item(s)')
         except Exception as e:
             logger.error(f"Non-standard map file specified: {e}")
             raise FailureException()
     else:
         self.completed = None
     self.completed_buffer = None
Ejemplo n.º 18
0
 def __call__(self, fcrepo, args):
     try:
         fcrepo.test_connection()
     except Exception:
         raise FailureException()
Ejemplo n.º 19
0
    def execute(self, repo, args):
        """
        Performs the import

        :param repo: the repository configuration
        :param args: the command-line arguments
        """
        start_time = datetime.now().timestamp()

        if args.resume and args.job_id is None:
            raise FailureException('Resuming a job requires a job id')

        if args.job_id is None:
            # TODO: generate a more unique id? add in user and hostname?
            args.job_id = f"import-{datetimestamp()}"

        job: ImportJob = Command.create_import_job(args.job_id, jobs_dir=self.jobs_dir)
        logger.debug(f'Job directory is {job.dir}')

        if args.resume and not job.dir_exists:
            raise FailureException(f'Cannot resume job {job.id}: no such job directory found in {self.jobs_dir}')

        # load or create config
        if args.resume:
            logger.info(f'Resuming saved job {job.id}')
            # load stored config from the previous run of this job
            try:
                job.load_config()
            except FileNotFoundError:
                raise FailureException(f'Cannot resume job {job.id}: no config.yml found in {job.dir}')
        else:
            if args.model is None:
                raise FailureException('A model is required unless resuming an existing job')
            job.save_config({
                'model': args.model,
                'access': args.access,
                'member_of': args.member_of,
                # Use "repo.relpath" as default for "container",
                # but allow it to be overridden by args
                'container': args.container or repo.relpath,
                'binaries_location': args.binaries_location
            })

        if args.template_file is not None:
            if not hasattr(job.model_class, 'HEADER_MAP'):
                logger.error(f'{job.model_class.__name__} has no HEADER_MAP, cannot create template')
                raise FailureException()
            logger.info(f'Writing template for the {job.model_class.__name__} model to {args.template_file.name}')
            writer = csv.writer(args.template_file)
            writer.writerow(list(job.model_class.HEADER_MAP.values()) + ['FILES'])
            return

        if args.import_file is None and not args.resume:
            raise FailureException('An import file is required unless resuming an existing job')

        if args.percentage:
            logger.info(f'Loading {args.percentage}% of the total items')
        if args.validate_only:
            logger.info('Validation-only mode, skipping imports')

        # if an import file was provided, save that as the new CSV metadata file
        if args.import_file is not None:
            job.store_metadata_file(args.import_file)

        try:
            metadata = job.metadata(limit=args.limit, percentage=args.percentage)
        except ModelClassNotFoundError as e:
            raise FailureException(f'Model class {e.model_name} not found') from e
        except JobError as e:
            raise FailureException(str(e)) from e

        if metadata.has_binaries and job.binaries_location is None:
            raise ConfigError('Must specify --binaries-location if the metadata has a FILES column')

        initial_completed_item_count = len(job.completed_log)
        logger.info(f'Found {initial_completed_item_count} completed items')

        updated_uris = []
        created_uris = []
        import_run = job.new_run().start()
        for row in metadata:
            repo_changeset = create_repo_changeset(repo, metadata, row)
            item = repo_changeset.item

            # count the number of files referenced in this row
            metadata.files += len(row.filenames)

            try:
                report = validate(item)
            except ValidationError as e:
                raise FailureException(f'Unable to run validation: {e}') from e

            metadata.validation_reports.append({
                'line': row.line_reference,
                'is_valid': report.is_valid(),
                'passed': [outcome for outcome in report.passed()],
                'failed': [outcome for outcome in report.failed()]
            })

            missing_files = [
                name for name in row.filenames if not self.get_source(job.binaries_location, name).exists()
            ]
            if len(missing_files) > 0:
                logger.warning(f'{len(missing_files)} file(s) for "{item}" not found')

            if report.is_valid() and len(missing_files) == 0:
                metadata.valid += 1
                logger.info(f'"{item}" is valid')
            else:
                # drop invalid items
                metadata.invalid += 1
                logger.warning(f'"{item}" is invalid, skipping')
                reasons = [' '.join(str(f) for f in outcome) for outcome in report.failed()]
                if len(missing_files) > 0:
                    reasons.extend(f'Missing file: {f}' for f in missing_files)
                import_run.drop_invalid(
                    item=item,
                    line_reference=row.line_reference,
                    reason=f'Validation failures: {"; ".join(reasons)}'
                )
                continue

            if args.validate_only:
                # validation-only mode
                continue

            try:
                self.update_repo(args, job, repo, metadata, row, repo_changeset,
                                 created_uris, updated_uris)
            except FailureException as e:
                metadata.errors += 1
                logger.error(f'{item} import failed: {e}')
                import_run.drop_failed(item, row.line_reference, reason=str(e))

            # update the status
            now = datetime.now().timestamp()
            yield {
                'time': {
                    'started': start_time,
                    'now': now,
                    'elapsed': now - start_time
                },
                'count': metadata.stats()
            }

        logger.info(f'Skipped {metadata.skipped} items')
        logger.info(f'Completed {len(job.completed_log) - initial_completed_item_count} items')
        logger.info(f'Dropped {len(import_run.invalid_items)} invalid items')
        logger.info(f'Dropped {len(import_run.failed_items)} failed items')

        logger.info(f"Found {metadata.valid} valid items")
        logger.info(f"Found {metadata.invalid} invalid items")
        logger.info(f"Found {metadata.errors} errors")
        if not args.validate_only:
            logger.info(f"{metadata.unchanged} of {metadata.total} items remained unchanged")
            logger.info(f"Created {metadata.created} of {metadata.total} items")
            logger.info(f"Updated {metadata.updated} of {metadata.total} items")

        if args.validate_only:
            # validate phase
            if metadata.invalid == 0:
                result_type = 'validate_success'
            else:
                result_type = 'validate_failed'
        else:
            # import phase
            if len(job.completed_log) == metadata.total:
                result_type = 'import_complete'
            else:
                result_type = 'import_incomplete'

        self.result = {
            'type': result_type,
            'validation': metadata.validation_reports,
            'count': metadata.stats()
        }
Ejemplo n.º 20
0
    def __call__(self, fcrepo, args):
        # Load batch configuration
        try:
            batch_config = BatchConfig(args.batch)
        except ConfigException as e:
            logger.error(e.message)
            logger.error(
                f'Failed to load batch configuration from {args.batch}')
            raise FailureException(e.message)

        logger.info(f'Loaded batch configuration from {args.batch}')

        if not os.path.isdir(batch_config.log_dir):
            os.makedirs(batch_config.log_dir)

        fcrepo.load_binaries = args.load_binaries

        # Define the data_handler function for the data being loaded
        logger.info("Initializing data handler")
        module_name = batch_config.handler
        handler = import_module('plastron.handlers.' + module_name)
        logger.info('Loaded "{0}" handler'.format(module_name))

        # "--no-binaries" implies "--no-annotations"
        if not args.load_binaries:
            logger.info("Setting --no-binaries implies --no-annotations")
            args.create_annotations = False

        try:
            batch = handler.Batch(fcrepo, batch_config)
        except (ConfigException, DataReadException) as e:
            logger.error(e.message)
            logger.error('Failed to initialize batch')
            raise FailureException(e.message)

        if not args.dry_run:
            fcrepo.test_connection()

            # read the log of completed items
            fieldnames = ['number', 'timestamp', 'title', 'path', 'uri']
            try:
                completed = ItemLog(batch_config.mapfile, fieldnames, 'path')
            except Exception as e:
                logger.error(f"Non-standard map file specified: {e}")
                raise FailureException()

            logger.info(f"Found {len(completed)} completed items")

            if args.ignore is not None:
                try:
                    ignored = ItemLog(args.ignore, fieldnames, 'path')
                except Exception as e:
                    logger.error(f"Non-standard ignore file specified: {e}")
                    raise FailureException()
            else:
                ignored = []

            skipfile = os.path.join(batch_config.log_dir,
                                    'skipped.load.{0}.csv'.format(now))
            skipped = ItemLog(skipfile, fieldnames, 'path')

            load_set = get_load_set(batch, args.percent)

            # create all batch objects in repository
            for n, item in enumerate(batch):
                is_loaded = False

                if n not in load_set:
                    logger.info(f"Loading {args.percent}, skipping item {n}")
                    continue

                # handle load limit parameter
                if args.limit is not None and n >= args.limit:
                    logger.info(f"Stopping after {args.limit} item(s)")
                    break
                elif item.path in completed:
                    continue
                elif item.path in ignored:
                    logger.debug(f"Ignoring {item.path}")
                    continue

                logger.info(f"Processing item {n + 1}/{batch.length}...")

                try:
                    logger.info(f"Loading item {n + 1}")
                    is_loaded = load_item(fcrepo,
                                          item,
                                          args,
                                          extra=batch_config.extra)
                except RESTAPIException:
                    logger.error(
                        "Unable to commit or rollback transaction, aborting")
                    raise FailureException()
                except DataReadException as e:
                    logger.error(f"Skipping item {n + 1}: {e.message}")

                row = {
                    'number':
                    n + 1,
                    'path':
                    item.path,
                    'timestamp':
                    getattr(item, 'creation_timestamp',
                            str(datetime.utcnow())),
                    'title':
                    getattr(item, 'title', 'N/A'),
                    'uri':
                    getattr(item, 'uri', 'N/A')
                }

                # write item details to relevant summary CSV
                if is_loaded:
                    completed.writerow(row)
                else:
                    skipped.writerow(row)

                if args.wait:
                    logger.info("Pausing {0} seconds".format(args.wait))
                    sleep(int(args.wait))
Ejemplo n.º 21
0
    def update_repo(self, args, job, repo, metadata, row, repo_changeset, created_uris, updated_uris):
        """
        Updates the repository with the given RepoChangeSet

        :param args: the arguments from the command-line
        :param job: The ImportJob
        :param repo: the repository configuration
        :param metadata: A plastron.jobs.MetadataRows object representing the
                          CSV file being imported
        :param row: A single plastron.jobs.Row object representing the row
                     being imported
        :param repo_changeset: The RepoChangeSet object describing the changes
                                 to make to the repository.
        :param created_uris: Accumulator storing a list of created URIS. This
                              variable is MODIFIED by this method.
        :param updated_uris: Accumulator storing a list of updated URIS. This
                              variable is MODIFIED by this method.
        """
        item = repo_changeset.item

        if not item.created:
            # if an item is new, don't construct a SPARQL Update query
            # instead, just create and update normally
            # create new item in the repo
            logger.debug('Creating a new item')
            # add the access class
            if job.access is not None:
                item.rdf_type.append(URIRef(job.access))
            # add the collection membership
            if job.member_of is not None:
                item.member_of = URIRef(job.member_of)

            if row.has_files:
                create_pages = bool(strtobool(row.get('CREATE_PAGES', 'True')))
                logger.debug('Adding pages and files to new item')
                self.add_files(
                    item,
                    build_file_groups(row['FILES']),
                    base_location=job.binaries_location,
                    access=job.access,
                    create_pages=create_pages
                )

            if args.extract_text_types is not None:
                annotate_from_files(item, args.extract_text_types.split(','))

            logger.debug(f"Creating resources in container: {job.container}")

            try:
                with Transaction(repo) as txn:
                    item.create(repo, container_path=job.container)
                    item.update(repo)
                    txn.commit()
            except Exception as e:
                raise FailureException(f'Creating item failed: {e}') from e

            job.complete(item, row.line_reference, ImportedItemStatus.CREATED)
            metadata.created += 1
            created_uris.append(item.uri)

        elif repo_changeset:
            # construct the SPARQL Update query if there are any deletions or insertions
            # then do a PATCH update of an existing item
            logger.info(f'Sending update for {item}')
            sparql_update = repo_changeset.build_sparql_update(repo)
            logger.debug(sparql_update)
            try:
                item.patch(repo, sparql_update)
            except RESTAPIException as e:
                raise FailureException(f'Updating item failed: {e}') from e

            job.complete(item, row.line_reference, ImportedItemStatus.MODIFIED)
            metadata.updated += 1
            updated_uris.append(item.uri)

        else:
            job.complete(item, row.line_reference, ImportedItemStatus.UNCHANGED)
            metadata.unchanged += 1
            logger.info(f'No changes found for "{item}" ({row.uri}); skipping')
            metadata.skipped += 1
Ejemplo n.º 22
0
    def execute(self, fcrepo, args):
        start_time = datetime.now().timestamp()
        count = 0
        errors = 0
        total = len(args.uris)
        try:
            serializer_class = SERIALIZER_CLASSES[args.format]
        except KeyError:
            logger.error(f'Unknown format: {args.format}')
            raise FailureException()

        if args.export_binaries and args.binary_types is not None:
            # filter files by their MIME type
            def mime_type_filter(file):
                return str(file.mimetype) in args.binary_types.split(',')
        else:
            # default filter is None; in this case filter() will return
            # all items that evaluate to true
            mime_type_filter = None

        logger.info(f'Export destination: {args.output_dest}')

        # create a bag in a temporary directory to hold exported items
        temp_dir = TemporaryDirectory()
        bag = make_bag(temp_dir.name)

        export_dir = os.path.join(temp_dir.name, 'data')
        serializer = serializer_class(directory=export_dir,
                                      public_uri_template=args.uri_template)
        for uri in args.uris:
            try:
                logger.info(f'Exporting item {count + 1}/{total}: {uri}')

                # derive an item-level directory name from the URI
                # currently this is hard-coded to look for a UUID
                # TODO: expand to other types of unique ids?
                match = UUID_REGEX.search(uri)
                if match is None:
                    raise DataReadException(f'No UUID found in {uri}')
                item_dir = match[0]

                graph = fcrepo.get_graph(uri)
                model_class = detect_resource_class(graph, uri, fallback=Item)
                obj = model_class.from_graph(graph, uri)

                if args.export_binaries:
                    logger.info(f'Gathering binaries for {uri}')
                    binaries = list(
                        filter(mime_type_filter, obj.gather_files(fcrepo)))
                    total_size = sum(int(file.size[0]) for file in binaries)
                    size, unit = format_size(total_size)
                    logger.info(
                        f'Total size of binaries: {round(size, 2)} {unit}')
                else:
                    binaries = None

                serializer.write(obj, files=binaries, binaries_dir=item_dir)

                if binaries is not None:
                    binaries_dir = os.path.join(export_dir, item_dir)
                    os.makedirs(binaries_dir, exist_ok=True)
                    for file in binaries:
                        response = fcrepo.head(file.uri)
                        accessed = parsedate(response.headers['Date'])
                        modified = parsedate(response.headers['Last-Modified'])

                        binary_filename = os.path.join(binaries_dir,
                                                       str(file.filename))
                        with open(binary_filename, mode='wb') as binary:
                            with file.source as stream:
                                for chunk in stream:
                                    binary.write(chunk)

                        # update the atime and mtime of the file to reflect the time of the
                        # HTTP request and the resource's last-modified time in the repo
                        os.utime(binary_filename,
                                 times=(mktime(accessed), mktime(modified)))
                        logger.debug(f'Copied {file.uri} to {binary.name}')

                count += 1

            except DataReadException as e:
                # log the failure, but continue to attempt to export the rest of the URIs
                logger.error(f'Export of {uri} failed: {e}')
                errors += 1
            except (RESTAPIException, ConnectionError) as e:
                # log the failure, but continue to attempt to export the rest of the URIs
                logger.error(f'Unable to retrieve {uri}: {e}')
                errors += 1

            # update the status
            now = datetime.now().timestamp()
            yield {
                'time': {
                    'started': start_time,
                    'now': now,
                    'elapsed': now - start_time
                },
                'count': {
                    'total': total,
                    'exported': count,
                    'errors': errors
                }
            }

        try:
            serializer.finish()
        except EmptyItemListError:
            logger.error("No items could be exported; skipping writing file")

        logger.info(f'Exported {count} of {total} items')

        # save the BagIt bag to send to the output destination
        bag.save(manifests=True)

        # parse the output destination to determine where to send the export
        if args.output_dest.startswith('sftp:'):
            # send over SFTP to a remote host
            sftp_uri = urlsplit(args.output_dest)
            ssh_client = get_ssh_client(sftp_uri, key_filename=args.key)
            try:
                sftp_client = SFTPClient.from_transport(
                    ssh_client.get_transport())
                root, ext = splitext(basename(sftp_uri.path))
                destination = sftp_client.open(sftp_uri.path, mode='w')
            except SSHException as e:
                raise FailureException(str(e)) from e
        else:
            # send to a local file
            zip_filename = args.output_dest
            root, ext = splitext(basename(zip_filename))
            destination = zip_filename

        # write out a single ZIP file of the whole bag
        compress_bag(bag, destination, root)

        self.result = {
            'type': 'export_complete' if count == total else 'partial_export',
            'content_type': serializer.content_type,
            'file_extension': serializer.file_extension,
            'count': {
                'total': total,
                'exported': count,
                'errors': errors
            }
        }
Ejemplo n.º 23
0
 def __call__(self, fcrepo, args):
     try:
         fcrepo.test_connection()
     except ConnectionError as e:
         raise FailureException(str(e)) from e
Ejemplo n.º 24
0
def main():
    """Parse args and handle options."""

    parser = ArgumentParser(prog='plastron',
                            description='Batch operation tool for Fedora 4.')
    parser.set_defaults(cmd_name=None)

    common_required = parser.add_mutually_exclusive_group(required=True)
    common_required.add_argument('-r',
                                 '--repo',
                                 help='Path to repository configuration file.',
                                 action='store')
    common_required.add_argument('-c',
                                 '--config',
                                 help='Path to configuration file.',
                                 action='store',
                                 dest='config_file',
                                 type=FileType('r'))
    common_required.add_argument('-V',
                                 '--version',
                                 help='Print version and exit.',
                                 action='version',
                                 version=version)

    parser.add_argument('-v',
                        '--verbose',
                        help='increase the verbosity of the status output',
                        action='store_true')
    parser.add_argument('-q',
                        '--quiet',
                        help='decrease the verbosity of the status output',
                        action='store_true')
    parser.add_argument('--on-behalf-of',
                        help='delegate repository operations to this username',
                        dest='delegated_user',
                        action='store')

    subparsers = parser.add_subparsers(title='commands')

    command_modules = load_commands(subparsers)

    # parse command line args
    args = parser.parse_args()

    # if no subcommand was selected, display the help
    if args.cmd_name is None:
        parser.print_help()
        sys.exit(0)

    if args.config_file is not None:
        # new-style, combined config file (a la plastron.daemon)
        config = envsubst(yaml.safe_load(args.config_file))
        repo_config = config['REPOSITORY']
        broker_config = config.get('MESSAGE_BROKER', None)
        command_config = config.get('COMMANDS', {})
    else:
        # old-style, repository-only config file
        with open(args.repo, 'r') as repo_config_file:
            repo_config = yaml.safe_load(repo_config_file)
        broker_config = None
        command_config = {}

    fcrepo = Repository(repo_config,
                        ua_string=f'plastron/{version}',
                        on_behalf_of=args.delegated_user)

    if broker_config is not None:
        broker = Broker(broker_config)
    else:
        broker = None

    # get basic logging options
    if 'LOGGING_CONFIG' in repo_config:
        with open(repo_config.get('LOGGING_CONFIG'),
                  'r') as logging_config_file:
            logging_options = yaml.safe_load(logging_config_file)
    else:
        logging_options = DEFAULT_LOGGING_OPTIONS

    # log file configuration
    log_dirname = repo_config.get('LOG_DIR')
    if not os.path.isdir(log_dirname):
        os.makedirs(log_dirname)
    log_filename = 'plastron.{0}.{1}.log'.format(args.cmd_name, now)
    logfile = os.path.join(log_dirname, log_filename)
    logging_options['handlers']['file']['filename'] = logfile

    # manipulate console verbosity
    if args.verbose:
        logging_options['handlers']['console']['level'] = 'DEBUG'
    elif args.quiet:
        logging_options['handlers']['console']['level'] = 'WARNING'

    # configure logging
    logging.config.dictConfig(logging_options)

    # get the selected subcommand
    command_module = command_modules[args.cmd_name]

    try:
        if hasattr(command_module, 'Command'):
            command = command_module.Command(
                config=command_config.get(args.cmd_name.upper()))
            command.repo = fcrepo
            command.broker = broker
        else:
            raise FailureException(
                f'Unable to execute command {args.cmd_name}')
        # dispatch to the selected subcommand
        print_header(args)
        logger.info(
            f'Loaded repo configuration from {args.repo or args.config_file.name}'
        )
        if args.delegated_user is not None:
            logger.info(
                f'Running repository operations on behalf of {args.delegated_user}'
            )
        command(fcrepo, args)
        print_footer(args)
    except FailureException as e:
        # something failed, exit with non-zero status
        logger.error(str(e))
        sys.exit(1)
    except KeyboardInterrupt:
        # aborted due to Ctrl+C
        sys.exit(2)
Ejemplo n.º 25
0
    def __init__(self,
                 job: ImportJob,
                 limit: int = None,
                 percentage: int = None):
        self.job = job
        self.limit = limit
        self.metadata_file = None

        try:
            self.metadata_file = open(job.metadata_filename, 'r')
        except FileNotFoundError as e:
            raise MetadataError(
                job, f'Cannot read source file "{job.metadata_filename}: {e}'
            ) from e

        self.csv_file = csv.DictReader(self.metadata_file)

        try:
            self.fields = build_fields(self.fieldnames, self.model_class)
        except DataReadException as e:
            raise FailureException(str(e)) from e

        self.validation_reports: List[Mapping] = []
        self.skipped = 0
        self.subset_to_load = None

        self.total = None
        self.rows = 0
        self.errors = 0
        self.valid = 0
        self.invalid = 0
        self.created = 0
        self.updated = 0
        self.unchanged = 0
        self.files = 0

        if self.metadata_file.seekable():
            # get the row count of the file, then rewind the CSV file
            self.total = sum(1 for _ in self.csv_file)
            self._rewind_csv_file()
        else:
            # file is not seekable, so we can't get a row count in advance
            self.total = None

        if percentage is not None:
            if not self.metadata_file.seekable():
                raise FailureException(
                    'Cannot execute a percentage load using a non-seekable file'
                )
            identifier_column = self.model_class.HEADER_MAP['identifier']
            identifiers = [
                row[identifier_column] for row in self.csv_file
                if row[identifier_column] not in job.completed_log
            ]
            self._rewind_csv_file()

            if len(identifiers) == 0:
                logger.info('No items remaining to load')
                self.subset_to_load = []
            else:
                target_count = int(((percentage / 100) * self.total))
                logger.info(
                    f'Attempting to load {target_count} items ({percentage}% of {self.total})'
                )
                if len(identifiers) > target_count:
                    # evenly space the items to load among the remaining items
                    step_size = int(
                        (100 * (1 - (len(job.completed_log) / self.total))) /
                        percentage)
                else:
                    # load all remaining items
                    step_size = 1
                self.subset_to_load = identifiers[::step_size]
Ejemplo n.º 26
0
 def __enter__(self):
     try:
         self.begin()
     except TransactionError as e:
         raise FailureException(f'Transaction failed: {e}')
     return self