Ejemplo n.º 1
0
def main(job, file_path, file_uuid, sip_uuid, shared_path, file_type):
    """Entry point for policy checker."""
    setup_dicts(mcpclient_settings)

    policy_checker = PolicyChecker(job, file_path, file_uuid, sip_uuid,
                                   shared_path, file_type)
    return policy_checker.check()
def main(task_uuid, file_uuid):
    setup_dicts(mcpclient_settings)

    succeeded = True

    file_ = File.objects.get(uuid=file_uuid)

    # Normally we don't transcribe derivatives (access copies, preservation copies);
    # however, some useful transcription tools can't handle some formats that
    # are common as the primary copies. For example, tesseract can't handle JPEG2000.
    # If there are no rules for the primary format passed in, try to look at each
    # derivative until a transcribable derivative is found.
    #
    # Skip derivatives to avoid double-scanning them; only look at them as a fallback.
    if file_.filegrpuse != "original":
        print('{} is not an original; not transcribing'.format(file_uuid), file=sys.stderr)
        return 0

    rules = fetch_rules_for(file_)
    if not rules:
        file_, rules = fetch_rules_for_derivatives(file_)

    if not rules:
        print('No rules found for file {} and its derivatives; not transcribing'.format(file_uuid), file=sys.stderr)
        return 0
    else:
        if file_.filegrpuse == "original":
            noun = "original"
        else:
            noun = file_.filegrpuse + " derivative"
        print('Transcribing {} {}'.format(noun, file_.uuid), file=sys.stderr)

    rd = ReplacementDict.frommodel(file_=file_, type_='file')

    for rule in rules:
        script = rule.command.command
        if rule.command.script_type in ('bashScript', 'command'):
            script, = rd.replace(script)
            args = []
        else:
            args = rd.to_gnu_options

        exitstatus, stdout, stderr = executeOrRun(rule.command.script_type,
                                                  script, arguments=args)
        if exitstatus != 0:
            succeeded = False

        output_path = rd.replace(rule.command.output_location)[0]
        relative_path = output_path.replace(rd['%SIPDirectory%'], '%SIPDirectory%')
        event = insert_transcription_event(exitstatus, file_uuid, rule, relative_path)

        if os.path.isfile(output_path):
            insert_file_into_database(file_uuid, rd['%SIPUUID%'], event, rule, output_path, relative_path)

    return 0 if succeeded else 1
Ejemplo n.º 3
0
def main(file_path, file_uuid, sip_uuid, shared_path, file_type):
    setup_dicts(mcpclient_settings)

    policy_checker = PolicyChecker(
        file_path, file_uuid, sip_uuid, shared_path, file_type)
    return policy_checker.check()
Ejemplo n.º 4
0
def main(job, file_path, file_uuid, sip_uuid):
    setup_dicts(mcpclient_settings)

    failed = False

    # Check to see whether the file has already been characterized; don't try
    # to characterize it a second time if so.
    if FPCommandOutput.objects.filter(file_id=file_uuid).count() > 0:
        return 0

    try:
        format = FormatVersion.active.get(
            fileformatversion__file_uuid=file_uuid)
    except FormatVersion.DoesNotExist:
        rules = format = None

    if format:
        rules = FPRule.active.filter(format=format.uuid,
                                     purpose="characterization")

    # Characterization always occurs - if nothing is specified, get one or more
    # defaults specified in the FPR.
    if not rules:
        rules = FPRule.active.filter(purpose="default_characterization")

    for rule in rules:
        if (rule.command.script_type == "bashScript"
                or rule.command.script_type == "command"):
            args = []
            command_to_execute = replace_string_values(rule.command.command,
                                                       file_=file_uuid,
                                                       sip=sip_uuid,
                                                       type_="file")
        else:
            rd = ReplacementDict.frommodel(file_=file_uuid,
                                           sip=sip_uuid,
                                           type_="file")
            args = rd.to_gnu_options()
            command_to_execute = rule.command.command

        exitstatus, stdout, stderr = executeOrRun(
            rule.command.script_type,
            command_to_execute,
            arguments=args,
            capture_output=True,
        )

        job.write_output(stdout)
        job.write_error(stderr)

        if exitstatus != 0:
            job.write_error(
                "Command {} failed with exit status {}; stderr:".format(
                    rule.command.description, exitstatus))
            failed = True
            continue
        # fmt/101 is XML - we want to collect and package any XML output, while
        # allowing other commands to execute without actually collecting their
        # output in the event that they are writing their output to disk.
        # FPCommandOutput can have multiple rows for a given file,
        # distinguished by the rule that produced it.
        if (rule.command.output_format
                and rule.command.output_format.pronom_id == "fmt/101"):
            try:
                etree.fromstring(stdout)
                insertIntoFPCommandOutput(file_uuid, stdout, rule.uuid)
                job.write_output(
                    'Saved XML output for command "{}" ({})'.format(
                        rule.command.description, rule.command.uuid))
            except etree.XMLSyntaxError:
                failed = True
                job.write_error(
                    'XML output for command "{}" ({}) was not valid XML; not saving to database'
                    .format(rule.command.description, rule.command.uuid))
        else:
            job.write_error(
                'Tool output for command "{}" ({}) is not XML; not saving to database'
                .format(rule.command.description, rule.command.uuid))

    if failed:
        return 255
    else:
        return 0
Ejemplo n.º 5
0
def main(job, file_path, file_uuid, sip_uuid, shared_path, file_type):
    setup_dicts(mcpclient_settings)

    validator = Validator(job, file_path, file_uuid, sip_uuid, shared_path, file_type)
    return validator.validate()
Ejemplo n.º 6
0
def main(opts):
    """ Find and execute normalization commands on input file. """
    # TODO fix for maildir working only on attachments

    setup_dicts(mcpclient_settings)

    # Find the file and it's FormatVersion (file identification)
    try:
        file_ = File.objects.get(uuid=opts.file_uuid)
    except File.DoesNotExist:
        print('File with uuid',
              opts.file_uuid,
              'does not exist in database.',
              file=sys.stderr)
        return NO_RULE_FOUND
    print('File found:', file_.uuid, file_.currentlocation)

    # Unless normalization file group use is submissionDocumentation, skip the
    # submissionDocumentation directory
    if opts.normalize_file_grp_use != "submissionDocumentation" and file_.currentlocation.startswith(
            '%SIPDirectory%objects/submissionDocumentation'):
        print('File', os.path.basename(opts.file_path),
              'in objects/submissionDocumentation, skipping')
        return SUCCESS

    # Only normalize files where the file's group use and normalize group use match
    if file_.filegrpuse != opts.normalize_file_grp_use:
        print(os.path.basename(opts.file_path), 'is file group usage',
              file_.filegrpuse, 'instead of ', opts.normalize_file_grp_use,
              ' - skipping')
        return SUCCESS

    # For re-ingest: clean up old derivations
    # If the file already has a Derivation with the same purpose, remove it and mark the derived file as deleted
    derivatives = Derivation.objects.filter(
        source_file=file_, derived_file__filegrpuse=opts.purpose)
    for derivative in derivatives:
        print(opts.purpose, 'derivative', derivative.derived_file_id,
              'already exists, marking as deleted')
        File.objects.filter(uuid=derivative.derived_file_id).update(
            filegrpuse='deleted')
        # Don't create events for thumbnail files
        if opts.purpose != 'thumbnail':
            databaseFunctions.insertIntoEvents(
                fileUUID=derivative.derived_file_id,
                eventType='deletion',
            )
    derivatives.delete()

    # If a file has been manually normalized for this purpose, skip it
    manually_normalized_file = check_manual_normalization(opts)
    if manually_normalized_file:
        print(os.path.basename(opts.file_path),
              'was already manually normalized into',
              manually_normalized_file.currentlocation)
        if 'preservation' in opts.purpose:
            # Add derivation link and associated event
            insert_derivation_event(
                original_uuid=opts.file_uuid,
                output_uuid=manually_normalized_file.uuid,
                derivation_uuid=str(uuid.uuid4()),
                event_detail_output="manual normalization",
                outcome_detail_note=None,
            )
        return SUCCESS

    do_fallback = False
    format_id = get_object_or_None(FileFormatVersion, file_uuid=opts.file_uuid)

    # Look up the normalization command in the FPR
    if format_id:
        print('File format:', format_id.format_version)
        try:
            rule = FPRule.active.get(format=format_id.format_version,
                                     purpose=opts.purpose)
        except FPRule.DoesNotExist:
            do_fallback = True

    # Try with default rule if no format_id or rule was found
    if format_id is None or do_fallback:
        try:
            rule = get_default_rule(opts.purpose)
            print(os.path.basename(file_.currentlocation),
                  "not identified or without rule",
                  "- Falling back to default", opts.purpose, "rule")
        except FPRule.DoesNotExist:
            print('Not normalizing', os.path.basename(file_.currentlocation),
                  ' - No rule or default rule found to normalize for',
                  opts.purpose)
            return NO_RULE_FOUND

    print('Format Policy Rule:', rule)
    command = rule.command
    print('Format Policy Command', command.description)

    replacement_dict = get_replacement_dict(opts)
    cl = transcoder.CommandLinker(rule, command, replacement_dict, opts,
                                  once_normalized)
    exitstatus = cl.execute()

    # If the access/thumbnail normalization command has errored AND a
    # derivative was NOT created, then we run the default access/thumbnail
    # rule. Note that we DO need to check if the derivative file exists. Even
    # when a verification command exists for the normalization command, the
    # transcoder.py::Command.execute method will only run the verification
    # command if the normalization command returns a 0 exit code.
    # Errored thumbnail normalization also needs to result in default thumbnail
    # normalization; if not, then a transfer with a single file that failed
    # thumbnail normalization will result in a failed SIP at "Prepare DIP: Copy
    # thumbnails to DIP directory"
    if (exitstatus != 0 and opts.purpose in ('access', 'thumbnail')
            and cl.commandObject.output_location
            and (not os.path.isfile(cl.commandObject.output_location))):
        # Fall back to default rule
        try:
            fallback_rule = get_default_rule(opts.purpose)
            print(opts.purpose,
                  'normalization failed, falling back to default',
                  opts.purpose, 'rule')
        except FPRule.DoesNotExist:
            print('Not retrying normalizing for',
                  os.path.basename(file_.currentlocation),
                  ' - No default rule found to normalize for', opts.purpose)
            fallback_rule = None
        # Don't re-run the same command
        if fallback_rule and fallback_rule.command != command:
            print('Fallback Format Policy Rule:', fallback_rule)
            command = fallback_rule.command
            print('Fallback Format Policy Command', command.description)

            # Use existing replacement dict
            cl = transcoder.CommandLinker(fallback_rule, command,
                                          replacement_dict, opts,
                                          once_normalized)
            exitstatus = cl.execute()

    # Store thumbnails locally for use during AIP searches
    # TODO is this still needed, with the storage service?
    if 'thumbnail' in opts.purpose:
        thumbnail_filepath = cl.commandObject.output_location
        thumbnail_storage_dir = os.path.join(
            mcpclient_settings.SHARED_DIRECTORY,
            'www',
            'thumbnails',
            opts.sip_uuid,
        )
        try:
            os.makedirs(thumbnail_storage_dir)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(
                    thumbnail_storage_dir):
                pass
            else:
                raise
        thumbnail_basename, thumbnail_extension = os.path.splitext(
            thumbnail_filepath)
        thumbnail_storage_file = os.path.join(
            thumbnail_storage_dir,
            opts.file_uuid + thumbnail_extension,
        )

        shutil.copyfile(thumbnail_filepath, thumbnail_storage_file)

    if not exitstatus == 0:
        print('Command', command.description, 'failed!', file=sys.stderr)
        return RULE_FAILED
    else:
        print('Successfully normalized ', os.path.basename(opts.file_path),
              'for', opts.purpose)
        return SUCCESS