Example #1
0
 def getReplacementDic(self, target):
     """ Return a dict with all of the replacement strings for this unit and the value to replace with. """
     ret = ReplacementDict.frommodel(type_="sip", sip=self.UUID)
     ret["%AIPFilename%"] = self.aipFilename
     ret["%unitType%"] = self.unitType
     ret["%SIPType%"] = self.sipType
     return ret
Example #2
0
 def getReplacementDic(self, target):
     ret = ReplacementDict.frommodel(
         type_='transfer',
         sip=self.UUID
     )
     ret["%unitType%"] = self.unitType
     return ret
Example #3
0
def test_replacementdict_model_constructor_file_only():
    rd = ReplacementDict.frommodel(file_=FILE, type_='file')

    assert rd['%fileUUID%'] == FILE.uuid
    assert rd['%originalLocation%'] == FILE.originallocation
    assert rd['%currentLocation%'] == FILE.currentlocation
    assert rd['%relativeLocation%'] == FILE.currentlocation
    assert rd['%fileGrpUse%'] == FILE.filegrpuse
def main(task_uuid, file_uuid):
    setup_dicts(mcpclient_settings)

    succeeded = True

    file_ = File.objects.get(uuid=file_uuid)

    # Normally we don't transcribe derivatives (access copies, preservation copies);
    # however, some useful transcription tools can't handle some formats that
    # are common as the primary copies. For example, tesseract can't handle JPEG2000.
    # If there are no rules for the primary format passed in, try to look at each
    # derivative until a transcribable derivative is found.
    #
    # Skip derivatives to avoid double-scanning them; only look at them as a fallback.
    if file_.filegrpuse != "original":
        print('{} is not an original; not transcribing'.format(file_uuid), file=sys.stderr)
        return 0

    rules = fetch_rules_for(file_)
    if not rules:
        file_, rules = fetch_rules_for_derivatives(file_)

    if not rules:
        print('No rules found for file {} and its derivatives; not transcribing'.format(file_uuid), file=sys.stderr)
        return 0
    else:
        if file_.filegrpuse == "original":
            noun = "original"
        else:
            noun = file_.filegrpuse + " derivative"
        print('Transcribing {} {}'.format(noun, file_.uuid), file=sys.stderr)

    rd = ReplacementDict.frommodel(file_=file_, type_='file')

    for rule in rules:
        script = rule.command.command
        if rule.command.script_type in ('bashScript', 'command'):
            script, = rd.replace(script)
            args = []
        else:
            args = rd.to_gnu_options

        exitstatus, stdout, stderr = executeOrRun(rule.command.script_type,
                                                  script, arguments=args)
        if exitstatus != 0:
            succeeded = False

        output_path = rd.replace(rule.command.output_location)[0]
        relative_path = output_path.replace(rd['%SIPDirectory%'], '%SIPDirectory%')
        event = insert_transcription_event(exitstatus, file_uuid, rule, relative_path)

        if os.path.isfile(output_path):
            insert_file_into_database(file_uuid, rd['%SIPUUID%'], event, rule, output_path, relative_path)

    return 0 if succeeded else 1
Example #5
0
 def getReplacementDic(self, target=None):
     if target is not None and self.owningUnit:
         return self.owningUnit.getReplacementDic(
             self.owningUnit.currentPath)
     elif self.UUID != "None":
         return ReplacementDict.frommodel(type_='file', file_=self.UUID)
     # If no UUID has been assigned yet, we can't use the
     # ReplacementDict.frommodel constructor; fall back to the
     # old style of manual construction.
     else:
         return ReplacementDict({
             "%relativeLocation%": self.currentPath,
             "%fileUUID%": self.UUID,
             "%fileGrpUse%": self.fileGrpUse
         })
Example #6
0
def get_replacement_dict(job, opts):
    """ Generates values for all knows %var% replacement variables. """
    prefix = ""
    postfix = ""
    output_dir = ""
    # get file name and extension
    (directory, basename) = os.path.split(opts.file_path)
    directory += os.path.sep  # All paths should have trailing /
    (filename, extension_dot) = os.path.splitext(basename)

    if "preservation" in opts.purpose:
        postfix = "-" + opts.task_uuid
        output_dir = directory
    elif "access" in opts.purpose:
        prefix = opts.file_uuid + "-"
        output_dir = os.path.join(opts.sip_path, "DIP",
                                  "objects") + os.path.sep
    elif "thumbnail" in opts.purpose:
        output_dir = os.path.join(opts.sip_path, "thumbnails") + os.path.sep
        postfix = opts.file_uuid
    else:
        job.print_error("Unsupported command purpose", opts.purpose)
        return None

    # Populates the standard set of unit variables, so,
    # e.g., %fileUUID% is available
    replacement_dict = ReplacementDict.frommodel(type_="file",
                                                 file_=opts.file_uuid)

    output_filename = "".join([prefix, filename, postfix])
    replacement_dict.update({
        "%outputDirectory%":
        output_dir,
        "%prefix%":
        prefix,
        "%postfix%":
        postfix,
        "%outputFileName%":
        output_filename,  # does not include extension
        "%outputFilePath%":
        os.path.join(output_dir,
                     output_filename),  # does not include extension
    })
    return replacement_dict
Example #7
0
def test_replacementdict_model_constructor_sip():
    rd = ReplacementDict.frommodel(sip=SIP, file_=FILE, type_='sip')

    # SIP-specific variables
    assert rd['%SIPUUID%'] == SIP.uuid
    assert rd['%relativeLocation%'] == SIP.currentpath
    assert rd['%currentPath%'] == SIP.currentpath
    assert rd['%SIPDirectory%'] == SIP.currentpath
    assert not '%transferDirectory%' in rd
    assert rd['%SIPDirectoryBasename%'] == os.path.basename(SIP.currentpath)
    assert rd['%SIPLogsDirectory%'] == os.path.join(SIP.currentpath, 'logs/')
    assert rd['%SIPObjectsDirectory%'] == os.path.join(SIP.currentpath, 'objects/')
    assert rd['%relativeLocation%'] == SIP.currentpath

    # File-specific variables
    assert rd['%fileUUID%'] == FILE.uuid
    assert rd['%originalLocation%'] == FILE.originallocation
    assert rd['%currentLocation%'] == FILE.currentlocation
    assert rd['%fileGrpUse%'] == FILE.filegrpuse
Example #8
0
def test_replacementdict_model_constructor_transfer():
    rd = ReplacementDict.frommodel(sip=TRANSFER, file_=FILE, type_='transfer')

    # Transfer-specific variables
    assert rd['%SIPUUID%'] == TRANSFER.uuid
    assert rd['%relativeLocation%'] == TRANSFER.currentlocation
    assert rd['%currentPath%'] == TRANSFER.currentlocation
    assert rd['%SIPDirectory%'] == TRANSFER.currentlocation
    assert rd['%transferDirectory%'] == TRANSFER.currentlocation
    assert rd['%SIPDirectoryBasename%'] == os.path.basename(TRANSFER.currentlocation)
    assert rd['%SIPLogsDirectory%'] == os.path.join(TRANSFER.currentlocation, 'logs/')
    assert rd['%SIPObjectsDirectory%'] == os.path.join(TRANSFER.currentlocation, 'objects/')
    # no, not actually relative
    assert rd['%relativeLocation%'] == TRANSFER.currentlocation

    # File-specific variables
    assert rd['%fileUUID%'] == FILE.uuid
    assert rd['%originalLocation%'] == FILE.originallocation
    assert rd['%currentLocation%'] == FILE.currentlocation
    assert rd['%fileGrpUse%'] == FILE.filegrpuse
Example #9
0
def test_replacementdict_model_constructor_sip():
    rd = ReplacementDict.frommodel(sip=SIP, file_=FILE, type_="sip")

    # SIP-specific variables
    assert rd["%SIPUUID%"] == SIP.uuid
    assert rd["%relativeLocation%"] == SIP.currentpath
    assert rd["%currentPath%"] == SIP.currentpath
    assert rd["%SIPDirectory%"] == SIP.currentpath
    assert "%transferDirectory%" not in rd
    assert rd["%SIPDirectoryBasename%"] == os.path.basename(SIP.currentpath)
    assert rd["%SIPLogsDirectory%"] == os.path.join(SIP.currentpath, "logs/")
    assert rd["%SIPObjectsDirectory%"] == os.path.join(SIP.currentpath,
                                                       "objects/")
    assert rd["%relativeLocation%"] == SIP.currentpath

    # File-specific variables
    assert rd["%fileUUID%"] == FILE.uuid
    assert rd["%originalLocation%"] == FILE.originallocation
    assert rd["%currentLocation%"] == FILE.currentlocation
    assert rd["%fileGrpUse%"] == FILE.filegrpuse
Example #10
0
    def getReplacementDic(self, target):
        ret = ReplacementDict.frommodel(type_="sip", sip=self.UUID)

        # augment the dict here, because DIP is a special case whose paths are
        # not entirely based on data from the database - the locations need to
        # be overridden.
        sip_directory = self.currentPath.replace(
            django_settings.SHARED_DIRECTORY, "%sharedPath%")
        relative_directory_location = target.replace(
            django_settings.SHARED_DIRECTORY, "%sharedPath%")

        ret["%SIPLogsDirectory%"] = os.path.join(sip_directory, "logs", "")
        ret["%SIPObjectsDirectory%"] = os.path.join(sip_directory, "objects",
                                                    "")
        ret["%SIPDirectory%"] = sip_directory
        ret["%SIPDirectoryBasename"] = os.path.basename(
            os.path.abspath(sip_directory))
        ret["%relativeLocation%"] = target.replace(
            self.currentPath, relative_directory_location, 1)
        ret["%unitType%"] = "DIP"
        return ret
Example #11
0
    def getReplacementDic(self, target):
        ret = ReplacementDict.frommodel(type_='sip', sip=self.UUID)

        # augment the dict here, because DIP is a special case whose paths are
        # not entirely based on data from the database - the locations need to
        # be overridden.
        sip_directory = self.currentPath.replace(
            archivematicaMCP.config.get('MCPServer', "sharedDirectory"),
            "%sharedPath%")
        relative_directory_location = target.replace(
            archivematicaMCP.config.get('MCPServer', "sharedDirectory"),
            "%sharedPath%")

        ret["%SIPLogsDirectory%"] = os.path.join(sip_directory, "logs", "")
        ret["%SIPObjectsDirectory%"] = os.path.join(sip_directory, "objects",
                                                    "")
        ret["%SIPDirectory%"] = sip_directory
        ret["%SIPDirectoryBasename"] = os.path.basename(
            os.path.abspath(sip_directory))
        ret["%relativeLocation%"] = target.replace(
            self.currentPath, relative_directory_location, 1)
        ret["%unitType%"] = "DIP"
        return ret
Example #12
0
def main(job, file_path, file_uuid, sip_uuid):
    setup_dicts(mcpclient_settings)

    failed = False

    # Check to see whether the file has already been characterized; don't try
    # to characterize it a second time if so.
    if FPCommandOutput.objects.filter(file_id=file_uuid).count() > 0:
        return 0

    try:
        format = FormatVersion.active.get(
            fileformatversion__file_uuid=file_uuid)
    except FormatVersion.DoesNotExist:
        rules = format = None

    if format:
        rules = FPRule.active.filter(format=format.uuid,
                                     purpose="characterization")

    # Characterization always occurs - if nothing is specified, get one or more
    # defaults specified in the FPR.
    if not rules:
        rules = FPRule.active.filter(purpose="default_characterization")

    for rule in rules:
        if (rule.command.script_type == "bashScript"
                or rule.command.script_type == "command"):
            args = []
            command_to_execute = replace_string_values(rule.command.command,
                                                       file_=file_uuid,
                                                       sip=sip_uuid,
                                                       type_="file")
        else:
            rd = ReplacementDict.frommodel(file_=file_uuid,
                                           sip=sip_uuid,
                                           type_="file")
            args = rd.to_gnu_options()
            command_to_execute = rule.command.command

        exitstatus, stdout, stderr = executeOrRun(
            rule.command.script_type,
            command_to_execute,
            arguments=args,
            capture_output=True,
        )

        job.write_output(stdout)
        job.write_error(stderr)

        if exitstatus != 0:
            job.write_error(
                "Command {} failed with exit status {}; stderr:".format(
                    rule.command.description, exitstatus))
            failed = True
            continue
        # fmt/101 is XML - we want to collect and package any XML output, while
        # allowing other commands to execute without actually collecting their
        # output in the event that they are writing their output to disk.
        # FPCommandOutput can have multiple rows for a given file,
        # distinguished by the rule that produced it.
        if (rule.command.output_format
                and rule.command.output_format.pronom_id == "fmt/101"):
            try:
                etree.fromstring(stdout)
                insertIntoFPCommandOutput(file_uuid, stdout, rule.uuid)
                job.write_output(
                    'Saved XML output for command "{}" ({})'.format(
                        rule.command.description, rule.command.uuid))
            except etree.XMLSyntaxError:
                failed = True
                job.write_error(
                    'XML output for command "{}" ({}) was not valid XML; not saving to database'
                    .format(rule.command.description, rule.command.uuid))
        else:
            job.write_error(
                'Tool output for command "{}" ({}) is not XML; not saving to database'
                .format(rule.command.description, rule.command.uuid))

    if failed:
        return 255
    else:
        return 0