Example #1
0
def xmlCreateFileAssociationBetween(originalFileFullPath, outputFromNormalizationFileFullPath, SIPFullPath, sipUUID, eventDetailText, eventOutcomeDetailNote, outputFileUUID=""):
    #assign file UUID

    date = databaseInterface.getUTCDate()
    if outputFileUUID == "":
        outputFileUUID = uuid.uuid4().__str__()

    originalFilePathRelativeToSIP = originalFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1)
    sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string(originalFilePathRelativeToSIP) + "' AND Files.sipUUID = '" + sipUUID + "';"
    print sql
    rows = databaseInterface.queryAllSQL(sql)
    print rows
    fileUUID = rows[0][0]


    filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1)
    addFileToSIP(filePathRelativeToSIP, outputFileUUID, sipUUID, uuid.uuid4().__str__(), date, sourceType="creation", use="preservation")
    updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath, date, uuid.uuid4().__str__())

    taskUUID = uuid.uuid4().__str__()
    insertIntoEvents(fileUUID=fileUUID, \
               eventIdentifierUUID=taskUUID, \
               eventType="normalization", \
               eventDateTime=date, \
               eventDetail=eventDetailText, \
               eventOutcome="", \
               eventOutcomeDetailNote=eventOutcomeDetailNote)

    insertIntoDerivations(sourceFileUUID=fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=taskUUID)
def insert_file_into_database(file_uuid, sip_uuid, event_uuid, rule, output_path, relative_path):
    transcription_uuid = str(uuid4())
    today = timezone.now()
    fileOperations.addFileToSIP(
        relative_path,
        transcription_uuid,
        sip_uuid,
        task_uuid,
        today,
        sourceType="creation",
        use="text/ocr"
    )

    fileOperations.updateSizeAndChecksum(
        transcription_uuid,
        output_path,
        today,
        str(uuid4())
    )

    databaseFunctions.insertIntoDerivations(
        sourceFileUUID=file_uuid,
        derivedFileUUID=transcription_uuid,
        relatedEventUUID=event_uuid
    )
def onceNormalized(command, opts, replacementDic):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    if os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >> sys.stderr, command
        print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    eventDetail = ""
    if command.eventDetailCommand != None:
        eventDetail = eventDetail = command.eventDetailCommand.stdOut
    for ef in transcodedFiles:
        if opts["commandClassifications"] == "preservation":
            # Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(
                filePathRelativeToSIP,
                replacementDic["%outputFileUUID%"],
                opts["sipUUID"],
                uuid.uuid4().__str__(),
                opts["date"],
                sourceType="creation",
                use="preservation",
            )
            # Calculate new file checksum
            # Add event information to current file
            insertIntoEvents(
                fileUUID=opts["fileUUID"],
                eventIdentifierUUID=derivationEventUUID,
                eventType="normalization",
                eventDateTime=opts["date"],
                eventDetail=eventDetail,
                eventOutcome="",
                eventOutcomeDetailNote=filePathRelativeToSIP,
            )

            updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__())

            # Add linking information between files
            insertIntoDerivations(
                sourceFileUUID=opts["fileUUID"],
                derivedFileUUID=replacementDic["%outputFileUUID%"],
                relatedEventUUID=derivationEventUUID,
            )

            replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def onceNormalized(command):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    elif os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >>sys.stderr, command
        print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    for ef in transcodedFiles:
        global outputFileUUID
        global replacementDic
        global opts
        if opts.commandClassifications == "preservation":
            old = """xmlNormalize(outputFileUUID, \
                     ef, \
                     command.eventDetailCommand.stdOut, \
                     opts.fileUUID, \
                     opts.objectsDirectory, \
                     opts.taskUUID, \
                     opts.date, \
                     opts.logsDirectory, \
                     ) #    {normalized; not normalized}"""

            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP, outputFileUUID, opts.sipUUID, uuid.uuid4().__str__(), opts.date, sourceType="creation", use="preservation")
            #Calculate new file checksum
            print >>sys.stderr, "TODO: calculate new file checksum"
            #Add event information to current file
            insertIntoEvents(fileUUID=opts.fileUUID, \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts.date, \
               eventDetail=command.eventDetailCommand.stdOut, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(outputFileUUID, ef, opts.date, uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(sourceFileUUID=opts.fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=derivationEventUUID)

            outputFileUUID = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + outputFileUUID
def preservation():
    for file in files:
        
        #create an entry for the file
        fileUUID = uuid.uuid4().__str__()
        addFileToSIP(filePathRelativeToSIP, fileUUID, opts.sipUUID, opts.eventIdentifierUUID, opts.date, use=opts.use)
        updateSizeAndChecksum(opts.fileUUID, \
                     opts.filePath, \
                     opts.date, \
                     opts.eventIdentifierUUID)
def onceNormalized(command, opts, replacementDic):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    if os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >>sys.stderr, command
        print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk)
    if command.eventDetailCommand != None:
        eventDetail = '%s; %s' % (eventDetail, command.eventDetailCommand.stdOut)
    for ef in transcodedFiles:
        if opts["commandClassifications"] == "preservation":
            # TODO Add manual normalization for files of same name mapping
            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation")
            #Calculate new file checksum
            #Add event information to current file
            insertIntoEvents(fileUUID=opts["fileUUID"], \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts["date"], \
               eventDetail=eventDetail, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID)

            sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % (replacementDic["%outputFileUUID%"], command.outputFormat)
            databaseInterface.runSQL(sql)
            
            replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
Example #7
0
def preservation():
    for file in files:

        #create an entry for the file
        fileUUID = uuid.uuid4().__str__()
        addFileToSIP(filePathRelativeToSIP,
                     fileUUID,
                     opts.sipUUID,
                     opts.eventIdentifierUUID,
                     opts.date,
                     use=opts.use)
        updateSizeAndChecksum(opts.fileUUID, \
                     opts.filePath, \
                     opts.date, \
                     opts.eventIdentifierUUID)
Example #8
0
def xmlCreateFileAssociationBetween(originalFileFullPath,
                                    outputFromNormalizationFileFullPath,
                                    SIPFullPath,
                                    sipUUID,
                                    eventDetailText,
                                    eventOutcomeDetailNote,
                                    outputFileUUID=""):
    #assign file UUID

    date = databaseInterface.getUTCDate()
    if outputFileUUID == "":
        outputFileUUID = uuid.uuid4().__str__()

    originalFilePathRelativeToSIP = originalFileFullPath.replace(
        SIPFullPath, "%SIPDirectory%", 1)
    sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string(
        originalFilePathRelativeToSIP
    ) + "' AND Files.sipUUID = '" + sipUUID + "';"
    print sql
    rows = databaseInterface.queryAllSQL(sql)
    print rows
    fileUUID = rows[0][0]

    filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace(
        SIPFullPath, "%SIPDirectory%", 1)
    addFileToSIP(filePathRelativeToSIP,
                 outputFileUUID,
                 sipUUID,
                 uuid.uuid4().__str__(),
                 date,
                 sourceType="creation",
                 use="preservation")
    updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath,
                          date,
                          uuid.uuid4().__str__())

    taskUUID = uuid.uuid4().__str__()
    insertIntoEvents(fileUUID=fileUUID, \
               eventIdentifierUUID=taskUUID, \
               eventType="normalization", \
               eventDateTime=date, \
               eventDetail=eventDetailText, \
               eventOutcome="", \
               eventOutcomeDetailNote=eventOutcomeDetailNote)

    insertIntoDerivations(sourceFileUUID=fileUUID,
                          derivedFileUUID=outputFileUUID,
                          relatedEventUUID=taskUUID)
def update_files(sip_uuid, files):
    """
    Update file information to DB.

    :param sip_uuid: UUID of the SIP to parse the metadata for.
    :param files: List of dicts containing file info.
    """
    now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
    # Add information to the DB
    for file_info in files:
        # Add file & reingest event
        event_id = str(uuid.uuid4())
        fileOperations.addFileToSIP(
            filePathRelativeToSIP=file_info['original_path'],
            fileUUID=file_info['uuid'],
            sipUUID=sip_uuid,
            taskUUID=event_id,
            date=now,
            sourceType="reingestion",
            use=file_info['use'],
        )
        # Update other file info
        # This doesn't use updateSizeAndChecksum because it also updates currentlocation
        models.File.objects.filter(uuid=file_info['uuid']).update(
            checksum=file_info['checksum'],
            checksumtype=file_info['checksumtype'],
            size=file_info['size'],
            currentlocation=file_info['current_path']
        )
        if file_info['format_version']:
            # Add Format ID
            models.FileFormatVersion.objects.create(
                file_uuid_id=file_info['uuid'],
                format_version=file_info['format_version']
            )

    # Derivation info
    # Has to be separate loop, as derived file may not be in DB otherwise
    # May not need to be parsed, if Derivation info can be roundtripped in METS Reader/Writer
    for file_info in files:
        if file_info['derivation'] is None:
            continue
        databaseFunctions.insertIntoDerivations(
            sourceFileUUID=file_info['uuid'],
            derivedFileUUID=file_info['derivation'],
        )
Example #10
0
def main(file_uuid=None, file_path='', date='', event_uuid=None, sip_directory='', sip_uuid=None, transfer_uuid=None, use='original', update_use=True):
    if file_uuid == "None":
        file_uuid = None
    if file_uuid:
        logger.error('File already has UUID: %s', file_uuid)
        if update_use:
            File.objects.filter(uuid=file_uuid).update(filegrpuse=use)
        return 0

    # Stop if both or neither of them are used
    if all([sip_uuid, transfer_uuid]) or not any([sip_uuid, transfer_uuid]):
        logger.error('SIP exclusive-or Transfer UUID must be defined')
        return 2

    # Transfer
    if transfer_uuid:
        file_path_relative_to_sip = file_path.replace(sip_directory, '%transferDirectory%', 1)
        transfer = Transfer.objects.get(uuid=transfer_uuid)
        event_type = 'ingestion'
        # For reingest, parse information from the METS
        if transfer.type == 'Archivematica AIP':
            info = get_file_info_from_mets(sip_directory, file_path_relative_to_sip)
            event_type = 'reingestion'
            file_uuid = info.get('uuid', file_uuid)
            use = info.get('filegrpuse', use)
            file_path_relative_to_sip = info.get('original_path', file_path_relative_to_sip)
        if not file_uuid:
            file_uuid = str(uuid.uuid4())
            logger.info('Generated UUID for this file: %s.', file_uuid)
        addFileToTransfer(file_path_relative_to_sip, file_uuid, transfer_uuid, event_uuid, date, use=use, sourceType=event_type)
        # For reingest, the original location was parsed from the METS
        # Update the current location to reflect what's on disk
        if transfer.type == 'Archivematica AIP':
            print('updating current location for', file_uuid, 'with', info)
            File.objects.filter(uuid=file_uuid).update(
                currentlocation=info['current_path']
            )
        return 0

    # Ingest
    if sip_uuid:
        file_uuid = str(uuid.uuid4())
        file_path_relative_to_sip = file_path.replace(sip_directory, "%SIPDirectory%", 1)
        addFileToSIP(file_path_relative_to_sip, file_uuid, sip_uuid, event_uuid, date, use=use)
        return 0
def main(
    job,
    file_uuid=None,
    file_path="",
    date="",
    event_uuid=None,
    sip_directory="",
    sip_uuid=None,
    transfer_uuid=None,
    use="original",
    update_use=True,
):
    if file_uuid == "None":
        file_uuid = None
    if file_uuid:
        logger.error("File already has UUID: %s", file_uuid)
        if update_use:
            File.objects.filter(uuid=file_uuid).update(filegrpuse=use)
        return 0

    # Stop if both or neither of them are used
    if all([sip_uuid, transfer_uuid]) or not any([sip_uuid, transfer_uuid]):
        logger.error("SIP exclusive-or Transfer UUID must be defined")
        return 2

    # Transfer
    if transfer_uuid:
        file_path_relative_to_sip = file_path.replace(sip_directory,
                                                      "%transferDirectory%", 1)
        transfer = Transfer.objects.get(uuid=transfer_uuid)
        event_type = "ingestion"
        # For reingest, parse information from the METS
        if transfer.type == "Archivematica AIP":
            info = get_file_info_from_mets(job, sip_directory,
                                           file_path_relative_to_sip)
            event_type = "reingestion"
            file_uuid = info.get("uuid", file_uuid)
            use = info.get("filegrpuse", use)
            file_path_relative_to_sip = info.get("original_path",
                                                 file_path_relative_to_sip)
        if not file_uuid:
            file_uuid = str(uuid.uuid4())
            logger.info("Generated UUID for this file: %s.", file_uuid)
        addFileToTransfer(
            file_path_relative_to_sip,
            file_uuid,
            transfer_uuid,
            event_uuid,
            date,
            use=use,
            sourceType=event_type,
        )
        # For reingest, the original location was parsed from the METS
        # Update the current location to reflect what's on disk
        if transfer.type == "Archivematica AIP":
            job.print_output("updating current location for", file_uuid,
                             "with", info)
            File.objects.filter(uuid=file_uuid).update(
                currentlocation=info["current_path"])
        return 0

    # Ingest
    if sip_uuid:
        file_uuid = str(uuid.uuid4())
        file_path_relative_to_sip = file_path.replace(sip_directory,
                                                      "%SIPDirectory%", 1)
        addFileToSIP(file_path_relative_to_sip,
                     file_uuid,
                     sip_uuid,
                     event_uuid,
                     date,
                     use=use)
        return 0
def onceNormalized(command, opts, replacementDic):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    if os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >> sys.stderr, command
        print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk)
    if command.eventDetailCommand != None:
        eventDetail = '%s; %s' % (eventDetail,
                                  command.eventDetailCommand.stdOut)
    for ef in transcodedFiles:
        if opts["commandClassifications"] == "preservation":
            # TODO Add manual normalization for files of same name mapping
            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts["sipPath"],
                                               "%SIPDirectory%", 1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP,
                         replacementDic["%outputFileUUID%"],
                         opts["sipUUID"],
                         uuid.uuid4().__str__(),
                         opts["date"],
                         sourceType="creation",
                         use="preservation")
            #Calculate new file checksum
            #Add event information to current file
            insertIntoEvents(fileUUID=opts["fileUUID"], \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts["date"], \
               eventDetail=eventDetail, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef,
                                  opts["date"],
                                  uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(
                sourceFileUUID=opts["fileUUID"],
                derivedFileUUID=replacementDic["%outputFileUUID%"],
                relatedEventUUID=derivationEventUUID)

            sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % (
                replacementDic["%outputFileUUID%"], command.outputFormat)
            databaseInterface.runSQL(sql)

            replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__()
            replacementDic[
                "%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
Example #13
0
def create_mets_file(aic, aips):
    """ Create AIC METS file with AIP information. """

    # Prepare constants
    nsmap = {
        'mets': ns.metsNS,
        'xlink': ns.xlinkNS,
        'xsi': ns.xsiNS,
    }
    now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S")

    # Set up structure
    E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap)
    mets = (
        E.mets(
            E.metsHdr(CREATEDATE=now),
            E.dmdSec(
                E.mdWrap(
                    E.xmlData(),
                    MDTYPE="DC",  # mdWrap
                ),
                ID='dmdSec_1',  # dmdSec
            ),
            E.fileSec(E.fileGrp(), ),
            E.structMap(
                E.div(
                    TYPE="Archival Information Collection",
                    DMDID="dmdSec_1",
                ),
                TYPE='logical',  # structMap
            ),
        ))
    mets.attrib['{{{ns}}}schemaLocation'.format(
        ns=nsmap['xsi']
    )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd"

    # Add Dublin Core info
    xml_data = mets.find('mets:dmdSec/mets:mdWrap/mets:xmlData',
                         namespaces=ns.NSMAP)
    dublincore = archivematicaCreateMETS2.getDublinCore(
        archivematicaCreateMETS2.SIPMetadataAppliesToType, aic['uuid'])
    # Add <extent> with number of AIPs
    extent = etree.SubElement(dublincore, ns.dctermsBNS + 'extent')
    extent.text = "{} AIPs".format(len(aips))
    xml_data.append(dublincore)

    # Add elements for each AIP
    file_grp = mets.find('mets:fileSec/mets:fileGrp', namespaces=ns.NSMAP)
    struct_div = mets.find('mets:structMap/mets:div', namespaces=ns.NSMAP)
    for aip in aips:
        file_id = '{name}-{uuid}'.format(name=aip['name'], uuid=aip['uuid'])
        etree.SubElement(file_grp, ns.metsBNS + 'file', ID=file_id)

        label = aip['label'] or aip['name']
        div = etree.SubElement(struct_div, ns.metsBNS + 'div', LABEL=label)
        etree.SubElement(div, ns.metsBNS + 'fptr', FILEID=file_id)

    print etree.tostring(mets, pretty_print=True)

    # Write out the file
    file_uuid = str(uuid.uuid4())
    basename = os.path.join('metadata', "METS.{}.xml".format(file_uuid))
    filename = os.path.join(aic['dir'], basename)
    with open(filename, 'w') as f:
        f.write(etree.tostring(mets, pretty_print=True))
    fileOperations.addFileToSIP(
        filePathRelativeToSIP='%SIPDirectory%' + basename,
        fileUUID=file_uuid,
        sipUUID=aic['uuid'],
        taskUUID=str(uuid.uuid4()),  # Unsure what should go here
        date=now,
        sourceType="aip creation",
        use='metadata')
    # To make this work with the createMETS2 (for SIPs)
    databaseFunctions.insertIntoDerivations(file_uuid, file_uuid)

    # Insert the count of AIPs in the AIC into UnitVariables, so it can be
    # indexed later
    UnitVariable.objects.create(unittype="SIP",
                                unituuid=aic['uuid'],
                                variable="AIPsinAIC",
                                variablevalue=str(len(aips)))
    parser.add_option("-p",  "--filePath",          action="store", dest="filePath", default="")
    parser.add_option("-d",  "--date",              action="store", dest="date", default="")
    parser.add_option("-u",  "--eventIdentifierUUID", action="store", dest="eventIdentifierUUID", default="")
    parser.add_option("-s",  "--sipDirectory", action="store", dest="sipDirectory", default="")
    parser.add_option("-S",  "--sipUUID", action="store", dest="sipUUID", default="")
    parser.add_option("-T",  "--transferUUID", action="store", dest="transferUUID", default="")
    parser.add_option("-e",  "--use", action="store", dest="use", default="original")


    (opts, args) = parser.parse_args()
    opts2 = vars(opts)
#    for key, value in opts2.iteritems():
#        print type(key), key, type(value), value
#        exec 'opts.' + key + ' = value.decode("utf-8")'
    fileUUID = opts.fileUUID
    if not fileUUID or fileUUID == "None":
        fileUUID = uuid.uuid4().__str__()


    if opts.sipUUID == "" and opts.transferUUID != "":
        filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%transferDirectory%", 1)
        addFileToTransfer(filePathRelativeToSIP, fileUUID, opts.transferUUID, opts.eventIdentifierUUID, opts.date, use=opts.use)

    elif opts.sipUUID != "" and opts.transferUUID == "":
        filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%SIPDirectory%", 1)
        addFileToSIP(filePathRelativeToSIP, fileUUID, opts.sipUUID, opts.eventIdentifierUUID, opts.date, use=opts.use)

    else:
        print >>sys.stderr, "SIP exclusive-or Transfer uuid must be defined"
        exit(2)
Example #15
0
def once_normalized(job, command, opts, replacement_dict):
    """ Updates the database if normalization completed successfully.

    Callback from transcoder.Command

    For preservation files, adds a normalization event, and derivation, as well
    as updating the size and checksum for the new file in the DB.  Adds format
    information for use in the METS file to FilesIDs.
    """
    transcoded_files = []
    if not command.output_location:
        command.output_location = ""
    if os.path.isfile(command.output_location):
        transcoded_files.append(command.output_location)
    elif os.path.isdir(command.output_location):
        for w in os.walk(command.output_location):
            path, _, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcoded_files.append(p)
    elif command.output_location:
        job.print_error("Error - output file does not exist [",
                        command.output_location, "]")
        command.exit_code = -2

    derivation_event_uuid = str(uuid.uuid4())
    event_detail_output = 'ArchivematicaFPRCommandID="{}"'.format(
        command.fpcommand.uuid)
    if command.event_detail_command is not None:
        event_detail_output += '; {}'.format(
            command.event_detail_command.std_out)
    for ef in transcoded_files:
        if "thumbnails" in opts.purpose:
            continue

        today = timezone.now()
        output_file_uuid = opts.task_uuid  # Match the UUID on disk
        # TODO Add manual normalization for files of same name mapping?
        # Add the new file to the SIP
        path_relative_to_sip = ef.replace(opts.sip_path, "%SIPDirectory%", 1)
        fileOperations.addFileToSIP(
            path_relative_to_sip,
            output_file_uuid,  # File UUID
            opts.sip_uuid,  # SIP UUID
            opts.task_uuid,  # Task UUID
            today,  # Current date
            sourceType="creation",
            use=opts.purpose,
        )

        # Calculate new file checksum
        fileOperations.updateSizeAndChecksum(
            output_file_uuid,  # File UUID, same as task UUID for preservation
            ef,  # File path
            today,  # Date
            str(uuid.uuid4()),  # Event UUID, new UUID
        )

        # Add derivation link and associated event
        #
        # Track both events and insert into Derivations table for
        # preservation copies
        if "preservation" in opts.purpose:
            insert_derivation_event(
                original_uuid=opts.file_uuid,
                output_uuid=output_file_uuid,
                derivation_uuid=derivation_event_uuid,
                event_detail_output=event_detail_output,
                outcome_detail_note=path_relative_to_sip,
                today=today,
            )
        # Other derivatives go into the Derivations table, but
        # don't get added to the PREMIS Events because they will
        # not appear in the METS.
        else:
            d = Derivation(source_file_id=opts.file_uuid,
                           derived_file_id=output_file_uuid,
                           event=None)
            d.save()

        # Use the format info from the normalization command
        # to save identification into the DB
        ffv = FileFormatVersion(file_uuid_id=output_file_uuid,
                                format_version=command.fpcommand.output_format)
        ffv.save()

        FileID.objects.create(
            file_id=output_file_uuid,
            format_name=command.fpcommand.output_format.format.description)
def create_mets_file(aic, aips, job):
    """ Create AIC METS file with AIP information. """

    # Prepare constants
    nsmap = {"mets": ns.metsNS, "xlink": ns.xlinkNS, "xsi": ns.xsiNS}
    now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S")

    # Set up structure
    E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap)
    mets = E.mets(
        E.metsHdr(CREATEDATE=now),
        E.dmdSec(E.mdWrap(E.xmlData(), MDTYPE="DC"),
                 ID="dmdSec_1"),  # mdWrap  # dmdSec
        E.fileSec(E.fileGrp()),
        E.structMap(
            E.div(TYPE="Archival Information Collection", DMDID="dmdSec_1"),
            TYPE="logical",  # structMap
        ),
    )
    mets.attrib["{{{ns}}}schemaLocation".format(
        ns=nsmap["xsi"]
    )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd"

    # Add Dublin Core info
    xml_data = mets.find("mets:dmdSec/mets:mdWrap/mets:xmlData",
                         namespaces=ns.NSMAP)
    dublincore = create_mets_v2.getDublinCore(
        create_mets_v2.SIPMetadataAppliesToType, aic["uuid"])
    # Add <extent> with number of AIPs
    extent = etree.SubElement(dublincore, ns.dctermsBNS + "extent")
    extent.text = "{} AIPs".format(len(aips))
    xml_data.append(dublincore)

    # Add elements for each AIP
    file_grp = mets.find("mets:fileSec/mets:fileGrp", namespaces=ns.NSMAP)
    struct_div = mets.find("mets:structMap/mets:div", namespaces=ns.NSMAP)
    for aip in aips:
        file_id = "{name}-{uuid}".format(name=aip["name"], uuid=aip["uuid"])
        etree.SubElement(file_grp, ns.metsBNS + "file", ID=file_id)

        label = aip["label"] or aip["name"]
        div = etree.SubElement(struct_div, ns.metsBNS + "div", LABEL=label)
        etree.SubElement(div, ns.metsBNS + "fptr", FILEID=file_id)

    job.pyprint(etree.tostring(mets, pretty_print=True))

    # Write out the file
    file_uuid = str(uuid.uuid4())
    basename = os.path.join("metadata", "METS.{}.xml".format(file_uuid))
    filename = os.path.join(aic["dir"], basename)
    with open(filename, "w") as f:
        f.write(
            etree.tostring(mets,
                           pretty_print=True,
                           xml_declaration=True,
                           encoding="utf-8"))
    fileOperations.addFileToSIP(
        filePathRelativeToSIP="%SIPDirectory%" + basename,
        fileUUID=file_uuid,
        sipUUID=aic["uuid"],
        taskUUID=str(uuid.uuid4()),  # Unsure what should go here
        date=now,
        sourceType="aip creation",
        use="metadata",
    )
    # To make this work with the createMETS2 (for SIPs)
    databaseFunctions.insertIntoDerivations(file_uuid, file_uuid)

    # Insert the count of AIPs in the AIC into UnitVariables, so it can be
    # indexed later
    UnitVariable.objects.create(
        unittype="SIP",
        unituuid=aic["uuid"],
        variable="AIPsinAIC",
        variablevalue=str(len(aips)),
    )
    parser.add_option("-e",  "--use", action="store", dest="use", default="original")
    parser.add_option("--disable-update-filegrpuse", action="store_false", dest="update_use", default=True)


    (opts, args) = parser.parse_args()
    opts2 = vars(opts)
#    for key, value in opts2.iteritems():
#        print type(key), key, type(value), value
#        exec 'opts.' + key + ' = value.decode("utf-8")'
    fileUUID = opts.fileUUID
    if not fileUUID or fileUUID == "None":
        fileUUID = uuid.uuid4().__str__()
    else:
        print >>sys.stderr, "File already has UUID:", fileUUID
        if opts.update_use:
            File.objects.filter(uuid=fileUUID).update(filegrpuse=opts.use)
        exit(0) 


    if opts.sipUUID == "" and opts.transferUUID != "":
        filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%transferDirectory%", 1)
        addFileToTransfer(filePathRelativeToSIP, fileUUID, opts.transferUUID, opts.eventIdentifierUUID, opts.date, use=opts.use)

    elif opts.sipUUID != "" and opts.transferUUID == "":
        filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%SIPDirectory%", 1)
        addFileToSIP(filePathRelativeToSIP, fileUUID, opts.sipUUID, opts.eventIdentifierUUID, opts.date, use=opts.use)

    else:
        print >>sys.stderr, "SIP exclusive-or Transfer uuid must be defined"
        exit(2)
Example #18
0
def main(aip_uuid, aip_name, compression, sip_dir, aip_filename):

    # Prep work
    mets_schema_location = 'http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd'
    premis_schema_location = 'info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-2.xsd'
    # Datetime format string from http://docs.python.org/2/library/datetime.html
    # %Y = 4 digit year, %m = 2 digit month, %d = 2 digit day
    # %H = 24-hour hour, %M = 2-digit minute, %S = 2 digit second
    now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S")
    aip_identifier = aip_name+'-'+aip_uuid
    aip_path = os.path.join(sip_dir, aip_filename)
    # Get archive tool and version
    program, algorithm = compression.split('-')

    # Pointer files are not written for uncompressed AIPs;
    # the purpose of the pointer file is primarily to provide information
    # on how to read a compressed AIP file, so there isn't anything for
    # it to do when pointing at an uncompressed AIP.
    if program == 'None':
        return 0

    if program == '7z':
        archive_tool = '7-Zip'
        archive_tool_version = '9.20'  # TODO get this dynamically
    elif program == 'pbzip2':
        archive_tool = program
        archive_tool_version = '1.1.6'  # TODO get this dynamically
    # Format / file extension
    _, extension = os.path.splitext(aip_filename)
    # PRONOM ID and PRONOM name for each file extension
    pronom_conversion = {
        '.7z': {'puid': 'fmt/484', 'name': '7Zip format'},
        '.bz2': {'puid': 'x-fmt/268', 'name': 'BZIP2 Compressed Archive'},
    }
    num_files = 1
    # Get size
    try:
        aip_size = os.path.getsize(aip_path)
    except os.error:
        print("File {} does not exist or is inaccessible.  Aborting.".format(aip_path), file=sys.stderr)
        return -1
    # Calculate checksum
    checksum_algorithm = get_setting('checksum_type', 'sha256')
    checksum = get_file_checksum(aip_path, checksum_algorithm)
    # Get package type (AIP, AIC)
    sip_metadata_uuid = '3e48343d-e2d2-4956-aaa3-b54d26eb9761'

    try:
        dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid,
                                    metadataappliestoidentifier=aip_uuid)
    except DublinCore.DoesNotExist:
        package_type = "Archival Information Package"
    else:
        package_type = dc.type

    # Namespaces
    nsmap = {
        # Default, unprefixed namespace
        'mets': namespaces.metsNS,
        'xsi': namespaces.xsiNS,
        'xlink': namespaces.xlinkNS,
    }
    # Set up structure
    E = ElementMaker(namespace=namespaces.metsNS, nsmap=nsmap)
    E_P = ElementMaker(namespace=namespaces.premisNS, nsmap={'premis': namespaces.premisNS})

    root = (
        E.mets(
            E.metsHdr(CREATEDATE=now),
            # amdSec goes here
            E.fileSec(
                E.fileGrp(USE='Archival Information Package'),
            ),
            E.structMap(
                TYPE='physical'
            ),
        )
    )
    # Namespaced attributes have to be added separately - don't know how to do
    # inline with E
    root.attrib[namespaces.xsiBNS+'schemaLocation'] = mets_schema_location

    add_amdsec_after = root.find('mets:metsHdr', namespaces=namespaces.NSMAP)
    filegrp = root.find('.//mets:fileGrp', namespaces=namespaces.NSMAP)
    structmap = root.find('.//mets:structMap', namespaces=namespaces.NSMAP)
    # For each file, add amdSec, file, fptr
    for admin_id in range(1, num_files+1):

        # amdSec
        amdsec_id = 'amdSec_{}'.format(admin_id)
        amdsec = E.amdSec(
            E.techMD(
                E.mdWrap(
                    E.xmlData(
                    ),
                    MDTYPE='PREMIS:OBJECT',  # mdWrap
                ),
                ID='techMD_1',  # techMD
            ),
            ID=amdsec_id,  # amdSec
        )
        # Add PREMIS:OBJECT
        obj = E_P.object(
            E_P.objectIdentifier(
                E_P.objectIdentifierType('UUID'),
                E_P.objectIdentifierValue(aip_uuid),
            ),
            E_P.objectCharacteristics(
                E_P.compositionLevel('1'),
                E_P.fixity(
                    E_P.messageDigestAlgorithm(checksum_algorithm),
                    E_P.messageDigest(checksum),
                ),
                E_P.size(str(aip_size)),
                E_P.format(
                    E_P.formatDesignation(
                        E_P.formatName(
                            pronom_conversion[extension]['name']),
                        E_P.formatVersion(),
                    ),
                    E_P.formatRegistry(
                        E_P.formatRegistryName('PRONOM'),
                        E_P.formatRegistryKey(
                            pronom_conversion[extension]['puid'])
                    ),
                ),
                E_P.creatingApplication(
                    E_P.creatingApplicationName(archive_tool),
                    E_P.creatingApplicationVersion(archive_tool_version),
                    E_P.dateCreatedByApplication(now),
                ),
            ),
            version='2.2',
        )
        obj.attrib[namespaces.xsiBNS+'type'] = 'premis:file'
        obj.attrib[namespaces.xsiBNS+'schemaLocation'] = premis_schema_location

        # Add as child of xmldata
        amdsec.find('.//mets:mdWrap[@MDTYPE="PREMIS:OBJECT"]/mets:xmlData', namespaces=namespaces.NSMAP).append(obj)

        # Add PREMIS:EVENT for compression & PREMIS:AGENTs
        # use archivematicaCreateMETS2 code
        elements = archivematicaCreateMETS2.createDigiprovMD(aip_uuid)
        for element in elements:
            amdsec.append(element)

        # add amdSec after previous amdSec (or metsHdr if first one)
        add_amdsec_after.addnext(amdsec)
        add_amdsec_after = amdsec

        # fileGrp
        file_ = E.file(
            E.FLocat(
                LOCTYPE="OTHER",
                OTHERLOCTYPE="SYSTEM",
            ),
            ID=aip_identifier
        )
        filegrp.append(file_)
        flocat = file_.find('mets:FLocat', namespaces=namespaces.NSMAP)
        flocat.attrib['{{{ns}}}href'.format(ns=namespaces.xlinkNS)] = aip_path

        # compression - 7z or tar.bz2
        if extension == '.7z':
            etree.SubElement(file_, namespaces.metsBNS + "transformFile",
                TRANSFORMORDER='1',
                TRANSFORMTYPE='decompression',
                TRANSFORMALGORITHM=algorithm)
        elif extension == '.bz2':
            etree.SubElement(file_, namespaces.metsBNS + "transformFile",
                TRANSFORMORDER='1',
                TRANSFORMTYPE='decompression',
                TRANSFORMALGORITHM='bzip2')
            etree.SubElement(file_, namespaces.metsBNS + "transformFile",
                TRANSFORMORDER='2',
                TRANSFORMTYPE='decompression',
                TRANSFORMALGORITHM='tar')

        # structMap
        div = etree.SubElement(structmap, namespaces.metsBNS+'div', ADMID=amdsec_id, TYPE=package_type)
        etree.SubElement(div, namespaces.metsBNS+'fptr', FILEID=aip_identifier)

    print(etree.tostring(root, pretty_print=True))

    # Write out pointer.xml
    xml_filename = 'pointer.xml'
    filename = os.path.join(os.path.dirname(aip_path), xml_filename)
    with open(filename, 'w') as f:
        f.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='utf-8'))
    fileOperations.addFileToSIP(
        filePathRelativeToSIP='%SIPDirectory%'+xml_filename,
        fileUUID=str(uuid.uuid4()),
        sipUUID=aip_uuid,
        taskUUID=str(uuid.uuid4()),  # Unsure what should go here
        date=now,
        sourceType="aip creation",
    )
    return 0
def onceNormalized(command):
    transcodedFiles = []
    if not command.outputLocation:
        command.outputLocation = ""
    elif os.path.isfile(command.outputLocation):
        transcodedFiles.append(command.outputLocation)
    elif os.path.isdir(command.outputLocation):
        for w in os.walk(command.outputLocation):
            path, directories, files = w
            for p in files:
                p = os.path.join(path, p)
                if os.path.isfile(p):
                    transcodedFiles.append(p)
    elif command.outputLocation:
        print >> sys.stderr, command
        print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]"
        command.exitCode = -2

    derivationEventUUID = uuid.uuid4().__str__()
    for ef in transcodedFiles:
        global outputFileUUID
        global replacementDic
        global opts
        if opts.commandClassifications == "preservation":
            old = """xmlNormalize(outputFileUUID, \
                     ef, \
                     command.eventDetailCommand.stdOut, \
                     opts.fileUUID, \
                     opts.objectsDirectory, \
                     opts.taskUUID, \
                     opts.date, \
                     opts.logsDirectory, \
                     ) #    {normalized; not normalized}"""

            #Add the new file to the sip
            filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%",
                                               1)
            # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"):
            addFileToSIP(filePathRelativeToSIP,
                         outputFileUUID,
                         opts.sipUUID,
                         uuid.uuid4().__str__(),
                         opts.date,
                         sourceType="creation",
                         use="preservation")
            #Calculate new file checksum
            print >> sys.stderr, "TODO: calculate new file checksum"
            #Add event information to current file
            insertIntoEvents(fileUUID=opts.fileUUID, \
               eventIdentifierUUID=derivationEventUUID, \
               eventType="normalization", \
               eventDateTime=opts.date, \
               eventDetail=command.eventDetailCommand.stdOut, \
               eventOutcome="", \
               eventOutcomeDetailNote=filePathRelativeToSIP)

            updateSizeAndChecksum(outputFileUUID, ef, opts.date,
                                  uuid.uuid4().__str__())

            #Add linking information between files
            insertIntoDerivations(sourceFileUUID=opts.fileUUID,
                                  derivedFileUUID=outputFileUUID,
                                  relatedEventUUID=derivationEventUUID)

            outputFileUUID = uuid.uuid4().__str__()
            replacementDic["%postfix%"] = "-" + outputFileUUID