def xmlCreateFileAssociationBetween(originalFileFullPath, outputFromNormalizationFileFullPath, SIPFullPath, sipUUID, eventDetailText, eventOutcomeDetailNote, outputFileUUID=""): #assign file UUID date = databaseInterface.getUTCDate() if outputFileUUID == "": outputFileUUID = uuid.uuid4().__str__() originalFilePathRelativeToSIP = originalFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1) sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string(originalFilePathRelativeToSIP) + "' AND Files.sipUUID = '" + sipUUID + "';" print sql rows = databaseInterface.queryAllSQL(sql) print rows fileUUID = rows[0][0] filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace(SIPFullPath,"%SIPDirectory%", 1) addFileToSIP(filePathRelativeToSIP, outputFileUUID, sipUUID, uuid.uuid4().__str__(), date, sourceType="creation", use="preservation") updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath, date, uuid.uuid4().__str__()) taskUUID = uuid.uuid4().__str__() insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=taskUUID, \ eventType="normalization", \ eventDateTime=date, \ eventDetail=eventDetailText, \ eventOutcome="", \ eventOutcomeDetailNote=eventOutcomeDetailNote) insertIntoDerivations(sourceFileUUID=fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=taskUUID)
def insert_file_into_database(file_uuid, sip_uuid, event_uuid, rule, output_path, relative_path): transcription_uuid = str(uuid4()) today = timezone.now() fileOperations.addFileToSIP( relative_path, transcription_uuid, sip_uuid, task_uuid, today, sourceType="creation", use="text/ocr" ) fileOperations.updateSizeAndChecksum( transcription_uuid, output_path, today, str(uuid4()) ) databaseFunctions.insertIntoDerivations( sourceFileUUID=file_uuid, derivedFileUUID=transcription_uuid, relatedEventUUID=event_uuid )
def onceNormalized(command, opts, replacementDic): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" if os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >> sys.stderr, command print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() eventDetail = "" if command.eventDetailCommand != None: eventDetail = eventDetail = command.eventDetailCommand.stdOut for ef in transcodedFiles: if opts["commandClassifications"] == "preservation": # Add the new file to the sip filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP( filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation", ) # Calculate new file checksum # Add event information to current file insertIntoEvents( fileUUID=opts["fileUUID"], eventIdentifierUUID=derivationEventUUID, eventType="normalization", eventDateTime=opts["date"], eventDetail=eventDetail, eventOutcome="", eventOutcomeDetailNote=filePathRelativeToSIP, ) updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__()) # Add linking information between files insertIntoDerivations( sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID, ) replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def onceNormalized(command): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" elif os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >>sys.stderr, command print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() for ef in transcodedFiles: global outputFileUUID global replacementDic global opts if opts.commandClassifications == "preservation": old = """xmlNormalize(outputFileUUID, \ ef, \ command.eventDetailCommand.stdOut, \ opts.fileUUID, \ opts.objectsDirectory, \ opts.taskUUID, \ opts.date, \ opts.logsDirectory, \ ) # {normalized; not normalized}""" #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, outputFileUUID, opts.sipUUID, uuid.uuid4().__str__(), opts.date, sourceType="creation", use="preservation") #Calculate new file checksum print >>sys.stderr, "TODO: calculate new file checksum" #Add event information to current file insertIntoEvents(fileUUID=opts.fileUUID, \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts.date, \ eventDetail=command.eventDetailCommand.stdOut, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(outputFileUUID, ef, opts.date, uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations(sourceFileUUID=opts.fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=derivationEventUUID) outputFileUUID = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + outputFileUUID
def preservation(): for file in files: #create an entry for the file fileUUID = uuid.uuid4().__str__() addFileToSIP(filePathRelativeToSIP, fileUUID, opts.sipUUID, opts.eventIdentifierUUID, opts.date, use=opts.use) updateSizeAndChecksum(opts.fileUUID, \ opts.filePath, \ opts.date, \ opts.eventIdentifierUUID)
def onceNormalized(command, opts, replacementDic): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" if os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >>sys.stderr, command print >>sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk) if command.eventDetailCommand != None: eventDetail = '%s; %s' % (eventDetail, command.eventDetailCommand.stdOut) for ef in transcodedFiles: if opts["commandClassifications"] == "preservation": # TODO Add manual normalization for files of same name mapping #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation") #Calculate new file checksum #Add event information to current file insertIntoEvents(fileUUID=opts["fileUUID"], \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts["date"], \ eventDetail=eventDetail, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations(sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID) sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % (replacementDic["%outputFileUUID%"], command.outputFormat) databaseInterface.runSQL(sql) replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def xmlCreateFileAssociationBetween(originalFileFullPath, outputFromNormalizationFileFullPath, SIPFullPath, sipUUID, eventDetailText, eventOutcomeDetailNote, outputFileUUID=""): #assign file UUID date = databaseInterface.getUTCDate() if outputFileUUID == "": outputFileUUID = uuid.uuid4().__str__() originalFilePathRelativeToSIP = originalFileFullPath.replace( SIPFullPath, "%SIPDirectory%", 1) sql = "SELECT Files.fileUUID FROM Files WHERE removedTime = 0 AND Files.currentLocation = '" + MySQLdb.escape_string( originalFilePathRelativeToSIP ) + "' AND Files.sipUUID = '" + sipUUID + "';" print sql rows = databaseInterface.queryAllSQL(sql) print rows fileUUID = rows[0][0] filePathRelativeToSIP = outputFromNormalizationFileFullPath.replace( SIPFullPath, "%SIPDirectory%", 1) addFileToSIP(filePathRelativeToSIP, outputFileUUID, sipUUID, uuid.uuid4().__str__(), date, sourceType="creation", use="preservation") updateSizeAndChecksum(outputFileUUID, outputFromNormalizationFileFullPath, date, uuid.uuid4().__str__()) taskUUID = uuid.uuid4().__str__() insertIntoEvents(fileUUID=fileUUID, \ eventIdentifierUUID=taskUUID, \ eventType="normalization", \ eventDateTime=date, \ eventDetail=eventDetailText, \ eventOutcome="", \ eventOutcomeDetailNote=eventOutcomeDetailNote) insertIntoDerivations(sourceFileUUID=fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=taskUUID)
def update_files(sip_uuid, files): """ Update file information to DB. :param sip_uuid: UUID of the SIP to parse the metadata for. :param files: List of dicts containing file info. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") # Add information to the DB for file_info in files: # Add file & reingest event event_id = str(uuid.uuid4()) fileOperations.addFileToSIP( filePathRelativeToSIP=file_info['original_path'], fileUUID=file_info['uuid'], sipUUID=sip_uuid, taskUUID=event_id, date=now, sourceType="reingestion", use=file_info['use'], ) # Update other file info # This doesn't use updateSizeAndChecksum because it also updates currentlocation models.File.objects.filter(uuid=file_info['uuid']).update( checksum=file_info['checksum'], checksumtype=file_info['checksumtype'], size=file_info['size'], currentlocation=file_info['current_path'] ) if file_info['format_version']: # Add Format ID models.FileFormatVersion.objects.create( file_uuid_id=file_info['uuid'], format_version=file_info['format_version'] ) # Derivation info # Has to be separate loop, as derived file may not be in DB otherwise # May not need to be parsed, if Derivation info can be roundtripped in METS Reader/Writer for file_info in files: if file_info['derivation'] is None: continue databaseFunctions.insertIntoDerivations( sourceFileUUID=file_info['uuid'], derivedFileUUID=file_info['derivation'], )
def main(file_uuid=None, file_path='', date='', event_uuid=None, sip_directory='', sip_uuid=None, transfer_uuid=None, use='original', update_use=True): if file_uuid == "None": file_uuid = None if file_uuid: logger.error('File already has UUID: %s', file_uuid) if update_use: File.objects.filter(uuid=file_uuid).update(filegrpuse=use) return 0 # Stop if both or neither of them are used if all([sip_uuid, transfer_uuid]) or not any([sip_uuid, transfer_uuid]): logger.error('SIP exclusive-or Transfer UUID must be defined') return 2 # Transfer if transfer_uuid: file_path_relative_to_sip = file_path.replace(sip_directory, '%transferDirectory%', 1) transfer = Transfer.objects.get(uuid=transfer_uuid) event_type = 'ingestion' # For reingest, parse information from the METS if transfer.type == 'Archivematica AIP': info = get_file_info_from_mets(sip_directory, file_path_relative_to_sip) event_type = 'reingestion' file_uuid = info.get('uuid', file_uuid) use = info.get('filegrpuse', use) file_path_relative_to_sip = info.get('original_path', file_path_relative_to_sip) if not file_uuid: file_uuid = str(uuid.uuid4()) logger.info('Generated UUID for this file: %s.', file_uuid) addFileToTransfer(file_path_relative_to_sip, file_uuid, transfer_uuid, event_uuid, date, use=use, sourceType=event_type) # For reingest, the original location was parsed from the METS # Update the current location to reflect what's on disk if transfer.type == 'Archivematica AIP': print('updating current location for', file_uuid, 'with', info) File.objects.filter(uuid=file_uuid).update( currentlocation=info['current_path'] ) return 0 # Ingest if sip_uuid: file_uuid = str(uuid.uuid4()) file_path_relative_to_sip = file_path.replace(sip_directory, "%SIPDirectory%", 1) addFileToSIP(file_path_relative_to_sip, file_uuid, sip_uuid, event_uuid, date, use=use) return 0
def main( job, file_uuid=None, file_path="", date="", event_uuid=None, sip_directory="", sip_uuid=None, transfer_uuid=None, use="original", update_use=True, ): if file_uuid == "None": file_uuid = None if file_uuid: logger.error("File already has UUID: %s", file_uuid) if update_use: File.objects.filter(uuid=file_uuid).update(filegrpuse=use) return 0 # Stop if both or neither of them are used if all([sip_uuid, transfer_uuid]) or not any([sip_uuid, transfer_uuid]): logger.error("SIP exclusive-or Transfer UUID must be defined") return 2 # Transfer if transfer_uuid: file_path_relative_to_sip = file_path.replace(sip_directory, "%transferDirectory%", 1) transfer = Transfer.objects.get(uuid=transfer_uuid) event_type = "ingestion" # For reingest, parse information from the METS if transfer.type == "Archivematica AIP": info = get_file_info_from_mets(job, sip_directory, file_path_relative_to_sip) event_type = "reingestion" file_uuid = info.get("uuid", file_uuid) use = info.get("filegrpuse", use) file_path_relative_to_sip = info.get("original_path", file_path_relative_to_sip) if not file_uuid: file_uuid = str(uuid.uuid4()) logger.info("Generated UUID for this file: %s.", file_uuid) addFileToTransfer( file_path_relative_to_sip, file_uuid, transfer_uuid, event_uuid, date, use=use, sourceType=event_type, ) # For reingest, the original location was parsed from the METS # Update the current location to reflect what's on disk if transfer.type == "Archivematica AIP": job.print_output("updating current location for", file_uuid, "with", info) File.objects.filter(uuid=file_uuid).update( currentlocation=info["current_path"]) return 0 # Ingest if sip_uuid: file_uuid = str(uuid.uuid4()) file_path_relative_to_sip = file_path.replace(sip_directory, "%SIPDirectory%", 1) addFileToSIP(file_path_relative_to_sip, file_uuid, sip_uuid, event_uuid, date, use=use) return 0
def onceNormalized(command, opts, replacementDic): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" if os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >> sys.stderr, command print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() eventDetail = "ArchivematicaFPRCommandID=\"%s\"" % (command.pk) if command.eventDetailCommand != None: eventDetail = '%s; %s' % (eventDetail, command.eventDetailCommand.stdOut) for ef in transcodedFiles: if opts["commandClassifications"] == "preservation": # TODO Add manual normalization for files of same name mapping #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts["sipPath"], "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, replacementDic["%outputFileUUID%"], opts["sipUUID"], uuid.uuid4().__str__(), opts["date"], sourceType="creation", use="preservation") #Calculate new file checksum #Add event information to current file insertIntoEvents(fileUUID=opts["fileUUID"], \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts["date"], \ eventDetail=eventDetail, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(replacementDic["%outputFileUUID%"], ef, opts["date"], uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations( sourceFileUUID=opts["fileUUID"], derivedFileUUID=replacementDic["%outputFileUUID%"], relatedEventUUID=derivationEventUUID) sql = "INSERT INTO FilesIDs (fileUUID, formatName, formatVersion, formatRegistryName, formatRegistryKey) VALUES ('%s', '%s', NULL, NULL, NULL);" % ( replacementDic["%outputFileUUID%"], command.outputFormat) databaseInterface.runSQL(sql) replacementDic["%outputFileUUID%"] = uuid.uuid4().__str__() replacementDic[ "%postfix%"] = "-" + replacementDic["%outputFileUUID%"]
def create_mets_file(aic, aips): """ Create AIC METS file with AIP information. """ # Prepare constants nsmap = { 'mets': ns.metsNS, 'xlink': ns.xlinkNS, 'xsi': ns.xsiNS, } now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") # Set up structure E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap) mets = ( E.mets( E.metsHdr(CREATEDATE=now), E.dmdSec( E.mdWrap( E.xmlData(), MDTYPE="DC", # mdWrap ), ID='dmdSec_1', # dmdSec ), E.fileSec(E.fileGrp(), ), E.structMap( E.div( TYPE="Archival Information Collection", DMDID="dmdSec_1", ), TYPE='logical', # structMap ), )) mets.attrib['{{{ns}}}schemaLocation'.format( ns=nsmap['xsi'] )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd" # Add Dublin Core info xml_data = mets.find('mets:dmdSec/mets:mdWrap/mets:xmlData', namespaces=ns.NSMAP) dublincore = archivematicaCreateMETS2.getDublinCore( archivematicaCreateMETS2.SIPMetadataAppliesToType, aic['uuid']) # Add <extent> with number of AIPs extent = etree.SubElement(dublincore, ns.dctermsBNS + 'extent') extent.text = "{} AIPs".format(len(aips)) xml_data.append(dublincore) # Add elements for each AIP file_grp = mets.find('mets:fileSec/mets:fileGrp', namespaces=ns.NSMAP) struct_div = mets.find('mets:structMap/mets:div', namespaces=ns.NSMAP) for aip in aips: file_id = '{name}-{uuid}'.format(name=aip['name'], uuid=aip['uuid']) etree.SubElement(file_grp, ns.metsBNS + 'file', ID=file_id) label = aip['label'] or aip['name'] div = etree.SubElement(struct_div, ns.metsBNS + 'div', LABEL=label) etree.SubElement(div, ns.metsBNS + 'fptr', FILEID=file_id) print etree.tostring(mets, pretty_print=True) # Write out the file file_uuid = str(uuid.uuid4()) basename = os.path.join('metadata', "METS.{}.xml".format(file_uuid)) filename = os.path.join(aic['dir'], basename) with open(filename, 'w') as f: f.write(etree.tostring(mets, pretty_print=True)) fileOperations.addFileToSIP( filePathRelativeToSIP='%SIPDirectory%' + basename, fileUUID=file_uuid, sipUUID=aic['uuid'], taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", use='metadata') # To make this work with the createMETS2 (for SIPs) databaseFunctions.insertIntoDerivations(file_uuid, file_uuid) # Insert the count of AIPs in the AIC into UnitVariables, so it can be # indexed later UnitVariable.objects.create(unittype="SIP", unituuid=aic['uuid'], variable="AIPsinAIC", variablevalue=str(len(aips)))
parser.add_option("-p", "--filePath", action="store", dest="filePath", default="") parser.add_option("-d", "--date", action="store", dest="date", default="") parser.add_option("-u", "--eventIdentifierUUID", action="store", dest="eventIdentifierUUID", default="") parser.add_option("-s", "--sipDirectory", action="store", dest="sipDirectory", default="") parser.add_option("-S", "--sipUUID", action="store", dest="sipUUID", default="") parser.add_option("-T", "--transferUUID", action="store", dest="transferUUID", default="") parser.add_option("-e", "--use", action="store", dest="use", default="original") (opts, args) = parser.parse_args() opts2 = vars(opts) # for key, value in opts2.iteritems(): # print type(key), key, type(value), value # exec 'opts.' + key + ' = value.decode("utf-8")' fileUUID = opts.fileUUID if not fileUUID or fileUUID == "None": fileUUID = uuid.uuid4().__str__() if opts.sipUUID == "" and opts.transferUUID != "": filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%transferDirectory%", 1) addFileToTransfer(filePathRelativeToSIP, fileUUID, opts.transferUUID, opts.eventIdentifierUUID, opts.date, use=opts.use) elif opts.sipUUID != "" and opts.transferUUID == "": filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%SIPDirectory%", 1) addFileToSIP(filePathRelativeToSIP, fileUUID, opts.sipUUID, opts.eventIdentifierUUID, opts.date, use=opts.use) else: print >>sys.stderr, "SIP exclusive-or Transfer uuid must be defined" exit(2)
def once_normalized(job, command, opts, replacement_dict): """ Updates the database if normalization completed successfully. Callback from transcoder.Command For preservation files, adds a normalization event, and derivation, as well as updating the size and checksum for the new file in the DB. Adds format information for use in the METS file to FilesIDs. """ transcoded_files = [] if not command.output_location: command.output_location = "" if os.path.isfile(command.output_location): transcoded_files.append(command.output_location) elif os.path.isdir(command.output_location): for w in os.walk(command.output_location): path, _, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcoded_files.append(p) elif command.output_location: job.print_error("Error - output file does not exist [", command.output_location, "]") command.exit_code = -2 derivation_event_uuid = str(uuid.uuid4()) event_detail_output = 'ArchivematicaFPRCommandID="{}"'.format( command.fpcommand.uuid) if command.event_detail_command is not None: event_detail_output += '; {}'.format( command.event_detail_command.std_out) for ef in transcoded_files: if "thumbnails" in opts.purpose: continue today = timezone.now() output_file_uuid = opts.task_uuid # Match the UUID on disk # TODO Add manual normalization for files of same name mapping? # Add the new file to the SIP path_relative_to_sip = ef.replace(opts.sip_path, "%SIPDirectory%", 1) fileOperations.addFileToSIP( path_relative_to_sip, output_file_uuid, # File UUID opts.sip_uuid, # SIP UUID opts.task_uuid, # Task UUID today, # Current date sourceType="creation", use=opts.purpose, ) # Calculate new file checksum fileOperations.updateSizeAndChecksum( output_file_uuid, # File UUID, same as task UUID for preservation ef, # File path today, # Date str(uuid.uuid4()), # Event UUID, new UUID ) # Add derivation link and associated event # # Track both events and insert into Derivations table for # preservation copies if "preservation" in opts.purpose: insert_derivation_event( original_uuid=opts.file_uuid, output_uuid=output_file_uuid, derivation_uuid=derivation_event_uuid, event_detail_output=event_detail_output, outcome_detail_note=path_relative_to_sip, today=today, ) # Other derivatives go into the Derivations table, but # don't get added to the PREMIS Events because they will # not appear in the METS. else: d = Derivation(source_file_id=opts.file_uuid, derived_file_id=output_file_uuid, event=None) d.save() # Use the format info from the normalization command # to save identification into the DB ffv = FileFormatVersion(file_uuid_id=output_file_uuid, format_version=command.fpcommand.output_format) ffv.save() FileID.objects.create( file_id=output_file_uuid, format_name=command.fpcommand.output_format.format.description)
def create_mets_file(aic, aips, job): """ Create AIC METS file with AIP information. """ # Prepare constants nsmap = {"mets": ns.metsNS, "xlink": ns.xlinkNS, "xsi": ns.xsiNS} now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") # Set up structure E = ElementMaker(namespace=ns.metsNS, nsmap=nsmap) mets = E.mets( E.metsHdr(CREATEDATE=now), E.dmdSec(E.mdWrap(E.xmlData(), MDTYPE="DC"), ID="dmdSec_1"), # mdWrap # dmdSec E.fileSec(E.fileGrp()), E.structMap( E.div(TYPE="Archival Information Collection", DMDID="dmdSec_1"), TYPE="logical", # structMap ), ) mets.attrib["{{{ns}}}schemaLocation".format( ns=nsmap["xsi"] )] = "http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd" # Add Dublin Core info xml_data = mets.find("mets:dmdSec/mets:mdWrap/mets:xmlData", namespaces=ns.NSMAP) dublincore = create_mets_v2.getDublinCore( create_mets_v2.SIPMetadataAppliesToType, aic["uuid"]) # Add <extent> with number of AIPs extent = etree.SubElement(dublincore, ns.dctermsBNS + "extent") extent.text = "{} AIPs".format(len(aips)) xml_data.append(dublincore) # Add elements for each AIP file_grp = mets.find("mets:fileSec/mets:fileGrp", namespaces=ns.NSMAP) struct_div = mets.find("mets:structMap/mets:div", namespaces=ns.NSMAP) for aip in aips: file_id = "{name}-{uuid}".format(name=aip["name"], uuid=aip["uuid"]) etree.SubElement(file_grp, ns.metsBNS + "file", ID=file_id) label = aip["label"] or aip["name"] div = etree.SubElement(struct_div, ns.metsBNS + "div", LABEL=label) etree.SubElement(div, ns.metsBNS + "fptr", FILEID=file_id) job.pyprint(etree.tostring(mets, pretty_print=True)) # Write out the file file_uuid = str(uuid.uuid4()) basename = os.path.join("metadata", "METS.{}.xml".format(file_uuid)) filename = os.path.join(aic["dir"], basename) with open(filename, "w") as f: f.write( etree.tostring(mets, pretty_print=True, xml_declaration=True, encoding="utf-8")) fileOperations.addFileToSIP( filePathRelativeToSIP="%SIPDirectory%" + basename, fileUUID=file_uuid, sipUUID=aic["uuid"], taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", use="metadata", ) # To make this work with the createMETS2 (for SIPs) databaseFunctions.insertIntoDerivations(file_uuid, file_uuid) # Insert the count of AIPs in the AIC into UnitVariables, so it can be # indexed later UnitVariable.objects.create( unittype="SIP", unituuid=aic["uuid"], variable="AIPsinAIC", variablevalue=str(len(aips)), )
parser.add_option("-e", "--use", action="store", dest="use", default="original") parser.add_option("--disable-update-filegrpuse", action="store_false", dest="update_use", default=True) (opts, args) = parser.parse_args() opts2 = vars(opts) # for key, value in opts2.iteritems(): # print type(key), key, type(value), value # exec 'opts.' + key + ' = value.decode("utf-8")' fileUUID = opts.fileUUID if not fileUUID or fileUUID == "None": fileUUID = uuid.uuid4().__str__() else: print >>sys.stderr, "File already has UUID:", fileUUID if opts.update_use: File.objects.filter(uuid=fileUUID).update(filegrpuse=opts.use) exit(0) if opts.sipUUID == "" and opts.transferUUID != "": filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%transferDirectory%", 1) addFileToTransfer(filePathRelativeToSIP, fileUUID, opts.transferUUID, opts.eventIdentifierUUID, opts.date, use=opts.use) elif opts.sipUUID != "" and opts.transferUUID == "": filePathRelativeToSIP = opts.filePath.replace(opts.sipDirectory,"%SIPDirectory%", 1) addFileToSIP(filePathRelativeToSIP, fileUUID, opts.sipUUID, opts.eventIdentifierUUID, opts.date, use=opts.use) else: print >>sys.stderr, "SIP exclusive-or Transfer uuid must be defined" exit(2)
def main(aip_uuid, aip_name, compression, sip_dir, aip_filename): # Prep work mets_schema_location = 'http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd' premis_schema_location = 'info:lc/xmlns/premis-v2 http://www.loc.gov/standards/premis/v2/premis-v2-2.xsd' # Datetime format string from http://docs.python.org/2/library/datetime.html # %Y = 4 digit year, %m = 2 digit month, %d = 2 digit day # %H = 24-hour hour, %M = 2-digit minute, %S = 2 digit second now = timezone.now().strftime("%Y-%m-%dT%H:%M:%S") aip_identifier = aip_name+'-'+aip_uuid aip_path = os.path.join(sip_dir, aip_filename) # Get archive tool and version program, algorithm = compression.split('-') # Pointer files are not written for uncompressed AIPs; # the purpose of the pointer file is primarily to provide information # on how to read a compressed AIP file, so there isn't anything for # it to do when pointing at an uncompressed AIP. if program == 'None': return 0 if program == '7z': archive_tool = '7-Zip' archive_tool_version = '9.20' # TODO get this dynamically elif program == 'pbzip2': archive_tool = program archive_tool_version = '1.1.6' # TODO get this dynamically # Format / file extension _, extension = os.path.splitext(aip_filename) # PRONOM ID and PRONOM name for each file extension pronom_conversion = { '.7z': {'puid': 'fmt/484', 'name': '7Zip format'}, '.bz2': {'puid': 'x-fmt/268', 'name': 'BZIP2 Compressed Archive'}, } num_files = 1 # Get size try: aip_size = os.path.getsize(aip_path) except os.error: print("File {} does not exist or is inaccessible. Aborting.".format(aip_path), file=sys.stderr) return -1 # Calculate checksum checksum_algorithm = get_setting('checksum_type', 'sha256') checksum = get_file_checksum(aip_path, checksum_algorithm) # Get package type (AIP, AIC) sip_metadata_uuid = '3e48343d-e2d2-4956-aaa3-b54d26eb9761' try: dc = DublinCore.objects.get(metadataappliestotype_id=sip_metadata_uuid, metadataappliestoidentifier=aip_uuid) except DublinCore.DoesNotExist: package_type = "Archival Information Package" else: package_type = dc.type # Namespaces nsmap = { # Default, unprefixed namespace 'mets': namespaces.metsNS, 'xsi': namespaces.xsiNS, 'xlink': namespaces.xlinkNS, } # Set up structure E = ElementMaker(namespace=namespaces.metsNS, nsmap=nsmap) E_P = ElementMaker(namespace=namespaces.premisNS, nsmap={'premis': namespaces.premisNS}) root = ( E.mets( E.metsHdr(CREATEDATE=now), # amdSec goes here E.fileSec( E.fileGrp(USE='Archival Information Package'), ), E.structMap( TYPE='physical' ), ) ) # Namespaced attributes have to be added separately - don't know how to do # inline with E root.attrib[namespaces.xsiBNS+'schemaLocation'] = mets_schema_location add_amdsec_after = root.find('mets:metsHdr', namespaces=namespaces.NSMAP) filegrp = root.find('.//mets:fileGrp', namespaces=namespaces.NSMAP) structmap = root.find('.//mets:structMap', namespaces=namespaces.NSMAP) # For each file, add amdSec, file, fptr for admin_id in range(1, num_files+1): # amdSec amdsec_id = 'amdSec_{}'.format(admin_id) amdsec = E.amdSec( E.techMD( E.mdWrap( E.xmlData( ), MDTYPE='PREMIS:OBJECT', # mdWrap ), ID='techMD_1', # techMD ), ID=amdsec_id, # amdSec ) # Add PREMIS:OBJECT obj = E_P.object( E_P.objectIdentifier( E_P.objectIdentifierType('UUID'), E_P.objectIdentifierValue(aip_uuid), ), E_P.objectCharacteristics( E_P.compositionLevel('1'), E_P.fixity( E_P.messageDigestAlgorithm(checksum_algorithm), E_P.messageDigest(checksum), ), E_P.size(str(aip_size)), E_P.format( E_P.formatDesignation( E_P.formatName( pronom_conversion[extension]['name']), E_P.formatVersion(), ), E_P.formatRegistry( E_P.formatRegistryName('PRONOM'), E_P.formatRegistryKey( pronom_conversion[extension]['puid']) ), ), E_P.creatingApplication( E_P.creatingApplicationName(archive_tool), E_P.creatingApplicationVersion(archive_tool_version), E_P.dateCreatedByApplication(now), ), ), version='2.2', ) obj.attrib[namespaces.xsiBNS+'type'] = 'premis:file' obj.attrib[namespaces.xsiBNS+'schemaLocation'] = premis_schema_location # Add as child of xmldata amdsec.find('.//mets:mdWrap[@MDTYPE="PREMIS:OBJECT"]/mets:xmlData', namespaces=namespaces.NSMAP).append(obj) # Add PREMIS:EVENT for compression & PREMIS:AGENTs # use archivematicaCreateMETS2 code elements = archivematicaCreateMETS2.createDigiprovMD(aip_uuid) for element in elements: amdsec.append(element) # add amdSec after previous amdSec (or metsHdr if first one) add_amdsec_after.addnext(amdsec) add_amdsec_after = amdsec # fileGrp file_ = E.file( E.FLocat( LOCTYPE="OTHER", OTHERLOCTYPE="SYSTEM", ), ID=aip_identifier ) filegrp.append(file_) flocat = file_.find('mets:FLocat', namespaces=namespaces.NSMAP) flocat.attrib['{{{ns}}}href'.format(ns=namespaces.xlinkNS)] = aip_path # compression - 7z or tar.bz2 if extension == '.7z': etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM=algorithm) elif extension == '.bz2': etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='bzip2') etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='2', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='tar') # structMap div = etree.SubElement(structmap, namespaces.metsBNS+'div', ADMID=amdsec_id, TYPE=package_type) etree.SubElement(div, namespaces.metsBNS+'fptr', FILEID=aip_identifier) print(etree.tostring(root, pretty_print=True)) # Write out pointer.xml xml_filename = 'pointer.xml' filename = os.path.join(os.path.dirname(aip_path), xml_filename) with open(filename, 'w') as f: f.write(etree.tostring(root, pretty_print=True, xml_declaration=True, encoding='utf-8')) fileOperations.addFileToSIP( filePathRelativeToSIP='%SIPDirectory%'+xml_filename, fileUUID=str(uuid.uuid4()), sipUUID=aip_uuid, taskUUID=str(uuid.uuid4()), # Unsure what should go here date=now, sourceType="aip creation", ) return 0
def onceNormalized(command): transcodedFiles = [] if not command.outputLocation: command.outputLocation = "" elif os.path.isfile(command.outputLocation): transcodedFiles.append(command.outputLocation) elif os.path.isdir(command.outputLocation): for w in os.walk(command.outputLocation): path, directories, files = w for p in files: p = os.path.join(path, p) if os.path.isfile(p): transcodedFiles.append(p) elif command.outputLocation: print >> sys.stderr, command print >> sys.stderr, "Error - output file does not exist [" + command.outputLocation + "]" command.exitCode = -2 derivationEventUUID = uuid.uuid4().__str__() for ef in transcodedFiles: global outputFileUUID global replacementDic global opts if opts.commandClassifications == "preservation": old = """xmlNormalize(outputFileUUID, \ ef, \ command.eventDetailCommand.stdOut, \ opts.fileUUID, \ opts.objectsDirectory, \ opts.taskUUID, \ opts.date, \ opts.logsDirectory, \ ) # {normalized; not normalized}""" #Add the new file to the sip filePathRelativeToSIP = ef.replace(opts.sipPath, "%SIPDirectory%", 1) # addFileToSIP(filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion"): addFileToSIP(filePathRelativeToSIP, outputFileUUID, opts.sipUUID, uuid.uuid4().__str__(), opts.date, sourceType="creation", use="preservation") #Calculate new file checksum print >> sys.stderr, "TODO: calculate new file checksum" #Add event information to current file insertIntoEvents(fileUUID=opts.fileUUID, \ eventIdentifierUUID=derivationEventUUID, \ eventType="normalization", \ eventDateTime=opts.date, \ eventDetail=command.eventDetailCommand.stdOut, \ eventOutcome="", \ eventOutcomeDetailNote=filePathRelativeToSIP) updateSizeAndChecksum(outputFileUUID, ef, opts.date, uuid.uuid4().__str__()) #Add linking information between files insertIntoDerivations(sourceFileUUID=opts.fileUUID, derivedFileUUID=outputFileUUID, relatedEventUUID=derivationEventUUID) outputFileUUID = uuid.uuid4().__str__() replacementDic["%postfix%"] = "-" + outputFileUUID