def verifyMetsFileSecChecksums(metsFile, date, taskUUID, transferDirectory, transferUUID, relativeDirectory="./"): print metsFile DspaceLicenses = "metadata/submissionDocumentation/DspaceLicenses" try: path = os.path.join(transferDirectory, DspaceLicenses) if not os.path.isdir(path): os.mkdir(path) except: print "error creating DspaceLicenses directory." exitCode = 0 tree = etree.parse(metsFile) root = tree.getroot() for item in root.findall("{http://www.loc.gov/METS/}fileSec/{http://www.loc.gov/METS/}fileGrp"): #print etree.tostring(item) #print item USE = item.get("USE") if USE == "LICENSE": for item2 in item: if item2.tag == "{http://www.loc.gov/METS/}file": for item3 in item2: if item3.tag == "{http://www.loc.gov/METS/}FLocat": fileLocation = item3.get("{http://www.w3.org/1999/xlink}href") fileFullPath = os.path.join(relativeDirectory, fileLocation) dest = os.path.join(transferDirectory, DspaceLicenses, os.path.basename(fileLocation)) renameAsSudo(fileFullPath, dest) src = fileFullPath.replace(transferDirectory, "%transferDirectory%") dst = dest.replace(transferDirectory, "%transferDirectory%") eventDetail = "" eventOutcomeDetailNote = "moved from=\"" + src + "\"; moved to=\"" + dst + "\"" updateFileLocation(src, dst, "movement", date, eventDetail, transferUUID=transferUUID, eventOutcomeDetailNote = eventOutcomeDetailNote) return exitCode
def verifyMetsFileSecChecksums(metsFile, date, taskUUID, transferDirectory, transferUUID, relativeDirectory="./"): print metsFile DSpaceMets = "metadata/submissionDocumentation/DSpaceMets" try: path = os.path.join(transferDirectory, DSpaceMets) if not os.path.isdir(path): os.mkdir(path) except: print "error creating DSpaceMets directory." exitCode = 0 metsDirectory = os.path.basename(os.path.dirname(metsFile)) if metsDirectory == "DSpace_export": outputDirectory = path else: outputDirectory = os.path.join(path, metsDirectory) if not os.path.isdir(outputDirectory): os.mkdir(outputDirectory) dest = os.path.join(outputDirectory, "mets.xml") renameAsSudo(metsFile, dest) src = metsFile.replace(transferDirectory, "%transferDirectory%") dst = dest.replace(transferDirectory, "%transferDirectory%") eventDetail = "" eventOutcomeDetailNote = "moved from=\"" + src + "\"; moved to=\"" + dst + "\"" updateFileLocation(src, dst, "movement", date, eventDetail, transferUUID=transferUUID, eventOutcomeDetailNote = eventOutcomeDetailNote) return exitCode
def something(SIPDirectory, accessDirectory, objectsDirectory, DIPDirectory, SIPUUID, date, copy=False): #exitCode = 435 exitCode = 179 print SIPDirectory #For every file, & directory Try to find the matching file & directory in the objects directory for (path, dirs, files) in os.walk(accessDirectory): for file in files: accessPath = os.path.join(path, file) objectPath = accessPath.replace(accessDirectory, objectsDirectory, 1) objectName = os.path.basename(objectPath) objectNameExtensionIndex = objectName.rfind(".") if objectNameExtensionIndex != -1: objectName = objectName[:objectNameExtensionIndex + 1] objectNameLike = os.path.join( os.path.dirname(objectPath), objectName).replace(SIPDirectory, "%SIPDirectory%", 1) #sql = "SELECT fileUUID, currentLocation FROM Files WHERE currentLocation LIKE '%s%' AND removedTime = 0 AND SIPUUID = '%s'" % (objectNameLike, SIPUUID) #ValueError: unsupported format character ''' (0x27) at index 76 sql = "SELECT fileUUID, currentLocation FROM Files WHERE currentLocation LIKE '" + objectNameLike + "%' AND removedTime = 0 AND SIPUUID = '"+ SIPUUID + "'" c, sqlLock = databaseInterface.querySQL(sql) row = c.fetchone() if not row: print >>sys.stderr, "No corresponding object for:", accessPath.replace(SIPDirectory, "%SIPDirectory%", 1) exitCode = 1 update = [] while row != None: objectUUID = row[0] objectPath = row[1] objectExtension = objectPath.replace(objectNameLike, "", 1) print objectName[objectNameExtensionIndex + 1:], objectExtension, "\t", if objectExtension.find(".") != -1: print row = c.fetchone() continue print objectName[objectNameExtensionIndex + 1:], objectExtension, "\t", print row dipPath = os.path.join(DIPDirectory, "objects", "%s-%s" % (objectUUID, os.path.basename(accessPath))) if copy: print "TODO - copy not supported yet" else: # dest = dipPath renameAsSudo(accessPath, dest) src = accessPath.replace(SIPDirectory, "%SIPDirectory%") dst = dest.replace(SIPDirectory, "%SIPDirectory%") update.append((src, dst)) # row = c.fetchone() sqlLock.release() for src, dst in update: eventDetail = "" eventOutcomeDetailNote = "moved from=\"" + src + "\"; moved to=\"" + dst + "\"" updateFileLocation(src, dst, "movement", date, eventDetail, sipUUID=SIPUUID, eventOutcomeDetailNote = eventOutcomeDetailNote) return exitCode
def verifyMetsFileSecChecksums( job, metsFile, date, taskUUID, transferDirectory, transferUUID, relativeDirectory="./", ): job.pyprint(metsFile) DSpaceMets = "metadata/submissionDocumentation/DSpaceMets" try: path = os.path.join(transferDirectory, DSpaceMets) if not os.path.isdir(path): os.mkdir(path) except: job.pyprint("error creating DSpaceMets directory.") exitCode = 0 metsDirectory = os.path.basename(os.path.dirname(metsFile)) if metsDirectory == "DSpace_export": outputDirectory = path else: outputDirectory = os.path.join(path, metsDirectory) if not os.path.isdir(outputDirectory): os.mkdir(outputDirectory) dest = os.path.join(outputDirectory, "mets.xml") rename_status = rename(metsFile, dest, printfn=job.pyprint, should_exit=False) if rename_status: return rename_status src = metsFile.replace(transferDirectory, "%transferDirectory%") dst = dest.replace(transferDirectory, "%transferDirectory%") eventDetail = "" eventOutcomeDetailNote = 'moved from="' + src + '"; moved to="' + dst + '"' updateFileLocation( src, dst, "movement", date, eventDetail, transferUUID=transferUUID, eventOutcomeDetailNote=eventOutcomeDetailNote, ) return exitCode
def something(SIPDirectory, accessDirectory, objectsDirectory, DIPDirectory, SIPUUID, date, copy=False): # exitCode = 435 exitCode = 179 print(SIPDirectory) # For every file, & directory Try to find the matching file & directory in the objects directory for (path, dirs, files) in os.walk(accessDirectory): for file in files: accessPath = os.path.join(path, file) objectPath = accessPath.replace(accessDirectory, objectsDirectory, 1) objectName = os.path.basename(objectPath) objectNameExtensionIndex = objectName.rfind(".") if objectNameExtensionIndex != -1: objectName = objectName[:objectNameExtensionIndex + 1] objectNameLike = os.path.join(os.path.dirname(objectPath), objectName).replace(SIPDirectory, "%SIPDirectory%", 1) files = File.objects.filter(removedtime__isnull=True, currentlocation__startswith=objectNameLike, sip_id=SIPUUID) if not files.exists(): print("No corresponding object for:", accessPath.replace(SIPDirectory, "%SIPDirectory%", 1), file=sys.stderr) exitCode = 1 update = [] for objectUUID, objectPath in files.values_list('uuid', 'currentlocation'): objectExtension = objectPath.replace(objectNameLike, "", 1) print(objectName[objectNameExtensionIndex + 1:], objectExtension, "\t", end=' ') if objectExtension.find(".") != -1: continue print(objectName[objectNameExtensionIndex + 1:], objectExtension, "\t", end=' ') dipPath = os.path.join(DIPDirectory, "objects", "%s-%s" % (objectUUID, os.path.basename(accessPath))) if copy: print("TODO - copy not supported yet") else: dest = dipPath rename(accessPath, dest) src = accessPath.replace(SIPDirectory, "%SIPDirectory%") dst = dest.replace(SIPDirectory, "%SIPDirectory%") update.append((src, dst)) for src, dst in update: eventDetail = "" eventOutcomeDetailNote = "moved from=\"" + src + "\"; moved to=\"" + dst + "\"" updateFileLocation(src, dst, "movement", date, eventDetail, sipUUID=SIPUUID, eventOutcomeDetailNote=eventOutcomeDetailNote) return exitCode
sipPath, groupType, 1) #"%SIPDirectory%objects/" sanitizations = sanitizeNames.sanitizeRecursively(objectsDirectory) eventDetail = "program=\"sanitizeNames\"; version=\"" + sanitizeNames.VERSION + "\"" for oldfile, newfile in sanitizations: if os.path.isfile(newfile): oldfile = oldfile.replace(objectsDirectory, relativeReplacement, 1) newfile = newfile.replace(objectsDirectory, relativeReplacement, 1) print oldfile, " -> ", newfile if groupType == "%SIPDirectory%": updateFileLocation(oldfile, newfile, "name cleanup", date, "prohibited characters removed:" + eventDetail, fileUUID=None, sipUUID=sipUUID) elif groupType == "%transferDirectory%": updateFileLocation(oldfile, newfile, "name cleanup", date, "prohibited characters removed:" + eventDetail, fileUUID=None, transferUUID=sipUUID) else: print >> sys.stderr, "bad group type", groupType exit(3)
def sanitize_object_names(job, objectsDirectory, sipUUID, date, groupType, groupSQL, sipPath): """Sanitize object names in a Transfer/SIP.""" relativeReplacement = objectsDirectory.replace(sipPath, groupType, 1) # "%SIPDirectory%objects/" # Get any ``Directory`` instances created for this transfer (if such exist) directory_mdls = [] if groupSQL == 'transfer_id': transfer_mdl = Transfer.objects.get(uuid=sipUUID) if transfer_mdl.diruuids: directory_mdls = Directory.objects.filter( transfer=transfer_mdl).all() # Sanitize objects on disk sanitizations = sanitize_names.sanitizeRecursively(job, objectsDirectory) for oldfile, newfile in sanitizations.items(): logger.info('sanitizations: %s -> %s', oldfile, newfile) eventDetail = 'program="sanitize_names"; version="' + sanitize_names.VERSION + '"' # Update files in DB kwargs = { groupSQL: sipUUID, "removedtime__isnull": True, } file_mdls = File.objects.filter(**kwargs) # Iterate over ``File`` and ``Directory`` for model in chain(file_mdls, directory_mdls): # Check all files to see if any parent directory had a sanitization event current_location = unicodeToStr( unicodedata.normalize('NFC', model.currentlocation)).replace( groupType, sipPath) sanitized_location = unicodeToStr(current_location) logger.info('Checking %s', current_location) # Check parent directories # Since directory keys are a mix of sanitized and unsanitized, this is # a little complicated # Directories keys are in the form sanitized/sanitized/unsanitized # When a match is found (eg 'unsanitized' -> 'sanitized') reset the # search. # This will find 'sanitized/unsanitized2' -> 'sanitized/sanitized2' on # the next pass # TODO This should be checked for a more efficient solution dirpath = sanitized_location while objectsDirectory in dirpath: # Stay within unit if dirpath in sanitizations: # Make replacement sanitized_location = sanitized_location.replace( dirpath, sanitizations[dirpath]) dirpath = sanitized_location # Reset search else: # Check next level up dirpath = os.path.dirname(dirpath) if current_location != sanitized_location: old_location = current_location.replace( objectsDirectory, relativeReplacement, 1) new_location = sanitized_location.replace( objectsDirectory, relativeReplacement, 1) kwargs = { 'src': old_location, 'dst': new_location, 'eventType': 'name cleanup', 'eventDateTime': date, 'eventDetail': "prohibited characters removed:" + eventDetail, 'fileUUID': None, } if groupType == "%SIPDirectory%": kwargs['sipUUID'] = sipUUID elif groupType == "%transferDirectory%": kwargs['transferUUID'] = sipUUID else: job.pyprint("bad group type", groupType, file=sys.stderr) return 3 logger.info('Sanitized name: %s -> %s', old_location, new_location) job.pyprint('Sanitized name:', old_location, " -> ", new_location) if isinstance(model, File): updateFileLocation(**kwargs) else: model.currentlocation = new_location model.save() else: logger.info('No sanitization for %s', current_location) job.pyprint('No sanitization found for', current_location) return 0
def something(SIPDirectory, accessDirectory, objectsDirectory, DIPDirectory, SIPUUID, date, copy=False): #exitCode = 435 exitCode = 179 print SIPDirectory #For every file, & directory Try to find the matching file & directory in the objects directory for (path, dirs, files) in os.walk(accessDirectory): for file in files: accessPath = os.path.join(path, file) objectPath = accessPath.replace(accessDirectory, objectsDirectory, 1) objectName = os.path.basename(objectPath) objectNameExtensionIndex = objectName.rfind(".") if objectNameExtensionIndex != -1: objectName = objectName[:objectNameExtensionIndex + 1] objectNameLike = os.path.join(os.path.dirname(objectPath), objectName).replace( SIPDirectory, "%SIPDirectory%", 1) #sql = "SELECT fileUUID, currentLocation FROM Files WHERE currentLocation LIKE '%s%' AND removedTime = 0 AND SIPUUID = '%s'" % (objectNameLike, SIPUUID) #ValueError: unsupported format character ''' (0x27) at index 76 sql = "SELECT fileUUID, currentLocation FROM Files WHERE currentLocation LIKE '" + objectNameLike + "%' AND removedTime = 0 AND SIPUUID = '" + SIPUUID + "'" c, sqlLock = databaseInterface.querySQL(sql) row = c.fetchone() if not row: print >> sys.stderr, "No corresponding object for:", accessPath.replace( SIPDirectory, "%SIPDirectory%", 1) exitCode = 1 update = [] while row != None: objectUUID = row[0] objectPath = row[1] objectExtension = objectPath.replace(objectNameLike, "", 1) print objectName[objectNameExtensionIndex + 1:], objectExtension, "\t", if objectExtension.find(".") != -1: print row = c.fetchone() continue print objectName[objectNameExtensionIndex + 1:], objectExtension, "\t", print row dipPath = os.path.join( DIPDirectory, "objects", "%s-%s" % (objectUUID, os.path.basename(accessPath))) if copy: print "TODO - copy not supported yet" else: # dest = dipPath renameAsSudo(accessPath, dest) src = accessPath.replace(SIPDirectory, "%SIPDirectory%") dst = dest.replace(SIPDirectory, "%SIPDirectory%") update.append((src, dst)) # row = c.fetchone() sqlLock.release() for src, dst in update: eventDetail = "" eventOutcomeDetailNote = "moved from=\"" + src + "\"; moved to=\"" + dst + "\"" updateFileLocation( src, dst, "movement", date, eventDetail, sipUUID=SIPUUID, eventOutcomeDetailNote=eventOutcomeDetailNote) return exitCode
quit(2) eventDetail= "program=\"sanitizeNames\"; version=\"" + version + "\"" for line in lines: detoxfiles = line.split(" -> ") if len(detoxfiles) > 1 : oldfile = detoxfiles[0].split('\n',1)[0] newfile = detoxfiles[1] #print "line: ", line if os.path.isfile(newfile): oldfile = oldfile.replace(objectsDirectory, relativeReplacement, 1) newfile = newfile.replace(objectsDirectory, relativeReplacement, 1) print oldfile, " -> ", newfile if groupType == "%SIPDirectory%": updateFileLocation(oldfile, newfile, "name cleanup", date, "prohibited characters removed:" + eventDetail, fileUUID=None, sipUUID=sipUUID) elif groupType == "%transferDirectory%": updateFileLocation(oldfile, newfile, "name cleanup", date, "prohibited characters removed:" + eventDetail, fileUUID=None, transferUUID=sipUUID) else: print >>sys.stderr, "bad group type", groupType exit(3) elif os.path.isdir(newfile): oldfile = oldfile.replace(objectsDirectory, relativeReplacement, 1) + "/" newfile = newfile.replace(objectsDirectory, relativeReplacement, 1) + "/" directoryContents = [] sql = "SELECT fileUUID, currentLocation FROM Files WHERE Files.removedTime = 0 AND Files.currentLocation LIKE '" + MySQLdb.escape_string(oldfile.replace("\\", "\\\\")).replace("%","\%") + "%' AND " + groupSQL + " = '" + groupID + "';" c, sqlLock = databaseInterface.querySQL(sql) row = c.fetchone()
if len(detoxfiles) > 1: oldfile = detoxfiles[0].split('\n', 1)[0] newfile = detoxfiles[1] #print "line: ", line if os.path.isfile(newfile): oldfile = oldfile.replace(objectsDirectory, relativeReplacement, 1) newfile = newfile.replace(objectsDirectory, relativeReplacement, 1) print oldfile, " -> ", newfile if groupType == "%SIPDirectory%": updateFileLocation(oldfile, newfile, "name cleanup", date, "prohibited characters removed:" + eventDetail, fileUUID=None, sipUUID=sipUUID) elif groupType == "%transferDirectory%": updateFileLocation(oldfile, newfile, "name cleanup", date, "prohibited characters removed:" + eventDetail, fileUUID=None, transferUUID=sipUUID) else: print >> sys.stderr, "bad group type", groupType exit(3)
def sanitize_object_names(objectsDirectory, sipUUID, date, groupType, groupSQL, sipPath): """Sanitize object names in a Transfer/SIP.""" relativeReplacement = objectsDirectory.replace( sipPath, groupType, 1) # "%SIPDirectory%objects/" # Sanitize objects on disk sanitizations = sanitizeNames.sanitizeRecursively(objectsDirectory) for oldfile, newfile in sanitizations.items(): logger.info('sanitizations: %s -> %s', oldfile, newfile) eventDetail = 'program="sanitizeNames"; version="' + sanitizeNames.VERSION + '"' # Update files in DB kwargs = { groupSQL: sipUUID, "removedtime__isnull": True, } for f in File.objects.filter(**kwargs): # Check all files to see if any parent directory had a sanitization event current_location = unicodeToStr( unicodedata.normalize('NFC', f.currentlocation)).replace( groupType, sipPath) sanitized_location = unicodeToStr(current_location) logger.info('Checking %s', current_location) # Check parent directories # Since directory keys are a mix of sanitized and unsanitized, this is a little complicated # Directories keys are in the form sanitized/sanitized/unsanitized # When a match is found (eg 'unsanitized' -> 'sanitized') reset the search # This will find 'sanitized/unsanitized2' -> 'sanitized/sanitized2' on the next pass # TODO This should be checked for a more efficient solution dirpath = sanitized_location while objectsDirectory in dirpath: # Stay within unit if dirpath in sanitizations: # Make replacement sanitized_location = sanitized_location.replace( dirpath, sanitizations[dirpath]) dirpath = sanitized_location # Reset search else: # Check next level up dirpath = os.path.dirname(dirpath) if current_location != sanitized_location: oldfile = current_location.replace(objectsDirectory, relativeReplacement, 1) newfile = sanitized_location.replace(objectsDirectory, relativeReplacement, 1) kwargs = { 'src': oldfile, 'dst': newfile, 'eventType': 'name cleanup', 'eventDateTime': date, 'eventDetail': "prohibited characters removed:" + eventDetail, 'fileUUID': None, } if groupType == "%SIPDirectory%": kwargs['sipUUID'] = sipUUID elif groupType == "%transferDirectory%": kwargs['transferUUID'] = sipUUID else: print("bad group type", groupType, file=sys.stderr) sys.exit(3) logger.info('Sanitized name: %s -> %s', oldfile, newfile) print('Sanitized name:', oldfile, " -> ", newfile) updateFileLocation(**kwargs) else: logger.info('No sanitization for %s', current_location) print('No sanitization found for', current_location)