def addFileToTransfer( filePathRelativeToSIP, fileUUID, transferUUID, taskUUID, date, sourceType="ingestion", eventDetail="", use="original", originalLocation=None, ): if not originalLocation: originalLocation = filePathRelativeToSIP file_obj = insertIntoFiles( fileUUID, filePathRelativeToSIP, date, transferUUID=transferUUID, use=use, originalLocation=originalLocation, ) insertIntoEvents( fileUUID=fileUUID, eventType=sourceType, eventDateTime=date, eventDetail=eventDetail, eventOutcome="", eventOutcomeDetailNote="", ) addAccessionEvent(fileUUID, transferUUID, date) return file_obj
def insert_derivation_event( original_uuid, output_uuid, derivation_uuid, event_detail_output, outcome_detail_note, today=None, ): """Add the derivation link for preservation files and the event.""" if today is None: today = timezone.now() # Add event information to current file databaseFunctions.insertIntoEvents( fileUUID=original_uuid, eventIdentifierUUID=derivation_uuid, eventType="normalization", eventDateTime=today, eventDetail=event_detail_output, eventOutcome="", eventOutcomeDetailNote=outcome_detail_note or "", ) # Add linking information between files databaseFunctions.insertIntoDerivations( sourceFileUUID=original_uuid, derivedFileUUID=output_uuid, relatedEventUUID=derivation_uuid, )
def test_insert_into_events(self): assert (Event.objects.filter( event_id="6a671050-81ec-11ea-b337-8f27e380aa54").count() == 0) databaseFunctions.insertIntoEvents( fileUUID="88c8f115-80bc-4da4-a1e6-0158f5df13b9", eventIdentifierUUID="6a671050-81ec-11ea-b337-8f27e380aa54", ) assert (Event.objects.filter( event_id="6a671050-81ec-11ea-b337-8f27e380aa54").count() == 1)
def test_insert_into_event_fetches_correct_agent_from_file(self): databaseFunctions.insertIntoEvents( fileUUID="88c8f115-80bc-4da4-a1e6-0158f5df13b9", eventIdentifierUUID="00e46dbc-81ec-11ea-bf23-eb8a0da7ab13", ) agents = Event.objects.get( event_id="00e46dbc-81ec-11ea-bf23-eb8a0da7ab13").agents assert agents.count() == 2 assert agents.get(id=1) assert agents.get(id=2)
def addAccessionEvent(fileUUID, transferUUID, date): transfer = Transfer.objects.get(uuid=transferUUID) if transfer.accessionid: eventOutcomeDetailNote = f"accession#{transfer.accessionid}" insertIntoEvents( fileUUID=fileUUID, eventType="registration", eventDateTime=date, eventDetail="", eventOutcome="", eventOutcomeDetailNote=eventOutcomeDetailNote, )
def call(jobs): event_queue = [] for job in jobs: with job.JobContext(logger=logger): if not mcpclient_settings.VIRUS_SCANNING_ENABLED: job.set_status(0) continue job.set_status(scan_file(event_queue, *job.args[1:])) with transaction.atomic(): for e in event_queue: insertIntoEvents(**e)
def write_premis_event(job, sip_uuid, checksum_type, event_outcome, event_outcome_detail_note): """Write the AIP-level "fixity check" PREMIS event.""" try: databaseFunctions.insertIntoEvents( fileUUID=sip_uuid, eventType="fixity check", eventDetail='program="python, bag"; module="hashlib.{}()"'.format( checksum_type), eventOutcome=event_outcome, eventOutcomeDetailNote=event_outcome_detail_note, ) except Exception as err: job.pyprint(f"Failed to write PREMIS event to database. Error: {err}") else: return event_outcome_detail_note
def insert_transcription_event(status, file_uuid, rule, relative_location): outcome = "transcribed" if status == 0 else "not transcribed" tool = rule.command.tool event_detail = 'program={}; version={}; command="{}"'.format( tool.description, tool.version, rule.command.command.replace('"', r"\"") ) event_uuid = str(uuid4()) databaseFunctions.insertIntoEvents( fileUUID=file_uuid, eventIdentifierUUID=event_uuid, eventType="transcription", eventDetail=event_detail, eventOutcome=outcome, eventOutcomeDetailNote=relative_location, ) return event_uuid
def addFileToSIP( filePathRelativeToSIP, fileUUID, sipUUID, taskUUID, date, sourceType="ingestion", use="original", ): insertIntoFiles(fileUUID, filePathRelativeToSIP, date, sipUUID=sipUUID, use=use) insertIntoEvents( fileUUID=fileUUID, eventType=sourceType, eventDateTime=date, eventDetail="", eventOutcome="", eventOutcomeDetailNote="", )
def write_identification_event(file_uuid, puid=None, success=True): event_detail_text = 'program="{}"; version="{}"'.format( TOOL_DESCRIPTION, TOOL_VERSION ) if success: event_outcome_text = "Positive" else: event_outcome_text = "Not identified" if not puid or puid == "UNKNOWN": puid = "No Matching Format" date = getUTCDate() insertIntoEvents( fileUUID=file_uuid, eventIdentifierUUID=str(uuid.uuid4()), eventType="format identification", eventDateTime=date, eventDetail=event_detail_text, eventOutcome=event_outcome_text, eventOutcomeDetailNote=puid, )
def updateSizeAndChecksum( fileUUID, filePath, date, eventIdentifierUUID, fileSize=None, checksum=None, checksumType=None, add_event=True, ): """ Update a File with its size, checksum and checksum type. These are parameters that can be either generated or provided via keywords. Finally, insert the corresponding Event. This behavior can be cancelled using the boolean keyword 'add_event'. """ if not fileSize: fileSize = os.path.getsize(filePath) if not checksumType: checksumType = django_settings.DEFAULT_CHECKSUM_ALGORITHM if not checksum: checksum = get_file_checksum(filePath, checksumType) File.objects.filter(uuid=fileUUID).update(size=fileSize, checksum=checksum, checksumtype=checksumType) if add_event: insertIntoEvents( fileUUID=fileUUID, eventType="message digest calculation", eventDateTime=date, eventDetail=f'program="python"; module="hashlib.{checksumType}()"', eventOutcomeDetailNote=checksum, )
def compress_aip(job, compression, compression_level, sip_directory, sip_name, sip_uuid): """Compresses AIP according to compression algorithm and level. compression = AIP compression algorithm, format: <program>-<algorithm>, eg. 7z-lzma, pbzip2- compression_level = AIP compression level, integer between 1 and 9 inclusive sip_directory = Absolute path to the directory where the SIP is sip_name = User-provided name of the SIP sip_uuid = SIP UUID Example inputs: compressAIP.py 7z-lzma 5 %sharedDirectory%/watchedDirectories/workFlowDecisions/compressionAIPDecisions/ep-d87d5845-bd07-4200-b1a4-928e0cb6e1e4/ ep d87d5845-bd07-4200-b1a4-928e0cb6e1e4 """ if compression_level == "0": compression_level = "1" # Default is uncompressed. compression = int(compression) ProcessingConfig.AIPCompressionAlgorithm.Name(compression) if compression == ProcessingConfig.AIP_COMPRESSION_ALGORITHM_UNSPECIFIED: compression = ProcessingConfig.AIP_COMPRESSION_ALGORITHM_UNCOMPRESSED # Translation to make compress_aip happy. mapping = { ProcessingConfig.AIP_COMPRESSION_ALGORITHM_UNCOMPRESSED: ("None", ""), ProcessingConfig.AIP_COMPRESSION_ALGORITHM_TAR: ( "gzip", "tar.gzip", ), # A3M-TODO: support ProcessingConfig.AIP_COMPRESSION_ALGORITHM_TAR_BZIP2: ("pbzip2", "pbzip2"), ProcessingConfig.AIP_COMPRESSION_ALGORITHM_TAR_GZIP: ("gzip", "tar.gzip"), ProcessingConfig.AIP_COMPRESSION_ALGORITHM_S7_COPY: ("7z", "copy"), ProcessingConfig.AIP_COMPRESSION_ALGORITHM_S7_BZIP2: ("7z", "bzip2"), ProcessingConfig.AIP_COMPRESSION_ALGORITHM_S7_LZMA: ("7z", "lzma"), } try: program, compression_algorithm = mapping[compression] except KeyError: msg = f"Invalid program-compression algorithm: {compression}" job.pyprint(msg, file=sys.stderr) return 255 archive_path = f"{sip_name}-{sip_uuid}" uncompressed_location = sip_directory + archive_path # Even though no actual compression is taking place, # the location still needs to be set in the unit to ensure that the # %AIPFilename% variable is set appropriately. # Setting it to an empty string ensures the common # "%SIPDirectory%%AIPFilename%" pattern still points at the right thing. if program == "None": update_unit(sip_uuid, uncompressed_location) return 0 job.pyprint("Compressing {} with {}, algorithm {}, level {}".format( uncompressed_location, program, compression_algorithm, compression_level)) if program == "7z": compressed_location = uncompressed_location + ".7z" command = '/usr/bin/7z a -bd -t7z -y -m0={algorithm} -mx={level} -mta=on -mtc=on -mtm=on -mmt=on "{compressed_location}" "{uncompressed_location}"'.format( algorithm=compression_algorithm, level=compression_level, uncompressed_location=uncompressed_location, compressed_location=compressed_location, ) tool_info_command = ( r'echo program="7z"\; ' r'algorithm="{}"\; ' 'version="`7z | grep Version`"'.format(compression_algorithm)) elif program == "pbzip2": compressed_location = uncompressed_location + ".tar.bz2" command = '/bin/tar -c --directory "{sip_directory}" "{archive_path}" | /usr/bin/pbzip2 --compress -{level} > "{compressed_location}"'.format( sip_directory=sip_directory, archive_path=archive_path, level=compression_level, compressed_location=compressed_location, ) tool_info_command = ( r'echo program="pbzip2"\; ' r'algorithm="{}"\; ' 'version="$((pbzip2 -V) 2>&1)"'.format(compression_algorithm)) elif program == "gzip": compressed_location = uncompressed_location + ".tar.gz" command = '/bin/tar -c --directory "{sip_directory}" "{archive_path}" | /bin/gzip -{level} > "{compressed_location}"'.format( sip_directory=sip_directory, archive_path=archive_path, level=compression_level, compressed_location=compressed_location, ) tool_info_command = ( r'echo program="gzip"\; ' r'algorithm="{}"\; ' 'version="$((gzip -V) 2>&1)"'.format(compression_algorithm)) else: msg = f"Program {program} not recognized, exiting script prematurely." job.pyprint(msg, file=sys.stderr) return 255 job.pyprint("Executing command:", command) exit_code, std_out, std_err = executeOrRun("bashScript", command, capture_output=True) job.write_output(std_out) job.write_error(std_err) # Add new AIP File file_uuid = sip_uuid databaseFunctions.insertIntoFiles( fileUUID=file_uuid, filePath=compressed_location.replace(sip_directory, "%SIPDirectory%", 1), sipUUID=sip_uuid, use="aip", ) # Add compression event job.pyprint("Tool info command:", tool_info_command) _, tool_info, tool_info_err = executeOrRun("bashScript", tool_info_command, capture_output=True) job.write_output(tool_info) job.write_error(tool_info_err) tool_output = f'Standard Output="{std_out}"; Standard Error="{std_err}"' databaseFunctions.insertIntoEvents( eventType="compression", eventDetail=tool_info, eventOutcomeDetailNote=tool_output, fileUUID=file_uuid, ) update_unit(sip_uuid, compressed_location) return exit_code
def _execute_rule_command(self, rule): """Execute the FPR command of FPR rule ``rule`` against the file passed in to this client script. The output of that command determines what we print to stdout and stderr, and the nature of the validation event that we save to the db. We also copy the MediaConch policy file to the logs/ directory of the AIP if it has not already been copied there. """ result = "passed" command_to_execute, args = self._get_command_to_execute(rule) self.job.pyprint("Running", rule.command.description) exitstatus, stdout, stderr = executeOrRun( rule.command.script_type, command_to_execute, arguments=args, printing=False, capture_output=True, ) try: output = json.loads(stdout) except ValueError: logger.exception( "Unable to load an object from the malformed JSON: \n%s", stderr ) raise if self.file_type in ("preservation", "original"): self._save_to_logs_dir(output) if exitstatus == 0: self.job.pyprint( "Command {} completed with output {}".format( rule.command.description, stdout ) ) else: self.job.print_error( "Command {} failed with exit status {}; stderr:".format( rule.command.description, exitstatus ), stderr, ) return "failed" event_detail = ( 'program="{tool.description}";' ' version="{tool.version}"'.format(tool=rule.command.tool) ) if output.get("eventOutcomeInformation") != "pass": self.job.print_error( "Command {descr} returned a non-pass outcome " "for the policy check;\n\noutcome: " "{outcome}\n\ndetails: {details}.".format( descr=rule.command.description, outcome=output.get("eventOutcomeInformation"), details=output.get("eventOutcomeDetailNote"), ) ) result = "failed" self.job.pyprint( "Creating policy checking event for {} ({})".format( self.file_path, self.file_uuid ) ) # Manually-normalized access derivatives have no file UUID so we can't # create a validation event for them. TODO/QUESTION: should we use the # UUID that was assigned to the manually normalized derivative during # transfer, i.e., the one that we retrieve in # ``_get_manually_normalized_access_derivative_file_uuid`` above? if not self.is_manually_normalized_access_derivative: databaseFunctions.insertIntoEvents( fileUUID=self.file_uuid, eventType="validation", # From PREMIS controlled vocab. eventDetail=event_detail, eventOutcome=output.get("eventOutcomeInformation"), eventOutcomeDetailNote=output.get("eventOutcomeDetailNote"), ) return result
def main(job): # "%SIPUUID%" "%SIPName%" "%SIPDirectory%" "%fileUUID%" "%filePath%" # job.args[2] (SIPName) is unused. SIPUUID = job.args[1] SIPDirectory = job.args[3] fileUUID = job.args[4] filePath = job.args[5] date = job.args[6] # Search for original file associated with preservation file given in filePath filePathLike = filePath.replace( os.path.join(SIPDirectory, "objects", "manualNormalization", "preservation"), "%SIPDirectory%objects", 1, ) i = filePathLike.rfind(".") k = os.path.basename(filePath).rfind(".") if i != -1 and k != -1: filePathLike = filePathLike[:i + 1] # Matches "path/to/file/filename." Includes . so it doesn't false match foobar.txt when we wanted foo.txt filePathLike1 = filePathLike # Matches the exact filename. For files with no extension. filePathLike2 = filePathLike[:-1] try: path_condition = Q(currentlocation__startswith=filePathLike1) | Q( currentlocation=filePathLike2) original_file = File.objects.get( path_condition, removedtime__isnull=True, filegrpuse="original", sip_id=SIPUUID, ) except (File.DoesNotExist, File.MultipleObjectsReturned) as e: # Original file was not found, or there is more than one original file with # the same filename (differing extensions) # Look for a CSV that will specify the mapping csv_path = os.path.join(SIPDirectory, "objects", "manualNormalization", "normalization.csv") if os.path.isfile(csv_path): try: preservation_file = filePath[ filePath.index("manualNormalization/preservation/"):] except ValueError: job.print_error( f"{filePath} not in manualNormalization directory") return 4 original = fileOperations.findFileInNormalizationCSV( csv_path, "preservation", preservation_file, SIPUUID, printfn=job.pyprint, ) if original is None: if isinstance(e, File.DoesNotExist): job.print_error("No matching file for: {}".format( filePath.replace(SIPDirectory, "%SIPDirectory%"))) return 3 else: job.print_error( "Could not find {preservation_file} in {filename}". format(preservation_file=preservation_file, filename=csv_path)) return 2 # If we found the original file, retrieve it from the DB original_file = File.objects.get( removedtime__isnull=True, filegrpuse="original", originallocation__endswith=original, sip_id=SIPUUID, ) else: if isinstance(e, File.DoesNotExist): job.print_error( "No matching file for: ", filePath.replace(SIPDirectory, "%SIPDirectory%", 1), ) return 3 elif isinstance(e, File.MultipleObjectsReturned): job.print_error( "Too many possible files for: ", filePath.replace(SIPDirectory, "%SIPDirectory%", 1), ) return 2 # We found the original file somewhere above job.print_output( "Matched original file %s (%s) to preservation file %s (%s)" % (original_file.currentlocation, original_file.uuid, filePath, fileUUID)) # Generate the new preservation path: path/to/original/filename-uuid.ext basename = os.path.basename(filePath) i = basename.rfind(".") dstFile = basename[:i] + "-" + fileUUID + basename[i:] dstDir = os.path.dirname( original_file.currentlocation.replace("%SIPDirectory%", SIPDirectory, 1)) dst = os.path.join(dstDir, dstFile) dstR = dst.replace(SIPDirectory, "%SIPDirectory%", 1) if os.path.exists(dst): job.print_error("already exists:", dstR) return 2 # Rename the preservation file job.print_output("Renaming preservation file", filePath, "to", dst) os.rename(filePath, dst) # Update the preservation file's location File.objects.filter(uuid=fileUUID).update(currentlocation=dstR) try: # Normalization event already exists, so just update it # fileUUID, eventIdentifierUUID, eventType, eventDateTime, eventDetail # probably already correct, and we only set eventOutcomeDetailNote here # Not using .filter().update() because that doesn't generate an exception event = Event.objects.get(event_type="normalization", file_uuid=original_file) event.event_outcome_detail = dstR event.save() job.print_output( "Updated the eventOutcomeDetailNote of an existing normalization" " Event for file {}. Not creating a Derivation object".format( fileUUID)) except Event.DoesNotExist: # No normalization event was created in normalize.py - probably manually # normalized during Ingest derivationEventUUID = str(uuid.uuid4()) databaseFunctions.insertIntoEvents( fileUUID=original_file.uuid, eventIdentifierUUID=derivationEventUUID, eventType="normalization", eventDateTime=date, eventDetail="manual normalization", eventOutcome="", eventOutcomeDetailNote=dstR, ) job.print_output( "Created a manual normalization Event for file {}.".format( original_file.uuid)) # Add linking information between files # Assuming that if an event already exists, then the derivation does as well databaseFunctions.insertIntoDerivations( sourceFileUUID=original_file.uuid, derivedFileUUID=fileUUID, relatedEventUUID=derivationEventUUID, ) job.print_output( "Created a Derivation for original file {}, derived file {}, and" " event {}".format(original_file.uuid, fileUUID, derivationEventUUID)) return 0
def _execute_rule_command(self, rule): """Run the command against the file and return either 'passed' or 'failed'. If the command errors or determines that the file is invalid, return 'failed'. Non-errors will result in the creation of an Event model in the db. Preservation derivative validation will result in the stdout from the command being saved to disk within the unit (i.e., SIP). """ result = "passed" if rule.command.script_type in ("bashScript", "command"): command_to_execute = replace_string_values( rule.command.command, file_=self.file_uuid, sip=self.sip_uuid, type_="file", ) args = [] else: command_to_execute = rule.command.command args = [self.file_path] self.job.print_output("Running", rule.command.description) exitstatus, stdout, stderr = executeOrRun( type=rule.command.script_type, text=command_to_execute, printing=False, arguments=args, ) if exitstatus != 0: self.job.print_error( "Command {description} failed with exit status {status};" " stderr:".format(description=rule.command.description, status=exitstatus)) return "failed" # Parse output and generate an Event # TODO: Evaluating a python string from a user-definable script seems # insecure practice; should be JSON. output = ast.literal_eval(stdout) event_detail = ('program="{tool.description}";' ' version="{tool.version}"'.format( tool=rule.command.tool)) # If the FPR command has not errored but the actual validation # determined that the file is not valid, then we want to both create a # validation event in the db and set ``failed`` to ``True`` because we # want the micro-service in the dashboard GUI to indicate "Failed". # NOTE: this requires that the stdout of all validation FPR commands be # a dict (preferably a JSON object) with an ``eventOutcomeInformation`` # boolean attribute. if output.get("eventOutcomeInformation") == "pass": self.job.print_output( f'Command "{rule.command.description}" was successful') elif output.get("eventOutcomeInformation") == "partial pass": self.job.print_output( f'Command "{rule.command.description}" was partially successful' ) else: self.job.pyprint( "Command {cmd_description} indicated failure with this" " output:\n\n{output}".format( cmd_description=rule.command.description, output=pformat(stdout)), file=sys.stderr, ) result = "failed" if self.file_type == "preservation": self._save_stdout_to_logs_dir(output) self.job.print_output( "Creating {purpose} event for {file_path} ({file_uuid})".format( purpose=self.purpose, file_path=self.file_path, file_uuid=self.file_uuid)) databaseFunctions.insertIntoEvents( fileUUID=self.file_uuid, eventType="validation", # From PREMIS controlled vocab. eventDetail=event_detail, eventOutcome=output.get("eventOutcomeInformation"), eventOutcomeDetailNote=output.get("eventOutcomeDetailNote"), ) return result