def AddChecksums(report): if 'steps' not in report: return if 'cmsRun' not in report['steps']: return if 'output' not in report['steps']['cmsRun']: return for outputMod in report['steps']['cmsRun']['output'].values(): for fileInfo in outputMod: if 'checksums' in fileInfo: continue if 'pfn' not in fileInfo: if 'fileName' in fileInfo: fileInfo['pfn'] = fileInfo['fileName'] else: continue fileInfo['size'] = os.stat(fileInfo['pfn']).st_size print("==== Checksum computation STARTING at %s ====" % time.asctime(time.gmtime())) (adler32, cksum) = calculateChecksums(fileInfo['pfn']) print("==== Checksum FINISHED at %s ====" % time.asctime(time.gmtime())) print("== FileName: %s - FileAdler32: %s - FileSize: %.3f MBytes" % \ (fileInfo['pfn'], adler32, float(fileInfo['size'])/(1024*1024)) ) fileInfo['checksums'] = {'adler32': adler32, 'cksum': cksum}
def processFile(self, filename, fileReport, step, outputModule): """ This attaches the info to the fileReport instance """ # Get checksum (adler32, cksum) = calculateChecksums(filename) # Get info from spec output = getattr(step.output.modules, outputModule) disableGUID = getattr(output, 'disableGUID', False) fixedLFN = getattr(output, 'fixedLFN', False) primaryDataset = output.primaryDataset processedDataset = output.processedDataset dataTier = output.dataTier # Get other file information size = os.stat(filename)[6] #Get info from file mergedLFNBase = getattr(fileReport, 'MergedLFNBase', None) mergedBySize = getattr(fileReport, 'MergedBySize', False) lfn = getattr(fileReport, 'lfn') # Do LFN manipulation # First in the standard case if not fixedLFN and not disableGUID: guid = getattr(fileReport, 'guid', None) if not guid: msg = "No GUID for file %s" %(lfn) logging.error(msg) raise Exception(msg) # Then we have to change the LFN to match the GUID dirname = os.path.dirname(lfn) filelfn = '%s.root' %(str(guid)) setattr(fileReport, 'lfn', os.path.join(dirname, filelfn)) elif not fixedLFN and mergedBySize and mergedLFNBase: # Then we better do the merge stuff # Not tested for now mergedLFNBase.rstrip('/') newLFN = os.path.join(mergedLFNBase, os.path.basename(lfn)) setattr(fileReport, 'lfn', newLFN) # Attach values setattr(fileReport, 'checksums', {'adler32': adler32, 'cksum': cksum}) setattr(fileReport, 'size', size) setattr(fileReport, "dataset", {"applicationName": "cmsRun", "applicationVersion": step.application.setup.cmsswVersion, "primaryDataset": primaryDataset, "processedDataset": processedDataset, "dataTier": dataTier}) return fileReport
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) logging.info("Steps.Executors.%s.execute called", self.__class__.__name__) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() logging.info("Using the following overrides: %s ", overrides) # Find alternate stageout location self.altLFN = overrides.get('altLFN', None) self.failedPreviousStep = overrides.get('previousCmsRunFailure', False) logging.info("Step configuration is: %s", self.step) # Wait timeout for stageOut waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) matchFiles = [ ".log$", # matches the scram, wmagent and cmsRun logs "FrameworkJobReport.xml", "Report.pkl", "^PSet.py$", "^PSet.pkl$", "_condor_std*", # condor wrapper logs at the pilot top level ] ignoredDirs = ['Utils', 'WMCore', 'WMSandbox'] # Okay, we need a stageOut Manager useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**overrides) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE") manager = WMCore.Storage.FileManager.StageOutMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **overrides) # Now we need to find all the reports # The log search follows this structure: ~pilotArea/jobArea/WMTaskSpaceArea/StepsArea # Start looking at the pilot scratch area first, such that we find the condor logs # Then look at the job area in order to find the wmagentJob log # Finally, at the taskspace area to find the cmsRun/FWJR/PSet files pilotScratchDir = os.path.join(self.stepSpace.taskSpace.location, '../../') logFilesToArchive = self.findFilesInDirectory(pilotScratchDir, matchFiles, ignoredDirs) # What if it's empty? if len(logFilesToArchive) == 0: msg = "Couldn't find any log files in the job" logging.error(msg) return logFilesToArchive # Now that we've gone through all the steps, we have to tar it out tarName = 'logArchive.tar.gz' tarBallLocation = os.path.join(self.stepSpace.location, tarName) with tarfile.open(tarBallLocation, 'w:gz') as tarBall: for fName in logFilesToArchive: altName = fName.replace(pilotScratchDir, '', 1) tarBall.add(name=fName, arcname=altName) fileInfo = {'LFN': self.getLFN(tarName), 'PFN': tarBallLocation, 'PNN': None, 'GUID': None } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileInfo) self.report.addOutputModule(moduleName="logArchive") (adler32, cksum) = calculateChecksums(tarBallLocation) reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"], "location": fileInfo["PNN"], "module_label": "logArchive", "events": 0, "size": 0, "merged": False, "checksums": {'adler32': adler32, 'cksum': cksum}} self.report.addOutputFile(outputModule="logArchive", aFile=reportFile) except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg) self.saveReport() raise WMExecutionFailure(60404, "LogArchiveTimeout", msg) except WMException as ex: self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex)) self.saveReport() raise ex except Exception as ex: self.report.addError(self.stepName, 60405, "LogArchiveFailure", str(ex)) self.saveReport() msg = "Failure in transferring logArchive tarball\n" logging.exception(msg) raise WMException("LogArchiveFailure", message=str(ex)) signal.alarm(0) signal.alarm(waitTime) self.sendLogToEOS(overrides, tarBallLocation, useNewStageOutCode) signal.alarm(0) return
def execute(self, emulator=None, **overrides): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) overrides = {} #TODO need to set override using addOverride method in WMStep if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Find alternate stageout location self.altLFN = overrides.get('altLFN', None) logging.info("Beginning Steps.Executors.LogArchive.Execute") logging.info("Using the following overrides: %s ", overrides) logging.info("Step is: %s", self.step) # Wait timeout for stageOut waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) matchFiles = [ ".log$", # matches the scram, wmagent and cmsRun logs "FrameworkJobReport.xml", "Report.pkl", "^PSet.py$", "^PSet.pkl$", "_condor_std*", # condor wrapper logs at the pilot top level ] ignoredDirs = ['Utils', 'WMCore', 'WMSandbox'] # Okay, we need a stageOut Manager useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**overrides) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.info("LOGARCHIVE IS USING NEW STAGEOUT CODE") manager = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **overrides) # Now we need to find all the reports # The log search follows this structure: ~pilotArea/jobArea/WMTaskSpaceArea/StepsArea # Start looking at the pilot scratch area first, such that we find the condor logs # Then look at the job area in order to find the wmagentJob log # Finally, at the taskspace area to find the cmsRun/FWJR/PSet files pilotScratchDir = os.path.join(self.stepSpace.taskSpace.location, '../../') logFilesToArchive = self.findFilesInDirectory(pilotScratchDir, matchFiles, ignoredDirs) # What if it's empty? if len(logFilesToArchive) == 0: msg = "Couldn't find any log files in the job" logging.error(msg) return logFilesToArchive # Now that we've gone through all the steps, we have to tar it out tarName = 'logArchive.tar.gz' tarBallLocation = os.path.join(self.stepSpace.location, tarName) tarBall = tarfile.open(tarBallLocation, 'w:gz') for f in logFilesToArchive: tarBall.add(name=f, arcname=f.replace(self.stepSpace.taskSpace.location, '', 1).lstrip('/')) tarBall.close() fileInfo = {'LFN': self.getLFN(tarName), 'PFN': tarBallLocation, 'PNN': None, 'GUID': None } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileInfo) self.report.addOutputModule(moduleName="logArchive") (adler32, cksum) = calculateChecksums(tarBallLocation) reportFile = {"lfn": fileInfo["LFN"], "pfn": fileInfo["PFN"], "location": fileInfo["PNN"], "module_label": "logArchive", "events": 0, "size": 0, "merged": False, "checksums": {'adler32': adler32, 'cksum': cksum}} self.report.addOutputFile(outputModule="logArchive", aFile=reportFile) except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) self.report.addError(self.stepName, 60404, "LogArchiveTimeout", msg) self.saveReport() raise WMExecutionFailure(60404, "LogArchiveTimeout", msg) except WMException as ex: self.report.addError(self.stepName, 60307, "LogArchiveFailure", str(ex)) self.saveReport() raise ex except Exception as ex: self.report.addError(self.stepName, 60405, "LogArchiveFailure", str(ex)) self.saveReport() msg = "Failure in transferring logArchive tarball\n" logging.exception(msg) raise WMException("LogArchiveFailure", message=str(ex)) signal.alarm(0) signal.alarm(waitTime) self.sendLogToEOS(overrides, tarBallLocation, useNewStageOutCode) signal.alarm(0) return