def pre(self, emulator=None): """ _pre_ Pre execution checks """ if (emulator != None): return emulator.emulatePre(self.step) logging.info("Pre-executing CMSSW step") if hasattr(self.step.application.configuration, 'configCacheUrl'): # means we have a configuration & tweak in the sandbox psetFile = self.step.application.command.configuration psetTweak = self.step.application.command.psetTweak self.stepSpace.getFromSandbox(psetFile) if psetTweak: self.stepSpace.getFromSandbox(psetTweak) if hasattr(self.step, "pileup"): self.stepSpace.getFromSandbox("pileupconf.json") # if step is multicore Enabled, add the multicore setup # that generates the input file manifest and per process splitting stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() if typeHelper.multicoreEnabled(): self.step.runtime.scramPreScripts.append("SetupCMSSWMulticore") # add in ths scram env PSet manip script whatever happens self.step.runtime.scramPreScripts.append("SetupCMSSWPset") return None
def pre(self, emulator = None): """ _pre_ Pre execution checks """ if (emulator != None): return emulator.emulatePre( self.step ) logging.info("Pre-executing CMSSW step") if hasattr(self.step.application.configuration, 'configCacheUrl'): # means we have a configuration & tweak in the sandbox psetFile = self.step.application.command.configuration psetTweak = self.step.application.command.psetTweak self.stepSpace.getFromSandbox(psetFile) if psetTweak: self.stepSpace.getFromSandbox(psetTweak) if hasattr(self.step, "pileup"): self.stepSpace.getFromSandbox("pileupconf.json") # if step is multicore Enabled, add the multicore setup # that generates the input file manifest and per process splitting stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() if typeHelper.multicoreEnabled(): self.step.runtime.scramPreScripts.append("SetupCMSSWMulticore") # add in ths scram env PSet manip script whatever happens self.step.runtime.scramPreScripts.append("SetupCMSSWPset") return None
def emulateAll(self): """ _emulateAll_ Traverse all Steps and load up the default Emulator based on type. """ for step in self.task.steps().nodeIterator(): helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() self.addEmulator(stepName, stepType)
def setStep(self, wmStep): """set topStep to be the step instance provided""" stepData = wmStep if isinstance(wmStep, WMStepHelper): stepData = wmStep.data stepHelper = wmStep else: stepHelper = WMStepHelper(wmStep) stepName = stepHelper.name() stepHelper.setTopOfTree() setattr(self.data.steps, stepName, stepData) setattr(self.data.steps, "topStepName", stepName) return
def execute(self): """ _emulate_ """ self.step.section_("execution") self.step.execution.exitStatus = 0 self.step.emulator.emulatedBy = str(self.__class__.__name__) helper = WMStepHelper(self.step) cmsswStep = helper.getTypeHelper() reportMaker = ReportEmu(Job = self.job, WMStep = cmsswStep) self.executor.report = reportMaker()
def execute(self): """ _emulate_ """ self.step.section_("execution") self.step.execution.exitStatus = 0 self.step.emulator.emulatedBy = str(self.__class__.__name__) helper = WMStepHelper(self.step) cmsswStep = helper.getTypeHelper() reportMaker = ReportEmu(Job=self.job, WMStep=cmsswStep) self.executor.report = reportMaker()
def pre(self, emulator = None): """ _pre_ Pre execution checks """ if (emulator != None): return emulator.emulatePre( self.step ) logging.info("Pre-executing CMSSW step") if hasattr(self.step.application.configuration, 'configCacheUrl'): # means we have a configuration & tweak in the sandbox psetFile = self.step.application.command.configuration psetTweak = self.step.application.command.psetTweak self.stepSpace.getFromSandbox(psetFile) if psetTweak: self.stepSpace.getFromSandbox(psetTweak) if hasattr(self.step, "pileup"): self.stepSpace.getFromSandbox("pileupconf.json") stepHelper = WMStepHelper(self.step) # add in ths scram env PSet manip script whatever happens self.step.runtime.scramPreScripts.append("SetupCMSSWPset") return None
def stepStart(self, step): """ _stepStart_ Fill with the step-based information. If it is the first step, report that the job started its execution. """ helper = WMStepHelper(step) self.stepCount += 1 data = None if not self.jobStarted: #It's the first step so let's send the exe that started and where #That's what they request data = {} data['MessageType'] = 'jobRuntime-jobStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['GridName'] = getUserProxyDN() data['ExeStart'] = helper.name() data['SyncCE'] = getSyncCE() data['WNHostName'] = socket.gethostname() self.publish(data = data) self.jobStarted = True #Now let's send the step information tmp = {'jobStart': data} data = {} data['MessageType'] = 'jobRuntime-stepStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_stepName' % self.stepCount] = helper.name() data['%d_ExeStart' % self.stepCount] = helper.name() self.publish(data = data) data.update(tmp) return data
def stepStart(self, step): """ _stepStart_ Fill with the step-based information. If it is the first step, report that the job started its execution. """ helper = WMStepHelper(step) self.stepCount += 1 data = None if not self.jobStarted: #It's the first step so let's send the exe that started and where #That's what they request data = {} data['MessageType'] = 'jobRuntime-jobStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['GridName'] = getUserProxyDN() data['ExeStart'] = helper.name() data['SyncCE'] = getSyncCE() data['WNHostName'] = socket.gethostname() self.publish(data=data) self.jobStarted = True #Now let's send the step information tmp = {'jobStart': data} data = {} data['MessageType'] = 'jobRuntime-stepStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_stepName' % self.stepCount] = helper.name() data['%d_ExeStart' % self.stepCount] = helper.name() self.publish(data=data) data.update(tmp) return data
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName = helper.name()) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['ExeEnd'] = helper.name() data['ExeExitCode'] = stepReport.getStepExitCode( stepName = helper.name()) if helper.name() == 'StageOut': data['StageOutExitStatus'] = int(stepReport.stepSuccessful( stepName = helper.name())) times = stepReport.getTimes(stepName = helper.name()) data['ExeWCTime'] = times['stopTime'] - times['startTime'] step = stepReport.retrieveStep(step = helper.name()) if hasattr(step, 'performance'): if hasattr(step.performance, 'cpu'): data['ExeCPUTime'] = getattr(step.performance.cpu, 'TotalJobCPU', 0) self.WrapperCPUTime += float(data['ExeCPUTime']) self.WrapperWCTime += data['ExeWCTime'] self.lastStep = helper.name() self.publish(data = data) return data
def makeStep(self, stepName): """ _makeStep_ create a new WMStep instance, install it as the top step and return the reference to the new step wrapped in a StepHelper """ newStep = WMStep(stepName) self.setStep(newStep) return WMStepHelper(newStep)
def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return
def stepStart(self, step): """ _stepStart_ Fill with the step-based information """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime' data['MessageTS'] = time.time() data['taskId'] = self.taskName data['jobId'] = self.jobName data['retryCount'] = self.job['retry_count'] data['ExeStart'] = helper.name() self.publish(data = data) return data
def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = getStepSpace(stepHelper.name()) if not stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return
def stepKilled(self, step): """ _stepKilled_ Fill with step-ending information assuming utter failure """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime-stepKilled' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['ExeEnd'] = helper.name() self.lastStep = helper.name() self.publish(data = data) return data
def stepKilled(self, step): """ _stepKilled_ Fill with step-ending information assuming utter failure """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime-stepKilled' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() self.lastStep = helper.name() self.publish(data=data) return data
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName = helper.name()) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) data = {} data['MessageType'] = 'jobRuntime' data['MessageTS'] = time.time() data['taskId'] = self.taskName data['jobId'] = self.jobName data['retryCount'] = self.job['retry_count'] data['ExeEnd'] = helper.name() data['ExeExitCode'] = stepReport.getStepExitCode(stepName = helper.name()) if helper.name() == 'StageOut': data['StageOutExitStatus'] = stepReport.stepSuccessful(stepName = helper.name()) self.publish(data = data) return
def toStepDirectory(self, step): """ _toStepDirectory_ Switch current working directory to the step location within WMTaskSpace """ stepName = WMStepHelper(step).name() from WMTaskSpace import taskSpace stepSpace = taskSpace.stepSpace(stepName) os.chdir(stepSpace.location)
def stepKilled(self, step): """ _stepKilled_ Fill with step-ending information assuming utter failure """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime' data['MessageTS'] = time.time() data['taskId'] = self.taskName data['jobId'] = self.jobName data['retryCount'] = self.job['retry_count'] data['ExeEnd'] = helper.name() self.publish(data = data) return
#!/usr/bin/env python """ _WMRuntimeMonitor_ This is the base class for monitors """ import threading import os from WMCore.WMException import WMException from WMCore.WMSpec.Steps.Executor import getStepSpace from WMCore.WMSpec.WMStep import WMStepHelper getStepName = lambda step: WMStepHelper(step).name() class WMRuntimeMonitorException(WMException): """ _StepFactortyException_ It's like an exception class that does nothing """ pass class WMRuntimeMonitor: def __init__(self): self.currentStep = None self.currentStepName = None
class ExecuteMaster: """ _ExecuteMaster_ Traverse the given task and invoke the execute framework If an emulator is provided, then invoke the appropriate emulator instead of the executor """ def __init__(self): pass def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception, ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if not result == None: skipToStep = result except WMException, ex: self.toTaskDirectory() break except Exception, ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break
""" from __future__ import absolute_import import json import logging import os import subprocess import sys from Utils.FileTools import getFullPath from Utils.Utilities import zipEncodeStr from WMCore.FwkJobReport.Report import Report from WMCore.WMSpec.Steps.StepFactory import getStepEmulator from WMCore.WMSpec.WMStep import WMStepHelper getStepName = lambda step: WMStepHelper(step).name() getStepErrorDestination = lambda step: WMStepHelper(step).getErrorDestinationStep() def getStepSpace(stepName): """ _getStepSpace_ Util to get the runtime step space. This imports dynamic runtime libraries so be careful how you use it """ modName = "WMTaskSpace" if modName in sys.modules.keys(): taskspace = sys.modules[modName]
def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if emulator is not None: return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments userTarball = ','.join(self.step.user.inputSandboxes) userFiles = ','.join(self.step.user.userFiles) logging.info('User files are %s', userFiles) logging.info('User sandboxes are %s', userTarball) scramArch = getSingleScramArch(scramArch) multicoreSettings = self.step.application.multicore try: logging.info("CMSSW configured for %s cores and %s event streams", multicoreSettings.numberOfCores, multicoreSettings.eventStreams) except AttributeError: logging.info( "No value set for multicore numberOfCores or eventStreams") logging.info("Executing CMSSW step") # # set any global environment variables # try: os.environ['FRONTIER_ID'] = 'wmagent_%s' % ( self.report.data.workload) except Exception as ex: logging.error('Have critical error in setting FRONTIER_ID: %s', str(ex)) logging.error( 'Continuing, as this is not a critical function yet.') # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Runing SCRAM") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # # pre scripts # logging.info("Running PRE scripts") for script in self.step.runtime.preScripts: # TODO: Exception handling and error handling & logging scriptProcess = subprocess.Popen( ["/bin/bash"], shell=True, cwd=self.step.builder.workingDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, ) # BADPYTHON scriptProcess.stdin.write( "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n") invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % ( sys.executable, stepModule, script) logging.info(" Invoking command: %s", invokeCommand) scriptProcess.stdin.write(invokeCommand) stdout, stderr = scriptProcess.communicate() retCode = scriptProcess.returncode if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr) logging.critical("Error running command") logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptFailure", msg) # # pre scripts with scram # logging.info("RUNNING SCRAM SCRIPTS") for script in self.step.runtime.scramPreScripts: # invoke scripts with scram() runtimeDir = getattr(self.step.runtime, 'scramPreDir', None) invokeCommand = self.step.runtime.invokeCommand if hasattr(self.step.runtime, 'invokeCommand') else \ "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule) invokeCommand += " %s \n" % script retCode = scram(invokeCommand, runtimeDir=runtimeDir) if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptScramFailure", msg) configPath = "%s/%s-main.sh" % (self.step.builder.workingDir, self.stepName) handle = open(configPath, 'w') handle.write(CONFIG_BLOB) handle.close() # spawn this new process # the script looks for: # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG> # open the output files stdoutHandle = open(self.step.output.stdout, 'w') stderrHandle = open(self.step.output.stderr, 'w') args = [ '/bin/bash', configPath, scramSetup, scramArch, scramCommand, scramProject, cmsswVersion, jobReportXML, cmsswCommand, cmsswConfig, userTarball, userFiles, cmsswArguments ] logging.info("Executing CMSSW. args: %s", args) # possibly needed environment overrides for CMSSW call go here envOverride = {} # work around problem with GSI authentication plugin and EOS at CERN if socket.getfqdn().endswith("cern.ch"): envOverride['XRD_LOADBALANCERTTL'] = "86400" # some libraries linked with CMSSW need HOME in the environment if 'HOME' not in os.environ: envOverride['HOME'] = os.environ.get('PWD', "/") os.environ.update(envOverride) returncode = subprocess.call(args, stdout=stdoutHandle, stderr=stderrHandle) self.setCondorChirpAttrDelayed('Chirp_WMCore_cmsRun_ExitCode', returncode) self.setCondorChirpAttrDelayed( 'Chirp_WMCore_%s_ExitCode' % self.stepName, returncode) stdoutHandle.close() stderrHandle.close() self.step.execution.exitStatus = returncode argsDump = {'arguments': args} if returncode != 0: msg = "Error running cmsRun\n%s\n" % argsDump msg += "Return code: %s\n" % returncode logging.critical(msg) raise WMExecutionFailure(returncode, "CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName=self.stepName) except Exception as ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.task.getPrepID() campaign = self.workload.getCampaign() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus=validStatus) self.report.setGlobalTag(globalTag=globalTag) self.report.setCampaign(campaign) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath=inputPath) self.report.setAcquisitionProcessing(acquisitionEra=acquisitionEra, processingVer=processingVer, processingStr=processingStr) self.report.setConfigURL(configURL="%s;;%s;;%s" % (cacheUrl, cacheDB, configID)) # Attach info to files self.report.addInfoToOutputFilesForStep(stepName=self.stepName, step=self.step) self.report.checkForOutputFiles(stepName=self.stepName) self.report.checkForAdlerChecksum(stepName=self.stepName) self.report.checkForRunLumiInformation(stepName=self.stepName) if self.step.output.keep != True: self.report.killOutput() else: # Check that we only keep the desired output for module in stepHelper.getIgnoredOutputModules(): self.report.deleteOutputModuleForStep(stepName=self.stepName, moduleName=module) # Add stageout LFN to existing TFileService files reportAnalysisFiles = self.report.getAnalysisFilesFromStep( self.stepName) for reportAnalysisFile in reportAnalysisFiles: newLFN = analysisFileLFN(reportAnalysisFile.fileName, self.step.user.lfnBase, self.job) addAttributesToFile(reportAnalysisFile, pfn=reportAnalysisFile.fileName, lfn=newLFN, validate=False) # Add analysis file entries for additional files listed in workflow for fileName in stepHelper.listAnalysisFiles(): analysisFile = stepHelper.getAnalysisFile(fileName) if os.path.isfile(analysisFile.fileName): newLFN = analysisFileLFN(analysisFile.fileName, analysisFile.lfnBase, self.job) self.report.addAnalysisFile(analysisFile.fileName, lfn=newLFN, Source='UserDefined', pfn=os.path.join( os.getcwd(), analysisFile.fileName), validate=False) return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args=None): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ args = args or {} # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.numOfCores = args.get('cores', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: rss = int(output[2]) // 1024 # convert it to MiB vsize = int(output[3]) // 1024 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) failureUpstream = False for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if failureUpstream: # for chained steps, don't execute further steps if a # failure has already happened helper.addOverride("previousCmsRunFailure", True) executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) logging.info("StepName: %s, StepType: %s, with result: %r", stepName, stepType, result) if result: # can be either None, or the step exit code failureUpstream = True except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args={}): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True #Let's check the running time currentTime = time.time() if self.hardTimeout != None and self.softTimeout != None: if (currentTime - self.startTime) > self.softTimeout: killProc = True msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=99900, errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if result is not None: skipToStep = result except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName=helper.name()) stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName=helper.name()) data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName=helper.name())) times = stepReport.getTimes(stepName=helper.name()) if times['stopTime'] is not None and times['startTime'] is not None: data['%d_ExeWCTime' % self.stepCount] = times['stopTime'] - times['startTime'] else: logging.error('Failed to retrieve start/end step time: %s', times) data['%d_ExeWCTime' % self.stepCount] = 0 self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] step = stepReport.retrieveStep(step=helper.name()) try: data['%d_ExeCPUTime' % self.stepCount] = getattr( step.performance.cpu, 'TotalJobCPU', 0) except AttributeError: msg = "Failed to retrieve cpu performance for step %s. Defaulting to 0" % helper.name( ) logging.warn(msg) data['%d_ExeCPUTime' % self.stepCount] = 0 self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.lastStep = helper.name() self.publish(data=data) return data
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName=helper.name()) stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName=helper.name()) data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName=helper.name())) times = stepReport.getTimes(stepName=helper.name()) if times['stopTime'] is not None and times['startTime'] is not None: data['%d_ExeWCTime' % self.stepCount] = times['stopTime'] - times['startTime'] else: logging.error('Failed to retrieve start/end step time: %s', times) data['%d_ExeWCTime' % self.stepCount] = 0 self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] step = stepReport.retrieveStep(step=helper.name()) try: data['%d_ExeCPUTime' % self.stepCount] = getattr(step.performance.cpu, 'TotalJobCPU', 0) except AttributeError: msg = "Failed to retrieve cpu performance for step %s. Defaulting to 0" % helper.name() logging.warn(msg) data['%d_ExeCPUTime' % self.stepCount] = 0 self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.lastStep = helper.name() self.maxCores = max(self.maxCores, helper.getNumberOfCores()) self.publish(data=data) return data
def steps(self): """get WMStep structure""" if self.data.steps.topStepName == None: return None step = getattr(self.data.steps, self.data.steps.topStepName, None) return WMStepHelper(step)
raise WMExecutionFailure(spawnedChild.returncode, "CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName=self.stepName) except Exception, ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) # # If multicore is enabled, merged the output files and reports # if multicoreEnabled: self.multicoreMerge(scram, applicationStart) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.workload.getPrepID() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus=validStatus) self.report.setGlobalTag(globalTag=globalTag) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath=inputPath)
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() #self.grabCommand = "ps -u %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd |grep %s |grep -v grep" self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww |grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.logPath = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args = {}): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.logPath = os.path.join(logPath) return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in DashboardMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step = "PerformanceError"): report.addStep(reportname = "PerformanceError") report.addError(stepName = "PerformanceError", exitCode = 99900, errorType = "PerformanceKill", errorDetails = msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: logging.error("Attempting to kill job using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) except Exception: os.kill(stepPID, signal.SIGTERM) return
"CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName = self.stepName) except Exception, ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) # # If multicore is enabled, merged the output files and reports # if multicoreEnabled: self.multicoreMerge(scram,applicationStart) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.workload.getPrepID() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus = validStatus) self.report.setGlobalTag(globalTag = globalTag) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath = inputPath)
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,pcpu,pmem,cmd -ww | grep %i" self.pssMemoryCommand = "awk '/^Pss/ {pss += $2} END {print pss}' /proc/%i/smaps" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.pcpu = [] self.pmem = [] self.maxPSS = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.killRetry = False # will trigger a hard (SIGTERM) instead of soft kill self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args=None): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ args = args or {} # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxPSS = args.get('maxPSS', args.get('maxRSS')) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.numOfCores = args.get('cores', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'PSS': 50660, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the ps monitor command and collate the data # Gathers RSS, %CPU and %MEM statistics from ps ps_cmd = self.monitorBase % (stepPID, stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) ps_output = stdout.split() if not len(ps_output) > 6: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % ps_output msg += "command = %s\n" % ps_cmd logging.error(msg) return # run the command to gather PSS memory statistics from /proc/<pid>/smaps smaps_cmd = self.pssMemoryCommand % (stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) smaps_output = stdout.split() if not len(smaps_output) == 1: # Then something went wrong in getting the smaps data msg = "Error when grabbing output from smaps\n" msg += "output = %s\n" % smaps_output msg += "command = %s\n" % smaps_cmd logging.error(msg) return # smaps also returns data in kiloBytes, let's make it megaBytes # I'm also confused with these megabytes and mebibytes... pss = int(smaps_output[0]) // 1000 logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxPSS is not None and pss >= self.maxPSS: msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS msg += "Job has PSS: %s MB\n" % pss killProc = True reason = 'PSS' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if not killProc: # then job is behaving well, there is nothing to do return # make sure we persist the performance error only once if not self.killRetry: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += "Error: %s" % str(ex) logging.exception(msg2) try: if not killHard and not self.killRetry: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) finally: self.killRetry = True return
def execute(self, emulator=None): """ _execute_ """ stepModule = "WMTaskSpace.%s" % self.stepName if emulator is not None: return emulator.emulate(self.step, self.job) # write the wrapper script to a temporary location # I don't pass it directly through os.system because I don't # trust that there won't be shell-escape shenanigans with # arbitrary input files scramSetup = self.step.application.setup.softwareEnvironment scramCommand = self.step.application.setup.scramCommand scramProject = self.step.application.setup.scramProject scramArch = self.step.application.setup.scramArch cmsswVersion = self.step.application.setup.cmsswVersion jobReportXML = self.step.output.jobReport cmsswCommand = self.step.application.command.executable cmsswConfig = self.step.application.command.configuration cmsswArguments = self.step.application.command.arguments userTarball = ",".join(self.step.user.inputSandboxes) userFiles = ",".join(self.step.user.userFiles) logging.info("User files are %s", userFiles) logging.info("User sandboxes are %s", userTarball) multicoreSettings = self.step.application.multicore try: logging.info("CMSSW configured for %s cores", multicoreSettings.numberOfCores) except AttributeError: logging.info("No value set for multicore.numberOfCores") logging.info("Executing CMSSW step") # # set any global environment variables # try: os.environ["FRONTIER_ID"] = "wmagent_%s" % (self.report.data.workload) except Exception as ex: logging.error("Have critical error in setting FRONTIER_ID: %s", str(ex)) logging.error("Continuing, as this is not a critical function yet.") # # scram bootstrap # scram = Scram( command=scramCommand, version=cmsswVersion, initialise=self.step.application.setup.softwareEnvironment, directory=self.step.builder.workingDir, architecture=scramArch, ) logging.info("Runing SCRAM") try: projectOutcome = scram.project() except Exception as ex: msg = "Exception raised while running scram.\n" msg += str(ex) logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) if projectOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) runtimeOutcome = scram.runtime() if runtimeOutcome > 0: msg = scram.diagnostic() logging.critical("Error running SCRAM") logging.critical(msg) raise WMExecutionFailure(50513, "ScramSetupFailure", msg) # # pre scripts # logging.info("Running PRE scripts") for script in self.step.runtime.preScripts: # TODO: Exception handling and error handling & logging scriptProcess = subprocess.Popen( ["/bin/bash"], shell=True, cwd=self.step.builder.workingDir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE, ) # BADPYTHON scriptProcess.stdin.write("export LD_LIBRARY_PATH=$LD_LIBRARY_PATH\n") invokeCommand = "%s -m WMCore.WMRuntime.ScriptInvoke %s %s \n" % (sys.executable, stepModule, script) logging.info(" Invoking command: %s", invokeCommand) scriptProcess.stdin.write(invokeCommand) stdout, stderr = scriptProcess.communicate() retCode = scriptProcess.returncode if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n %s\n %s\n" % (retCode, stdout, stderr) logging.critical("Error running command") logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptFailure", msg) # # pre scripts with scram # logging.info("RUNNING SCRAM SCRIPTS") for script in self.step.runtime.scramPreScripts: # invoke scripts with scram() runtimeDir = getattr(self.step.runtime, "scramPreDir", None) invokeCommand = ( self.step.runtime.invokeCommand if hasattr(self.step.runtime, "invokeCommand") else "%s -m WMCore.WMRuntime.ScriptInvoke %s" % (sys.executable, stepModule) ) invokeCommand += " %s \n" % script retCode = scram(invokeCommand, runtimeDir=runtimeDir) if retCode > 0: msg = "Error running command\n%s\n" % invokeCommand msg += "%s\n " % retCode msg += scram.diagnostic() logging.critical(msg) raise WMExecutionFailure(50513, "PreScriptScramFailure", msg) configPath = "%s/%s-main.sh" % (self.step.builder.workingDir, self.stepName) handle = open(configPath, "w") handle.write(CONFIG_BLOB) handle.close() # spawn this new process # the script looks for: # <SCRAM_COMMAND> <SCRAM_PROJECT> <CMSSW_VERSION> <JOB_REPORT> <EXECUTABLE> <CONFIG> # open the output files stdoutHandle = open(self.step.output.stdout, "w") stderrHandle = open(self.step.output.stderr, "w") args = [ "/bin/bash", configPath, scramSetup, scramArch, scramCommand, scramProject, cmsswVersion, jobReportXML, cmsswCommand, cmsswConfig, userTarball, userFiles, cmsswArguments, ] logging.info("Executing CMSSW. args: %s", args) # at CERN override the environment to work around problem with GSI authentication plugin and EOS if socket.getfqdn().endswith("cern.ch"): returncode = subprocess.call( args, stdout=stdoutHandle, stderr=stderrHandle, env=dict(os.environ, XRD_LOADBALANCERTTL="86400") ) else: returncode = subprocess.call(args, stdout=stdoutHandle, stderr=stderrHandle) self.setCondorChirpAttrDelayed("Chirp_WMCore_cmsRun_ExitCode", returncode) self.setCondorChirpAttrDelayed("Chirp_WMCore_%s_ExitCode" % self.stepName, returncode) stdoutHandle.close() stderrHandle.close() self.step.execution.exitStatus = returncode argsDump = {"arguments": args} if returncode != 0: msg = "Error running cmsRun\n%s\n" % argsDump msg += "Return code: %s\n" % returncode logging.critical(msg) raise WMExecutionFailure(returncode, "CmsRunFailure", msg) try: self.report.parse(jobReportXML, stepName=self.stepName) except Exception as ex: # Catch it if something goes wrong raise WMExecutionFailure(50115, "BadJobReportXML", str(ex)) stepHelper = WMStepHelper(self.step) typeHelper = stepHelper.getTypeHelper() acquisitionEra = self.task.getAcquisitionEra() processingVer = self.task.getProcessingVersion() processingStr = self.task.getProcessingString() validStatus = self.workload.getValidStatus() inputPath = self.task.getInputDatasetPath() globalTag = typeHelper.getGlobalTag() prepID = self.task.getPrepID() cacheUrl, cacheDB, configID = stepHelper.getConfigInfo() self.report.setValidStatus(validStatus=validStatus) self.report.setGlobalTag(globalTag=globalTag) self.report.setPrepID(prepID) self.report.setInputDataset(inputPath=inputPath) self.report.setAcquisitionProcessing( acquisitionEra=acquisitionEra, processingVer=processingVer, processingStr=processingStr ) self.report.setConfigURL(configURL="%s;;%s;;%s" % (cacheUrl, cacheDB, configID)) # Attach info to files self.report.addInfoToOutputFilesForStep(stepName=self.stepName, step=self.step) self.report.checkForOutputFiles(stepName=self.stepName) self.report.checkForAdlerChecksum(stepName=self.stepName) self.report.checkForRunLumiInformation(stepName=self.stepName) if self.step.output.keep != True: self.report.killOutput() else: # Check that we only keep the desired output for module in stepHelper.getIgnoredOutputModules(): self.report.deleteOutputModuleForStep(stepName=self.stepName, moduleName=module) # Add stageout LFN to existing TFileService files reportAnalysisFiles = self.report.getAnalysisFilesFromStep(self.stepName) for reportAnalysisFile in reportAnalysisFiles: newLFN = analysisFileLFN(reportAnalysisFile.fileName, self.step.user.lfnBase, self.job) addAttributesToFile(reportAnalysisFile, pfn=reportAnalysisFile.fileName, lfn=newLFN, validate=False) # Add analysis file entries for additional files listed in workflow for fileName in stepHelper.listAnalysisFiles(): analysisFile = stepHelper.getAnalysisFile(fileName) if os.path.isfile(analysisFile.fileName): newLFN = analysisFileLFN(analysisFile.fileName, analysisFile.lfnBase, self.job) self.report.addAnalysisFile( analysisFile.fileName, lfn=newLFN, Source="UserDefined", pfn=os.path.join(os.getcwd(), analysisFile.fileName), validate=False, ) return
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName=helper.name()) stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName=helper.name()) if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName=helper.name())) times = stepReport.getTimes(stepName=helper.name()) if times['stopTime'] != None and times['startTime'] != None: data['%d_ExeWCTime' % self.stepCount] = \ times['stopTime'] - times['startTime'] data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() self.maxCores = max(self.maxCores, helper.getNumberOfCores()) step = stepReport.retrieveStep(step=helper.name()) if hasattr(step, 'performance'): if hasattr(step.performance, 'cpu'): data['%d_ExeCPUTime' % self.stepCount] = \ getattr(step.performance.cpu, 'TotalJobCPU', 0) self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] self.lastStep = helper.name() self.publish(data=data) return data