def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName = helper.name()) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) data = {} data['MessageType'] = 'jobRuntime' data['MessageTS'] = time.time() data['taskId'] = self.taskName data['jobId'] = self.jobName data['retryCount'] = self.job['retry_count'] data['ExeEnd'] = helper.name() data['ExeExitCode'] = stepReport.getStepExitCode(stepName = helper.name()) if helper.name() == 'StageOut': data['StageOutExitStatus'] = stepReport.stepSuccessful(stepName = helper.name()) self.publish(data = data) return
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName=helper.name()) stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName=helper.name()) data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName=helper.name())) times = stepReport.getTimes(stepName=helper.name()) if times['stopTime'] is not None and times['startTime'] is not None: data['%d_ExeWCTime' % self.stepCount] = times['stopTime'] - times['startTime'] else: logging.error('Failed to retrieve start/end step time: %s', times) data['%d_ExeWCTime' % self.stepCount] = 0 self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] step = stepReport.retrieveStep(step=helper.name()) try: data['%d_ExeCPUTime' % self.stepCount] = getattr( step.performance.cpu, 'TotalJobCPU', 0) except AttributeError: msg = "Failed to retrieve cpu performance for step %s. Defaulting to 0" % helper.name( ) logging.warn(msg) data['%d_ExeCPUTime' % self.stepCount] = 0 self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.lastStep = helper.name() self.maxCores = max(self.maxCores, helper.getNumberOfCores()) self.publish(data=data) return data
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName=helper.name()) stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName=helper.name()) data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName=helper.name())) times = stepReport.getTimes(stepName=helper.name()) if times['stopTime'] is not None and times['startTime'] is not None: data['%d_ExeWCTime' % self.stepCount] = times['stopTime'] - times['startTime'] else: logging.error('Failed to retrieve start/end step time: %s', times) data['%d_ExeWCTime' % self.stepCount] = 0 self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] step = stepReport.retrieveStep(step=helper.name()) try: data['%d_ExeCPUTime' % self.stepCount] = getattr(step.performance.cpu, 'TotalJobCPU', 0) except AttributeError: msg = "Failed to retrieve cpu performance for step %s. Defaulting to 0" % helper.name() logging.warn(msg) data['%d_ExeCPUTime' % self.stepCount] = 0 self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.lastStep = helper.name() self.maxCores = max(self.maxCores, helper.getNumberOfCores()) self.publish(data=data) return data
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName=helper.name()) stepReport.setStepCounter(stepName=helper.name(), counter=self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName=helper.name()) if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName=helper.name())) times = stepReport.getTimes(stepName=helper.name()) if times['stopTime'] != None and times['startTime'] != None: data['%d_ExeWCTime' % self.stepCount] = \ times['stopTime'] - times['startTime'] data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() self.maxCores = max(self.maxCores, helper.getNumberOfCores()) step = stepReport.retrieveStep(step=helper.name()) if hasattr(step, 'performance'): if hasattr(step.performance, 'cpu'): data['%d_ExeCPUTime' % self.stepCount] = \ getattr(step.performance.cpu, 'TotalJobCPU', 0) self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] self.lastStep = helper.name() self.publish(data=data) return data
def stepEnd(self, step, stepReport): """ _stepEnd_ Fill with step-ending information """ helper = WMStepHelper(step) stepSuccess = stepReport.getStepExitCode(stepName = helper.name()) stepReport.setStepCounter(stepName = helper.name(), counter = self.stepCount) if self.jobSuccess == 0: self.jobSuccess = int(stepSuccess) if int(stepSuccess) != 0: self.failedStep = helper.name() data = {} data['MessageType'] = 'jobRuntime-stepEnd' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() data['%d_ExeExitCode' % self.stepCount] = stepReport.getStepExitCode( stepName = helper.name()) if helper.name() == 'StageOut': data['%d_StageOutExitStatus' % self.stepCount] = int( stepReport.stepSuccessful(stepName = helper.name())) times = stepReport.getTimes(stepName = helper.name()) if times['stopTime'] != None and times['startTime'] != None: data['%d_ExeWCTime' % self.stepCount] = \ times['stopTime'] - times['startTime'] data['%d_NCores' % self.stepCount] = helper.getNumberOfCores() self.maxCores = max(self.maxCores, helper.getNumberOfCores()) step = stepReport.retrieveStep(step = helper.name()) if hasattr(step, 'performance'): if hasattr(step.performance, 'cpu'): data['%d_ExeCPUTime' % self.stepCount] = \ getattr(step.performance.cpu, 'TotalJobCPU', 0) self.WrapperCPUTime += float(data['%d_ExeCPUTime' % self.stepCount]) self.WrapperWCTime += data['%d_ExeWCTime' % self.stepCount] self.lastStep = helper.name() self.publish(data = data) return data
def stepStart(self, step): """ _stepStart_ Fill with the step-based information. If it is the first step, report that the job started its execution. """ helper = WMStepHelper(step) self.stepCount += 1 data = None if not self.jobStarted: #It's the first step so let's send the exe that started and where #That's what they request data = {} data['MessageType'] = 'jobRuntime-jobStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['GridName'] = getUserProxyDN() data['ExeStart'] = helper.name() data['SyncCE'] = getSyncCE() data['WNHostName'] = socket.gethostname() self.publish(data = data) self.jobStarted = True #Now let's send the step information tmp = {'jobStart': data} data = {} data['MessageType'] = 'jobRuntime-stepStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_stepName' % self.stepCount] = helper.name() data['%d_ExeStart' % self.stepCount] = helper.name() self.publish(data = data) data.update(tmp) return data
def stepStart(self, step): """ _stepStart_ Fill with the step-based information. If it is the first step, report that the job started its execution. """ helper = WMStepHelper(step) self.stepCount += 1 data = None if not self.jobStarted: #It's the first step so let's send the exe that started and where #That's what they request data = {} data['MessageType'] = 'jobRuntime-jobStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['GridName'] = getUserProxyDN() data['ExeStart'] = helper.name() data['SyncCE'] = getSyncCE() data['WNHostName'] = socket.gethostname() self.publish(data=data) self.jobStarted = True #Now let's send the step information tmp = {'jobStart': data} data = {} data['MessageType'] = 'jobRuntime-stepStart' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_stepName' % self.stepCount] = helper.name() data['%d_ExeStart' % self.stepCount] = helper.name() self.publish(data=data) data.update(tmp) return data
def stepKilled(self, step): """ _stepKilled_ Fill with step-ending information assuming utter failure """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime-stepKilled' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['%d_ExeEnd' % self.stepCount] = helper.name() self.lastStep = helper.name() self.publish(data=data) return data
def stepKilled(self, step): """ _stepKilled_ Fill with step-ending information assuming utter failure """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime-stepKilled' data['MessageTS'] = time.strftime(self.tsFormat, time.gmtime()) data['taskId'] = self.taskName data['jobId'] = self.jobName data['ExeEnd'] = helper.name() self.lastStep = helper.name() self.publish(data = data) return data
def emulateAll(self): """ _emulateAll_ Traverse all Steps and load up the default Emulator based on type. """ for step in self.task.steps().nodeIterator(): helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() self.addEmulator(stepName, stepType)
def setStep(self, wmStep): """set topStep to be the step instance provided""" stepData = wmStep if isinstance(wmStep, WMStepHelper): stepData = wmStep.data stepHelper = wmStep else: stepHelper = WMStepHelper(wmStep) stepName = stepHelper.name() stepHelper.setTopOfTree() setattr(self.data.steps, stepName, stepData) setattr(self.data.steps, "topStepName", stepName) return
def stepStart(self, step): """ _stepStart_ Fill with the step-based information """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime' data['MessageTS'] = time.time() data['taskId'] = self.taskName data['jobId'] = self.jobName data['retryCount'] = self.job['retry_count'] data['ExeStart'] = helper.name() self.publish(data = data) return data
def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = getStepSpace(stepHelper.name()) if not stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return
def stepKilled(self, step): """ _stepKilled_ Fill with step-ending information assuming utter failure """ helper = WMStepHelper(step) data = {} data['MessageType'] = 'jobRuntime' data['MessageTS'] = time.time() data['taskId'] = self.taskName data['jobId'] = self.jobName data['retryCount'] = self.job['retry_count'] data['ExeEnd'] = helper.name() self.publish(data = data) return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args={}): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True #Let's check the running time currentTime = time.time() if self.hardTimeout != None and self.softTimeout != None: if (currentTime - self.startTime) > self.softTimeout: killProc = True msg += "Job has been running for more than: %s\n" % str( self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug( "Found pre-existant error report in PerformanceMonitor termination." ) report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=99900, errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if not result == None: skipToStep = result except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) failureUpstream = False for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if failureUpstream: # for chained steps, don't execute further steps if a # failure has already happened helper.addOverride("previousCmsRunFailure", True) executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) logging.info("StepName: %s, StepType: %s, with result: %r", stepName, stepType, result) if result: # can be either None, or the step exit code failureUpstream = True except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
def __call__(self, task, wmbsJob): """ _operator(task)_ Load and run executors for all steps in Task, if an emulator is available for that step, use it instead. """ myThread = threading.currentThread try: myThread.watchdogMonitor.setupMonitors(task, wmbsJob) myThread.watchdogMonitor.notifyJobStart(task) except WMException: self.toTaskDirectory() raise except Exception as ex: msg = "Encountered unhandled exception while starting monitors:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() raise WMExecutionFailure(msg) skipToStep = None for step in task.steps().nodeIterator(): try: helper = WMStepHelper(step) stepType = helper.stepType() stepName = helper.name() if skipToStep and skipToStep != stepName: # Then we continue until we get to the required step continue skipToStep = None # Reset this when we get to the right step executor = StepFactory.getStepExecutor(stepType) result = self.doExecution(executor, step, wmbsJob) if result is not None: skipToStep = result except WMException as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" logging.error(msg) self.toTaskDirectory() break except Exception as ex: msg = "Encountered error while running ExecuteMaster:\n" msg += str(ex) + "\n" msg += str(traceback.format_exc()) + "\n" self.toTaskDirectory() logging.error(msg) break try: myThread.watchdogMonitor.notifyJobEnd(task) except WMException: self.toTaskDirectory() except Exception as ex: msg = "Encountered unhandled exception while ending the job:\n" msg += str(ex) + '\n' msg += str(traceback.format_exc()) + '\n' logging.error(msg) self.toTaskDirectory() return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,pcpu,pmem,cmd -ww | grep %i" self.pssMemoryCommand = "awk '/^Pss/ {pss += $2} END {print pss}' /proc/%i/smaps" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.pcpu = [] self.pmem = [] self.maxPSS = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.killRetry = False # will trigger a hard (SIGTERM) instead of soft kill self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args=None): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ args = args or {} # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxPSS = args.get('maxPSS', args.get('maxRSS')) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.numOfCores = args.get('cores', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'PSS': 50660, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the ps monitor command and collate the data # Gathers RSS, %CPU and %MEM statistics from ps ps_cmd = self.monitorBase % (stepPID, stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(ps_cmd) ps_output = stdout.split() if not len(ps_output) > 6: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % ps_output msg += "command = %s\n" % ps_cmd logging.error(msg) return # run the command to gather PSS memory statistics from /proc/<pid>/smaps smaps_cmd = self.pssMemoryCommand % (stepPID) stdout, _stderr, _retcode = subprocessAlgos.runCommand(smaps_cmd) smaps_output = stdout.split() if not len(smaps_output) == 1: # Then something went wrong in getting the smaps data msg = "Error when grabbing output from smaps\n" msg += "output = %s\n" % smaps_output msg += "command = %s\n" % smaps_cmd logging.error(msg) return # smaps also returns data in kiloBytes, let's make it megaBytes # I'm also confused with these megabytes and mebibytes... pss = int(smaps_output[0]) // 1000 logging.info("PSS: %s; RSS: %s; PCPU: %s; PMEM: %s", smaps_output[0], ps_output[2], ps_output[3], ps_output[4]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxPSS is not None and pss >= self.maxPSS: msg += "Job has exceeded maxPSS: %s MB\n" % self.maxPSS msg += "Job has PSS: %s MB\n" % pss killProc = True reason = 'PSS' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if not killProc: # then job is behaving well, there is nothing to do return # make sure we persist the performance error only once if not self.killRetry: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += "Error: %s" % str(ex) logging.exception(msg2) try: if not killHard and not self.killRetry: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) finally: self.killRetry = True return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() #self.grabCommand = "ps -u %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd |grep %s |grep -v grep" self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww |grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.logPath = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args = {}): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.logPath = os.path.join(logPath) return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s" % self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName == None: # We're between steps return if self.currentStepSpace == None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID == None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return rss = float(output[2]) vsize = float(output[3]) logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s" % (output[2], output[3], output[4], output[5])) msg = 'Error in CMSSW step %s\n' % self.currentStepName if self.maxRSS != None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True if self.maxVSize != None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in DashboardMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step = "PerformanceError"): report.addStep(reportname = "PerformanceError") report.addError(stepName = "PerformanceError", exitCode = 99900, errorType = "PerformanceKill", errorDetails = msg) report.save(logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: logging.error("Attempting to kill job using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) except Exception: os.kill(stepPID, signal.SIGTERM) return
class PerformanceMonitor(WMRuntimeMonitor): """ _PerformanceMonitor_ Monitors the performance by pinging ps and recording data regarding the current step """ def __init__(self): """ Actual variable initialization in initMonitor """ self.pid = None self.uid = os.getuid() self.monitorBase = "ps -p %i -o pid,ppid,rss,vsize,pcpu,pmem,cmd -ww | grep %i" self.monitorCommand = None self.currentStepSpace = None self.currentStepName = None self.rss = [] self.vsize = [] self.pcpu = [] self.pmem = [] self.maxRSS = None self.maxVSize = None self.softTimeout = None self.hardTimeout = None self.logPath = None self.startTime = None self.watchStepTypes = [] self.disableStep = False WMRuntimeMonitor.__init__(self) return def initMonitor(self, task, job, logPath, args=None): """ _initMonitor_ Puts together the information needed for the monitoring to actually find everything. """ args = args or {} # Set the steps we want to watch self.watchStepTypes = args.get('WatchStepTypes', ['CMSSW', 'PerfTest']) self.maxRSS = args.get('maxRSS', None) self.maxVSize = args.get('maxVSize', None) self.softTimeout = args.get('softTimeout', None) self.hardTimeout = args.get('hardTimeout', None) self.numOfCores = args.get('cores', None) self.logPath = os.path.join(logPath) return def jobStart(self, task): """ _jobStart_ Acknowledge that the job has started and initialize the time """ self.startTime = time.time() return def stepStart(self, step): """ _stepStart_ Assure that the monitor is pointing at the right step """ self.stepHelper = WMStepHelper(step) self.currentStepName = getStepName(step) self.currentStepSpace = None if not self.stepHelper.stepType() in self.watchStepTypes: self.disableStep = True logging.debug("PerformanceMonitor ignoring step of type %s", self.stepHelper.stepType()) return else: logging.debug("Beginning PeformanceMonitor step Initialization") self.disableStep = False return def stepEnd(self, step, stepReport): """ _stepEnd_ Package the information and send it off """ if not self.disableStep: # No information to correlate return self.currentStepName = None self.currentStepSpace = None return def periodicUpdate(self): """ Run on the defined intervals. """ killProc = False killHard = False reason = '' errorCodeLookup = {'RSS': 50660, 'VSZ': 50661, 'Wallclock time': 50664, '': 99999} if self.disableStep: # Then we aren't doing CPU monitoring # on this step return if self.currentStepName is None: # We're between steps return if self.currentStepSpace is None: # Then build the step space self.currentStepSpace = getStepSpace(self.stepHelper.name()) stepPID = getStepPID(self.currentStepSpace, self.currentStepName) if stepPID is None: # Then we have no step PID, we can do nothing return # Now we run the monitor command and collate the data cmd = self.monitorBase % (stepPID, stepPID) stdout, stderr, retcode = subprocessAlgos.runCommand(cmd) output = stdout.split() if not len(output) > 7: # Then something went wrong in getting the ps data msg = "Error when grabbing output from process ps\n" msg += "output = %s\n" % output msg += "command = %s\n" % self.monitorCommand logging.error(msg) return # FIXME: making it backwards compatible. Keep only the "else" block in HG1801 if self.maxRSS is not None and self.maxRSS >= (1024 * 1024): # then workload value is still in KiB (old way) rss = int(output[2]) vsize = int(output[3]) else: rss = int(output[2]) // 1024 # convert it to MiB vsize = int(output[3]) // 1024 # convert it to MiB logging.info("Retrieved following performance figures:") logging.info("RSS: %s; VSize: %s; PCPU: %s; PMEM: %s", output[2], output[3], output[4], output[5]) msg = 'Error in CMSSW step %s\n' % self.currentStepName msg += 'Number of Cores: %s\n' % self.numOfCores if self.maxRSS is not None and rss >= self.maxRSS: msg += "Job has exceeded maxRSS: %s\n" % self.maxRSS msg += "Job has RSS: %s\n" % rss killProc = True reason = 'RSS' elif self.maxVSize is not None and vsize >= self.maxVSize: msg += "Job has exceeded maxVSize: %s\n" % self.maxVSize msg += "Job has VSize: %s\n" % vsize killProc = True reason = 'VSZ' elif self.hardTimeout is not None and self.softTimeout is not None: currentTime = time.time() if (currentTime - self.startTime) > self.softTimeout: killProc = True reason = 'Wallclock time' msg += "Job has been running for more than: %s\n" % str(self.softTimeout) msg += "Job has been running for: %s\n" % str(currentTime - self.startTime) if (currentTime - self.startTime) > self.hardTimeout: killHard = True msg += "Job exceeded soft timeout" if killProc: logging.error(msg) report = Report.Report() # Find the global report logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) try: if os.path.isfile(logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in PerformanceMonitor termination.") report.load(logPath) # Create a new step that won't be overridden by an exiting CMSSW if not report.retrieveStep(step="PerformanceError"): report.addStep(reportname="PerformanceError") report.addError(stepName="PerformanceError", exitCode=errorCodeLookup[reason], errorType="PerformanceKill", errorDetails=msg) report.save(logPath) except Exception as ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) try: if not killHard: logging.error("Attempting to kill step using SIGUSR2") os.kill(stepPID, signal.SIGUSR2) else: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) except Exception: logging.error("Attempting to kill step using SIGTERM") os.kill(stepPID, signal.SIGTERM) return