def testAFailedJobMonitoring(self): """ _TestAFailedJobMonitoring_ Simulate a job that completes but fails, check that the data sent is correct """ # Get the necessary objects name = 'testB' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName="DataProcessing") report = self.createReport(outcome=1) # Fill the job environment self.setupJobEnvironment(name=name) # Instantiate DBInfo dbInfo = DashboardInfo(job=job, task=task, dashboardUrl='127.0.0.1:8884') # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['MessageType'], 'JobStatus') self.assertEqual(data['StatusValue'], 'running') self.assertEqual(data['StatusDestination'], "T1_US_FNAL") self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco') # Do the first step step = task.getStep(stepName="cmsRun1") # Do the step start data = dbInfo.stepStart(step=step.data) self.assertNotEqual(data['jobStart'], None) self.assertEqual(data['jobStart']['ExeStart'], step.name()) self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname()) self.assertEqual(data['1_ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step=step.data, stepReport=report) self.assertEqual(data['1_ExeEnd'], step.name()) self.assertNotEqual(data['1_ExeExitCode'], 0) self.assertTrue(data['1_ExeWCTime'] >= 0) self.assertEqual(report.retrieveStep("cmsRun1").counter, 1) # End the job! data = dbInfo.jobEnd() self.assertEqual(data['ExeEnd'], "cmsRun1") self.assertNotEqual(data['JobExitCode'], 0) self.assertEqual(data['WrapperCPUTime'], 0) self.assertTrue(data['WrapperWCTime'] >= 0) self.assertNotEqual(data['JobExitReason'].find('cmsRun1'), -1) return
def testAFailedJobMonitoring(self): """ _TestAFailedJobMonitoring_ Simulate a job that completes but fails, check that the data sent is correct """ # Get the necessary objects name = 'testB' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport(outcome = 1) # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) dbInfo.addDestination('127.0.0.1', 8884) # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['MessageType'], 'JobStatus') self.assertEqual(data['StatusValue'], 'running') self.assertEqual(data['StatusDestination'], "T1_US_FNAL") self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco') # Do the first step step = task.getStep(stepName = "cmsRun1") # Do the step start data = dbInfo.stepStart(step = step.data) self.assertNotEqual(data['jobStart'], None) self.assertEqual(data['jobStart']['ExeStart'], step.name()) self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname()) self.assertEqual(data['1_ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['1_ExeEnd'], step.name()) self.assertNotEqual(data['1_ExeExitCode'], 0) self.assertTrue(data['1_ExeWCTime'] >= 0) self.assertEqual(report.retrieveStep("cmsRun1").counter, 1) # End the job! data = dbInfo.jobEnd() self.assertEqual(data['ExeEnd'], "cmsRun1") self.assertNotEqual(data['JobExitCode'], 0) self.assertEqual(data['WrapperCPUTime'], 0) self.assertTrue(data['WrapperWCTime'] >= 0) self.assertNotEqual(data['JobExitReason'].find('cmsRun1'), -1) return
def testA_createDashboardInfo(self): """ _createDashboardInfo_ Can we create the dashboardInfo and fill it with local information? """ # Get the necessary objects name = 'testA' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport() # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) # Check some defaults self.assertEqual(dbInfo.get('TaskType', None), task.taskType()) self.assertEqual(dbInfo.get('User', None), '*****@*****.**') self.assertEqual(dbInfo.get('JSTool', None), 'WMAgent') # This shouldn't add anything, # but we have to make sure it doesn't fail. dbInfo.jobStart() # Do a step step = task.getStep(stepName = "cmsRun1") # Do the step start data = dbInfo.stepStart(step = step.data) self.assertEqual(data.get('ExeStart', None), step.name()) self.assertEqual(data.get('taskId', None), 'wmagent_Tier1ReReco') # Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data.get('ExeEnd', None), step.name()) self.assertEqual(data.get('ExeExitCode', None), 0) # End the job! data = dbInfo.jobEnd() self.assertFalse(data.get('MessageTS', None) == None, 'Did not assign finish time in jobEnd()') return
def testMultithreadedApplication(self): """ _testMultithreadedApplication_ Check that the data packets have NCores and it picks it up successfully from the CMSSW step """ # Get the necessary objects name = 'testMT' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName="DataProcessing") report = self.createReport() # Fill the job environment self.setupJobEnvironment(name=name) # Instantiate DBInfo dbInfo = DashboardInfo(job=job, task=task, dashboardUrl='127.0.0.1:8884') # Modify the first step step = task.getStep(stepName="cmsRun1") step.getTypeHelper().setNumberOfCores(8) # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['NCores'], 8) # Do the first step step = task.getStep(stepName="cmsRun1") # Do the step start data = dbInfo.stepStart(step=step.data) #Do the step end data = dbInfo.stepEnd(step=step.data, stepReport=report) self.assertEqual(data['1_NCores'], 8) self.assertEqual(report.retrieveStep("cmsRun1").counter, 1) # End the job and test the final NCores report data = dbInfo.jobEnd() self.assertEqual(data['NCores'], 8) return
def testMultithreadedApplication(self): """ _testMultithreadedApplication_ Check that the data packets have NCores and it picks it up successfully from the CMSSW step """ # Get the necessary objects name = 'testMT' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport() # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) dbInfo.addDestination('127.0.0.1', 8884) # Modify the first step step = task.getStep(stepName = "cmsRun1") step.getTypeHelper().setNumberOfCores(8) # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['NCores'], 8) # Do the first step step = task.getStep(stepName = "cmsRun1") # Do the step start data = dbInfo.stepStart(step = step.data) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['1_NCores'], 8) self.assertEqual(report.retrieveStep("cmsRun1").counter, 1) # End the job and test the final NCores report data = dbInfo.jobEnd() self.assertEqual(data['NCores'], 8) return
def testASuccessfulJobMonitoring(self): """ _testASuccessfulJobMonitoring_ Check that the data packets make sense when a job completes successfully """ # Get the necessary objects name = 'testA' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport() # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) dbInfo.addDestination('127.0.0.1', 8884) # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['MessageType'], 'JobStatus') self.assertEqual(data['StatusValue'], 'running') self.assertEqual(data['StatusDestination'], "T1_US_FNAL") self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco') # Do the first step step = task.getStep(stepName = "cmsRun1") # Do the step start data = dbInfo.stepStart(step = step.data) self.assertNotEqual(data['jobStart'], None) self.assertEqual(data['jobStart']['ExeStart'], step.name()) self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname()) self.assertEqual(data['1_ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['1_ExeEnd'], step.name()) self.assertEqual(data['1_ExeExitCode'], 0) self.assertTrue(data['1_ExeWCTime'] >= 0) self.assertEqual(report.retrieveStep("cmsRun1").counter, 1) #Do a second step step = task.getStep(stepName = "cmsRun1") #Do the step start (It's not the first step) data = dbInfo.stepStart(step = step.data) self.assertEqual(data['jobStart'], None) self.assertEqual(data['2_ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['2_ExeEnd'], step.name()) self.assertEqual(data['2_ExeExitCode'], 0) self.assertTrue(data['2_ExeWCTime'] >= 0) self.assertEqual(report.retrieveStep("cmsRun1").counter, 2) # End the job! data = dbInfo.jobEnd() self.assertEqual(data['ExeEnd'], "cmsRun1") self.assertEqual(data['JobExitCode'], 0) self.assertEqual(data['WrapperCPUTime'], 0) self.assertTrue(data['WrapperWCTime'] >= 0) self.assertNotEqual(data['JobExitReason'], "") return
def testASuccessfulJobMonitoring(self): """ _testASuccessfulJobMonitoring_ Check that the data packets make sense when a job completes successfully """ # Get the necessary objects name = 'testA' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport() # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) dbInfo.addDestination('127.0.0.1', 8884) # Check jobStart information data = dbInfo.jobStart() self.assertEqual(data['MessageType'], 'JobStatus') self.assertEqual(data['StatusValue'], 'running') self.assertEqual(data['StatusDestination'], "T1_US_FNAL") self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco') # Do the first step step = task.getStep(stepName = "cmsRun1") # Do the step start data = dbInfo.stepStart(step = step.data) self.assertNotEqual(data['jobStart'], None) self.assertEqual(data['jobStart']['ExeStart'], step.name()) self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname()) self.assertEqual(data['ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['ExeEnd'], step.name()) self.assertEqual(data['ExeExitCode'], 0) self.assertTrue(data['ExeWCTime'] >= 0) #Do a second step step = task.getStep(stepName = "cmsRun1") #Do the step start (It's not the first step) data = dbInfo.stepStart(step = step.data) self.assertEqual(data['jobStart'], None) self.assertEqual(data['ExeStart'], step.name()) #Do the step end data = dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(data['ExeEnd'], step.name()) self.assertEqual(data['ExeExitCode'], 0) self.assertTrue(data['ExeWCTime'] >= 0) # End the job! data = dbInfo.jobEnd() self.assertEqual(data['ExeEnd'], "cmsRun1") self.assertEqual(data['JobExitCode'], 0) self.assertEqual(data['WrapperCPUTime'], 0) self.assertTrue(data['WrapperWCTime'] >= 0) self.assertNotEqual(data['JobExitReason'], "") return
class DashboardMonitor(WMRuntimeMonitor): """ _DashboardMonitor_ Run in the background and pass information to the DashboardInterface instance. If the job exceeds timeouts, kill the job """ def __init__(self): self.startTime = None self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.task = None self.job = None self.dashboardInfo = None WMRuntimeMonitor.__init__(self) def initMonitor(self, task, job, logPath, args={}): """ Handles the monitor initiation """ logging.info("In DashboardMonitor.initMonitor") self.task = task self.job = job destHost = args.get('destinationHost', None) destPort = args.get('destinationPort', None) dashboardUrl = '%s:%s' % (destHost, str(destPort)) cores = args.get('cores', 0) self.dashboardInfo = DashboardInfo(task, job, dashboardUrl=dashboardUrl, overrideCores=cores) def jobStart(self, task): """ Job start notifier. """ try: self.dashboardInfo.jobStart() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobEnd(self, task): """ Job End notification """ try: self.dashboardInfo.jobEnd() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepStart(self, step): """ Step start notification """ self.currentStep = step self.currentStepName = getStepName(step) self.currentStepSpace = None self.startTime = time.time() try: self.dashboardInfo.stepStart(step=step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepEnd(self, step, stepReport): """ Step end notification """ self.currentStep = None self.currentStepName = None self.currentStepSpace = None try: self.dashboardInfo.stepEnd(step=step, stepReport=stepReport) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepKilled(self, step): """ Step killed notification """ self.currentStep = None self.currentStepName = None try: self.dashboardInfo.stepKilled(step=step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobKilled(self, task): """ Killed job notification """ try: self.dashboardInfo.jobKilled() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def periodicUpdate(self): """ Run on the defined intervals. Tell the dashboard info to run the periodic update """ try: self.dashboardInfo.periodicUpdate() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return
class DashboardMonitor(WMRuntimeMonitor): """ _DashboardMonitor_ Run in the background and pass information to the DashboardInterface instance. If the job exceeds timeouts, kill the job """ def __init__(self): self.startTime = None self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.softTimeOut = None self.hardTimeOut = None self.killFlag = False self.cmsswFile = None self.task = None self.job = None self.dashboardInfo = None WMRuntimeMonitor.__init__(self) def initMonitor(self, task, job, logPath, args = {}): """ Handles the monitor initiation """ logging.info("In DashboardMonitor.initMonitor") self.task = task self.job = job self.logPath = logPath self.softTimeOut = args.get('softTimeOut', None) self.hardTimeOut = args.get('hardTimeOut', None) destHost = args.get('destinationHost', None) destPort = args.get('destinationPort', None) self.dashboardInfo = DashboardInfo(task = task, job = job) if destHost and destPort: logging.info("About to set destination to %s:%s" % (destHost, destPort)) self.dashboardInfo.addDestination(host = destHost, port = destPort) def jobStart(self, task): """ Job start notifier. """ self.dashboardInfo.jobStart() return def jobEnd(self, task): """ Job End notification """ self.dashboardInfo.jobEnd() return def stepStart(self, step): """ Step start notification """ self.currentStep = step self.currentStepName = getStepName(step) self.currentStepSpace = None self.startTime = time.time() self.dashboardInfo.stepStart(step = step) return def stepEnd(self, step, stepReport): """ Step end notification """ self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.dashboardInfo.stepEnd(step = step, stepReport = stepReport) return def stepKilled(self, step): """ Step killed notification """ self.currentStep = None self.currentStepName = None self.dashboardInfo.stepKilled(step = step) def jobKilled(self, task): """ Killed job notification """ self.dashboardInfo.jobKilled() return def periodicUpdate(self): """ Run on the defined intervals. """ if not self.currentStep: #We're probably between steps return self.dashboardInfo.periodicUpdate() #Check for events if self.cmsswFile: run, event = searchForEvent(file) if run and event: #Then we actually found something, otherwise do nothing #Right now I don't know what to do pass #Do timeout if not self.softTimeOut: return if time.time() - self.startTime > self.softTimeOut: #Then we have to kill the process # If our stepName is None, we're inbetween steps. Nothing to kill! if self.currentStepName == None: return # If our stepName is valid, then we may need the stepSpace if self.currentStepSpace == None: self.currentStepSpace = getStepSpace(self.currentStepName) #First, get the PID stepPID = getStepPID(self.currentStepSpace, self.currentStepName) #Now kill it! msg = "" msg += "Start Time: %s\n" % self.startTime msg += "Time Now: %s\n" % time.time() msg += "Timeout: %s\n" % self.softTimeOut msg += "Killing Job...\n" msg += "Process ID is: %s\n" % stepPID # If possible, write a FWJR report = Report.Report() try: self.logPath = os.path.join(self.currentStepSpace.location, '../../../', os.path.basename(self.logPath)) if os.path.isfile(self.logPath): # We should be able to find existant job report. # If not, we're in trouble logging.debug("Found pre-existant error report in DashboardMonitor termination.") report.load(self.logPath) report.addError(stepName = self.currentStepName, exitCode = 99901, errorType = "JobTimeout", errorDetails = msg) report.save(self.logPath) except Exception, ex: # Basically, we can't write a log report and we're hosed # Kill anyway, and hope the logging file gets written out msg2 = "Exception while writing out jobReport.\n" msg2 += "Aborting job anyway: unlikely you'll get any error report.\n" msg2 += str(ex) msg2 += str(traceback.format_exc()) + '\n' logging.error(msg2) if stepPID == None or stepPID == os.getpid(): # Then we are supposed to kill things # that don't exist in separate processes: # Self-terminate msg += "WARNING: No separate process. Watchdog will attempt self-termination." logging.error(msg) os.abort() if time.time() - self.startTime < self.hardTimeOut or not self.killFlag: msg += "WARNING: Soft Kill Timeout has Expired:" logging.error(msg) os.kill(stepPID, signal.SIGUSR2) self.killFlag = True elif self.killFlag: msg += "WARNING: Hard Kill Timeout has Expired:" logging.error(msg) os.kill(stepPID, signal.SIGTERM) killedpid, stat = os.waitpid(stepPID, os.WNOHANG) if killedpid == 0: os.kill(stepPID, signal.SIGKILL) killedpid, stat = os.waitpid(stepPID, os.WNOHANG) if killedpid == 0: logging.error("Can't kill job. Out of options. Waiting for system reboot.") #Panic! It's unkillable! return
class DashboardMonitor(WMRuntimeMonitor): """ _DashboardMonitor_ Run in the background and pass information to the DashboardInterface instance. If the job exceeds timeouts, kill the job """ def __init__(self): self.startTime = None self.currentStep = None self.currentStepName = None self.currentStepSpace = None self.task = None self.job = None self.dashboardInfo = None WMRuntimeMonitor.__init__(self) def initMonitor(self, task, job, logPath, args = {}): """ Handles the monitor initiation """ logging.info("In DashboardMonitor.initMonitor") self.task = task self.job = job destHost = args.get('destinationHost', None) destPort = args.get('destinationPort', None) dashboardUrl = '%s:%s' % (destHost, str(destPort)) cores = args.get('cores', 0) self.dashboardInfo = DashboardInfo(task, job, dashboardUrl=dashboardUrl, overrideCores=cores) def jobStart(self, task): """ Job start notifier. """ try: self.dashboardInfo.jobStart() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobEnd(self, task): """ Job End notification """ try: self.dashboardInfo.jobEnd() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepStart(self, step): """ Step start notification """ self.currentStep = step self.currentStepName = getStepName(step) self.currentStepSpace = None self.startTime = time.time() try: self.dashboardInfo.stepStart(step = step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepEnd(self, step, stepReport): """ Step end notification """ self.currentStep = None self.currentStepName = None self.currentStepSpace = None try: self.dashboardInfo.stepEnd(step = step, stepReport = stepReport) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def stepKilled(self, step): """ Step killed notification """ self.currentStep = None self.currentStepName = None try: self.dashboardInfo.stepKilled(step = step) except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def jobKilled(self, task): """ Killed job notification """ try: self.dashboardInfo.jobKilled() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return def periodicUpdate(self): """ Run on the defined intervals. Tell the dashboard info to run the periodic update """ try: self.dashboardInfo.periodicUpdate() except Exception as ex: logging.error(str(ex)) logging.error(str(traceback.format_exc())) return
def testA_createDashboardInfo(self): """ _createDashboardInfo_ Can we create the dashboardInfo and fill it with local information? """ # Get the necessary objects name = 'testA' job = self.createTestJob() workload = self.createWorkload() task = workload.getTask(taskName = "DataProcessing") report = self.createReport() # Fill the job environment self.setupJobEnvironment(name = name) # Instantiate DBInfo dbInfo = DashboardInfo(job = job, task = task) # Check some defaults self.assertEqual(dbInfo.get('TaskType', None), task.taskType()) self.assertEqual(dbInfo.get('User', None), '*****@*****.**') self.assertEqual(dbInfo.get('JSTool', None), 'WMAgent') self.assertEqual(dbInfo.get('jobName', None), 'WMAgent_1_0_ThisIsASillyName') self.assertEqual(dbInfo.get('taskName', None), 'ProdAgent_-Tier1ReReco-DataProcessing_WMAgentPrimary') dbInfo.jobStart() self.assertEqual(dbInfo.get('GridJobID', None), name) self.assertEqual(dbInfo.get('SyncCE', None), name) # Do a step step = task.getStep(stepName = "cmsRun1") # Do the step start dbInfo.stepStart(step = step.data) self.assertEqual(dbInfo.get('ExeStart', None), step.name()) self.assertEqual(dbInfo.get('ApplicationVersion', None), 'CMSSW_3_5_8') # Do the step end dbInfo.stepEnd(step = step.data, stepReport = report) self.assertEqual(dbInfo.get('ExeEnd', None), step.name()) self.assertEqual(dbInfo.get('ExeExitStatus', None), False) # End the job! dbInfo.jobEnd() self.assertFalse(dbInfo.get('JobFinished', None) == None, 'Did not assign finish time in jobEnd()') return