Esempio n. 1
0
    def testA_createDashboardInfo(self):
        """
        _createDashboardInfo_

        Can we create the dashboardInfo and fill it with
        local information?
        """

        # Get the necessary objects
        name     = 'testA'
        job      = self.createTestJob()
        workload = self.createWorkload()
        task     = workload.getTask(taskName = "DataProcessing")
        report   = self.createReport()

        # Fill the job environment
        self.setupJobEnvironment(name = name)

        # Instantiate DBInfo
        dbInfo   = DashboardInfo(job = job, task = task)

        # Check some defaults
        self.assertEqual(dbInfo.get('TaskType', None), task.taskType())
        self.assertEqual(dbInfo.get('User', None), '*****@*****.**')
        self.assertEqual(dbInfo.get('JSTool', None), 'WMAgent')

        # This shouldn't add anything,
        # but we have to make sure it doesn't fail.
        dbInfo.jobStart()        

        # Do a step
        step = task.getStep(stepName = "cmsRun1")

        # Do the step start
        data = dbInfo.stepStart(step = step.data)
        self.assertEqual(data.get('ExeStart', None), step.name())
        self.assertEqual(data.get('taskId', None), 'wmagent_Tier1ReReco')


        # Do the step end
        data = dbInfo.stepEnd(step = step.data, stepReport = report)
        self.assertEqual(data.get('ExeEnd', None), step.name())
        self.assertEqual(data.get('ExeExitCode', None), 0)




        # End the job!
        data = dbInfo.jobEnd()
        self.assertFalse(data.get('MessageTS', None) == None,
                         'Did not assign finish time in jobEnd()')

        return
Esempio n. 2
0
class DashboardMonitor(WMRuntimeMonitor):
    """
    _DashboardMonitor_

    Run in the background and pass information to
    the DashboardInterface instance.

    If the job exceeds timeouts, kill the job
    """

    def __init__(self):
        self.startTime = None
        self.currentStep = None
        self.currentStepName = None
        self.currentStepSpace = None
        self.task = None
        self.job = None
        self.dashboardInfo = None
        WMRuntimeMonitor.__init__(self)

    def initMonitor(self, task, job, logPath, args={}):
        """
        Handles the monitor initiation

        """
        logging.info("In DashboardMonitor.initMonitor")

        self.task = task
        self.job = job

        destHost = args.get("destinationHost", None)
        destPort = args.get("destinationPort", None)

        self.dashboardInfo = DashboardInfo(task=task, job=job)

        if destHost and destPort:
            logging.info("About to set destination to %s:%s" % (destHost, destPort))
            self.dashboardInfo.addDestination(host=destHost, port=destPort)

    def jobStart(self, task):
        """
        Job start notifier.
        """
        try:
            self.dashboardInfo.jobStart()
        except Exception, ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))

        return
Esempio n. 3
0
    def testAFailedJobMonitoring(self):
        """
        _TestAFailedJobMonitoring_

        Simulate a job that completes but fails, check that the data sent is
        correct
        """

        # Get the necessary objects
        name     = 'testB'
        job      = self.createTestJob()
        workload = self.createWorkload()
        task     = workload.getTask(taskName = "DataProcessing")
        report   = self.createReport(outcome = 1)

        # Fill the job environment
        self.setupJobEnvironment(name = name)

        # Instantiate DBInfo
        dbInfo   = DashboardInfo(job = job, task = task)
        dbInfo.addDestination('127.0.0.1', 8884)

        # Check jobStart information
        data = dbInfo.jobStart()
        self.assertEqual(data['MessageType'], 'JobStatus')
        self.assertEqual(data['StatusValue'], 'running')
        self.assertEqual(data['StatusDestination'], "T1_US_FNAL")
        self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco')

        # Do the first step
        step = task.getStep(stepName = "cmsRun1")

        # Do the step start
        data = dbInfo.stepStart(step = step.data)
        self.assertNotEqual(data['jobStart'], None)
        self.assertEqual(data['jobStart']['ExeStart'], step.name())
        self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname())
        self.assertEqual(data['1_ExeStart'], step.name())

        #Do the step end
        data = dbInfo.stepEnd(step = step.data, stepReport = report)
        self.assertEqual(data['1_ExeEnd'], step.name())
        self.assertNotEqual(data['1_ExeExitCode'], 0)
        self.assertTrue(data['1_ExeWCTime'] >= 0)
        self.assertEqual(report.retrieveStep("cmsRun1").counter, 1)

        # End the job!
        data = dbInfo.jobEnd()
        self.assertEqual(data['ExeEnd'], "cmsRun1")
        self.assertNotEqual(data['JobExitCode'], 0)
        self.assertEqual(data['WrapperCPUTime'], 0)
        self.assertTrue(data['WrapperWCTime'] >= 0)
        self.assertNotEqual(data['JobExitReason'].find('cmsRun1'), -1)

        return
Esempio n. 4
0
    def testMultithreadedApplication(self):
        """
        _testMultithreadedApplication_

        Check that the data packets have NCores and it picks it up successfully from the CMSSW step
        """

        # Get the necessary objects
        name     = 'testMT'
        job      = self.createTestJob()
        workload = self.createWorkload()
        task     = workload.getTask(taskName = "DataProcessing")
        report   = self.createReport()

        # Fill the job environment
        self.setupJobEnvironment(name = name)

        # Instantiate DBInfo
        dbInfo   = DashboardInfo(job = job, task = task)
        dbInfo.addDestination('127.0.0.1', 8884)

        # Modify the first step
        step = task.getStep(stepName = "cmsRun1")
        step.getTypeHelper().setNumberOfCores(8)

        # Check jobStart information
        data = dbInfo.jobStart()
        self.assertEqual(data['NCores'], 8)

        # Do the first step
        step = task.getStep(stepName = "cmsRun1")

        # Do the step start
        data = dbInfo.stepStart(step = step.data)

        #Do the step end
        data = dbInfo.stepEnd(step = step.data, stepReport = report)
        self.assertEqual(data['1_NCores'], 8)
        self.assertEqual(report.retrieveStep("cmsRun1").counter, 1)

        # End the job and test the final NCores report
        data = dbInfo.jobEnd()
        self.assertEqual(data['NCores'], 8)

        return
Esempio n. 5
0
    def testASuccessfulJobMonitoring(self):
        """
        _testASuccessfulJobMonitoring_

        Check that the data packets make sense when a job completes successfully
        """

        # Get the necessary objects
        name     = 'testA'
        job      = self.createTestJob()
        workload = self.createWorkload()
        task     = workload.getTask(taskName = "DataProcessing")
        report   = self.createReport()

        # Fill the job environment
        self.setupJobEnvironment(name = name)

        # Instantiate DBInfo
        dbInfo   = DashboardInfo(job = job, task = task)
        dbInfo.addDestination('127.0.0.1', 8884)

        # Check jobStart information
        data = dbInfo.jobStart()
        self.assertEqual(data['MessageType'], 'JobStatus')
        self.assertEqual(data['StatusValue'], 'running')
        self.assertEqual(data['StatusDestination'], "T1_US_FNAL")
        self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco')

        # Do the first step
        step = task.getStep(stepName = "cmsRun1")

        # Do the step start
        data = dbInfo.stepStart(step = step.data)
        self.assertNotEqual(data['jobStart'], None)
        self.assertEqual(data['jobStart']['ExeStart'], step.name())
        self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname())
        self.assertEqual(data['ExeStart'], step.name())

        #Do the step end
        data = dbInfo.stepEnd(step = step.data, stepReport = report)
        self.assertEqual(data['ExeEnd'], step.name())
        self.assertEqual(data['ExeExitCode'], 0)
        self.assertTrue(data['ExeWCTime'] >= 0)

        #Do a second step
        step = task.getStep(stepName = "cmsRun1")

        #Do the step start (It's not the first step)
        data = dbInfo.stepStart(step = step.data)
        self.assertEqual(data['jobStart'], None)
        self.assertEqual(data['ExeStart'], step.name())

        #Do the step end
        data = dbInfo.stepEnd(step = step.data, stepReport = report)
        self.assertEqual(data['ExeEnd'], step.name())
        self.assertEqual(data['ExeExitCode'], 0)
        self.assertTrue(data['ExeWCTime'] >= 0)

        # End the job!
        data = dbInfo.jobEnd()
        self.assertEqual(data['ExeEnd'], "cmsRun1")
        self.assertEqual(data['JobExitCode'], 0)
        self.assertEqual(data['WrapperCPUTime'], 0)
        self.assertTrue(data['WrapperWCTime'] >= 0)
        self.assertNotEqual(data['JobExitReason'], "")

        return
Esempio n. 6
0
class DashboardMonitor(WMRuntimeMonitor):
    """
    _DashboardMonitor_
    
    Run in the background and pass information to
    the DashboardInterface instance.

    If the job exceeds timeouts, kill the job
    """

    def __init__(self):
        self.startTime        = None
        self.currentStep      = None
        self.currentStepName  = None
        self.currentStepSpace = None
        self.softTimeOut      = None
        self.hardTimeOut      = None
        self.killFlag         = False
        self.cmsswFile        = None
        self.task             = None
        self.job              = None
        self.dashboardInfo    = None
        WMRuntimeMonitor.__init__(self)


    def initMonitor(self, task, job, logPath, args = {}):
        """
        Handles the monitor initiation

        """
        logging.info("In DashboardMonitor.initMonitor")

        self.task    = task
        self.job     = job
        self.logPath = logPath

        self.softTimeOut = args.get('softTimeOut', None)
        self.hardTimeOut = args.get('hardTimeOut', None)
        
        destHost = args.get('destinationHost', None)
        destPort = args.get('destinationPort', None)

        self.dashboardInfo = DashboardInfo(task = task, job = job)

        if destHost and destPort:
            logging.info("About to set destination to %s:%s" % (destHost, destPort)) 
            self.dashboardInfo.addDestination(host = destHost,
                                              port = destPort)


    def jobStart(self, task):
        """
        Job start notifier.
        """

        self.dashboardInfo.jobStart()

        return


    def jobEnd(self, task):
        """
        Job End notification

        """

        self.dashboardInfo.jobEnd()

        return

    def stepStart(self, step):
        """
        Step start notification

        """
        self.currentStep      = step
        self.currentStepName  = getStepName(step)
        self.currentStepSpace = None
        self.startTime        = time.time()
        self.dashboardInfo.stepStart(step = step)

        return

    def stepEnd(self, step, stepReport):
        """
        Step end notification

        """
        self.currentStep      = None
        self.currentStepName  = None
        self.currentStepSpace = None
        self.dashboardInfo.stepEnd(step = step,
                                   stepReport = stepReport)
        return


    def stepKilled(self, step):
        """
        Step killed notification

        """

        self.currentStep     = None
        self.currentStepName = None
        self.dashboardInfo.stepKilled(step = step)


    def jobKilled(self, task):
        """
        Killed job notification

        """

        self.dashboardInfo.jobKilled()

        return


    def periodicUpdate(self):
        """
        Run on the defined intervals.

        """
        
        if not self.currentStep:
            #We're probably between steps
            return

        self.dashboardInfo.periodicUpdate()


        #Check for events
        if self.cmsswFile:
            run, event = searchForEvent(file)
            if run and event:
                #Then we actually found something, otherwise do nothing
                #Right now I don't know what to do
                pass

        #Do timeout
        if not self.softTimeOut:
            return


        if time.time() - self.startTime > self.softTimeOut:
            #Then we have to kill the process

            # If our stepName is None, we're inbetween steps.  Nothing to kill!
            if self.currentStepName == None:
                return

            # If our stepName is valid, then we may need the stepSpace
            if self.currentStepSpace == None:
                self.currentStepSpace = getStepSpace(self.currentStepName)

            #First, get the PID
            stepPID = getStepPID(self.currentStepSpace, self.currentStepName)
        
            #Now kill it!
            msg = ""
            msg += "Start Time: %s\n" % self.startTime
            msg += "Time Now: %s\n" % time.time()
            msg += "Timeout: %s\n" % self.softTimeOut
            msg += "Killing Job...\n"
            msg += "Process ID is: %s\n" % stepPID

            # If possible, write a FWJR
            report  = Report.Report()
            try:
                self.logPath = os.path.join(self.currentStepSpace.location,
                                            '../../../', os.path.basename(self.logPath))
                if os.path.isfile(self.logPath):
                    # We should be able to find existant job report.
                    # If not, we're in trouble
                    logging.debug("Found pre-existant error report in DashboardMonitor termination.")
                    report.load(self.logPath)
                report.addError(stepName = self.currentStepName, exitCode = 99901,
                                errorType = "JobTimeout", errorDetails = msg)
                report.save(self.logPath)
            except Exception, ex:
                # Basically, we can't write a log report and we're hosed
                # Kill anyway, and hope the logging file gets written out
                msg2 =  "Exception while writing out jobReport.\n"
                msg2 += "Aborting job anyway: unlikely you'll get any error report.\n"
                msg2 += str(ex)
                msg2 += str(traceback.format_exc()) + '\n'
                logging.error(msg2)

            
            if stepPID == None or stepPID == os.getpid():
                # Then we are supposed to kill things
                # that don't exist in separate processes:
                # Self-terminate
                msg += "WARNING: No separate process.  Watchdog will attempt self-termination."
                logging.error(msg)
                os.abort()
            if time.time() - self.startTime < self.hardTimeOut or not self.killFlag:
                msg += "WARNING: Soft Kill Timeout has Expired:"
                logging.error(msg)
                os.kill(stepPID, signal.SIGUSR2)
                self.killFlag = True
            elif self.killFlag:
                msg += "WARNING: Hard Kill Timeout has Expired:"
                logging.error(msg)
                os.kill(stepPID, signal.SIGTERM)
                killedpid, stat = os.waitpid(stepPID, os.WNOHANG)
                if killedpid == 0:
                    os.kill(stepPID, signal.SIGKILL)
                    killedpid, stat = os.waitpid(stepPID, os.WNOHANG)
                    if killedpid == 0:
                        logging.error("Can't kill job.  Out of options.  Waiting for system reboot.")
                        #Panic!  It's unkillable!
                        


        return
class DashboardMonitor(WMRuntimeMonitor):
    """
    _DashboardMonitor_

    Run in the background and pass information to
    the DashboardInterface instance.

    If the job exceeds timeouts, kill the job
    """

    def __init__(self):
        self.startTime        = None
        self.currentStep      = None
        self.currentStepName  = None
        self.currentStepSpace = None
        self.task             = None
        self.job              = None
        self.dashboardInfo    = None
        WMRuntimeMonitor.__init__(self)


    def initMonitor(self, task, job, logPath, args = {}):
        """
        Handles the monitor initiation

        """
        logging.info("In DashboardMonitor.initMonitor")

        self.task    = task
        self.job     = job

        destHost = args.get('destinationHost', None)
        destPort = args.get('destinationPort', None)
        dashboardUrl = '%s:%s' % (destHost, str(destPort))
        cores = args.get('cores', 0)

        self.dashboardInfo = DashboardInfo(task, job, dashboardUrl=dashboardUrl,
                                           overrideCores=cores)

    def jobStart(self, task):
        """
        Job start notifier.
        """
        try:
            self.dashboardInfo.jobStart()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))

        return


    def jobEnd(self, task):
        """
        Job End notification

        """
        try:
            self.dashboardInfo.jobEnd()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))

        return

    def stepStart(self, step):
        """
        Step start notification

        """
        self.currentStep      = step
        self.currentStepName  = getStepName(step)
        self.currentStepSpace = None
        self.startTime        = time.time()
        try:
            self.dashboardInfo.stepStart(step = step)
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return

    def stepEnd(self, step, stepReport):
        """
        Step end notification

        """
        self.currentStep      = None
        self.currentStepName  = None
        self.currentStepSpace = None
        try:
            self.dashboardInfo.stepEnd(step = step,
                                   stepReport = stepReport)
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return


    def stepKilled(self, step):
        """
        Step killed notification

        """

        self.currentStep     = None
        self.currentStepName = None
        try:
            self.dashboardInfo.stepKilled(step = step)
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return

    def jobKilled(self, task):
        """
        Killed job notification

        """
        try:
            self.dashboardInfo.jobKilled()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return


    def periodicUpdate(self):
        """
        Run on the defined intervals. Tell the dashboard info to run the
        periodic update

        """

        try:
            self.dashboardInfo.periodicUpdate()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return
Esempio n. 8
0
class DashboardMonitor(WMRuntimeMonitor):
    """
    _DashboardMonitor_

    Run in the background and pass information to
    the DashboardInterface instance.

    If the job exceeds timeouts, kill the job
    """
    def __init__(self):
        self.startTime = None
        self.currentStep = None
        self.currentStepName = None
        self.currentStepSpace = None
        self.task = None
        self.job = None
        self.dashboardInfo = None
        WMRuntimeMonitor.__init__(self)

    def initMonitor(self, task, job, logPath, args={}):
        """
        Handles the monitor initiation

        """
        logging.info("In DashboardMonitor.initMonitor")

        self.task = task
        self.job = job

        destHost = args.get('destinationHost', None)
        destPort = args.get('destinationPort', None)
        dashboardUrl = '%s:%s' % (destHost, str(destPort))

        self.dashboardInfo = DashboardInfo(task=task,
                                           job=job,
                                           dashboardUrl=dashboardUrl)

    def jobStart(self, task):
        """
        Job start notifier.
        """
        try:
            self.dashboardInfo.jobStart()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))

        return

    def jobEnd(self, task):
        """
        Job End notification

        """
        try:
            self.dashboardInfo.jobEnd()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))

        return

    def stepStart(self, step):
        """
        Step start notification

        """
        self.currentStep = step
        self.currentStepName = getStepName(step)
        self.currentStepSpace = None
        self.startTime = time.time()
        try:
            self.dashboardInfo.stepStart(step=step)
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return

    def stepEnd(self, step, stepReport):
        """
        Step end notification

        """
        self.currentStep = None
        self.currentStepName = None
        self.currentStepSpace = None
        try:
            self.dashboardInfo.stepEnd(step=step, stepReport=stepReport)
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return

    def stepKilled(self, step):
        """
        Step killed notification

        """

        self.currentStep = None
        self.currentStepName = None
        try:
            self.dashboardInfo.stepKilled(step=step)
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return

    def jobKilled(self, task):
        """
        Killed job notification

        """
        try:
            self.dashboardInfo.jobKilled()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return

    def periodicUpdate(self):
        """
        Run on the defined intervals. Tell the dashboard info to run the
        periodic update

        """

        try:
            self.dashboardInfo.periodicUpdate()
        except Exception as ex:
            logging.error(str(ex))
            logging.error(str(traceback.format_exc()))
        return
Esempio n. 9
0
    def testASuccessfulJobMonitoring(self):
        """
        _testASuccessfulJobMonitoring_

        Check that the data packets make sense when a job completes successfully
        """

        # Get the necessary objects
        name = 'testA'
        job = self.createTestJob()
        workload = self.createWorkload()
        task = workload.getTask(taskName="DataProcessing")
        report = self.createReport()

        # Fill the job environment
        self.setupJobEnvironment(name=name)

        # Instantiate DBInfo
        dbInfo = DashboardInfo(job=job, task=task)
        dbInfo.addDestination('127.0.0.1', 8884)

        # Check jobStart information
        data = dbInfo.jobStart()
        self.assertEqual(data['MessageType'], 'JobStatus')
        self.assertEqual(data['StatusValue'], 'running')
        self.assertEqual(data['StatusDestination'], "T1_US_FNAL")
        self.assertEqual(data['taskId'], 'wmagent_Tier1ReReco')

        # Do the first step
        step = task.getStep(stepName="cmsRun1")

        # Do the step start
        data = dbInfo.stepStart(step=step.data)
        self.assertNotEqual(data['jobStart'], None)
        self.assertEqual(data['jobStart']['ExeStart'], step.name())
        self.assertEqual(data['jobStart']['WNHostName'], socket.gethostname())
        self.assertEqual(data['1_ExeStart'], step.name())

        #Do the step end
        data = dbInfo.stepEnd(step=step.data, stepReport=report)
        self.assertEqual(data['1_ExeEnd'], step.name())
        self.assertEqual(data['1_ExeExitCode'], 0)
        self.assertTrue(data['1_ExeWCTime'] >= 0)
        self.assertEqual(report.retrieveStep("cmsRun1").counter, 1)

        #Do a second step
        step = task.getStep(stepName="cmsRun1")

        #Do the step start (It's not the first step)
        data = dbInfo.stepStart(step=step.data)
        self.assertEqual(data['jobStart'], None)
        self.assertEqual(data['2_ExeStart'], step.name())

        #Do the step end
        data = dbInfo.stepEnd(step=step.data, stepReport=report)
        self.assertEqual(data['2_ExeEnd'], step.name())
        self.assertEqual(data['2_ExeExitCode'], 0)
        self.assertTrue(data['2_ExeWCTime'] >= 0)
        self.assertEqual(report.retrieveStep("cmsRun1").counter, 2)

        # End the job!
        data = dbInfo.jobEnd()
        self.assertEqual(data['ExeEnd'], "cmsRun1")
        self.assertEqual(data['JobExitCode'], 0)
        self.assertEqual(data['WrapperCPUTime'], 0)
        self.assertTrue(data['WrapperWCTime'] >= 0)
        self.assertNotEqual(data['JobExitReason'], "")

        return
Esempio n. 10
0
    def testA_createDashboardInfo(self):
        """
        _createDashboardInfo_

        Can we create the dashboardInfo and fill it with
        local information?
        """

        # Get the necessary objects
        name     = 'testA'
        job      = self.createTestJob()
        workload = self.createWorkload()
        task     = workload.getTask(taskName = "DataProcessing")
        report   = self.createReport()

        # Fill the job environment
        self.setupJobEnvironment(name = name)

        # Instantiate DBInfo
        dbInfo   = DashboardInfo(job = job, task = task)


        # Check some defaults
        self.assertEqual(dbInfo.get('TaskType', None), task.taskType())
        self.assertEqual(dbInfo.get('User', None), '*****@*****.**')
        self.assertEqual(dbInfo.get('JSTool', None), 'WMAgent')
        self.assertEqual(dbInfo.get('jobName', None),
                         'WMAgent_1_0_ThisIsASillyName')
        self.assertEqual(dbInfo.get('taskName', None),
                         'ProdAgent_-Tier1ReReco-DataProcessing_WMAgentPrimary')


        dbInfo.jobStart()

        self.assertEqual(dbInfo.get('GridJobID', None), name)
        self.assertEqual(dbInfo.get('SyncCE', None), name)


        # Do a step
        step = task.getStep(stepName = "cmsRun1")

        # Do the step start
        dbInfo.stepStart(step = step.data)
        self.assertEqual(dbInfo.get('ExeStart', None), step.name())
        self.assertEqual(dbInfo.get('ApplicationVersion', None),
                         'CMSSW_3_5_8')


        # Do the step end
        dbInfo.stepEnd(step = step.data, stepReport = report)
        self.assertEqual(dbInfo.get('ExeEnd', None), step.name())
        self.assertEqual(dbInfo.get('ExeExitStatus', None), False)




        # End the job!
        dbInfo.jobEnd()
        self.assertFalse(dbInfo.get('JobFinished', None) == None,
                         'Did not assign finish time in jobEnd()')

        return