Esempio n. 1
0
    def reportWorkflowToDashboard(self, dashboardActivity):
        """
        _reportWorkflowToDashboard_
        Gathers workflow information from the arguments and reports it to the
        dashboard
        """
        try:
            #Create a fake config
            conf = ConfigSection()
            conf.section_('DashboardReporter')
            conf.DashboardReporter.dashboardHost = self.dashboardHost
            conf.DashboardReporter.dashboardPort = self.dashboardPort

            #Create the reporter
            reporter = DashboardReporter(conf)

            #Assemble the info
            workflow = {}
            workflow['name'] = self.workloadName
            workflow['application'] = self.frameworkVersion
            workflow['TaskType'] = dashboardActivity
            #Let's try to build information about the inputDataset
            dataset = 'DoesNotApply'
            if hasattr(self, 'inputDataset'):
                dataset = self.inputDataset
            workflow['datasetFull'] = dataset
            workflow['user'] = '******'

            #Send the workflow info
            reporter.addTask(workflow)
        except:
            #This is not critical, if it fails just leave it be
            logging.error("There was an error with dashboard reporting")
Esempio n. 2
0
    def __init__(self, config, couchDbName=None):
        WMObject.__init__(self, config)
        WMConnectionBase.__init__(self, "WMCore.WMBS")

        if couchDbName is None:
            self.dbname = getattr(self.config.JobStateMachine, "couchDBName")
        else:
            self.dbname = couchDbName

        self.jobsdatabase = None
        self.fwjrdatabase = None
        self.jsumdatabase = None
        self.statsumdatabase = None

        self.couchdb = CouchServer(self.config.JobStateMachine.couchurl)
        self._connectDatabases()

        try:
            self.dashboardReporter = DashboardReporter(config)
        except Exception as ex:
            logging.error("Error setting up the dashboard reporter: %s", str(ex))
            raise

        self.getCouchDAO = self.daofactory("Jobs.GetCouchID")
        self.setCouchDAO = self.daofactory("Jobs.SetCouchID")
        self.incrementRetryDAO = self.daofactory("Jobs.IncrementRetry")
        self.workflowTaskDAO = self.daofactory("Jobs.GetWorkflowTask")
        self.jobTypeDAO = self.daofactory("Jobs.GetType")
        self.updateLocationDAO = self.daofactory("Jobs.UpdateLocation")
        self.getWorkflowSpecDAO = self.daofactory("Workflow.GetSpecAndNameFromTask")

        self.maxUploadedInputFiles = getattr(self.config.JobStateMachine, 'maxFWJRInputFiles', 1000)
        self.workloadCache = {}
        return
Esempio n. 3
0
    def reportWorkflowToDashboard(self, dashboardActivity):
        """
        _reportWorkflowToDashboard_
        Gathers workflow information from the arguments and reports it to the
        dashboard
        """
        try:
        #Create a fake config
            conf = ConfigSection()
            conf.section_('DashboardReporter')
            conf.DashboardReporter.dashboardHost = self.dashboardHost
            conf.DashboardReporter.dashboardPort = self.dashboardPort

            #Create the reporter
            reporter = DashboardReporter(conf)

            #Assemble the info
            workflow = {}
            workflow['name'] = self.workloadName
            workflow['application'] = self.frameworkVersion
            workflow['TaskType'] = dashboardActivity
            #Let's try to build information about the inputDataset
            dataset = 'DoesNotApply'
            if hasattr(self, 'inputDataset'):
                dataset = self.inputDataset
            workflow['datasetFull'] = dataset
            workflow['user'] = '******'

            #Send the workflow info
            reporter.addTask(workflow)
        except:
            #This is not critical, if it fails just leave it be
            logging.error("There was an error with dashboard reporting")
Esempio n. 4
0
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config=None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        return
Esempio n. 5
0
class ChangeState(WMObject, WMConnectionBase):
    """
    Propagate the state of a job through the JSM.
    """
    def __init__(self, config, couchDbName=None):
        WMObject.__init__(self, config)
        WMConnectionBase.__init__(self, "WMCore.WMBS")

        if couchDbName == None:
            self.dbname = getattr(self.config.JobStateMachine, "couchDBName")
        else:
            self.dbname = couchDbName

        try:
            self.couchdb = CouchServer(self.config.JobStateMachine.couchurl)
            self.jobsdatabase = self.couchdb.connectDatabase("%s/jobs" %
                                                             self.dbname,
                                                             size=250)
            self.fwjrdatabase = self.couchdb.connectDatabase("%s/fwjrs" %
                                                             self.dbname,
                                                             size=250)
            self.jsumdatabase = self.couchdb.connectDatabase(getattr(
                self.config.JobStateMachine, 'jobSummaryDBName'),
                                                             size=250)
        except Exception, ex:
            logging.error("Error connecting to couch: %s" % str(ex))
            self.jobsdatabase = None
            self.fwjrdatabase = None
            self.jsumdatabase = None

        try:
            self.dashboardReporter = DashboardReporter(config)
        except Exception, ex:
            logging.error("Error setting up the \
                          dashboard reporter: %s" % str(ex))
Esempio n. 6
0
    def __init__(self, config, couchDbName = None):
        WMObject.__init__(self, config)
        WMConnectionBase.__init__(self, "WMCore.WMBS")

        if couchDbName == None:
            self.dbname = getattr(self.config.JobStateMachine, "couchDBName")
        else:
            self.dbname = couchDbName
            
        self.jobsdatabase = None
        self.fwjrdatabase = None
        self.jsumdatabase = None
        self.statsumdatabase = None
        
        self.couchdb = CouchServer(self.config.JobStateMachine.couchurl)
        self._connectDatabases()

        try:
            self.dashboardReporter = DashboardReporter(config)
        except Exception as ex:
            logging.error("Error setting up the \
-                          dashboard reporter: %s" % str(ex))
            raise

        self.getCouchDAO = self.daofactory("Jobs.GetCouchID")
        self.setCouchDAO = self.daofactory("Jobs.SetCouchID")
        self.incrementRetryDAO = self.daofactory("Jobs.IncrementRetry")
        self.workflowTaskDAO = self.daofactory("Jobs.GetWorkflowTask")
        self.jobTypeDAO = self.daofactory("Jobs.GetType")
        self.updateLocationDAO = self.daofactory("Jobs.UpdateLocation")

        self.maxUploadedInputFiles = getattr(self.config.JobStateMachine, 'maxFWJRInputFiles', 1000)
        return
Esempio n. 7
0
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config=None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        self.fallbackReport = FallbackSample.report

        self.twoFileFallbackXmlPath = os.path.join(
            getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWTwoFileRemote.xml")
        self.pileupXmlPath = os.path.join(
            getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWPileup.xml")

        return
Esempio n. 8
0
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config = None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        return
Esempio n. 9
0
    def reportWorkflowToDashboard(self, dashboardActivity):
        """
        _reportWorkflowToDashboard_
        Gathers workflow information from the arguments and reports it to the
        dashboard
        """
        try:
            # Create a fake config
            conf = ConfigSection()
            conf.section_("DashboardReporter")
            conf.DashboardReporter.dashboardHost = self.dashboardHost
            conf.DashboardReporter.dashboardPort = self.dashboardPort

            # Create the reporter
            reporter = DashboardReporter(conf)

            # Assemble the info
            workflow = {}
            workflow["name"] = self.workloadName
            workflow["application"] = self.frameworkVersion
            workflow["scheduler"] = "BossAir"
            workflow["TaskType"] = dashboardActivity
            # Let's try to build information about the inputDataset
            dataset = "DoesNotApply"
            if hasattr(self, "inputDataset"):
                dataset = self.inputDataset
            workflow["datasetFull"] = dataset
            workflow["user"] = "******"

            # These two make are not reported for now
            workflow["GridName"] = "NotAvailable"
            workflow["nevtJob"] = "NotAvailable"

            # Send the workflow info
            reporter.addTask(workflow)
        except:
            # This is not critical, if it fails just leave it be
            logging.error("There was an error with dashboard reporting")
Esempio n. 10
0
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config=None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        self.fallbackReport = FallbackSample.report

        self.twoFileFallbackXmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWTwoFileRemote.xml")
        self.pileupXmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWPileup.xml")

        return
Esempio n. 11
0
    def __init__(self, config, couchDbName=None):
        WMObject.__init__(self, config)
        WMConnectionBase.__init__(self, "WMCore.WMBS")

        if couchDbName == None:
            self.dbname = getattr(self.config.JobStateMachine, "couchDBName")
        else:
            self.dbname = couchDbName

        self.couchdb = CouchServer(self.config.JobStateMachine.couchurl)
        self._connectDatabases()

        try:
            self.dashboardReporter = DashboardReporter(config)
        except Exception, ex:
            logging.error("Error setting up the \
-                          dashboard reporter: %s" % str(ex))
            raise
Esempio n. 12
0
class DashboardReporterTest(unittest.TestCase):
    """
    _DashboardReporterTest_

    Unit tests for the dashboard reporter class.
    """
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config=None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        self.fallbackReport = FallbackSample.report

        self.twoFileFallbackXmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWTwoFileRemote.xml")
        self.pileupXmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWPileup.xml")

        return

    def tearDown(self):
        """
        _tearDown_

        Just get out
        """
        pass

    def trimNoneValues(self, package):
        """
        _trimNoneValues_

        Simple utility to trim the None values of a dictionary
        """
        trimmed = {}
        for key in package:
            if package[key] != None:
                trimmed[key] = package[key]
        return trimmed

    def createTestJob(self, fwjr):
        """
        _createTestJob_

        Creates a minimal job to report
        """
        job = Job('finishedJob')
        job['retry_count'] = 1
        job['workflow'] = 'testWorkflow'
        job['fwjr'] = fwjr

        return job

    def testHandleSteps(self):
        """
        _testHandleSteps_

        Check that we can extract the information from a completed job
        and report it

        """
        job = self.createTestJob(self.processingReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.mergeReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.errorReport)
        self.reporter.handleSteps(job)

    def testPerformanceReport(self):
        """
        _testPerformanceReport_

        Check that the performance information is extracted correctly for
        different reports
        """
        step = self.processingReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(len(self.trimNoneValues(perfInfo)), 21,
                         'Found less information than expected')
        self.assertEqual(perfInfo['PeakValueRss'], '891.617',
                         'Values do not match')
        self.assertEqual(perfInfo['readCachePercentageOps'], 0.995779157341,
                         'Values do not match')
        self.assertEqual(perfInfo['MaxEventTime'], '3.32538',
                         'Values do not match')

        step = self.processingReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.processingReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')

        step = self.errorReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'cmsRun1 performance info is not empty')

        step = self.errorReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.errorReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')


    def testEventInformationReport(self):
        """
        _testEventInformationReport_

        Check that the event information is extracted correctly for
        different reports
        """
        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.processingReport)
        self.assertEqual(eventInfo['inputEvents'], 18192,
                         'Input events do not match')
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-WElectron-PromptSkim-v1:USER:1603'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-LogErrorMonitor-PromptSkim-v1:USER:137'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-LogError-PromptSkim-v1:RAW-RECO:66'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-TOPElePlusJets-PromptSkim-v1:AOD:2320'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-HighMET-PromptSkim-v1:RAW-RECO:8'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-DiTau-PromptSkim-v1:RAW-RECO:192'), 1)

        eventInfo = self.reporter.getEventInformation('stageOut1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {},
                         'stageOut1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('logArch1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {},
                         'logArch1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.mergeReport)
        self.assertEqual(eventInfo['inputEvents'], 0,
                         'Input events do not match')
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-LogError-PromptSkim-v1:RAW-RECO:0'), 1)

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.errorReport)
        self.assertEqual(eventInfo, {},
                         'Error report event info is not empty')

    def testFileInformation(self):
        """
        _testFileInformation_

        Check that the file information is extracted correctly for
        different reports
        """

        # First test the processingReport

        step = self.processingReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(2, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')
        report1 = (fileInfo['inputFiles'].split(';'))[1].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('1', report1[1])
        self.assertEqual('Local', report0[3])
        self.assertEqual('Local', report1[3])
        self.assertEqual('1', report0[4])
        self.assertEqual('2', report1[4])

        step = self.processingReport.retrieveStep('logArch1')
        fileInfo = self.reporter.getInputFilesInformation(step)
        self.assertEqual(self.trimNoneValues(fileInfo), {},
                         'logArch1 file info is not empty')

        step = self.processingReport.retrieveStep('stageOut1')
        fileInfo = self.reporter.getInputFilesInformation(step)
        self.assertEqual(self.trimNoneValues(fileInfo), {},
                         'stageOut1 file info is not empty')

        # Now shorter test on mergeReport

        step = self.mergeReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(1, len(fileReports))

        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('Local', report0[3])
        self.assertEqual('1', report0[4])

        # Now shorter test on errorReport
        step = self.errorReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(2, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')
        report1 = (fileInfo['inputFiles'].split(';'))[1].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('0', report1[1])
        self.assertEqual('Local', report0[3])
        self.assertEqual('Local', report1[3])
        self.assertEqual('1', report0[4])
        self.assertEqual('2', report1[4])

        # And tests on the fallback report

        step = self.fallbackReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(1, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('Remote', report0[3])
        self.assertEqual('1', report0[4])

        # And tests on a report of two fallback files

        twoReport = Report("cmsRun1")
        twoReport.parse(self.twoFileFallbackXmlPath)
        step = twoReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(2, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')
        report1 = (fileInfo['inputFiles'].split(';'))[1].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('1', report1[1])
        self.assertEqual('Remote', report0[3])
        self.assertEqual('Remote', report1[3])
        self.assertEqual('1', report0[4])
        self.assertEqual('2', report1[4])

        pileupReport = Report("cmsRun1")
        pileupReport.parse(self.pileupXmlPath)
        step = pileupReport.retrieveStep('cmsRun1')

        localCount = 0
        remoteCount = 0
        for report in (self.reporter.getInputFilesInformation(step))['inputFiles'].split(';'):
            if (report.split('::'))[3] == 'Remote':
                remoteCount += 1
            elif (report.split('::'))[3] == 'Local':
                localCount += 1

        self.assertEqual(1, remoteCount)
        self.assertEqual(13, localCount)
Esempio n. 13
0
class DashboardReporterTest(unittest.TestCase):
    """
    _DashboardReporterTest_

    Unit tests for the dashboard reporter class.
    """
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config=None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        self.fallbackReport = FallbackSample.report

        self.twoFileFallbackXmlPath = os.path.join(
            getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWTwoFileRemote.xml")
        self.pileupXmlPath = os.path.join(
            getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWPileup.xml")

        return

    def tearDown(self):
        """
        _tearDown_

        Just get out
        """
        pass

    def trimNoneValues(self, package):
        """
        _trimNoneValues_

        Simple utility to trim the None values of a dictionary
        """
        trimmed = {}
        for key in package:
            if package[key] != None:
                trimmed[key] = package[key]
        return trimmed

    def createTestJob(self, fwjr):
        """
        _createTestJob_

        Creates a minimal job to report
        """
        job = Job('finishedJob')
        job['retry_count'] = 1
        job['workflow'] = 'testWorkflow'
        job['fwjr'] = fwjr

        return job

    def testHandleSteps(self):
        """
        _testHandleSteps_

        Check that we can extract the information from a completed job
        and report it

        """
        job = self.createTestJob(self.processingReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.mergeReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.errorReport)
        self.reporter.handleSteps(job)

    def testPerformanceReport(self):
        """
        _testPerformanceReport_

        Check that the performance information is extracted correctly for
        different reports
        """
        step = self.processingReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(len(self.trimNoneValues(perfInfo)), 21,
                         'Found less information than expected')
        self.assertEqual(perfInfo['PeakValueRss'], '891.617',
                         'Values do not match')
        self.assertEqual(perfInfo['readCachePercentageOps'], 0.995779157341,
                         'Values do not match')
        self.assertEqual(perfInfo['MaxEventTime'], '3.32538',
                         'Values do not match')

        step = self.processingReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.processingReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')

        step = self.errorReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'cmsRun1 performance info is not empty')

        step = self.errorReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.errorReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')

    def testEventInformationReport(self):
        """
        _testEventInformationReport_

        Check that the event information is extracted correctly for
        different reports
        """
        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.processingReport)
        self.assertEqual(eventInfo['inputEvents'], 18192,
                         'Input events do not match')
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-WElectron-PromptSkim-v1:USER:1603'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-LogErrorMonitor-PromptSkim-v1:USER:137'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-LogError-PromptSkim-v1:RAW-RECO:66'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-TOPElePlusJets-PromptSkim-v1:AOD:2320'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-HighMET-PromptSkim-v1:RAW-RECO:8'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-DiTau-PromptSkim-v1:RAW-RECO:192'), 1)

        eventInfo = self.reporter.getEventInformation('stageOut1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {}, 'stageOut1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('logArch1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {}, 'logArch1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.mergeReport)
        self.assertEqual(eventInfo['inputEvents'], 0,
                         'Input events do not match')
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-LogError-PromptSkim-v1:RAW-RECO:0'), 1)

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.errorReport)
        self.assertEqual(eventInfo, {}, 'Error report event info is not empty')

    def testFileInformation(self):
        """
        _testFileInformation_

        Check that the file information is extracted correctly for
        different reports
        """

        # First test the processingReport

        step = self.processingReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(2, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')
        report1 = (fileInfo['inputFiles'].split(';'))[1].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('1', report1[1])
        self.assertEqual('Local', report0[3])
        self.assertEqual('Local', report1[3])
        self.assertEqual('1', report0[4])
        self.assertEqual('2', report1[4])

        step = self.processingReport.retrieveStep('logArch1')
        fileInfo = self.reporter.getInputFilesInformation(step)
        self.assertEqual(self.trimNoneValues(fileInfo), {},
                         'logArch1 file info is not empty')

        step = self.processingReport.retrieveStep('stageOut1')
        fileInfo = self.reporter.getInputFilesInformation(step)
        self.assertEqual(self.trimNoneValues(fileInfo), {},
                         'stageOut1 file info is not empty')

        # Now shorter test on mergeReport

        step = self.mergeReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(1, len(fileReports))

        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('Local', report0[3])
        self.assertEqual('1', report0[4])

        # Now shorter test on errorReport
        step = self.errorReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(2, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')
        report1 = (fileInfo['inputFiles'].split(';'))[1].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('0', report1[1])
        self.assertEqual('Local', report0[3])
        self.assertEqual('Local', report1[3])
        self.assertEqual('1', report0[4])
        self.assertEqual('2', report1[4])

        # And tests on the fallback report

        step = self.fallbackReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(1, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('Remote', report0[3])
        self.assertEqual('1', report0[4])

        # And tests on a report of two fallback files

        twoReport = Report("cmsRun1")
        twoReport.parse(self.twoFileFallbackXmlPath)
        step = twoReport.retrieveStep('cmsRun1')
        fileInfo = self.reporter.getInputFilesInformation(step)

        fileReports = fileInfo['inputFiles'].split(';')
        self.assertEqual(2, len(fileReports))

        # Format is LFN, Status, Type (EDM), Local/Remote, Count
        report0 = (fileInfo['inputFiles'].split(';'))[0].split('::')
        report1 = (fileInfo['inputFiles'].split(';'))[1].split('::')

        self.assertEqual('1', report0[1])
        self.assertEqual('1', report1[1])
        self.assertEqual('Remote', report0[3])
        self.assertEqual('Remote', report1[3])
        self.assertEqual('1', report0[4])
        self.assertEqual('2', report1[4])

        pileupReport = Report("cmsRun1")
        pileupReport.parse(self.pileupXmlPath)
        step = pileupReport.retrieveStep('cmsRun1')

        localCount = 0
        remoteCount = 0
        for report in (self.reporter.getInputFilesInformation(step)
                       )['inputFiles'].split(';'):
            if (report.split('::'))[3] == 'Remote':
                remoteCount += 1
            elif (report.split('::'))[3] == 'Local':
                localCount += 1

        self.assertEqual(1, remoteCount)
        self.assertEqual(13, localCount)
Esempio n. 14
0
class ChangeState(WMObject, WMConnectionBase):
    """
    Propagate the state of a job through the JSM.
    """

    def __init__(self, config, couchDbName=None):
        WMObject.__init__(self, config)
        WMConnectionBase.__init__(self, "WMCore.WMBS")

        if couchDbName is None:
            self.dbname = getattr(self.config.JobStateMachine, "couchDBName")
        else:
            self.dbname = couchDbName

        self.jobsdatabase = None
        self.fwjrdatabase = None
        self.jsumdatabase = None
        self.statsumdatabase = None

        self.couchdb = CouchServer(self.config.JobStateMachine.couchurl)
        self._connectDatabases()

        try:
            self.dashboardReporter = DashboardReporter(config)
        except Exception as ex:
            logging.error("Error setting up the dashboard reporter: %s", str(ex))
            raise

        self.getCouchDAO = self.daofactory("Jobs.GetCouchID")
        self.setCouchDAO = self.daofactory("Jobs.SetCouchID")
        self.incrementRetryDAO = self.daofactory("Jobs.IncrementRetry")
        self.workflowTaskDAO = self.daofactory("Jobs.GetWorkflowTask")
        self.jobTypeDAO = self.daofactory("Jobs.GetType")
        self.updateLocationDAO = self.daofactory("Jobs.UpdateLocation")
        self.getWorkflowSpecDAO = self.daofactory("Workflow.GetSpecAndNameFromTask")

        self.maxUploadedInputFiles = getattr(self.config.JobStateMachine, 'maxFWJRInputFiles', 1000)
        self.workloadCache = {}
        return

    def _connectDatabases(self):
        """
        Try connecting to the couchdbs
        """
        if not hasattr(self, 'jobsdatabase') or self.jobsdatabase is None:
            try:
                self.jobsdatabase = self.couchdb.connectDatabase("%s/jobs" % self.dbname, size=250)
            except Exception as ex:
                logging.error("Error connecting to couch db '%s/jobs': %s", self.dbname, str(ex))
                self.jobsdatabase = None
                return False

        if not hasattr(self, 'fwjrdatabase') or self.fwjrdatabase is None:
            try:
                self.fwjrdatabase = self.couchdb.connectDatabase("%s/fwjrs" % self.dbname, size=250)
            except Exception as ex:
                logging.error("Error connecting to couch db '%s/fwjrs': %s", self.dbname, str(ex))
                self.fwjrdatabase = None
                return False

        if not hasattr(self, 'jsumdatabase') or self.jsumdatabase is None:
            dbname = getattr(self.config.JobStateMachine, 'jobSummaryDBName')
            try:
                self.jsumdatabase = self.couchdb.connectDatabase(dbname, size=250)
            except Exception as ex:
                logging.error("Error connecting to couch db '%s': %s", dbname, str(ex))
                self.jsumdatabase = None
                return False

        if not hasattr(self, 'statsumdatabase') or self.statsumdatabase is None:
            dbname = getattr(self.config.JobStateMachine, 'summaryStatsDBName')
            try:
                self.statsumdatabase = self.couchdb.connectDatabase(dbname, size=250)
            except Exception as ex:
                logging.error("Error connecting to couch db '%s': %s", dbname, str(ex))
                self.jsumdatabase = None
                return False

        return True

    def propagate(self, jobs, newstate, oldstate, updatesummary=False):
        """
        Move the job from a state to another. Book keep the change to CouchDB.
        Report the information to the Dashboard.
        Take a list of job objects (dicts) and the desired state change.
        Return the jobs back, throw assertion error if the state change is not allowed
        and other exceptions as appropriate
        """
        if not isinstance(jobs, list):
            jobs = [jobs]

        if len(jobs) == 0:
            return

        # 1. Is the state transition allowed?
        self.check(newstate, oldstate)

        # 2. Load workflow/task information into the jobs
        self.loadExtraJobInformation(jobs)

        # 3. Make the state transition
        self.persist(jobs, newstate, oldstate)

        # 4. Report the job transition to the dashboard
        try:
            self.reportToDashboard(jobs, newstate, oldstate)
        except Exception as ex:
            logging.error("Error reporting to the dashboard: %s", str(ex))
            logging.error(traceback.format_exc())

        # 5. Document the state transition in couch
        try:
            self.recordInCouch(jobs, newstate, oldstate, updatesummary)
        except UnicodeDecodeError as ex:
            msg = "A critical error happened! Report it to developers. Error: %s" % str(ex)
            logging.exception(msg)
            raise
        except Exception as ex:
            logging.error("Error updating job in couch: %s", str(ex))
            logging.error(traceback.format_exc())

        return

    def check(self, newstate, oldstate):
        """
        check that the transition is allowed. return a tuple of the transition
        if it is allowed, throw up an exception if not.
        """
        newstate = newstate.lower()
        oldstate = oldstate.lower()

        # Check for wrong transitions
        transitions = Transitions()
        assert newstate in transitions[oldstate], \
            "Illegal state transition requested: %s -> %s" % (oldstate, newstate)

    def recordInCouch(self, jobs, newstate, oldstate, updatesummary=False):
        """
        _recordInCouch_

        Record relevant job information in couch. If the job does not yet exist
        in couch it will be saved as a seperate document.  If the job has a FWJR
        attached that will be saved as a seperate document.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        timestamp = int(time.time())
        couchRecordsToUpdate = []

        for job in jobs:
            couchDocID = job.get("couch_record", None)

            if newstate == "new":
                oldstate = "none"

            if job.get("site_cms_name", None):
                if newstate == "executing":
                    jobLocation = job["site_cms_name"]
                else:
                    jobLocation = "Agent"
            else:
                jobLocation = "Agent"

            if couchDocID is None:
                jobDocument = {}
                jobDocument["_id"] = str(job["id"])
                job["couch_record"] = jobDocument["_id"]
                jobDocument["jobid"] = job["id"]
                jobDocument["workflow"] = job["workflow"]
                jobDocument["task"] = job["task"]
                jobDocument["owner"] = job["owner"]

                jobDocument["inputfiles"] = []
                for inputFile in job["input_files"]:
                    docInputFile = inputFile.json()

                    docInputFile["parents"] = []
                    for parent in inputFile["parents"]:
                        docInputFile["parents"].append({"lfn": parent["lfn"]})

                    jobDocument["inputfiles"].append(docInputFile)

                jobDocument["states"] = {"0": {"oldstate": oldstate,
                                               "newstate": newstate,
                                               "location": jobLocation,
                                               "timestamp": timestamp}}

                jobDocument["jobgroup"] = job["jobgroup"]
                jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"],
                                       "LastEvent": job["mask"]["LastEvent"],
                                       "FirstLumi": job["mask"]["FirstLumi"],
                                       "LastLumi": job["mask"]["LastLumi"],
                                       "FirstRun": job["mask"]["FirstRun"],
                                       "LastRun": job["mask"]["LastRun"]}

                if job['mask']['runAndLumis'] != {}:
                    # Then we have to save the mask runAndLumis
                    jobDocument['mask']['runAndLumis'] = {}
                    for key in job['mask']['runAndLumis'].keys():
                        jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key]

                jobDocument["name"] = job["name"]
                jobDocument["type"] = "job"
                jobDocument["user"] = job.get("user", None)
                jobDocument["group"] = job.get("group", None)
                jobDocument["taskType"] = job.get("taskType", "Unknown")
                jobDocument["jobType"] = job.get("jobType", "Unknown")

                couchRecordsToUpdate.append({"jobid": job["id"],
                                             "couchid": jobDocument["_id"]})
                self.jobsdatabase.queue(jobDocument, callback=discardConflictingDocument)
            else:
                # We send a PUT request to the stateTransition update handler.
                # Couch expects the parameters to be passed as arguments to in
                # the URI while the Requests class will only encode arguments
                # this way for GET requests.  Changing the Requests class to
                # encode PUT arguments as couch expects broke a bunch of code so
                # we'll just do our own encoding here.
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    newstate,
                                                                                    jobLocation,
                                                                                    timestamp)
                self.jobsdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)

            # updating the status of the summary doc only when it is explicitely requested
            # doc is already in couch
            if updatesummary:
                jobSummaryId = job["name"]
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobSummaryState/" + jobSummaryId
                # map retrydone state to jobfailed state for monitoring
                if newstate == "retrydone":
                    monitorState = "jobfailed"
                else:
                    monitorState = newstate
                updateUri += "?newstate=%s&timestamp=%s" % (monitorState, timestamp)
                self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
                logging.debug("Updated job summary status for job %s", jobSummaryId)

                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobStateTransition/" + jobSummaryId
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    monitorState,
                                                                                    job["location"],
                                                                                    timestamp)
                self.jsumdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
                logging.debug("Updated job summary state history for job %s", jobSummaryId)

            if job.get("fwjr", None):

                cachedByWorkflow = self.workloadCache.setdefault(job['workflow'],
                                                                 getDataFromSpecFile(
                                                                     self.getWorkflowSpecDAO.execute(job['task'])[
                                                                         job['task']]['spec']))
                job['fwjr'].setCampaign(cachedByWorkflow.get('Campaign', ''))
                job['fwjr'].setPrepID(cachedByWorkflow.get(job['task'], ''))
                # If there are too many input files, strip them out
                # of the FWJR, as they should already
                # be in the database
                # This is not critical
                try:
                    if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles:
                        job['fwjr'].stripInputFiles()
                except Exception as ex:
                    logging.error("Error while trying to strip input files from FWJR.  Ignoring. : %s", str(ex))

                if newstate == "retrydone":
                    jobState = "jobfailed"
                else:
                    jobState = newstate

                # there is race condition updating couch record location and job is completed.
                # for the fast fail job, it could miss the location update
                job["location"] = job["fwjr"].getSiteName() or job.get("location", "Unknown")
                # complete fwjr document
                job["fwjr"].setTaskName(job["task"])
                jsonFWJR = job["fwjr"].__to_json__(None)

                # Don't archive cleanup job report
                if job["jobType"] == "Cleanup":
                    archStatus = "skip"
                else:
                    archStatus = "ready"

                fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]),
                                "jobid": job["id"],
                                "jobtype": job["jobType"],
                                "jobstate": jobState,
                                "retrycount": job["retry_count"],
                                "archivestatus": archStatus,
                                "fwjr": jsonFWJR,
                                "type": "fwjr"}
                self.fwjrdatabase.queue(fwjrDocument, timestamp=True, callback=discardConflictingDocument)

                updateSummaryDB(self.statsumdatabase, job)

                # TODO: can add config switch to swich on and off
                # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'):
                if (job["retry_count"] > 0) or (newstate != 'success'):
                    jobSummaryId = job["name"]
                    # building a summary of fwjr
                    logging.debug("Pushing job summary for job %s", jobSummaryId)
                    errmsgs = {}
                    inputs = []
                    if "steps" in fwjrDocument["fwjr"]:
                        for step in fwjrDocument["fwjr"]["steps"]:
                            if "errors" in fwjrDocument["fwjr"]["steps"][step]:
                                errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]]
                            if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in \
                                    fwjrDocument["fwjr"]["steps"][step]["input"]:
                                inputs.extend(
                                    [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"]
                                     if "runs" in source])

                    outputs = []
                    outputDataset = None
                    for singlestep in job["fwjr"].listSteps():
                        for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep):
                            if singlefile:
                                if len(singlefile.get('locations', set())) > 1:
                                    locations = list(singlefile.get('locations'))
                                elif singlefile.get('locations'):
                                    locations = singlefile['locations'].pop()
                                else:
                                    locations = set()
                                if CMSSTEP.match(singlestep):
                                    outType = 'output'
                                else:
                                    outType = singlefile.get('module_label', None)
                                outputs.append({'type': outType,
                                                'lfn': singlefile.get('lfn', None),
                                                'location': locations,
                                                'checksums': singlefile.get('checksums', {}),
                                                'size': singlefile.get('size', None)})
                                # it should have one output dataset for all the files
                                outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset
                    inputFiles = []
                    for inputFileStruct in job["fwjr"].getAllInputFiles():
                        # check if inputFileSummary needs to be extended
                        inputFileSummary = {}
                        inputFileSummary["lfn"] = inputFileStruct["lfn"]
                        inputFileSummary["input_type"] = inputFileStruct["input_type"]
                        inputFiles.append(inputFileSummary)

                    # Don't record intermediate jobfailed status in the jobsummary
                    # change to jobcooloff which will be overwritten by error handler anyway
                    if (job["retry_count"] > 0) and (newstate == 'jobfailed'):
                        summarystate = 'jobcooloff'
                    else:
                        summarystate = newstate

                    jobSummary = {"_id": jobSummaryId,
                                  "wmbsid": job["id"],
                                  "type": "jobsummary",
                                  "retrycount": job["retry_count"],
                                  "workflow": job["workflow"],
                                  "task": job["task"],
                                  "jobtype": job["jobType"],
                                  "state": summarystate,
                                  "site": job.get("location", None),
                                  "cms_location": job["fwjr"].getSiteName(),
                                  "exitcode": job["fwjr"].getExitCode(),
                                  "eos_log_url": job["fwjr"].getLogURL(),
                                  "worker_node_info": job["fwjr"].getWorkerNodeInfo(),
                                  "errors": errmsgs,
                                  "lumis": inputs,
                                  "outputdataset": outputDataset,
                                  "inputfiles": inputFiles,
                                  "acdc_url": "%s/%s" % (
                                  sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database),
                                  "agent_name": self.config.Agent.hostName,
                                  "output": outputs}
                    if couchDocID is not None:
                        try:
                            currentJobDoc = self.jsumdatabase.document(id=jobSummaryId)
                            jobSummary['_rev'] = currentJobDoc['_rev']
                            jobSummary['state_history'] = currentJobDoc.get('state_history', [])
                            # record final status transition
                            if newstate == 'success':
                                finalStateDict = {'oldstate': oldstate,
                                                  'newstate': newstate,
                                                  'location': job["location"],
                                                  'timestamp': timestamp}
                                jobSummary['state_history'].append(finalStateDict)

                            noEmptyList = ["inputfiles", "lumis"]
                            for prop in noEmptyList:
                                jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, [])
                        except CouchNotFoundError:
                            pass
                    self.jsumdatabase.queue(jobSummary, timestamp=True)

        if len(couchRecordsToUpdate) > 0:
            self.setCouchDAO.execute(bulkList=couchRecordsToUpdate,
                                     conn=self.getDBConn(),
                                     transaction=self.existingTransaction())

        self.jobsdatabase.commit(callback=discardConflictingDocument)
        self.fwjrdatabase.commit(callback=discardConflictingDocument)
        self.jsumdatabase.commit()
        return

    def persist(self, jobs, newstate, oldstate):
        """
        _persist_

        Update the job state in the database.
        """

        if newstate == "killed":
            self.incrementRetryDAO.execute(jobs, increment=99999,
                                           conn=self.getDBConn(),
                                           transaction=self.existingTransaction())
        elif oldstate == "submitcooloff" or oldstate == "jobcooloff" or oldstate == "createcooloff":
            self.incrementRetryDAO.execute(jobs,
                                           conn=self.getDBConn(),
                                           transaction=self.existingTransaction())
        for job in jobs:
            job['state'] = newstate
            job['oldstate'] = oldstate

        dao = self.daofactory(classname="Jobs.ChangeState")
        dao.execute(jobs, conn=self.getDBConn(),
                    transaction=self.existingTransaction())

    def reportToDashboard(self, jobs, newstate, oldstate):
        """
        _reportToDashboard_

        Report job information to the dashboard, completes the job dictionaries
        with any additional information needed
        """

        # If the new state is created it possible came from 3 locations:
        # JobCreator in that case it comes with all the needed info
        # ErrorHandler comes with the standard information of a WMBSJob
        # RetryManager comes with the standard information of a WMBSJob
        # Unpause script comes with the standard information of a WMBSJob
        # For those last 3 cases we need to fill the gaps
        if newstate == 'created':
            incrementRetry = True if 'cooloff' in oldstate else False
            self.completeCreatedJobsInformation(jobs, incrementRetry)
            self.dashboardReporter.handleCreated(jobs)
        # If the new state is executing that was done only by the JobSubmitter,
        # it sends jobs with select information, nevertheless is enough
        elif newstate == 'executing':
            statusMessage = 'Job was successfuly submitted'
            self.dashboardReporter.handleJobStatusChange(jobs, 'submitted',
                                                         statusMessage)
        # If the new state is success, then the JobAccountant sent the jobs.
        # Jobs come with all the standard information of a WMBSJob plus FWJR
        elif newstate == 'success':
            statusMessage = 'Job has completed successfully'
            self.dashboardReporter.handleJobStatusChange(jobs, 'succeeded',
                                                         statusMessage)
        elif newstate == 'jobfailed':
            # If it failed after being in complete state, then  the JobAccountant
            # sent the jobs, these come with all the standard information of a WMBSJob
            # plus FWJR
            if oldstate == 'complete':
                statusMessage = 'Job failed at the site'
            # If it failed while executing then it timed out in BossAir
            # The JobTracker should sent the jobs with the required information
            elif oldstate == 'executing':
                statusMessage = 'Job timed out in the agent'
            self.dashboardReporter.handleJobStatusChange(jobs, 'failed',
                                                         statusMessage)
        # In this case either a paused job was killed or the workqueue is killing
        # a workflow, in both cases a WMBSJob with all the info should come
        elif newstate == 'killed':
            if oldstate == 'jobpaused':
                statusMessage = 'A paused job was killed, maybe it is beyond repair'
            else:
                statusMessage = 'The whole workflow is being killed'
            self.dashboardReporter.handleJobStatusChange(jobs, 'killed',
                                                         statusMessage)

    def loadExtraJobInformation(self, jobs):
        # This is needed for both couch and dashboard
        jobIDsToCheck = []
        jobTasksToCheck = []
        # This is for mapping ids to the position in the list
        jobMap = {}
        for idx, job in enumerate(jobs):
            if job["couch_record"] is None:
                jobIDsToCheck.append(job["id"])
            if job.get("task", None) is None or job.get("workflow", None) is None \
                    or job.get("taskType", None) is None or job.get("jobType", None) is None:
                jobTasksToCheck.append(job["id"])
            jobMap[job["id"]] = idx

        if len(jobIDsToCheck) > 0:
            couchIDs = self.getCouchDAO.execute(jobID=jobIDsToCheck,
                                                conn=self.getDBConn(),
                                                transaction=self.existingTransaction())
            for couchID in couchIDs:
                idx = jobMap[couchID["jobid"]]
                jobs[idx]["couch_record"] = couchID["couch_record"]
        if len(jobTasksToCheck) > 0:
            jobTasks = self.workflowTaskDAO.execute(jobIDs=jobTasksToCheck,
                                                    conn=self.getDBConn(),
                                                    transaction=self.existingTransaction())
            for jobTask in jobTasks:
                idx = jobMap[jobTask["id"]]
                jobs[idx]["task"] = jobTask["task"]
                jobs[idx]["workflow"] = jobTask["name"]
                jobs[idx]["taskType"] = jobTask["type"]
                jobs[idx]["jobType"] = jobTask["subtype"]

    def completeCreatedJobsInformation(self, jobs, incrementRetry=False):
        for job in jobs:
            # It there's no jobID in the mask then it's not loaded
            if "jobID" not in job["mask"]:
                # Make sure the daofactory was not stripped
                if getattr(job["mask"], "daofactory", None):
                    job["mask"].load(jobID=job["id"])
            # If the mask is event based, then we have info to report
            if job["mask"]['inclusivemask'] and job["mask"]["LastEvent"] is not None and \
                            job["mask"]["FirstEvent"] is not None:
                job["nEventsToProc"] = int(job["mask"]["LastEvent"] -
                                           job["mask"]["FirstEvent"] + 1)
            # Increment retry when commanded
            if incrementRetry:
                job["retry_count"] += 1

    def recordLocationChange(self, jobs):
        """
        _recordLocationChange_

        Record a location change in couch and WMBS,
        this expects a list of dictionaries with
        jobid and location keys which represent
        the job id in WMBS and new location respectively.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        # First update safely in WMBS
        self.updateLocationDAO.execute(jobs, conn=self.getDBConn(),
                                       transaction=self.existingTransaction())
        # Now try couch, this can fail and we don't require it to succeed
        try:
            jobIDs = [x['jobid'] for x in jobs]
            couchIDs = self.getCouchDAO.execute(jobIDs, conn=self.getDBConn(),
                                                transaction=self.existingTransaction())
            locationCache = dict((x['jobid'], x['location']) for x in jobs)
            for entry in couchIDs:
                couchRecord = entry['couch_record']
                location = locationCache[entry['jobid']]
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/locationTransition/" + couchRecord
                updateUri += "?location=%s" % (location)
                self.jobsdatabase.makeRequest(uri=updateUri, type="PUT", decode=False)
        except Exception as ex:
            logging.error("Error updating job in couch: %s", str(ex))
            logging.error(traceback.format_exc())
Esempio n. 15
0
class DashboardReporterTest(unittest.TestCase):
    """
    _DashboardReporterTest_

    Unit tests for the dashboard reporter class.
    """
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config=None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        return

    def tearDown(self):
        """
        _tearDown_

        Just get out
        """
        pass

    def trimNoneValues(self, package):
        """
        _trimNoneValues_

        Simple utility to trim the None values of a dictionary
        """
        trimmed = {}
        for key in package:
            if package[key] != None:
                trimmed[key] = package[key]
        return trimmed

    def createTestJob(self, fwjr):
        """
        _createTestJob_

        Creates a minimal job to report
        """
        job = Job('finishedJob')
        job['retry_count'] = 1
        job['workflow'] = 'testWorkflow'
        job['fwjr'] = fwjr

        return job

    def testHandleSteps(self):
        """
        _testHandleSteps_

        Check that we can extract the information from a completed job
        and report it

        """
        job = self.createTestJob(self.processingReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.mergeReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.errorReport)
        self.reporter.handleSteps(job)

    def testPerformanceReport(self):
        """
        _testPerformanceReport_

        Check that the performance information is extracted correctly for
        different reports
        """
        step = self.processingReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(len(self.trimNoneValues(perfInfo)), 21,
                         'Found less information than expected')
        self.assertEqual(perfInfo['PeakValueRss'], '891.617',
                         'Values do not match')
        self.assertEqual(perfInfo['readCachePercentageOps'], 0.995779157341,
                         'Values do not match')
        self.assertEqual(perfInfo['MaxEventTime'], '3.32538',
                         'Values do not match')

        step = self.processingReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.processingReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')

        step = self.errorReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'cmsRun1 performance info is not empty')

        step = self.errorReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.errorReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')

    def testEventInformationReport(self):
        """
        _testEventInformationReport_

        Check that the event information is extracted correctly for
        different reports
        """
        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.processingReport)
        self.assertEqual(eventInfo['inputEvents'], 18192,
                         'Input events do not match')
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-WElectron-PromptSkim-v1:USER:1603'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-LogErrorMonitor-PromptSkim-v1:USER:137'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-LogError-PromptSkim-v1:RAW-RECO:66'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-TOPElePlusJets-PromptSkim-v1:AOD:2320'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-HighMET-PromptSkim-v1:RAW-RECO:8'), 1)
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-DiTau-PromptSkim-v1:RAW-RECO:192'), 1)

        eventInfo = self.reporter.getEventInformation('stageOut1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {}, 'stageOut1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('logArch1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {}, 'logArch1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.mergeReport)
        self.assertEqual(eventInfo['inputEvents'], 0,
                         'Input events do not match')
        self.assertEqual(
            eventInfo['OutputEventInfo'].count(
                'Run2012B-LogError-PromptSkim-v1:RAW-RECO:0'), 1)

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.errorReport)
        self.assertEqual(eventInfo, {}, 'Error report event info is not empty')
Esempio n. 16
0
class ChangeState(WMObject, WMConnectionBase):
    """
    Propagate the state of a job through the JSM.
    """
    def __init__(self, config, couchDbName = None):
        WMObject.__init__(self, config)
        WMConnectionBase.__init__(self, "WMCore.WMBS")

        if couchDbName == None:
            self.dbname = getattr(self.config.JobStateMachine, "couchDBName")
        else:
            self.dbname = couchDbName
            
        self.jobsdatabase = None
        self.fwjrdatabase = None
        self.jsumdatabase = None
        self.statsumdatabase = None
        
        self.couchdb = CouchServer(self.config.JobStateMachine.couchurl)
        self._connectDatabases()

        try:
            self.dashboardReporter = DashboardReporter(config)
        except Exception as ex:
            logging.error("Error setting up the \
-                          dashboard reporter: %s" % str(ex))
            raise

        self.getCouchDAO = self.daofactory("Jobs.GetCouchID")
        self.setCouchDAO = self.daofactory("Jobs.SetCouchID")
        self.incrementRetryDAO = self.daofactory("Jobs.IncrementRetry")
        self.workflowTaskDAO = self.daofactory("Jobs.GetWorkflowTask")
        self.jobTypeDAO = self.daofactory("Jobs.GetType")
        self.updateLocationDAO = self.daofactory("Jobs.UpdateLocation")

        self.maxUploadedInputFiles = getattr(self.config.JobStateMachine, 'maxFWJRInputFiles', 1000)
        return

    def _connectDatabases(self):
        """
        Try connecting to the couchdbs
        """
        if not hasattr(self, 'jobsdatabase') or self.jobsdatabase is None:
            try:
                self.jobsdatabase = self.couchdb.connectDatabase("%s/jobs" % self.dbname, size = 250)
            except Exception as ex:
                logging.error("Error connecting to couch db '%s/jobs': %s" % (self.dbname, str(ex)))
                self.jobsdatabase = None
                return False

        if not hasattr(self, 'fwjrdatabase') or self.fwjrdatabase is None:
            try:
                self.fwjrdatabase = self.couchdb.connectDatabase("%s/fwjrs" % self.dbname, size = 250)
            except Exception as ex:
                logging.error("Error connecting to couch db '%s/fwjrs': %s" % (self.dbname, str(ex)))
                self.fwjrdatabase = None
                return False

        if not hasattr(self, 'jsumdatabase') or self.jsumdatabase is None:
            dbname = getattr(self.config.JobStateMachine, 'jobSummaryDBName')
            try:
                self.jsumdatabase = self.couchdb.connectDatabase(dbname, size = 250 )
            except Exception as ex:
                logging.error("Error connecting to couch db '%s': %s" % (dbname, str(ex)))
                self.jsumdatabase = None
                return False
        
        if not hasattr(self, 'statsumdatabase') or self.statsumdatabase is None:
            dbname = getattr(self.config.JobStateMachine, 'summaryStatsDBName')
            try:
                self.statsumdatabase = self.couchdb.connectDatabase(dbname, size = 250 )
            except Exception as ex:
                logging.error("Error connecting to couch db '%s': %s" % (dbname, str(ex)))
                self.jsumdatabase = None
                return False

        return True
    
    def propagate(self, jobs, newstate, oldstate, updatesummary = False):
        """
        Move the job from a state to another. Book keep the change to CouchDB.
        Report the information to the Dashboard.
        Take a list of job objects (dicts) and the desired state change.
        Return the jobs back, throw assertion error if the state change is not allowed
        and other exceptions as appropriate
        """
        if not isinstance(jobs, list):
            jobs = [jobs]

        if len(jobs) == 0:
            return

        # 1. Is the state transition allowed?
        self.check(newstate, oldstate)

        # 2. Load workflow/task information into the jobs
        self.loadExtraJobInformation(jobs)
        
        # 3. Make the state transition
        self.persist(jobs, newstate, oldstate)

        # 4. Document the state transition in couch
        try:
            self.recordInCouch(jobs, newstate, oldstate, updatesummary)
        except Exception as ex:
            logging.error("Error updating job in couch: %s" % str(ex))
            logging.error(traceback.format_exc())

        # 5. Report the job transition to the dashboard
        try:
            self.reportToDashboard(jobs, newstate, oldstate)
        except Exception as ex:
            logging.error("Error reporting to the dashboard: %s" % str(ex))
            logging.error(traceback.format_exc())

        return

    def check(self, newstate, oldstate):
        """
        check that the transition is allowed. return a tuple of the transition
        if it is allowed, throw up an exception if not.
        """
        newstate = newstate.lower()
        oldstate = oldstate.lower()

        # Check for wrong transitions
        transitions = Transitions()
        assert newstate in transitions[oldstate], \
               "Illegal state transition requested: %s -> %s" % (oldstate, newstate)

    def recordInCouch(self, jobs, newstate, oldstate, updatesummary = False):
        """
        _recordInCouch_

        Record relevant job information in couch. If the job does not yet exist
        in couch it will be saved as a seperate document.  If the job has a FWJR
        attached that will be saved as a seperate document.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        timestamp = int(time.time())
        couchRecordsToUpdate = []

        for job in jobs:
            couchDocID = job.get("couch_record", None)

            if newstate == "new":
                oldstate = "none"

            if job.get("site_cms_name", None):
                if newstate == "executing":
                    jobLocation = job["site_cms_name"]
                else:
                    jobLocation = "Agent"
            else:
                jobLocation = "Agent"

            if couchDocID == None:
                jobDocument = {}
                jobDocument["_id"] = str(job["id"])
                job["couch_record"] = jobDocument["_id"]
                jobDocument["jobid"] = job["id"]
                jobDocument["workflow"] = job["workflow"]
                jobDocument["task"] = job["task"]
                jobDocument["owner"] = job["owner"]

                jobDocument["inputfiles"] = []
                for inputFile in job["input_files"]:
                    docInputFile = inputFile.json()

                    docInputFile["parents"] = []
                    for parent in inputFile["parents"]:
                        docInputFile["parents"].append({"lfn": parent["lfn"]})

                    jobDocument["inputfiles"].append(docInputFile)

                jobDocument["states"] = {"0": {"oldstate": oldstate,
                                               "newstate": newstate,
                                               "location": jobLocation,
                                               "timestamp": timestamp}}

                jobDocument["jobgroup"] = job["jobgroup"]
                jobDocument["mask"] = {"FirstEvent": job["mask"]["FirstEvent"],
                                       "LastEvent": job["mask"]["LastEvent"],
                                       "FirstLumi": job["mask"]["FirstLumi"],
                                       "LastLumi": job["mask"]["LastLumi"],
                                       "FirstRun": job["mask"]["FirstRun"],
                                       "LastRun": job["mask"]["LastRun"]}

                if job['mask']['runAndLumis'] != {}:
                    # Then we have to save the mask runAndLumis
                    jobDocument['mask']['runAndLumis'] = {}
                    for key in job['mask']['runAndLumis'].keys():
                        jobDocument['mask']['runAndLumis'][str(key)] = job['mask']['runAndLumis'][key]

                jobDocument["name"] = job["name"]
                jobDocument["type"] = "job"
                jobDocument["user"] = job.get("user", None)
                jobDocument["group"] = job.get("group", None)
                jobDocument["taskType"] = job.get("taskType", "Unknown")
                jobDocument["jobType"] = job.get("jobType", "Unknown")

                couchRecordsToUpdate.append({"jobid": job["id"],
                                             "couchid": jobDocument["_id"]})
                self.jobsdatabase.queue(jobDocument, callback = discardConflictingDocument)
            else:
                # We send a PUT request to the stateTransition update handler.
                # Couch expects the parameters to be passed as arguments to in
                # the URI while the Requests class will only encode arguments
                # this way for GET requests.  Changing the Requests class to
                # encode PUT arguments as couch expects broke a bunch of code so
                # we'll just do our own encoding here.
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/stateTransition/" + couchDocID
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    newstate,
                                                                                    jobLocation,
                                                                                    timestamp)
                self.jobsdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)

            # updating the status of the summary doc only when it is explicitely requested
            # doc is already in couch
            if updatesummary:
                jobSummaryId = job["name"]
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobSummaryState/" + jobSummaryId
                # map retrydone state to jobfailed state for monitoring
                if newstate == "retrydone":
                    monitorState = "jobfailed"
                else:
                    monitorState = newstate
                updateUri += "?newstate=%s&timestamp=%s" % (monitorState, timestamp)
                self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
                logging.debug("Updated job summary status for job %s" % jobSummaryId)
                
                updateUri = "/" + self.jsumdatabase.name + "/_design/WMStatsAgent/_update/jobStateTransition/" + jobSummaryId
                updateUri += "?oldstate=%s&newstate=%s&location=%s&timestamp=%s" % (oldstate,
                                                                                    monitorState,
                                                                                    job["location"],
                                                                                    timestamp)
                self.jsumdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
                logging.debug("Updated job summary state history for job %s" % jobSummaryId)

            if job.get("fwjr", None):

                # If there are too many input files, strip them out
                # of the FWJR, as they should already
                # be in the database
                # This is not critical
                try:
                    if len(job['fwjr'].getAllInputFiles()) > self.maxUploadedInputFiles:
                        job['fwjr'].stripInputFiles()
                except:
                    logging.error("Error while trying to strip input files from FWJR.  Ignoring.")
                    pass

                # complete fwjr document
                job["fwjr"].setTaskName(job["task"])
                jsonFWJR = job["fwjr"].__to_json__(None)
                fwjrDocument = {"_id": "%s-%s" % (job["id"], job["retry_count"]),
                                "jobid": job["id"],
                                "retrycount": job["retry_count"],
                                "archivestatus": "ready",
                                "fwjr": jsonFWJR,
                                "type": "fwjr"}
                self.fwjrdatabase.queue(fwjrDocument, timestamp = True, callback = discardConflictingDocument)
                
                updateSummaryDB(self.statsumdatabase, job)

                #TODO: can add config switch to swich on and off
                # if self.config.JobSateMachine.propagateSuccessJobs or (job["retry_count"] > 0) or (newstate != 'success'):
                if (job["retry_count"] > 0) or (newstate != 'success'):
                    jobSummaryId = job["name"]
                    # building a summary of fwjr
                    logging.debug("Pushing job summary for job %s" % jobSummaryId)
                    errmsgs = {}
                    inputs = []
                    if "steps" in fwjrDocument["fwjr"]:
                        for step in fwjrDocument["fwjr"]["steps"]:
                            if "errors" in fwjrDocument["fwjr"]["steps"][step]:
                                errmsgs[step] = [error for error in fwjrDocument["fwjr"]["steps"][step]["errors"]]
                            if "input" in fwjrDocument["fwjr"]["steps"][step] and "source" in fwjrDocument["fwjr"]["steps"][step]["input"]:
                                inputs.extend( [source["runs"] for source in fwjrDocument["fwjr"]['steps'][step]["input"]["source"] if "runs" in source] )
    
                    outputs = []
                    outputDataset = None
                    for singlestep in job["fwjr"].listSteps():
                        for singlefile in job["fwjr"].getAllFilesFromStep(step=singlestep):
                            if singlefile:
                                outputs.append({'type': 'output' if CMSSTEP.match(singlestep) else singlefile.get('module_label', None),
                                                'lfn': singlefile.get('lfn', None),
                                                'location': list(singlefile.get('locations', set([]))) if len(singlefile.get('locations', set([]))) > 1
                                                                                                       else singlefile['locations'].pop(),
                                                'checksums': singlefile.get('checksums', {}),
                                                'size': singlefile.get('size', None) })
                                #it should have one output dataset for all the files
                                outputDataset = singlefile.get('dataset', None) if not outputDataset else outputDataset
                    inputFiles = []
                    for inputFileStruct in job["fwjr"].getAllInputFiles():
                        # check if inputFileSummary needs to be extended
                        inputFileSummary = {}
                        inputFileSummary["lfn"] = inputFileStruct["lfn"]
                        inputFileSummary["input_type"] = inputFileStruct["input_type"]
                        inputFiles.append(inputFileSummary)
                    
                    # Don't record intermediate jobfailed status in the jobsummary
                    # change to jobcooloff which will be overwritten by error handler anyway
                    if (job["retry_count"] > 0) and (newstate == 'jobfailed'):
                        summarystate = 'jobcooloff'
                    else:
                        summarystate = newstate
                           
                    jobSummary = {"_id": jobSummaryId,
                                  "wmbsid": job["id"],
                                  "type": "jobsummary",
                                  "retrycount": job["retry_count"],
                                  "workflow": job["workflow"],
                                  "task": job["task"],
                                  "jobtype": job["jobType"],
                                  "state": summarystate,
                                  "site": job.get("location", None),
                                  "cms_location": job["fwjr"].getSiteName(),
                                  "exitcode": job["fwjr"].getExitCode(),
                                  "errors": errmsgs,
                                  "lumis": inputs,
                                  "outputdataset": outputDataset,
                                  "inputfiles": inputFiles,
                                  "acdc_url": "%s/%s" % (sanitizeURL(self.config.ACDC.couchurl)['url'], self.config.ACDC.database),
                                  "agent_name": self.config.Agent.hostName,
                                  "output": outputs }
                    if couchDocID is not None:
                        try:
                            currentJobDoc = self.jsumdatabase.document(id = jobSummaryId)
                            jobSummary['_rev'] = currentJobDoc['_rev']
                            jobSummary['state_history'] = currentJobDoc.get('state_history', [])
                            # record final status transition
                            if newstate == 'success':
                                finalStateDict = {'oldstate': oldstate,
                                                  'newstate': newstate,
                                                  'location': job["location"],
                                                  'timestamp': timestamp}
                                jobSummary['state_history'].append(finalStateDict)
                                
                            noEmptyList = ["inputfiles", "lumis"]
                            for prop in noEmptyList:
                                jobSummary[prop] = jobSummary[prop] if jobSummary[prop] else currentJobDoc.get(prop, [])
                        except CouchNotFoundError:
                            pass
                    self.jsumdatabase.queue(jobSummary, timestamp = True)

        if len(couchRecordsToUpdate) > 0:
            self.setCouchDAO.execute(bulkList = couchRecordsToUpdate,
                                     conn = self.getDBConn(),
                                     transaction = self.existingTransaction())

        self.jobsdatabase.commit(callback = discardConflictingDocument)
        self.fwjrdatabase.commit(callback = discardConflictingDocument)
        self.jsumdatabase.commit()
        return

    def persist(self, jobs, newstate, oldstate):
        """
        _persist_

        Update the job state in the database.
        """
        
        if newstate == "killed":
            self.incrementRetryDAO.execute(jobs, increment = 99999,
                                           conn = self.getDBConn(),
                                           transaction = self.existingTransaction())
        elif oldstate == "submitcooloff" or oldstate == "jobcooloff" or oldstate == "createcooloff" :
            self.incrementRetryDAO.execute(jobs,
                                           conn = self.getDBConn(),
                                           transaction = self.existingTransaction())
        for job in jobs:
            job['state'] = newstate
            job['oldstate'] = oldstate

        dao = self.daofactory(classname = "Jobs.ChangeState")
        dao.execute(jobs, conn = self.getDBConn(),
                    transaction = self.existingTransaction())

    def reportToDashboard(self, jobs, newstate, oldstate):
        """
        _reportToDashboard_

        Report job information to the dashboard, completes the job dictionaries
        with any additional information needed
        """

        #If the new state is created it possible came from 3 locations:
        #JobCreator in that case it comes with all the needed info
        #ErrorHandler comes with the standard information of a WMBSJob
        #RetryManager comes with the standard information of a WMBSJob
        #Unpause script comes with the standard information of a WMBSJob
        #For those last 3 cases we need to fill the gaps
        if newstate == 'created':
            incrementRetry = True if 'cooloff' in oldstate else False
            self.completeCreatedJobsInformation(jobs, incrementRetry)
            self.dashboardReporter.handleCreated(jobs)
        #If the new state is executing that was done only by the JobSubmitter,
        #it sends jobs with select information, nevertheless is enough
        elif newstate == 'executing':
            statusMessage = 'Job was successfuly submitted'
            self.dashboardReporter.handleJobStatusChange(jobs, 'submitted',
                                                         statusMessage)
        #If the new state is success, then the JobAccountant sent the jobs.
        #Jobs come with all the standard information of a WMBSJob plus FWJR
        elif newstate == 'success':
            statusMessage = 'Job has completed successfully'
            self.dashboardReporter.handleJobStatusChange(jobs, 'succeeded',
                                                         statusMessage)
        elif newstate == 'jobfailed':
            #If it failed after being in complete state, then  the JobAccountant
            #sent the jobs, these come with all the standard information of a WMBSJob
            #plus FWJR
            if oldstate == 'complete':
                statusMessage = 'Job failed at the site'
            #If it failed while executing then it timed out in BossAir
            #The JobTracker should sent the jobs with the required information
            elif oldstate == 'executing':
                statusMessage = 'Job timed out in the agent'
            self.dashboardReporter.handleJobStatusChange(jobs, 'failed',
                                                        statusMessage)
        #In this case either a paused job was killed or the workqueue is killing
        #a workflow, in both cases a WMBSJob with all the info should come
        elif newstate == 'killed':
            if oldstate == 'jobpaused':
                statusMessage = 'A paused job was killed, maybe it is beyond repair'
            else:
                statusMessage = 'The whole workflow is being killed'
            self.dashboardReporter.handleJobStatusChange(jobs, 'killed',
                                                         statusMessage)

    def loadExtraJobInformation(self, jobs):
        #This is needed for both couch and dashboard
        jobIDsToCheck = []
        jobTasksToCheck = []
        #This is for mapping ids to the position in the list
        jobMap = {}
        for idx, job in enumerate(jobs):
            if job["couch_record"] == None:
                jobIDsToCheck.append(job["id"])
            if job.get("task", None) == None or job.get("workflow", None) == None \
                or job.get("taskType", None) == None or job.get("jobType", None) == None:
                jobTasksToCheck.append(job["id"])
            jobMap[job["id"]] = idx

        if len(jobIDsToCheck) > 0:
            couchIDs = self.getCouchDAO.execute(jobID = jobIDsToCheck,
                                                conn = self.getDBConn(),
                                                transaction = self.existingTransaction())
            for couchID in couchIDs:
                idx = jobMap[couchID["jobid"]]
                jobs[idx]["couch_record"] = couchID["couch_record"]
        if len(jobTasksToCheck) > 0:
            jobTasks = self.workflowTaskDAO.execute(jobIDs = jobTasksToCheck,
                                                    conn = self.getDBConn(),
                                                    transaction = self.existingTransaction())
            for jobTask in jobTasks:
                idx = jobMap[jobTask["id"]]
                jobs[idx]["task"] = jobTask["task"]
                jobs[idx]["workflow"] = jobTask["name"]
                jobs[idx]["taskType"] = jobTask["type"]
                jobs[idx]["jobType"]  = jobTask["subtype"]

    def completeCreatedJobsInformation(self, jobs, incrementRetry = False):
        for job in jobs:
            #It there's no jobID in the mask then it's not loaded
            if "jobID" not in job["mask"]:
                #Make sure the daofactory was not stripped
                if getattr(job["mask"], "daofactory", None):
                    job["mask"].load(jobID = job["id"])
            #If the mask is event based, then we have info to report
            if job["mask"]["LastEvent"] != None and \
               job["mask"]["FirstEvent"] != None and job["mask"]['inclusivemask']:
                job["nEventsToProc"] = int(job["mask"]["LastEvent"] -
                                            job["mask"]["FirstEvent"])
            #Increment retry when commanded
            if incrementRetry:
                job["retry_count"] += 1

    def recordLocationChange(self, jobs):
        """
        _recordLocationChange_

        Record a location change in couch and WMBS,
        this expects a list of dictionaries with
        jobid and location keys which represent
        the job id in WMBS and new location respectively.
        """
        if not self._connectDatabases():
            logging.error('Databases not connected properly')
            return

        # First update safely in WMBS
        self.updateLocationDAO.execute(jobs, conn = self.getDBConn(),
                                       transaction = self.existingTransaction())
        # Now try couch, this can fail and we don't require it to succeed
        try:
            jobIDs = [x['jobid'] for x in jobs]
            couchIDs = self.getCouchDAO.execute(jobIDs, conn = self.getDBConn(),
                                                transaction = self.existingTransaction())
            locationCache = dict((x['jobid'], x['location']) for x in jobs)
            for entry in couchIDs:
                couchRecord = entry['couch_record']
                location = locationCache[entry['jobid']]
                updateUri = "/" + self.jobsdatabase.name + "/_design/JobDump/_update/locationTransition/" + couchRecord
                updateUri += "?location=%s" % (location)
                self.jobsdatabase.makeRequest(uri = updateUri, type = "PUT", decode = False)
        except Exception as ex:
            logging.error("Error updating job in couch: %s" % str(ex))
            logging.error(traceback.format_exc())
Esempio n. 17
0
class DashboardReporterTest(unittest.TestCase):
    """
    _DashboardReporterTest_

    Unit tests for the dashboard reporter class.
    """
    def setUp(self):
        """
        _setUp_

        Setup a dashboard reporter
        """
        self.reporter = DashboardReporter(config = None)
        self.processingReport = ProcessingSample.report
        self.mergeReport = MergeSample.report
        self.errorReport = ErrorSample.report
        return

    def tearDown(self):
        """
        _tearDown_

        Just get out
        """
        pass

    def trimNoneValues(self, package):
        """
        _trimNoneValues_

        Simple utility to trim the None values of a dictionary
        """
        trimmed = {}
        for key in package:
            if package[key] != None:
                trimmed[key] = package[key]
        return trimmed

    def createTestJob(self, fwjr):
        """
        _createTestJob_

        Creates a minimal job to report
        """
        job = Job('finishedJob')
        job['retry_count'] = 1
        job['workflow'] = 'testWorkflow'
        job['fwjr'] = fwjr

        return job

    def testHandleSteps(self):
        """
        _testHandleSteps_

        Check that we can extract the information from a completed job
        and report it

        """
        job = self.createTestJob(self.processingReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.mergeReport)
        self.reporter.handleSteps(job)
        job = self.createTestJob(self.errorReport)
        self.reporter.handleSteps(job)

    def testPerformanceReport(self):
        """
        _testPerformanceReport_

        Check that the performance information is extracted correctly for
        different reports
        """
        step = self.processingReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(len(self.trimNoneValues(perfInfo)), 21,
                         'Found less information than expected')
        self.assertEqual(perfInfo['PeakValueRss'], '891.617',
                         'Values do not match')
        self.assertEqual(perfInfo['readCachePercentageOps'], 0.995779157341,
                         'Values do not match')
        self.assertEqual(perfInfo['MaxEventTime'], '3.32538',
                         'Values do not match')

        step = self.processingReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.processingReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')

        step = self.errorReport.retrieveStep('cmsRun1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'cmsRun1 performance info is not empty')

        step = self.errorReport.retrieveStep('logArch1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'logArch1 performance info is not empty')

        step = self.errorReport.retrieveStep('stageOut1')
        perfInfo = self.reporter.getPerformanceInformation(step)
        self.assertEqual(self.trimNoneValues(perfInfo), {},
                         'stageOut1 performance info is not empty')


    def testEventInformationReport(self):
        """
        _testEventInformationReport_

        Check that the event information is extracted correctly for
        different reports
        """
        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.processingReport)
        self.assertEqual(eventInfo['inputEvents'], 18192,
                         'Input events do not match')
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-WElectron-PromptSkim-v1:USER:1603'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-LogErrorMonitor-PromptSkim-v1:USER:137'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-LogError-PromptSkim-v1:RAW-RECO:66'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-TOPElePlusJets-PromptSkim-v1:AOD:2320'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-HighMET-PromptSkim-v1:RAW-RECO:8'), 1)
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-DiTau-PromptSkim-v1:RAW-RECO:192'), 1)

        eventInfo = self.reporter.getEventInformation('stageOut1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {},
                         'stageOut1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('logArch1',
                                                      self.processingReport)
        self.assertEqual(eventInfo, {},
                         'logArch1 event info is not empty')

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.mergeReport)
        self.assertEqual(eventInfo['inputEvents'], 0,
                         'Input events do not match')
        self.assertEqual(eventInfo['OutputEventInfo'].count('Run2012B-LogError-PromptSkim-v1:RAW-RECO:0'), 1)

        eventInfo = self.reporter.getEventInformation('cmsRun1',
                                                      self.errorReport)
        self.assertEqual(eventInfo, {},
                         'Error report event info is not empty')