Example #1
0
    def testOutputFiles(self):
        """
        _testOutputFiles_

        Test some basic manipulation of output files
        """

        myReport = Report("cmsRun1")
        myReport.parse(self.xmlPath)

        files = myReport.getAllFilesFromStep(step = "cmsRun1")

        f1 = files[0]
        f2 = files[1]

        self.assertEqual(f1['outputModule'], 'outputRECORECO')
        self.assertEqual(f1['pfn'], 'outputRECORECO.root')

        self.assertEqual(f2['outputModule'], 'outputALCARECORECO')
        self.assertEqual(f2['pfn'], 'outputALCARECORECO.root')

        for f in files:
            self.assertEqual(f['events'], 2)
            self.assertEqual(f['configURL'], None)
            self.assertEqual(f['merged'], False)
            self.assertEqual(f['validStatus'], None)
            self.assertEqual(f['first_event'], 0)

        return
Example #2
0
    def testA_testSubmit(self):
        """
        _testSubmit_

        Test whether we pick up submitted jobs
        """

        #workload = self.createWorkload()
        jobGroup = self.createTestJobGroup()
        config   = self.getConfig()

        xmlPath = os.path.join(WMCore.WMBase.getTestBase(),
                               "WMCore_t/FwkJobReport_t/PerformanceReport.xml")
        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        changer = ChangeState(config)
        for job in jobGroup.jobs:
            job['fwjr'] = myReport
        changer.propagate(jobGroup.jobs, "complete", "executing")
        changer.propagate(jobGroup.jobs, "success", "complete")

        dashboardReporter = DashboardReporterPoller(config = config)

        dashboardReporter.algorithm()

        # What the hell am I supposed to check?
        changer.propagate(jobGroup.jobs, 'jobfailed', 'executing')

        dashboardReporter.algorithm()

        return
Example #3
0
    def testOutputFiles(self):
        """
        _testOutputFiles_
        
        Test some basic manipulation of output files
        """

        myReport = Report("cmsRun1")
        myReport.parse(self.xmlPath)

        files = myReport.getAllFilesFromStep(step="cmsRun1")

        f1 = files[0]
        f2 = files[1]

        self.assertEqual(f1["outputModule"], "outputRECORECO")
        self.assertEqual(f1["pfn"], "outputRECORECO.root")

        self.assertEqual(f2["outputModule"], "outputALCARECORECO")
        self.assertEqual(f2["pfn"], "outputALCARECORECO.root")

        for f in files:
            self.assertEqual(f["events"], 2)
            self.assertEqual(f["configURL"], None)
            self.assertEqual(f["merged"], False)
            self.assertEqual(f["validStatus"], None)
            self.assertEqual(f["first_event"], 0)

        return
Example #4
0
    def generateCreateFailedReports(self, createFailedJobs):
        """
        _generateCreateFailedReports_

        Create and store FWJR for the  jobs that failed on creation
        leaving meaningful information about what happened with them
        """
        if not createFailedJobs:
            return

        fjrsToSave = []
        for failedJob in createFailedJobs:
            report = Report()
            defaultMsg = "There is a condition which assures that this job will fail if it's submitted"
            report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg))
            jobCache = failedJob.getCache()
            try:
                fjrPath = os.path.join(jobCache, "Report.0.pkl")
                report.save(fjrPath)
                fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath})
                failedJob["fwjr"] = report
            except Exception:
                logging.error("Something went wrong while saving the report for  job %s" % failedJob["id"])

        myThread = threading.currentThread()
        self.setFWJRPath.execute(binds = fjrsToSave, conn = myThread.transaction.conn, transaction = True)

        return
Example #5
0
    def testC_ExecuteSegfault(self):
        """
        _ExecuteSegfault_

        Test the execution of a script
        which raises a ABRT signal which
        is the normal CMSSW response
        to a SEGFAULT.
        """
        self.step.application.command.executable = "test.sh"
        # CMSSW leaves an empty FWJR when a SEGFAULT is present
        open(os.path.join(self.step.builder.workingDir, "FrameworkJobReport.xml"), "w").close()
        try:
            os.chdir(self.step.builder.workingDir)
            executor = StepFactory.getStepExecutor("CMSSW")
            executor.initialise(self.step, self.job)
            executor.pre()
            executor.step.runtime.scramPreScripts.remove("SetupCMSSWPset")
            try:
                executor.execute()
                self.fail("An exception should have been raised")
            except WMExecutionFailure as ex:
                executor.diagnostic(ex.code, executor, ExceptionInstance=ex)
                self.assertEqual(50115, executor.report.getExitCode())
                report = Report()
                report.load("Report.pkl")
                self.assertEqual(50115, report.getExitCode())
        except Exception as ex:
            self.fail("Failure encountered, %s" % str(ex))
        finally:
            os.chdir(self.oldCwd)
        return
Example #6
0
    def testB_ExecuteNonZeroExit(self):
        """
        _ExecuteNonZeroExit_

        Test the execution of a script
        which exits with non-zero code.
        """
        self.step.application.command.executable = "brokenCmsRun.py"
        shutil.copy(os.path.join(getTestBase(),
                                 "WMCore_t/FwkJobReport_t/CMSSWFailReport.xml"),
                    os.path.join(self.step.builder.workingDir, "FrameworkJobReport.xml"))
        try:
            os.chdir(self.step.builder.workingDir)
            executor = StepFactory.getStepExecutor("CMSSW")
            executor.initialise(self.step, self.job)
            executor.pre()
            executor.step.runtime.scramPreScripts.remove("SetupCMSSWPset")
            try:
                executor.execute()
                self.fail("An exception should have been raised")
            except WMExecutionFailure as ex:
                executor.diagnostic(ex.code, executor, ExceptionInstance=ex)
                self.assertEqual(8001, executor.report.getExitCode())
                report = Report()
                report.load("Report.pkl")
                self.assertEqual(8001, report.getExitCode())
        except Exception as ex:
            self.fail("Failure encountered, %s" % str(ex))
        finally:
            os.chdir(self.oldCwd)
        return
Example #7
0
    def testPerformanceJSON(self):
        """
        _testPerformanceJSON_

        Verify that the performance section of the report is correctly converted
        to JSON.
        """
        xmlPath = os.path.join(getTestBase(),
                               "WMCore_t/FwkJobReport_t/PerformanceReport.xml")

        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        perfSection = myReport.__to_json__(thunker = None)["steps"]["cmsRun1"]["performance"]

        self.assertTrue(perfSection.has_key("storage"),
                        "Error: Storage section is missing.")
        self.assertTrue(perfSection.has_key("memory"),
                        "Error: Memory section is missing.")
        self.assertTrue(perfSection.has_key("cpu"),
                        "Error: CPU section is missing.")

        self.assertEqual(perfSection["cpu"]["AvgEventCPU"], "0.626105",
                         "Error: AvgEventCPU is wrong.")
        self.assertEqual(perfSection["cpu"]["TotalJobTime"], "23.5703",
                         "Error: TotalJobTime is wrong.")
        self.assertEqual(perfSection["storage"]["readTotalMB"], 39.6166,
                         "Error: readTotalMB is wrong.")
        self.assertEqual(perfSection["storage"]["readMaxMSec"], 320.653,
                         "Error: readMaxMSec is wrong")
        self.assertEqual(perfSection["memory"]["PeakValueRss"], "492.293",
                         "Error: PeakValueRss is wrong.")
        self.assertEqual(perfSection["memory"]["PeakValueVsize"], "643.281",
                         "Error: PeakValueVsize is wrong.")
        return
Example #8
0
    def testJSONEncoding(self):
        """
        _testJSONEncoding_

        Verify that turning the FWJR into a JSON object works correctly.
        """
        xmlPath = os.path.join(getTestBase(),
                               "WMCore_t/FwkJobReport_t/CMSSWProcessingReport.xml")
        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        jsonReport = myReport.__to_json__(None)

        assert "task" in jsonReport.keys(), \
               "Error: Task name missing from report."

        assert len(jsonReport["steps"].keys()) == 1, \
               "Error: Wrong number of steps in report."
        assert "cmsRun1" in jsonReport["steps"].keys(), \
               "Error: Step missing from json report."

        cmsRunStep = jsonReport["steps"]["cmsRun1"]

        jsonReportSections = ["status", "errors", "logs", "parameters", "site",
                              "analysis", "cleanup", "input", "output", "start"]
        for jsonReportSection in jsonReportSections:
            assert jsonReportSection in cmsRunStep.keys(), \
                "Error: missing section: %s" % jsonReportSection

        return
Example #9
0
    def testAbortedState(self):
        """
        _testAbortedState_

        Check that we can kill jobs when a site is set to aborted
        ### We no longer need this test as we are not killing jobs that are running
        """
        self.tempDir = self.testInit.generateWorkDir()
        config = self.createConfig()
        myResourceControl = ResourceControl(config)
        myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1", "T1_US_FNAL", "MockPlugin")
        myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2", "T1_IT_CNAF", "MockPlugin")

        myResourceControl.insertThreshold("testSite1", "Processing", 20, 10)
        myResourceControl.insertThreshold("testSite1", "Merge", 200, 100)
        myResourceControl.insertThreshold("testSite2", "Processing", 50, 25)
        myResourceControl.insertThreshold("testSite2", "Merge", 135, 65)

        self.createJobs()

        myResourceControl.changeSiteState("testSite1", "Aborted")

        ## Now check the tempDir for a FWJR for the killed job
        reportPath = os.path.join(self.tempDir, "Report.0.pkl")
        report = Report()
        report.load(reportPath)
        self.assertEqual(report.getExitCode(), 71301)
        return
Example #10
0
def thrashCouch():
    """
    _thrashCouch_

    """
    jobs = {"new": set(), "created": set(), "executing": set(),
            "complete": set(), "success": set(), "cleanout": set()}

    config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
    changeState = ChangeState(config)

    myReport = Report()
    myReport.unpersist(os.path.join(getWMBASE(), "test/python/WMComponent_t/JobAccountant_t/fwjrs/LoadTest00.pkl"))

    for i in range(500):
        jobs = createJobs()
        changeState.recordInCouch(jobs, "created", "new")
        changeState.recordInCouch(jobs, "executing", "created")
        changeState.recordInCouch(jobs, "complete", "executing")
        
        for job in jobs:
            job["fwjr"] = myReport
        
        changeState.recordInCouch(jobs, "success", "complete")
            
        for job in jobs:
            job["fwjr"] = None
        
        changeState.recordInCouch(jobs, "cleanout", "success")
        #time.sleep(10)
    return
Example #11
0
    def loadJobReport(self, parameters):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        jobReportPath = parameters.get("fwjr_path", None)
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://","")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except Exception, ex:
            msg =  "Error loading jobReport %s\n" % jobReportPath
            msg += str(ex)
            logging.error(msg)
            logging.debug("Failing job: %s\n" % parameters)
            return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport')
Example #12
0
    def testMultipleInputs(self):
        """
        _testMultipleInputs_

        Verify that parsing XML reports with multiple inputs works correctly.
        """
        xmlPath = os.path.join(getTestBase(),
                               "WMCore_t/FwkJobReport_t/CMSSWMultipleInput.xml")
        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        assert hasattr(myReport.data.cmsRun1.input, "source"), \
               "Error: Report missing input source."

        inputFiles = myReport.getInputFilesFromStep("cmsRun1")

        assert len(inputFiles) == 2, \
               "Error: Wrong number of input files."

        for inputFile in inputFiles:
            assert inputFile["input_type"] == "primaryFiles", \
                   "Error: Wrong input type."
            assert inputFile["module_label"] == "source", \
                   "Error: Module label is wrong"
            assert inputFile["catalog"] == "trivialcatalog_file:/uscmst1/prod/sw/cms/SITECONF/T1_US_FNAL/PhEDEx/storage.xml?protocol=dcap", \
                   "Error: Catalog is wrong."
            assert inputFile["events"] == 2, \
                   "Error: Wrong number of events."
            assert inputFile["input_source_class"] == "PoolSource", \
                   "Error: Wrong input source class."

            if inputFile["guid"] == "F0875ECD-3347-DF11-9FE0-003048678A80":
                assert inputFile["lfn"] == "/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/F0875ECD-3347-DF11-9FE0-003048678A80.root", \
                       "Error: Input LFN is wrong."
                assert inputFile["pfn"] == "dcap://cmsdca3.fnal.gov:24142/pnfs/fnal.gov/usr/cms/WAX/11/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/F0875ECD-3347-DF11-9FE0-003048678A80.root", \
                       "Error: Input PFN is wrong."
                assert len(inputFile["runs"]) == 1, \
                       "Error: Wrong number of runs."
                assert list(inputFile["runs"])[0].run == 124216, \
                       "Error: Wrong run number."
                assert 1 in list(inputFile["runs"])[0], \
                       "Error: Wrong lumi sections in input file."
            else:
                assert inputFile["guid"] == "626D74CE-3347-DF11-9363-0030486790C0", \
                       "Error: Wrong guid."
                assert inputFile["lfn"] == "/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/626D74CE-3347-DF11-9363-0030486790C0.root", \
                       "Error: Input LFN is wrong."
                assert inputFile["pfn"] == "dcap://cmsdca3.fnal.gov:24142/pnfs/fnal.gov/usr/cms/WAX/11/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/626D74CE-3347-DF11-9363-0030486790C0.root", \
                       "Error: Input PFN is wrong."
                assert len(inputFile["runs"]) == 1, \
                       "Error: Wrong number of runs."
                assert list(inputFile["runs"])[0].run == 124216, \
                       "Error: Wrong run number."
                assert 2 in list(inputFile["runs"])[0], \
                       "Error: Wrong lumi sections in input file."

        return
Example #13
0
    def testB_EmulatorTest(self):
        """
        _EmulatorTest_
        
        This is where things get scary.  We need to not only unpack the job,
        but also ascertain whether it can run locally in emulator mode.

        This requires...uh...emulator emulation.
        """


        # Assume all this works, because we tested it in testA
        workloadName = 'basicWorkload'
        workload     = self.createTestWorkload(workloadName = workloadName)

        self.createWMBSComponents(workload = workload)

        self.unpackComponents(workload = workload)


        self.runJobs(workload = workload)

        # Check the report
        taskDir = os.path.join(self.testDir, 'unpack/ReReco/job/WMTaskSpace')
        report = Report()
        report.load(os.path.join(taskDir, 'Report.0.pkl'))
        cmsReport = report.data.cmsRun1



        # Now validate the report
        self.assertEqual(report.data.ceName, socket.gethostname())
        self.assertEqual(report.data.seName, 'cmssrm.fnal.gov')
        self.assertEqual(report.data.siteName, 'T1_US_FNAL')
        self.assertEqual(report.data.hostName, socket.gethostname())
        self.assertTrue(report.data.completed)

        # Should have status 0 (emulator job)
        self.assertEqual(cmsReport.status, 0)

        # Should have one output module
        self.assertEqual(cmsReport.outputModules, ['TestOutputModule'])

        # It should have one file for input and output
        self.assertEqual(cmsReport.input.PoolSource.files.fileCount, 1)
        self.assertEqual(cmsReport.output.TestOutputModule.files.fileCount, 1)

        # So, um, I guess we're done


        # At the end, copy the directory
        #if os.path.exists('tmpDir'):
        #    shutil.rmtree('tmpDir')
        #shutil.copytree(self.testDir, 'tmpDir')

        return
Example #14
0
    def createReport(self):
        """
        Create a test report

        """

        jobReport = Report()
        jobReport.addStep('cmsRun1')

        return jobReport
Example #15
0
 def testASONoNameChange(self):
     AsyncStageOut_t.FakeTransferWorker.setFailProbability(0)
     testJob = self.roundtripHelper(preserveLFN = True)
     stepReport = Report('cmsRun1')
     stepReport.unpersist(testJob['fwjr_path'])
     files = stepReport.getAllFileRefsFromStep(step = 'cmsRun1')
     for file in files:
         self.assertNotEqual( file.lfn.find('store/temp'),
                              -1,
                              "The lfn should still have store/temp: %s" % file.lfn)
Example #16
0
    def isReady(self, job, cooloffType):
        """
        Actual function that does the work
        """
        # This should come from configuration, pause_count

        pauseCount = self.getAlgoParam(job['jobType'], param='pauseCount', defaultReturn=3)

        pauseMap = {
            'createcooloff': 'createpaused',
            'submitcooloff': 'submitpaused',
            'jobcooloff': 'jobpaused'
            }

        # Setting a pauseCount depending on job exit code
        if job['state'] == 'jobcooloff':
            exitCodes = self.getAlgoParam(job['jobType'], 'retryErrorCodes', {})
            if exitCodes:
                report = Report()
                reportPath = os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])
                try:
                    report.load(reportPath)
                    jobExitCode = report.getExitCode()
                    # If the jobExitCode is configured, set the respective pauseCount for the job.
                    if jobExitCode in exitCodes:
                        retryByTimeOut = True
                        pauseCount = exitCodes[jobExitCode]
                except IOError as ex:
                    msg = "Error loading report %s\n" % (reportPath)
                    msg += str(ex)
                    logging.warning(msg)

        # Here introduces the SquaredAlgo logic :
        baseTimeoutDict = self.getAlgoParam(job['jobType'])
        baseTimeout = baseTimeoutDict.get(cooloffType.lower(), 10)
        cooloffTime = baseTimeout * pow(job['retry_count'], 2)
        currentTime = self.timestamp()
        if currentTime - job['state_time'] > cooloffTime:
            retryByTimeOut = True
        else:
            retryByTimeOut = False

        if retryByTimeOut:
            # If reached the pauseCount, we want the job to pause instead of retrying
            if pauseCount == 0:
                self.changer.propagate(job, pauseMap[job['state']], job['state'], updatesummary=True)
                return False
            elif job['retry_count'] > 0 and not job['retry_count'] % pauseCount:
                self.changer.propagate(job, pauseMap[job['state']], job['state'], updatesummary=True)
                return False
            else:
                return True
        else:
            return False
Example #17
0
    def createMissingFWKJR(self, errorCode=999, errorDescription='Failure of unknown type'):
        """
        _createMissingFWJR_

        Create a missing FWJR if the report can't be found by the code in the
        path location.
        """
        report = Report()
        report.addError("cmsRun1", 84, errorCode, errorDescription)
        report.data.cmsRun1.status = "Failed"
        return report
def something():
    f = os.path.join(WMCore.WMBase.getTestBase(),
                                                               "WMComponent_t/JobAccountant_t/fwjrs",
                                                               "MergeSuccess.pkl")
    x = Report()
    x.load(f)
    x.setAcquisitionProcessing("IansMagicMushroomSoup", 9, "T0Test-AnalyzeThisAndGetAFreePhD-PreScaleThingy10")
    x.setGlobalTag("GT:Super")
    x.setValidStatus("Production")
Example #19
0
 def testCPBackendStageOutAgainstReportFailedStepOld(self):
     myReport = Report('cmsRun1')
     myReport.unpersist(os.path.join( self.testDir,'UnitTests', 'WMTaskSpace', 'cmsRun1' , 'Report.pkl'))
     myReport.data.cmsRun1.status = 1
     myReport.persist(os.path.join( self.testDir, 'UnitTests','WMTaskSpace', 'cmsRun1' , 'Report.pkl'))
     
     executor = StageOutExecutor.StageOut()
     executor.initialise( self.stepdata, self.job)
     self.setLocalOverride(self.stepdata)
     executor.step = self.stepdata
     executor.execute( )
     self.assertFalse( os.path.exists( os.path.join( self.testDir, 'hosts' )))
     self.assertFalse( os.path.exists( os.path.join( self.testDir, 'test1', 'hosts')))
Example #20
0
 def testCPBackendLogArchiveAgainstReportNew(self):
     myReport = Report()
     myReport.unpersist(os.path.join( self.testDir, 'UnitTests','WMTaskSpace', 'cmsRun1' , 'Report.pkl'))
     myReport.data.cmsRun1.status = 0
     myReport.persist(os.path.join( self.testDir,'UnitTests', 'WMTaskSpace', 'cmsRun1' , 'Report.pkl'))
     executor = LogArchiveExecutor.LogArchive()
     executor.initialise( self.stepdata, self.job)
     self.setLocalOverride(self.stepdata)
     self.stepdata.override.newLogArchive = True
     executor.step = self.stepdata
     executor.execute( )
     self.assertTrue( os.path.exists( os.path.join( self.testDir, 'hosts' )))
     self.assertTrue( os.path.exists( os.path.join( self.testDir, 'test1', 'hosts')))
    def isReady(self, job, cooloffType):
        """
        Actual function that does the work

        """

        if cooloffType == 'create' or cooloffType == 'submit':
            # Can't really do anything with these: resubmit
            return True

        # Run this to get the errors in the actual job
        try:
            report     = Report()
            reportPath = os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count'])
            report.load(reportPath)
        except:
            # If we're here, then the FWJR doesn't exist.
            # Give up, run it again
            return True

        # Set oneMore flag to be False
        oneMore = False

        # Find startTime, stopTime
        times = report.getFirstStartLastStop()
        startTime = times['startTime']
        stopTime  = times['stopTime']

        if startTime == None or stopTime == None:
            # Well, then we have a problem.
            # There is something very wrong with this job, nevertheless we don't know what it is.
            # Rerun, and hope the times get written the next time around.
            logging.error("No start, stop times for steps")
            return True

        if stopTime - startTime > self.maxRunTime:
            logging.error("Job only allowed to run one more time due to ProcessingAlgo.maxRunTime")
            oneMore = True

        if report.getExitCode() in self.exitCodes:
            logging.error("Job only allowed to run one more time due to ProcessingAlgo.exitCodes")
            oneMore = True


        # Reset the retry time
        if oneMore:
            job['retry_count'] = max(self.maxRetries - 1, job['retry_count'])
            job.save()
            # Hope this gets passed back by reference

        return True
Example #22
0
    def testXMLParsing(self):
        """
        _testParsing_

        Verify that the parsing of a CMSSW XML report works correctly.
        """
        myReport = Report("cmsRun1")
        myReport.parse(self.xmlPath)

        self.verifyInputData(myReport)
        self.verifyRecoOutput(myReport)
        self.verifyAlcaOutput(myReport)

        return
Example #23
0
    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """
        if len(failedJobs) == 0:
            return

        jrBinds = []
        for job in failedJobs:
            # Make sure the job object goes packed with fwjr_path to be persisted in couch
            jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error("The pickle in %s could not be loaded, generating a new one", jrPath)
                fwjr = Report()
                msg = "The job failed due to a timeout, unfortunately the original job report was lost"
                fwjr.addError("NoJobReport", 99303, "NoJobReport", msg)
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True)
        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs", len(failedJobs))
        myThread.transaction.commit()

        return
Example #24
0
    def testWithEventsXMLParsing(self):
        """
        _testParsing_

        Verify that the parsing of a CMSSW XML report works correctly.
        """
        myReport = Report("cmsRun1")
        myReport.parse(self.withEventsXmlPath)

        self.verifyInputData(myReport)
        self.verifyRecoOutput(myReport, hasEventCounts=True)
        self.verifyAlcaOutput(myReport, hasEventCounts=True)

        return
Example #25
0
    def testBadXMLParsing(self):
        """
        _testBadXMLParsing_

        Verify that the parsing of a CMSSW XML report works correctly even if
        the XML is malformed.

        This should raise a FwkJobReportException, which in CMSSW will be caught
        """
        myReport = Report("cmsRun1")
        from WMCore.FwkJobReport.Report import FwkJobReportException
        self.assertRaises(FwkJobReportException, myReport.parse, self.badxmlPath)
        self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].type, 'BadFWJRXML')
        self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].exitCode, 50115)
        return
Example #26
0
    def testExecutorDoesntDetonate(self):
        myReport = Report()
        myReport.unpersist(os.path.join( self.testDir,'UnitTests', 'WMTaskSpace', 'cmsRun1' , 'Report.pkl'))
        myReport.data.cmsRun1.status = 1
        myReport.persist(os.path.join( self.testDir, 'UnitTests','WMTaskSpace', 'cmsRun1' , 'Report.pkl'))

        executor = LogArchiveExecutor.LogArchive()
        
        executor.initialise( self.stepdata, self.job)
        self.setLocalOverride(self.stepdata)
        executor.step = self.stepdata
        executor.execute( )
        self.assertFalse( os.path.exists( os.path.join( self.testDir, 'hosts' )))
        self.assertFalse( os.path.exists( os.path.join( self.testDir, 'test1', 'hosts')))
        return
Example #27
0
    def testErrorReporting(self):
        """
        _testErrorReporting_

        Verify that errors are correctly transfered from the XML report to the
        python report.
        """
        cmsException = \
"""cms::Exception caught in cmsRun
---- EventProcessorFailure BEGIN
EventProcessingStopped
---- ScheduleExecutionFailure BEGIN
ProcessingStopped
---- NoRecord BEGIN
No "CastorDbRecord" record found in the EventSetup.
 Please add an ESSource or ESProducer that delivers such a record.
cms::Exception going through module CastorRawToDigi/castorDigis run: 121849 lumi: 1 event: 23
---- NoRecord END
Exception going through path raw2digi_step
---- ScheduleExecutionFailure END
an exception occurred during current event processing
cms::Exception caught in EventProcessor and rethrown
---- EventProcessorFailure END"""

        xmlPath = os.path.join(getTestBase(),
                               "WMCore_t/FwkJobReport_t/CMSSWFailReport.xml")

        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        assert hasattr(myReport.data.cmsRun1, "errors"), \
               "Error: Error section missing."
        assert getattr(myReport.data.cmsRun1.errors, "errorCount") == 1, \
               "Error: Error count is wrong."
        assert hasattr(myReport.data.cmsRun1.errors, "error0"), \
               "Error: Error0 section is missing."
        assert myReport.data.cmsRun1.errors.error0.type == "CMSException", \
               "Error: Wrong error type."
        assert myReport.data.cmsRun1.errors.error0.exitCode == "8001", \
               "Error: Wrong exit code."
        assert myReport.data.cmsRun1.errors.error0.details == cmsException, \
               "Error: Error details are wrong:\n|%s|\n|%s|" % (myReport.data.cmsRun1.errors.error0.details,
                                                               cmsException)

        # Test getStepErrors
        self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].type, "CMSException")

        return
Example #28
0
 def makeReport(self, fileName):
     myReport = Report('oneitem')
     myReport.addStep('stageOut1')
     mod1 = myReport.addOutputModule('module1')
     mod2 = myReport.addOutputModule('module2')
     file1 = myReport.addOutputFile('module1', {'lfn': 'FILE1', 'size' : 1, 'events' : 1})
     file2 = myReport.addOutputFile('module2', {'lfn': 'FILE2', 'size' : 1, 'events' : 1})
     file3 = myReport.addOutputFile('module2', {'lfn': 'FILE3', 'size' : 1, 'events' : 1})
     myReport.persist( fileName )
Example #29
0
    def loadJobReport(self, parameters):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        jobReportPath = parameters.get("fwjr_path", None)
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(parameters, 99999,
                                           "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://", "")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99999,
                'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except Exception as ex:
            msg = "Error loading jobReport %s\n" % jobReportPath
            msg += str(ex)
            logging.error(msg)
            logging.debug("Failing job: %s\n" % parameters)
            return self.createMissingFWKJR(parameters, 99997,
                                           'Cannot load jobReport')

        if len(jobReport.listSteps()) == 0:
            logging.error("FwkJobReport with no steps: %s" % jobReportPath)
            return self.createMissingFWKJR(
                parameters, 99997,
                'jobReport with no steps: %s ' % jobReportPath)

        return jobReport
Example #30
0
    def testErrorReporting(self):
        """
        _testErrorReporting_

        Verify that errors are correctly transfered from the XML report to the
        python report.
        """
        cmsException = "cms::Exception caught in cmsRun\n"
        cmsException += "---- EventProcessorFailure BEGIN\n"
        cmsException += "EventProcessingStopped\n"
        cmsException += "---- ScheduleExecutionFailure BEGIN\n"
        cmsException += "ProcessingStopped\n"
        cmsException += "---- NoRecord BEGIN\n"
        cmsException += 'No "CastorDbRecord" record found in the EventSetup.\n'
        cmsException += " Please add an ESSource or ESProducer that delivers such a record.\n"
        cmsException += "cms::Exception going through module CastorRawToDigi/castorDigis run: 121849 lumi: 1 event: 23\n"
        cmsException += "---- NoRecord END\n"
        cmsException += "Exception going through path raw2digi_step\n"
        cmsException += "---- ScheduleExecutionFailure END\n"
        cmsException += "an exception occurred during current event processing\n"
        cmsException += "cms::Exception caught in EventProcessor and rethrown\n"
        cmsException += "---- EventProcessorFailure END"

        xmlPath = os.path.join(getTestBase(),
                               "WMCore_t/FwkJobReport_t/CMSSWFailReport.xml")

        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        assert hasattr(myReport.data.cmsRun1, "errors"), \
            "Error: Error section missing."
        assert getattr(myReport.data.cmsRun1.errors, "errorCount") == 1, \
            "Error: Error count is wrong."
        assert hasattr(myReport.data.cmsRun1.errors, "error0"), \
            "Error: Error0 section is missing."
        assert myReport.data.cmsRun1.errors.error0.type == "CMSException", \
            "Error: Wrong error type."
        assert myReport.data.cmsRun1.errors.error0.exitCode == 8001, \
            "Error: Wrong exit code."
        assert myReport.data.cmsRun1.errors.error0.details == cmsException, \
            "Error: Error details are wrong:\n|%s|\n|%s|" % (myReport.data.cmsRun1.errors.error0.details,
                                                             cmsException)

        # Test getStepErrors
        self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].type, "CMSException")

        return
Example #31
0
    def testTaskJobID(self):
        """
        _testTaskJobID_

        Test the basic task and jobID functions
        """


        report = Report('fake')
        self.assertEqual(report.getTaskName(), None)
        self.assertEqual(report.getJobID(), None)
        report.setTaskName('silly')
        report.setJobID(100)
        self.assertEqual(report.getTaskName(), 'silly')
        self.assertEqual(report.getJobID(), 100)

        return
Example #32
0
    def testExitCode(self):
        """
        _testExitCode_

        Test and see if we can get an exit code out of a report

        Note: Errors without a return code return 99999
        """

        report = Report("cmsRun1")
        self.assertEqual(report.getExitCode(), 0)
        report.addError(stepName = "cmsRun1", exitCode = None, errorType = "test", errorDetails = "test")
        self.assertEqual(report.getExitCode(), 99999)
        self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 99999)
        report.addError(stepName = "cmsRun1", exitCode = '12345', errorType = "test", errorDetails = "test")
        self.assertEqual(report.getExitCode(), 12345)
        self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 12345)
Example #33
0
    def testTaskSuccessful(self):
        """
        _testTaskSuccessful_

        Test whether or not the report marks the task successful
        """

        myReport = Report("cmsRun1")
        myReport.parse(self.xmlPath)

        # First, the report should fail
        self.assertFalse(myReport.taskSuccessful())

        # Second, if we ignore cmsRun, the task
        # should succeed
        self.assertTrue(myReport.taskSuccessful(ignoreString = 'cmsRun'))
        return
Example #34
0
    def execute(self, emulator=None):
        """
        _execute_

        """
        # Are we using emulators again?
        if emulator is not None:
            return emulator.emulate(self.step, self.job)

        if self.step.upload.proxy:
            try:
                self.stepSpace.getFromSandbox(self.step.upload.proxy)
            except Exception as ex:
                # Let it go, it wasn't in the sandbox. Then it must be
                # somewhere else
                del ex

        # Search through steps for analysis files
        for step in self.stepSpace.taskSpace.stepSpaces():
            if step == self.stepName:
                # Don't try to parse your own report; it's not there yet
                continue
            stepLocation = os.path.join(self.stepSpace.taskSpace.location,
                                        step)
            logging.info("Beginning report processing for step %s", step)
            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s",
                              step, stepLocation)
                continue

            # First, get everything from a file and 'unpersist' it
            stepReport = Report()
            stepReport.unpersist(reportLocation, step)

            # Don't upload nor stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            # Pulling out the analysis files from each step
            analysisFiles = stepReport.getAnalysisFilesFromStep(step)

            # Working on analysis files
            for analysisFile in analysisFiles:
                # only deal with DQM files
                if analysisFile.FileClass == "DQM":
                    # uploading file to the server
                    self.httpPost(
                        os.path.join(stepLocation,
                                     os.path.basename(analysisFile.fileName)))

            # Am DONE with report
            # Persist it
            stepReport.persist(reportLocation)

        return
Example #35
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds))
            try:
                # 4th argument has to be None otherwise HTCondor leaks the result ads
                # through it (as of 8.7.x). More info in WMCore/#8729
                clusterId = schedd.submitMany(clusterAd, procAds, False, None)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error("Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Example #36
0
    def submit(self, jobs, info=None):
        """
        _submit_

        Submits jobs to the condor queue
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            (sub, jobParams) = self.createSubmitRequest(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python Submit", len(jobParams))
            try:
                with schedd.transaction() as txn:
                    submitRes = sub.queue_with_itemdata(txn, 1, iter(jobParams))
                    clusterId = submitRes.cluster()
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.exception(str(ex))
                logging.error("Moving on the the next batch of jobs and/or cycle....")

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId)
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Example #37
0
    def testPileupFiles(self):
        """
        _testPileupFiles_

        Test that alll the pileup files end up in the report
        """

        report = Report("cmsRun1")
        report.parse(self.pileupXmlPath)
        self.assertEqual(len(report.getAllInputFiles()), 14)

        primaryCount = 0
        secondaryCount = 0
        mixingCount = 0

        for fileEntry in report.getAllInputFiles():
            if fileEntry['input_type'] == 'mixingFiles':
                mixingCount += 1
            elif fileEntry['input_type'] == 'primaryFiles':
                primaryCount += 1
            elif fileEntry['input_type'] == 'secondaryFiles':
                secondaryCount += 1

        self.assertEqual(primaryCount, 1)
        self.assertEqual(secondaryCount, 0)
        self.assertEqual(mixingCount, 13)
        self.assertEqual(len(report.getAllFallbackFiles()), 1)

        return
Example #38
0
    def submit(self, jobs, info=None):
        """
        _submit_


        Submit jobs for one subscription
        """
        successfulJobs = []
        failedJobs = []

        if len(jobs) == 0:
            # Then was have nothing to do
            return successfulJobs, failedJobs

        schedd = htcondor.Schedd()

        # Submit the jobs
        for jobsReady in grouper(jobs, self.jobsPerSubmit):

            clusterAd = self.getClusterAd()
            procAds = self.getProcAds(jobsReady)

            logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds))
            try:
                clusterId = schedd.submitMany(clusterAd, procAds)
            except Exception as ex:
                logging.error("SimpleCondorPlugin job submission failed.")
                logging.error("Moving on the the next batch of jobs and/or cycle....")
                logging.exception(ex)

                condorErrorReport = Report()
                condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex))
                for job in jobsReady:
                    job['fwjr'] = condorErrorReport
                    failedJobs.append(job)
            else:
                logging.debug("Finish: Submitting jobs using Condor Python SubmitMany")
                for index, job in enumerate(jobsReady):
                    job['gridid'] = "%s.%s" % (clusterId, index)
                    job['status'] = 'Idle'
                    successfulJobs.append(job)

        # We must return a list of jobs successfully submitted and a list of jobs failed
        logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin")
        return successfulJobs, failedJobs
Example #39
0
    def testBadXMLParsing(self):
        """
        _testBadXMLParsing_

        Verify that the parsing of a CMSSW XML report works correctly even if
        the XML is malformed.

        This should raise a FwkJobReportException, which in CMSSW will be caught
        """
        myReport = Report("cmsRun1")
        from WMCore.FwkJobReport.Report import FwkJobReportException
        self.assertRaises(FwkJobReportException, myReport.parse,
                          self.badxmlPath)
        self.assertEqual(
            myReport.getStepErrors("cmsRun1")['error0'].type, 'BadFWJRXML')
        self.assertEqual(
            myReport.getStepErrors("cmsRun1")['error0'].exitCode, 50115)
        return
Example #40
0
    def testOutputCheck(self):
        """
        _testOutputCheck_

        Check that we can identify bad reports with no output files
        """
        badReport = Report("cmsRun1")
        badReport.parse(self.skippedAllFilesxmlPath)
        badReport.checkForOutputFiles("cmsRun1")
        self.assertFalse(badReport.stepSuccessful(stepName="cmsRun1"))
        self.assertEqual(badReport.getExitCode(), 60450)
        return
Example #41
0
    def testFallbackFiles(self):
        """
        _testFallback_

        Test that fallback files end up in the report
        """

        # For negative control, check a good report with no fallback files
        goodReport = Report("cmsRun1")
        goodReport.parse(self.xmlPath)
        self.assertEqual(goodReport.getAllFallbackFiles(), [])

        # Check a report where the file was a fallback
        badReport = Report("cmsRun1")
        badReport.parse(self.fallbackXmlPath)
        self.assertEqual(sorted(badReport.getAllFallbackFiles()),
                         ['/store/data/Run2012D/SingleElectron/AOD/PromptReco-v1/000/207/279/D43A5B72-1831-E211-895D-001D09F24763.root'])

        return
Example #42
0
    def testDeleteOutputModule(self):
        """
        _testDeleteOutputModule_

        If asked delete an output module, if it doesn't
        exist then do nothing
        """
        originalReport = Report("cmsRun1")
        originalReport.parse(self.xmlPath)

        self.assertTrue(originalReport.getOutputModule("cmsRun1", "outputALCARECORECO"),
                        "Error: Report XML doesn't have the module for the test, invalid test")

        originalOutputModules = len(originalReport.retrieveStep("cmsRun1").outputModules)
        originalReport.deleteOutputModuleForStep("cmsRun1", "outputALCARECORECO")
        self.assertFalse(originalReport.getOutputModule("cmsRun1", "outputALCARECORECO"),
                        "Error: The output module persists after deletion")
        self.assertEqual(len(originalReport.retrieveStep("cmsRun1").outputModules), originalOutputModules - 1,
                         "Error: The number of output modules is incorrect after deletion")
Example #43
0
    def testFallbackFilesJSON(self):
        """
        _testFallbackFilesJSON_

        Test that fallback attempt files are translated properly into JSON
        """

        # For negative control, check a good report with no skipped files
        goodReport = Report("cmsRun1")
        goodReport.parse(self.xmlPath)
        goodJSON = goodReport.__to_json__(None)
        self.assertEqual(goodJSON['fallbackFiles'], [])

        # Check a report where all files were skipped
        badReport = Report("cmsRun1")
        badReport.parse(self.fallbackXmlPath)
        badJSON = badReport.__to_json__(None)
        self.assertEqual(len(badJSON['fallbackFiles']), 1)

        return
Example #44
0
    def mergeReport(self):
        """
        _mergeReport_

        read the merge report
        """
        reportInstance = Report(self.stepName)
        ReportReader.xmlToJobReport(
            reportInstance,
            os.path.join(self.workingDir, self.merge_report_file))
        return reportInstance
    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71104]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleLocations']))
            elif exitCode in [71101]:
                # there is no possible site
                if "fileLocations" in job:
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))
                else:
                    # This is temporary addition if this is patched for existing agent.
                    # If jobs are created before the patch is applied fileLocations is not set.
                    # TODO. remove this later for new agent
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] +
                        ": Job is created before this patch. Please check this input for the jobs: %s "
                        % job['fwjr'].getAllInputFiles())

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])
            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return
Example #46
0
    def post(self, emulator=None):
        """
        _post_

        Post execution checkpointing

        """
        # Another emulator check
        if emulator is not None:
            return emulator.emulatePost(self.step)

        logging.info("Steps.Executors.%s.post called", self.__class__.__name__)

        for step in self.stepSpace.taskSpace.stepSpaces():

            if step == self.stepName:
                # Don't try to parse your own report; it's not there yet
                continue

            stepLocation = os.path.join(self.stepSpace.taskSpace.location,
                                        step)
            logging.info("Beginning report processing for step %s", step)

            reportLocation = os.path.join(stepLocation, 'Report.pkl')
            if not os.path.isfile(reportLocation):
                logging.error("Cannot find report for step %s in space %s",
                              step, stepLocation)
                continue

            # First, get everything from a file and 'unpersist' it
            stepReport = Report(step)
            stepReport.unpersist(reportLocation)

            # Don't stage out files from bad steps.
            if not stepReport.stepSuccessful(step):
                continue

            files = stepReport.getAllFileRefsFromStep(step=step)
            for fileInfo in files:
                if hasattr(fileInfo, 'lfn') and hasattr(
                        fileInfo, 'location') and hasattr(fileInfo, 'guid'):
                    fileInfo.user_dn = getattr(self.step, "userDN", None)
                    fileInfo.async_dest = getattr(self.step, "asyncDest", None)
                    fileInfo.user_vogroup = getattr(self.step, "owner_vogroup",
                                                    '')
                    fileInfo.user_vorole = getattr(self.step, "owner_vorole",
                                                   '')

            stepReport.persist(reportLocation)

        return None
Example #47
0
 def makeReport(self, fileName):
     myReport = Report('oneitem')
     myReport.addStep('stageOut1')
     mod1 = myReport.addOutputModule('module1')
     mod2 = myReport.addOutputModule('module2')
     file1 = myReport.addOutputFile('module1', {
         'lfn': 'FILE1',
         'size': 1,
         'events': 1
     })
     file2 = myReport.addOutputFile('module2', {
         'lfn': 'FILE2',
         'size': 1,
         'events': 1
     })
     file3 = myReport.addOutputFile('module2', {
         'lfn': 'FILE3',
         'size': 1,
         'events': 1
     })
     myReport.persist(fileName)
Example #48
0
    def failJobs(self, failedJobs):
        """
        _failJobs_

        Dump those jobs that have failed due to timeout
        """
        if len(failedJobs) == 0:
            return

        jrBinds = []
        for job in failedJobs:
            # Make sure the job object goes packed with fwjr_path to be persisted in couch
            jrPath = os.path.join(job.getCache(),
                                  'Report.%i.pkl' % (job['retry_count']))
            jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath})

            fwjr = Report()
            try:
                fwjr.load(jrPath)
            except Exception:
                # Something went wrong reading the pickle
                logging.error(
                    "The pickle in %s could not be loaded, generating a new one",
                    jrPath)
                fwjr = Report()
                fwjr.addError("NoJobReport", 99303, "NoJobReport",
                              WM_JOB_ERROR_CODES[99303])
                fwjr.save(jrPath)
            job["fwjr"] = fwjr

        myThread = threading.currentThread()
        myThread.transaction.begin()
        self.setFWJRAction.execute(binds=jrBinds,
                                   conn=myThread.transaction.conn,
                                   transaction=True)
        self.changeState.propagate(failedJobs, 'jobfailed', 'executing')
        logging.info("Failed %i jobs", len(failedJobs))
        myThread.transaction.commit()

        return
Example #49
0
    def loadJobReport(self, jobReportPath):
        """
        _loadJobReport_

        Given a framework job report on disk, load it and return a
        FwkJobReport instance.  If there is any problem loading or parsing the
        framework job report return None.
        """
        # The jobReportPath may be prefixed with "file://" which needs to be
        # removed so it doesn't confuse the FwkJobReport() parser.
        if not jobReportPath:
            logging.error("Bad FwkJobReport Path: %s", jobReportPath)
            return self.createMissingFWKJR(99999, "FWJR path is empty")

        jobReportPath = jobReportPath.replace("file://", "")
        if not os.path.exists(jobReportPath):
            logging.error("Bad FwkJobReport Path: %s", jobReportPath)
            return self.createMissingFWKJR(99999, 'Cannot find file in jobReport path: %s' % jobReportPath)

        if os.path.getsize(jobReportPath) == 0:
            logging.error("Empty FwkJobReport: %s", jobReportPath)
            return self.createMissingFWKJR(99998, 'jobReport of size 0: %s ' % jobReportPath)

        jobReport = Report()

        try:
            jobReport.load(jobReportPath)
        except UnicodeDecodeError:
            logging.error("Hit UnicodeDecodeError exception while loading jobReport: %s", jobReportPath)
            return self.createMissingFWKJR(99997, 'Found undecodable data in jobReport: {}'.format(jobReportPath))
        except Exception as ex:
            msg = "Error loading jobReport: {}\nDetails: {}".format(jobReportPath, str(ex))
            logging.error(msg)
            return self.createMissingFWKJR(99997, 'Cannot load jobReport')

        if not jobReport.listSteps():
            logging.error("FwkJobReport with no steps: %s", jobReportPath)
            return self.createMissingFWKJR(99997, 'jobReport with no steps: %s ' % jobReportPath)

        return jobReport
Example #50
0
def thrashCouch():
    """
    _thrashCouch_

    """
    jobs = {
        "new": set(),
        "created": set(),
        "executing": set(),
        "complete": set(),
        "success": set(),
        "cleanout": set()
    }

    config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"])
    changeState = ChangeState(config)

    myReport = Report()
    myReport.unpersist(
        os.path.join(
            getWMBASE(),
            "test/python/WMComponent_t/JobAccountant_t/fwjrs/LoadTest00.pkl"))

    for i in range(500):
        jobs = createJobs()
        changeState.recordInCouch(jobs, "created", "new")
        changeState.recordInCouch(jobs, "executing", "created")
        changeState.recordInCouch(jobs, "complete", "executing")

        for job in jobs:
            job["fwjr"] = myReport

        changeState.recordInCouch(jobs, "success", "complete")

        for job in jobs:
            job["fwjr"] = None

        changeState.recordInCouch(jobs, "cleanout", "success")
        #time.sleep(10)
    return
Example #51
0
    def testUnitTestBackendNew(self):
        myReport = Report()
        myReport.unpersist(
            os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1',
                         'Report.pkl'))
        myReport.data.cmsRun1.status = 1
        myReport.persist(
            os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1',
                         'Report.pkl'))

        executor = StageOutExecutor.StageOut()
        helper = StageOutTemplate.StageOutStepHelper(self.stepdata)
        helper.addOverride(override='command', overrideValue='test-win')
        helper.addOverride(override='option', overrideValue='')
        helper.addOverride(override='phedex-node',
                           overrideValue='charlie.sheen.biz')
        helper.addOverride(override='lfn-prefix', overrideValue='test-win')
        helper.setNewStageoutOverride(True)

        executor.initialise(self.stepdata, self.job)
        self.setLocalOverride(self.stepdata)
        executor.step = self.stepdata
        executor.execute()
        self.assertFalse(os.path.exists(os.path.join(self.testDir, 'hosts')))
        self.assertFalse(
            os.path.exists(os.path.join(self.testDir, 'test1', 'hosts')))
Example #52
0
    def initialise(self, step, job):
        """
        _initialise_


        Initialise the executor attributes

        """
        self.step = step
        self.job = job
        self.stepName = getStepName(self.step)
        self.stepSpace = getStepSpace(self.stepName)
        self.task = self.stepSpace.getWMTask()
        self.workload = self.stepSpace.taskSpace.workload
        self.report = Report(self.stepName)
        self.report.data.task = self.task.name()
        self.report.data.workload = self.stepSpace.taskSpace.workloadName()
        self.report.data.id = job['id']
        self.errorDestination = getStepErrorDestination(self.step)

        self.step.section_("execution")
        self.step.execution.exitStatus = 0
        self.step.execution.reportLocation = "%s/Report.pkl" % (
            self.stepSpace.location,
        )

        # Set overall step status to 1 (failed)
        self.report.setStepStatus(stepName=self.stepName, status=1)

        #  //
        # //  Does the step contain settings for an emulator?
        # //   If so, load it up

        emulatorName = getattr(self.step.emulator, "emulatorName", None)
        if emulatorName != None:
            self.emulator = getStepEmulator(emulatorName)
            self.emulator.initialise(self)
            self.emulationMode = True

        return
Example #53
0
    def testUnitTestBackend(self):
        myReport = Report()
        myReport.unpersist(
            os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1',
                         'Report.pkl'))
        myReport.data.cmsRun1.status = 1
        myReport.persist(
            os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1',
                         'Report.pkl'))

        executor = LogArchiveExecutor.LogArchive()
        helper = LogArchiveTemplate.LogArchiveStepHelper(self.stepdata)
        helper.addOverride(override='command', overrideValue='test-win')
        helper.addOverride(override='option', overrideValue='')
        helper.addOverride(override='se-name',
                           overrideValue='charlie.sheen.biz')
        helper.addOverride(override='lfn-prefix', overrideValue='test-win')

        executor.initialise(self.stepdata, self.job)
        self.setLocalOverride(self.stepdata)
        executor.step = self.stepdata
        executor.execute()
        self.assertFalse(os.path.exists(os.path.join(self.testDir, 'hosts')))
        self.assertFalse(
            os.path.exists(os.path.join(self.testDir, 'test1', 'hosts')))
Example #54
0
    def generateCreateFailedReports(self, createFailedJobs):
        """
        _generateCreateFailedReports_

        Create and store FWJR for the  jobs that failed on creation
        leaving meaningful information about what happened with them
        """
        if not createFailedJobs:
            return

        fjrsToSave = []
        for failedJob in createFailedJobs:
            report = Report()
            defaultMsg = "There is a condition which assures that this job will fail if it's submitted"
            report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg))
            jobCache = failedJob.getCache()
            try:
                fjrPath = os.path.join(jobCache, "Report.0.pkl")
                report.save(fjrPath)
                fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath})
                failedJob["fwjr"] = report
            except Exception:
                logging.error("Something went wrong while saving the report for  job %s", failedJob["id"])

        myThread = threading.currentThread()
        self.setFWJRPath.execute(binds=fjrsToSave, conn=myThread.transaction.conn, transaction=True)

        return
Example #55
0
    def _handleSubmitFailedJobs(self, badJobs, exitCode):
        """
        __handleSubmitFailedJobs_

        For a default job report for the exitCode
        and register in the job. Preserve it on disk as well.
        Propagate the failure to the JobStateMachine.
        """
        fwjrBinds = []
        for job in badJobs:
            job['couch_record'] = None
            job['fwjr'] = Report()
            if exitCode in [71102, 71103]:
                job['fwjr'].addError(
                    "JobSubmit", exitCode, "SubmitFailed",
                    WM_JOB_ERROR_CODES[exitCode] +
                    ', '.join(job['possibleSites']),
                    ', '.join(job['possibleSites']))
            elif exitCode in [71101]:
                # there is no possible site
                if job.get("fileLocations"):
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] + ": file locations: " +
                        ', '.join(job['fileLocations']) +
                        ": site white list: " +
                        ', '.join(job['siteWhitelist']) +
                        ": site black list: " +
                        ', '.join(job['siteBlacklist']))
                else:
                    job['fwjr'].addError(
                        "JobSubmit", exitCode, "SubmitFailed",
                        WM_JOB_ERROR_CODES[exitCode] +
                        ', and empty fileLocations')

            else:
                job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed",
                                     WM_JOB_ERROR_CODES[exitCode])

            fwjrPath = os.path.join(job['cache_dir'],
                                    'Report.%d.pkl' % int(job['retry_count']))
            job['fwjr'].setJobID(job['id'])
            try:
                job['fwjr'].save(fwjrPath)
                fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath})
            except IOError as ioer:
                logging.error(
                    "Failed to write FWJR for submit failed job %d, message: %s",
                    job['id'], str(ioer))
        self.changeState.propagate(badJobs, "submitfailed", "created")
        self.setFWJRPathAction.execute(binds=fwjrBinds)
        return
Example #56
0
    def testBadXMLParsing(self):
        """
        _testBadXMLParsing_

        Verify that the parsing of a CMSSW XML report works correctly even if
        the XML is malformed.

        This should raise a FwkJobReportException, which in CMSSW will be caught
        """
        myReport = Report("cmsRun1")
        from WMCore.FwkJobReport.Report import FwkJobReportException
        self.assertRaises(FwkJobReportException, myReport.parse, self.badxmlPath)
        return
Example #57
0
    def testPerformanceJSON(self):
        """
        _testPerformanceJSON_

        Verify that the performance section of the report is correctly converted
        to JSON.
        """
        xmlPath = os.path.join(
            WMCore.WMBase.getTestBase(),
            "WMCore_t/FwkJobReport_t/PerformanceReport.xml")

        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        perfSection = myReport.__to_json__(
            thunker=None)["steps"]["cmsRun1"]["performance"]

        self.assertTrue(perfSection.has_key("storage"),
                        "Error: Storage section is missing.")
        self.assertTrue(perfSection.has_key("memory"),
                        "Error: Memory section is missing.")
        self.assertTrue(perfSection.has_key("cpu"),
                        "Error: CPU section is missing.")

        self.assertEqual(perfSection["cpu"]["AvgEventCPU"], "0.626105",
                         "Error: AvgEventCPU is wrong.")
        self.assertEqual(perfSection["cpu"]["TotalJobTime"], "23.5703",
                         "Error: TotalJobTime is wrong.")
        self.assertEqual(perfSection["storage"]["readTotalMB"], 39.6166,
                         "Error: readTotalMB is wrong.")
        self.assertEqual(perfSection["storage"]["readMaxMSec"], 320.653,
                         "Error: readMaxMSec is wrong")
        self.assertEqual(perfSection["memory"]["PeakValueRss"], "492.293",
                         "Error: PeakValueRss is wrong.")
        self.assertEqual(perfSection["memory"]["PeakValueVsize"], "643.281",
                         "Error: PeakValueVsize is wrong.")
        return
Example #58
0
    def testCPBackendStageOutAgainstReportOld(self):

        myReport = Report()
        reportPath = os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace',
                                  'cmsRun1', 'Report.pkl')
        myReport.unpersist(reportPath)
        myReport.data.cmsRun1.status = 0
        # print("myReport.data.cmsRun1: {}, dir: {}".format(myReport.data.cmsRun1, dir(myReport.data.cmsRun1)))
        myReport.persist(reportPath)
        executor = StageOutExecutor.StageOut()
        executor.initialise(self.stepdata, self.job)
        self.setLocalOverride(self.stepdata)
        executor.step = self.stepdata
        # It should fail with:
        # AssertionError: LFN candidate: hosts doesn't match any of the following regular expressions:
        with self.assertRaises(AssertionError):
            executor.execute()

        # now fix those output file names to pass the Lexicon check, and execute it again
        myReport.unpersist(reportPath)
        # cmsRun1.output.FEVT.files.file0.lfn = 'hosts'
        # cmsRun1.output.ALCARECOStreamCombined.files.file0.lfn = '/test1/hosts'
        myReport.data.cmsRun1.output.FEVT.files.file0.lfn = "/store/mc/acqera/pd/FEVT/procstr/abc123.root"
        myReport.data.cmsRun1.output.ALCARECOStreamCombined.files.file0.lfn = "/store/mc/acqera/pd/ALCARECO/procstr/abc123.root"
        myReport.persist(reportPath)
        executor.execute()

        self.assertTrue(
            os.path.exists(
                os.path.join(self.testDir, "store", "mc", "acqera", "pd",
                             "FEVT")))
        self.assertTrue(
            os.path.exists(
                os.path.join(self.testDir, "store", "mc", "acqera", "pd",
                             "ALCARECO")))
        return
Example #59
0
    def testPerformanceSummary(self):
        """
        _testPerformanceSummary_

        Test whether or not we can pull performance information
        out of a Timing/SimpleMemoryCheck jobReport
        """

        xmlPath = os.path.join(getTestBase(),
                               "WMCore_t/FwkJobReport_t/PerformanceReport.xml")

        myReport = Report("cmsRun1")
        myReport.parse(xmlPath)

        # Do a brief check of the three sections
        perf = myReport.data.cmsRun1.performance

        self.assertEqual(perf.memory.PeakValueRss, '492.293')
        self.assertEqual(perf.cpu.TotalJobCPU, '9.16361')
        self.assertEqual(perf.storage.writeTotalMB, 5.22226)
        self.assertEqual(perf.storage.writeTotalSecs, 60317.4)
        self.assertEqual(perf.storage.readPercentageOps, 0.98585512216030857)

        return
Example #60
0
 def testCPBackendLogArchiveAgainstReportNew(self):
     myReport = Report()
     myReport.unpersist(os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl'))
     myReport.data.cmsRun1.status = 0
     myReport.persist(os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl'))
     executor = LogArchiveExecutor.LogArchive()
     executor.initialise(self.stepdata, self.job)
     self.setLocalOverride(self.stepdata)
     self.stepdata.override.newLogArchive = True
     executor.step = self.stepdata
     executor.execute()
     self.assertTrue(os.path.exists(os.path.join(self.testDir, 'hosts')))
     self.assertTrue(os.path.exists(os.path.join(self.testDir, 'test1', 'hosts')))