def testOutputFiles(self): """ _testOutputFiles_ Test some basic manipulation of output files """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) files = myReport.getAllFilesFromStep(step = "cmsRun1") f1 = files[0] f2 = files[1] self.assertEqual(f1['outputModule'], 'outputRECORECO') self.assertEqual(f1['pfn'], 'outputRECORECO.root') self.assertEqual(f2['outputModule'], 'outputALCARECORECO') self.assertEqual(f2['pfn'], 'outputALCARECORECO.root') for f in files: self.assertEqual(f['events'], 2) self.assertEqual(f['configURL'], None) self.assertEqual(f['merged'], False) self.assertEqual(f['validStatus'], None) self.assertEqual(f['first_event'], 0) return
def testA_testSubmit(self): """ _testSubmit_ Test whether we pick up submitted jobs """ #workload = self.createWorkload() jobGroup = self.createTestJobGroup() config = self.getConfig() xmlPath = os.path.join(WMCore.WMBase.getTestBase(), "WMCore_t/FwkJobReport_t/PerformanceReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) changer = ChangeState(config) for job in jobGroup.jobs: job['fwjr'] = myReport changer.propagate(jobGroup.jobs, "complete", "executing") changer.propagate(jobGroup.jobs, "success", "complete") dashboardReporter = DashboardReporterPoller(config = config) dashboardReporter.algorithm() # What the hell am I supposed to check? changer.propagate(jobGroup.jobs, 'jobfailed', 'executing') dashboardReporter.algorithm() return
def testOutputFiles(self): """ _testOutputFiles_ Test some basic manipulation of output files """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) files = myReport.getAllFilesFromStep(step="cmsRun1") f1 = files[0] f2 = files[1] self.assertEqual(f1["outputModule"], "outputRECORECO") self.assertEqual(f1["pfn"], "outputRECORECO.root") self.assertEqual(f2["outputModule"], "outputALCARECORECO") self.assertEqual(f2["pfn"], "outputALCARECORECO.root") for f in files: self.assertEqual(f["events"], 2) self.assertEqual(f["configURL"], None) self.assertEqual(f["merged"], False) self.assertEqual(f["validStatus"], None) self.assertEqual(f["first_event"], 0) return
def generateCreateFailedReports(self, createFailedJobs): """ _generateCreateFailedReports_ Create and store FWJR for the jobs that failed on creation leaving meaningful information about what happened with them """ if not createFailedJobs: return fjrsToSave = [] for failedJob in createFailedJobs: report = Report() defaultMsg = "There is a condition which assures that this job will fail if it's submitted" report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg)) jobCache = failedJob.getCache() try: fjrPath = os.path.join(jobCache, "Report.0.pkl") report.save(fjrPath) fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath}) failedJob["fwjr"] = report except Exception: logging.error("Something went wrong while saving the report for job %s" % failedJob["id"]) myThread = threading.currentThread() self.setFWJRPath.execute(binds = fjrsToSave, conn = myThread.transaction.conn, transaction = True) return
def testC_ExecuteSegfault(self): """ _ExecuteSegfault_ Test the execution of a script which raises a ABRT signal which is the normal CMSSW response to a SEGFAULT. """ self.step.application.command.executable = "test.sh" # CMSSW leaves an empty FWJR when a SEGFAULT is present open(os.path.join(self.step.builder.workingDir, "FrameworkJobReport.xml"), "w").close() try: os.chdir(self.step.builder.workingDir) executor = StepFactory.getStepExecutor("CMSSW") executor.initialise(self.step, self.job) executor.pre() executor.step.runtime.scramPreScripts.remove("SetupCMSSWPset") try: executor.execute() self.fail("An exception should have been raised") except WMExecutionFailure as ex: executor.diagnostic(ex.code, executor, ExceptionInstance=ex) self.assertEqual(50115, executor.report.getExitCode()) report = Report() report.load("Report.pkl") self.assertEqual(50115, report.getExitCode()) except Exception as ex: self.fail("Failure encountered, %s" % str(ex)) finally: os.chdir(self.oldCwd) return
def testB_ExecuteNonZeroExit(self): """ _ExecuteNonZeroExit_ Test the execution of a script which exits with non-zero code. """ self.step.application.command.executable = "brokenCmsRun.py" shutil.copy(os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWFailReport.xml"), os.path.join(self.step.builder.workingDir, "FrameworkJobReport.xml")) try: os.chdir(self.step.builder.workingDir) executor = StepFactory.getStepExecutor("CMSSW") executor.initialise(self.step, self.job) executor.pre() executor.step.runtime.scramPreScripts.remove("SetupCMSSWPset") try: executor.execute() self.fail("An exception should have been raised") except WMExecutionFailure as ex: executor.diagnostic(ex.code, executor, ExceptionInstance=ex) self.assertEqual(8001, executor.report.getExitCode()) report = Report() report.load("Report.pkl") self.assertEqual(8001, report.getExitCode()) except Exception as ex: self.fail("Failure encountered, %s" % str(ex)) finally: os.chdir(self.oldCwd) return
def testPerformanceJSON(self): """ _testPerformanceJSON_ Verify that the performance section of the report is correctly converted to JSON. """ xmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/PerformanceReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) perfSection = myReport.__to_json__(thunker = None)["steps"]["cmsRun1"]["performance"] self.assertTrue(perfSection.has_key("storage"), "Error: Storage section is missing.") self.assertTrue(perfSection.has_key("memory"), "Error: Memory section is missing.") self.assertTrue(perfSection.has_key("cpu"), "Error: CPU section is missing.") self.assertEqual(perfSection["cpu"]["AvgEventCPU"], "0.626105", "Error: AvgEventCPU is wrong.") self.assertEqual(perfSection["cpu"]["TotalJobTime"], "23.5703", "Error: TotalJobTime is wrong.") self.assertEqual(perfSection["storage"]["readTotalMB"], 39.6166, "Error: readTotalMB is wrong.") self.assertEqual(perfSection["storage"]["readMaxMSec"], 320.653, "Error: readMaxMSec is wrong") self.assertEqual(perfSection["memory"]["PeakValueRss"], "492.293", "Error: PeakValueRss is wrong.") self.assertEqual(perfSection["memory"]["PeakValueVsize"], "643.281", "Error: PeakValueVsize is wrong.") return
def testJSONEncoding(self): """ _testJSONEncoding_ Verify that turning the FWJR into a JSON object works correctly. """ xmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWProcessingReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) jsonReport = myReport.__to_json__(None) assert "task" in jsonReport.keys(), \ "Error: Task name missing from report." assert len(jsonReport["steps"].keys()) == 1, \ "Error: Wrong number of steps in report." assert "cmsRun1" in jsonReport["steps"].keys(), \ "Error: Step missing from json report." cmsRunStep = jsonReport["steps"]["cmsRun1"] jsonReportSections = ["status", "errors", "logs", "parameters", "site", "analysis", "cleanup", "input", "output", "start"] for jsonReportSection in jsonReportSections: assert jsonReportSection in cmsRunStep.keys(), \ "Error: missing section: %s" % jsonReportSection return
def testAbortedState(self): """ _testAbortedState_ Check that we can kill jobs when a site is set to aborted ### We no longer need this test as we are not killing jobs that are running """ self.tempDir = self.testInit.generateWorkDir() config = self.createConfig() myResourceControl = ResourceControl(config) myResourceControl.insertSite("testSite1", 10, 20, "testSE1", "testCE1", "T1_US_FNAL", "MockPlugin") myResourceControl.insertSite("testSite2", 20, 40, "testSE2", "testCE2", "T1_IT_CNAF", "MockPlugin") myResourceControl.insertThreshold("testSite1", "Processing", 20, 10) myResourceControl.insertThreshold("testSite1", "Merge", 200, 100) myResourceControl.insertThreshold("testSite2", "Processing", 50, 25) myResourceControl.insertThreshold("testSite2", "Merge", 135, 65) self.createJobs() myResourceControl.changeSiteState("testSite1", "Aborted") ## Now check the tempDir for a FWJR for the killed job reportPath = os.path.join(self.tempDir, "Report.0.pkl") report = Report() report.load(reportPath) self.assertEqual(report.getExitCode(), 71301) return
def thrashCouch(): """ _thrashCouch_ """ jobs = {"new": set(), "created": set(), "executing": set(), "complete": set(), "success": set(), "cleanout": set()} config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) changeState = ChangeState(config) myReport = Report() myReport.unpersist(os.path.join(getWMBASE(), "test/python/WMComponent_t/JobAccountant_t/fwjrs/LoadTest00.pkl")) for i in range(500): jobs = createJobs() changeState.recordInCouch(jobs, "created", "new") changeState.recordInCouch(jobs, "executing", "created") changeState.recordInCouch(jobs, "complete", "executing") for job in jobs: job["fwjr"] = myReport changeState.recordInCouch(jobs, "success", "complete") for job in jobs: job["fwjr"] = None changeState.recordInCouch(jobs, "cleanout", "success") #time.sleep(10) return
def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://","") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception, ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport')
def testMultipleInputs(self): """ _testMultipleInputs_ Verify that parsing XML reports with multiple inputs works correctly. """ xmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWMultipleInput.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) assert hasattr(myReport.data.cmsRun1.input, "source"), \ "Error: Report missing input source." inputFiles = myReport.getInputFilesFromStep("cmsRun1") assert len(inputFiles) == 2, \ "Error: Wrong number of input files." for inputFile in inputFiles: assert inputFile["input_type"] == "primaryFiles", \ "Error: Wrong input type." assert inputFile["module_label"] == "source", \ "Error: Module label is wrong" assert inputFile["catalog"] == "trivialcatalog_file:/uscmst1/prod/sw/cms/SITECONF/T1_US_FNAL/PhEDEx/storage.xml?protocol=dcap", \ "Error: Catalog is wrong." assert inputFile["events"] == 2, \ "Error: Wrong number of events." assert inputFile["input_source_class"] == "PoolSource", \ "Error: Wrong input source class." if inputFile["guid"] == "F0875ECD-3347-DF11-9FE0-003048678A80": assert inputFile["lfn"] == "/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/F0875ECD-3347-DF11-9FE0-003048678A80.root", \ "Error: Input LFN is wrong." assert inputFile["pfn"] == "dcap://cmsdca3.fnal.gov:24142/pnfs/fnal.gov/usr/cms/WAX/11/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/F0875ECD-3347-DF11-9FE0-003048678A80.root", \ "Error: Input PFN is wrong." assert len(inputFile["runs"]) == 1, \ "Error: Wrong number of runs." assert list(inputFile["runs"])[0].run == 124216, \ "Error: Wrong run number." assert 1 in list(inputFile["runs"])[0], \ "Error: Wrong lumi sections in input file." else: assert inputFile["guid"] == "626D74CE-3347-DF11-9363-0030486790C0", \ "Error: Wrong guid." assert inputFile["lfn"] == "/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/626D74CE-3347-DF11-9363-0030486790C0.root", \ "Error: Input LFN is wrong." assert inputFile["pfn"] == "dcap://cmsdca3.fnal.gov:24142/pnfs/fnal.gov/usr/cms/WAX/11/store/backfill/2/unmerged/WMAgentCommissioining10/MinimumBias/RECO/rereco_GR10_P_V4_All_v1/0000/626D74CE-3347-DF11-9363-0030486790C0.root", \ "Error: Input PFN is wrong." assert len(inputFile["runs"]) == 1, \ "Error: Wrong number of runs." assert list(inputFile["runs"])[0].run == 124216, \ "Error: Wrong run number." assert 2 in list(inputFile["runs"])[0], \ "Error: Wrong lumi sections in input file." return
def testB_EmulatorTest(self): """ _EmulatorTest_ This is where things get scary. We need to not only unpack the job, but also ascertain whether it can run locally in emulator mode. This requires...uh...emulator emulation. """ # Assume all this works, because we tested it in testA workloadName = 'basicWorkload' workload = self.createTestWorkload(workloadName = workloadName) self.createWMBSComponents(workload = workload) self.unpackComponents(workload = workload) self.runJobs(workload = workload) # Check the report taskDir = os.path.join(self.testDir, 'unpack/ReReco/job/WMTaskSpace') report = Report() report.load(os.path.join(taskDir, 'Report.0.pkl')) cmsReport = report.data.cmsRun1 # Now validate the report self.assertEqual(report.data.ceName, socket.gethostname()) self.assertEqual(report.data.seName, 'cmssrm.fnal.gov') self.assertEqual(report.data.siteName, 'T1_US_FNAL') self.assertEqual(report.data.hostName, socket.gethostname()) self.assertTrue(report.data.completed) # Should have status 0 (emulator job) self.assertEqual(cmsReport.status, 0) # Should have one output module self.assertEqual(cmsReport.outputModules, ['TestOutputModule']) # It should have one file for input and output self.assertEqual(cmsReport.input.PoolSource.files.fileCount, 1) self.assertEqual(cmsReport.output.TestOutputModule.files.fileCount, 1) # So, um, I guess we're done # At the end, copy the directory #if os.path.exists('tmpDir'): # shutil.rmtree('tmpDir') #shutil.copytree(self.testDir, 'tmpDir') return
def createReport(self): """ Create a test report """ jobReport = Report() jobReport.addStep('cmsRun1') return jobReport
def testASONoNameChange(self): AsyncStageOut_t.FakeTransferWorker.setFailProbability(0) testJob = self.roundtripHelper(preserveLFN = True) stepReport = Report('cmsRun1') stepReport.unpersist(testJob['fwjr_path']) files = stepReport.getAllFileRefsFromStep(step = 'cmsRun1') for file in files: self.assertNotEqual( file.lfn.find('store/temp'), -1, "The lfn should still have store/temp: %s" % file.lfn)
def isReady(self, job, cooloffType): """ Actual function that does the work """ # This should come from configuration, pause_count pauseCount = self.getAlgoParam(job['jobType'], param='pauseCount', defaultReturn=3) pauseMap = { 'createcooloff': 'createpaused', 'submitcooloff': 'submitpaused', 'jobcooloff': 'jobpaused' } # Setting a pauseCount depending on job exit code if job['state'] == 'jobcooloff': exitCodes = self.getAlgoParam(job['jobType'], 'retryErrorCodes', {}) if exitCodes: report = Report() reportPath = os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count']) try: report.load(reportPath) jobExitCode = report.getExitCode() # If the jobExitCode is configured, set the respective pauseCount for the job. if jobExitCode in exitCodes: retryByTimeOut = True pauseCount = exitCodes[jobExitCode] except IOError as ex: msg = "Error loading report %s\n" % (reportPath) msg += str(ex) logging.warning(msg) # Here introduces the SquaredAlgo logic : baseTimeoutDict = self.getAlgoParam(job['jobType']) baseTimeout = baseTimeoutDict.get(cooloffType.lower(), 10) cooloffTime = baseTimeout * pow(job['retry_count'], 2) currentTime = self.timestamp() if currentTime - job['state_time'] > cooloffTime: retryByTimeOut = True else: retryByTimeOut = False if retryByTimeOut: # If reached the pauseCount, we want the job to pause instead of retrying if pauseCount == 0: self.changer.propagate(job, pauseMap[job['state']], job['state'], updatesummary=True) return False elif job['retry_count'] > 0 and not job['retry_count'] % pauseCount: self.changer.propagate(job, pauseMap[job['state']], job['state'], updatesummary=True) return False else: return True else: return False
def createMissingFWKJR(self, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report
def something(): f = os.path.join(WMCore.WMBase.getTestBase(), "WMComponent_t/JobAccountant_t/fwjrs", "MergeSuccess.pkl") x = Report() x.load(f) x.setAcquisitionProcessing("IansMagicMushroomSoup", 9, "T0Test-AnalyzeThisAndGetAFreePhD-PreScaleThingy10") x.setGlobalTag("GT:Super") x.setValidStatus("Production")
def testCPBackendStageOutAgainstReportFailedStepOld(self): myReport = Report('cmsRun1') myReport.unpersist(os.path.join( self.testDir,'UnitTests', 'WMTaskSpace', 'cmsRun1' , 'Report.pkl')) myReport.data.cmsRun1.status = 1 myReport.persist(os.path.join( self.testDir, 'UnitTests','WMTaskSpace', 'cmsRun1' , 'Report.pkl')) executor = StageOutExecutor.StageOut() executor.initialise( self.stepdata, self.job) self.setLocalOverride(self.stepdata) executor.step = self.stepdata executor.execute( ) self.assertFalse( os.path.exists( os.path.join( self.testDir, 'hosts' ))) self.assertFalse( os.path.exists( os.path.join( self.testDir, 'test1', 'hosts')))
def testCPBackendLogArchiveAgainstReportNew(self): myReport = Report() myReport.unpersist(os.path.join( self.testDir, 'UnitTests','WMTaskSpace', 'cmsRun1' , 'Report.pkl')) myReport.data.cmsRun1.status = 0 myReport.persist(os.path.join( self.testDir,'UnitTests', 'WMTaskSpace', 'cmsRun1' , 'Report.pkl')) executor = LogArchiveExecutor.LogArchive() executor.initialise( self.stepdata, self.job) self.setLocalOverride(self.stepdata) self.stepdata.override.newLogArchive = True executor.step = self.stepdata executor.execute( ) self.assertTrue( os.path.exists( os.path.join( self.testDir, 'hosts' ))) self.assertTrue( os.path.exists( os.path.join( self.testDir, 'test1', 'hosts')))
def isReady(self, job, cooloffType): """ Actual function that does the work """ if cooloffType == 'create' or cooloffType == 'submit': # Can't really do anything with these: resubmit return True # Run this to get the errors in the actual job try: report = Report() reportPath = os.path.join(job['cache_dir'], "Report.%i.pkl" % job['retry_count']) report.load(reportPath) except: # If we're here, then the FWJR doesn't exist. # Give up, run it again return True # Set oneMore flag to be False oneMore = False # Find startTime, stopTime times = report.getFirstStartLastStop() startTime = times['startTime'] stopTime = times['stopTime'] if startTime == None or stopTime == None: # Well, then we have a problem. # There is something very wrong with this job, nevertheless we don't know what it is. # Rerun, and hope the times get written the next time around. logging.error("No start, stop times for steps") return True if stopTime - startTime > self.maxRunTime: logging.error("Job only allowed to run one more time due to ProcessingAlgo.maxRunTime") oneMore = True if report.getExitCode() in self.exitCodes: logging.error("Job only allowed to run one more time due to ProcessingAlgo.exitCodes") oneMore = True # Reset the retry time if oneMore: job['retry_count'] = max(self.maxRetries - 1, job['retry_count']) job.save() # Hope this gets passed back by reference return True
def testXMLParsing(self): """ _testParsing_ Verify that the parsing of a CMSSW XML report works correctly. """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) self.verifyInputData(myReport) self.verifyRecoOutput(myReport) self.verifyAlcaOutput(myReport) return
def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ if len(failedJobs) == 0: return jrBinds = [] for job in failedJobs: # Make sure the job object goes packed with fwjr_path to be persisted in couch jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) fwjr = Report() try: fwjr.load(jrPath) except Exception: # Something went wrong reading the pickle logging.error("The pickle in %s could not be loaded, generating a new one", jrPath) fwjr = Report() msg = "The job failed due to a timeout, unfortunately the original job report was lost" fwjr.addError("NoJobReport", 99303, "NoJobReport", msg) fwjr.save(jrPath) job["fwjr"] = fwjr myThread = threading.currentThread() myThread.transaction.begin() self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs", len(failedJobs)) myThread.transaction.commit() return
def testWithEventsXMLParsing(self): """ _testParsing_ Verify that the parsing of a CMSSW XML report works correctly. """ myReport = Report("cmsRun1") myReport.parse(self.withEventsXmlPath) self.verifyInputData(myReport) self.verifyRecoOutput(myReport, hasEventCounts=True) self.verifyAlcaOutput(myReport, hasEventCounts=True) return
def testBadXMLParsing(self): """ _testBadXMLParsing_ Verify that the parsing of a CMSSW XML report works correctly even if the XML is malformed. This should raise a FwkJobReportException, which in CMSSW will be caught """ myReport = Report("cmsRun1") from WMCore.FwkJobReport.Report import FwkJobReportException self.assertRaises(FwkJobReportException, myReport.parse, self.badxmlPath) self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].type, 'BadFWJRXML') self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].exitCode, 50115) return
def testExecutorDoesntDetonate(self): myReport = Report() myReport.unpersist(os.path.join( self.testDir,'UnitTests', 'WMTaskSpace', 'cmsRun1' , 'Report.pkl')) myReport.data.cmsRun1.status = 1 myReport.persist(os.path.join( self.testDir, 'UnitTests','WMTaskSpace', 'cmsRun1' , 'Report.pkl')) executor = LogArchiveExecutor.LogArchive() executor.initialise( self.stepdata, self.job) self.setLocalOverride(self.stepdata) executor.step = self.stepdata executor.execute( ) self.assertFalse( os.path.exists( os.path.join( self.testDir, 'hosts' ))) self.assertFalse( os.path.exists( os.path.join( self.testDir, 'test1', 'hosts'))) return
def testErrorReporting(self): """ _testErrorReporting_ Verify that errors are correctly transfered from the XML report to the python report. """ cmsException = \ """cms::Exception caught in cmsRun ---- EventProcessorFailure BEGIN EventProcessingStopped ---- ScheduleExecutionFailure BEGIN ProcessingStopped ---- NoRecord BEGIN No "CastorDbRecord" record found in the EventSetup. Please add an ESSource or ESProducer that delivers such a record. cms::Exception going through module CastorRawToDigi/castorDigis run: 121849 lumi: 1 event: 23 ---- NoRecord END Exception going through path raw2digi_step ---- ScheduleExecutionFailure END an exception occurred during current event processing cms::Exception caught in EventProcessor and rethrown ---- EventProcessorFailure END""" xmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWFailReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) assert hasattr(myReport.data.cmsRun1, "errors"), \ "Error: Error section missing." assert getattr(myReport.data.cmsRun1.errors, "errorCount") == 1, \ "Error: Error count is wrong." assert hasattr(myReport.data.cmsRun1.errors, "error0"), \ "Error: Error0 section is missing." assert myReport.data.cmsRun1.errors.error0.type == "CMSException", \ "Error: Wrong error type." assert myReport.data.cmsRun1.errors.error0.exitCode == "8001", \ "Error: Wrong exit code." assert myReport.data.cmsRun1.errors.error0.details == cmsException, \ "Error: Error details are wrong:\n|%s|\n|%s|" % (myReport.data.cmsRun1.errors.error0.details, cmsException) # Test getStepErrors self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].type, "CMSException") return
def makeReport(self, fileName): myReport = Report('oneitem') myReport.addStep('stageOut1') mod1 = myReport.addOutputModule('module1') mod2 = myReport.addOutputModule('module2') file1 = myReport.addOutputFile('module1', {'lfn': 'FILE1', 'size' : 1, 'events' : 1}) file2 = myReport.addOutputFile('module2', {'lfn': 'FILE2', 'size' : 1, 'events' : 1}) file3 = myReport.addOutputFile('module2', {'lfn': 'FILE3', 'size' : 1, 'events' : 1}) myReport.persist( fileName )
def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport
def testErrorReporting(self): """ _testErrorReporting_ Verify that errors are correctly transfered from the XML report to the python report. """ cmsException = "cms::Exception caught in cmsRun\n" cmsException += "---- EventProcessorFailure BEGIN\n" cmsException += "EventProcessingStopped\n" cmsException += "---- ScheduleExecutionFailure BEGIN\n" cmsException += "ProcessingStopped\n" cmsException += "---- NoRecord BEGIN\n" cmsException += 'No "CastorDbRecord" record found in the EventSetup.\n' cmsException += " Please add an ESSource or ESProducer that delivers such a record.\n" cmsException += "cms::Exception going through module CastorRawToDigi/castorDigis run: 121849 lumi: 1 event: 23\n" cmsException += "---- NoRecord END\n" cmsException += "Exception going through path raw2digi_step\n" cmsException += "---- ScheduleExecutionFailure END\n" cmsException += "an exception occurred during current event processing\n" cmsException += "cms::Exception caught in EventProcessor and rethrown\n" cmsException += "---- EventProcessorFailure END" xmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/CMSSWFailReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) assert hasattr(myReport.data.cmsRun1, "errors"), \ "Error: Error section missing." assert getattr(myReport.data.cmsRun1.errors, "errorCount") == 1, \ "Error: Error count is wrong." assert hasattr(myReport.data.cmsRun1.errors, "error0"), \ "Error: Error0 section is missing." assert myReport.data.cmsRun1.errors.error0.type == "CMSException", \ "Error: Wrong error type." assert myReport.data.cmsRun1.errors.error0.exitCode == 8001, \ "Error: Wrong exit code." assert myReport.data.cmsRun1.errors.error0.details == cmsException, \ "Error: Error details are wrong:\n|%s|\n|%s|" % (myReport.data.cmsRun1.errors.error0.details, cmsException) # Test getStepErrors self.assertEqual(myReport.getStepErrors("cmsRun1")['error0'].type, "CMSException") return
def testTaskJobID(self): """ _testTaskJobID_ Test the basic task and jobID functions """ report = Report('fake') self.assertEqual(report.getTaskName(), None) self.assertEqual(report.getJobID(), None) report.setTaskName('silly') report.setJobID(100) self.assertEqual(report.getTaskName(), 'silly') self.assertEqual(report.getJobID(), 100) return
def testExitCode(self): """ _testExitCode_ Test and see if we can get an exit code out of a report Note: Errors without a return code return 99999 """ report = Report("cmsRun1") self.assertEqual(report.getExitCode(), 0) report.addError(stepName = "cmsRun1", exitCode = None, errorType = "test", errorDetails = "test") self.assertEqual(report.getExitCode(), 99999) self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 99999) report.addError(stepName = "cmsRun1", exitCode = '12345', errorType = "test", errorDetails = "test") self.assertEqual(report.getExitCode(), 12345) self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 12345)
def testTaskSuccessful(self): """ _testTaskSuccessful_ Test whether or not the report marks the task successful """ myReport = Report("cmsRun1") myReport.parse(self.xmlPath) # First, the report should fail self.assertFalse(myReport.taskSuccessful()) # Second, if we ignore cmsRun, the task # should succeed self.assertTrue(myReport.taskSuccessful(ignoreString = 'cmsRun')) return
def execute(self, emulator=None): """ _execute_ """ # Are we using emulators again? if emulator is not None: return emulator.emulate(self.step, self.job) if self.step.upload.proxy: try: self.stepSpace.getFromSandbox(self.step.upload.proxy) except Exception as ex: # Let it go, it wasn't in the sandbox. Then it must be # somewhere else del ex # Search through steps for analysis files for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: # Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't upload nor stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Pulling out the analysis files from each step analysisFiles = stepReport.getAnalysisFilesFromStep(step) # Working on analysis files for analysisFile in analysisFiles: # only deal with DQM files if analysisFile.FileClass == "DQM": # uploading file to the server self.httpPost( os.path.join(stepLocation, os.path.basename(analysisFile.fileName))) # Am DONE with report # Persist it stepReport.persist(reportLocation) return
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: # 4th argument has to be None otherwise HTCondor leaks the result ads # through it (as of 8.7.x). More info in WMCore/#8729 clusterId = schedd.submitMany(clusterAd, procAds, False, None) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error("Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): (sub, jobParams) = self.createSubmitRequest(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python Submit", len(jobParams)) try: with schedd.transaction() as txn: submitRes = sub.queue_with_itemdata(txn, 1, iter(jobParams)) clusterId = submitRes.cluster() except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error("Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def testPileupFiles(self): """ _testPileupFiles_ Test that alll the pileup files end up in the report """ report = Report("cmsRun1") report.parse(self.pileupXmlPath) self.assertEqual(len(report.getAllInputFiles()), 14) primaryCount = 0 secondaryCount = 0 mixingCount = 0 for fileEntry in report.getAllInputFiles(): if fileEntry['input_type'] == 'mixingFiles': mixingCount += 1 elif fileEntry['input_type'] == 'primaryFiles': primaryCount += 1 elif fileEntry['input_type'] == 'secondaryFiles': secondaryCount += 1 self.assertEqual(primaryCount, 1) self.assertEqual(secondaryCount, 0) self.assertEqual(mixingCount, 13) self.assertEqual(len(report.getAllFallbackFiles()), 1) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: clusterId = schedd.submitMany(clusterAd, procAds) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error("Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Finish: Submitting jobs using Condor Python SubmitMany") for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def testBadXMLParsing(self): """ _testBadXMLParsing_ Verify that the parsing of a CMSSW XML report works correctly even if the XML is malformed. This should raise a FwkJobReportException, which in CMSSW will be caught """ myReport = Report("cmsRun1") from WMCore.FwkJobReport.Report import FwkJobReportException self.assertRaises(FwkJobReportException, myReport.parse, self.badxmlPath) self.assertEqual( myReport.getStepErrors("cmsRun1")['error0'].type, 'BadFWJRXML') self.assertEqual( myReport.getStepErrors("cmsRun1")['error0'].exitCode, 50115) return
def testOutputCheck(self): """ _testOutputCheck_ Check that we can identify bad reports with no output files """ badReport = Report("cmsRun1") badReport.parse(self.skippedAllFilesxmlPath) badReport.checkForOutputFiles("cmsRun1") self.assertFalse(badReport.stepSuccessful(stepName="cmsRun1")) self.assertEqual(badReport.getExitCode(), 60450) return
def testFallbackFiles(self): """ _testFallback_ Test that fallback files end up in the report """ # For negative control, check a good report with no fallback files goodReport = Report("cmsRun1") goodReport.parse(self.xmlPath) self.assertEqual(goodReport.getAllFallbackFiles(), []) # Check a report where the file was a fallback badReport = Report("cmsRun1") badReport.parse(self.fallbackXmlPath) self.assertEqual(sorted(badReport.getAllFallbackFiles()), ['/store/data/Run2012D/SingleElectron/AOD/PromptReco-v1/000/207/279/D43A5B72-1831-E211-895D-001D09F24763.root']) return
def testDeleteOutputModule(self): """ _testDeleteOutputModule_ If asked delete an output module, if it doesn't exist then do nothing """ originalReport = Report("cmsRun1") originalReport.parse(self.xmlPath) self.assertTrue(originalReport.getOutputModule("cmsRun1", "outputALCARECORECO"), "Error: Report XML doesn't have the module for the test, invalid test") originalOutputModules = len(originalReport.retrieveStep("cmsRun1").outputModules) originalReport.deleteOutputModuleForStep("cmsRun1", "outputALCARECORECO") self.assertFalse(originalReport.getOutputModule("cmsRun1", "outputALCARECORECO"), "Error: The output module persists after deletion") self.assertEqual(len(originalReport.retrieveStep("cmsRun1").outputModules), originalOutputModules - 1, "Error: The number of output modules is incorrect after deletion")
def testFallbackFilesJSON(self): """ _testFallbackFilesJSON_ Test that fallback attempt files are translated properly into JSON """ # For negative control, check a good report with no skipped files goodReport = Report("cmsRun1") goodReport.parse(self.xmlPath) goodJSON = goodReport.__to_json__(None) self.assertEqual(goodJSON['fallbackFiles'], []) # Check a report where all files were skipped badReport = Report("cmsRun1") badReport.parse(self.fallbackXmlPath) badJSON = badReport.__to_json__(None) self.assertEqual(len(badJSON['fallbackFiles']), 1) return
def mergeReport(self): """ _mergeReport_ read the merge report """ reportInstance = Report(self.stepName) ReportReader.xmlToJobReport( reportInstance, os.path.join(self.workingDir, self.merge_report_file)) return reportInstance
def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71104]: job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleLocations'])) elif exitCode in [71101]: # there is no possible site if "fileLocations" in job: job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: # This is temporary addition if this is patched for existing agent. # If jobs are created before the patch is applied fileLocations is not set. # TODO. remove this later for new agent job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": Job is created before this patch. Please check this input for the jobs: %s " % job['fwjr'].getAllInputFiles()) else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath}) except IOError as ioer: logging.error( "Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return
def post(self, emulator=None): """ _post_ Post execution checkpointing """ # Another emulator check if emulator is not None: return emulator.emulatePost(self.step) logging.info("Steps.Executors.%s.post called", self.__class__.__name__) for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: # Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report(step) stepReport.unpersist(reportLocation) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue files = stepReport.getAllFileRefsFromStep(step=step) for fileInfo in files: if hasattr(fileInfo, 'lfn') and hasattr( fileInfo, 'location') and hasattr(fileInfo, 'guid'): fileInfo.user_dn = getattr(self.step, "userDN", None) fileInfo.async_dest = getattr(self.step, "asyncDest", None) fileInfo.user_vogroup = getattr(self.step, "owner_vogroup", '') fileInfo.user_vorole = getattr(self.step, "owner_vorole", '') stepReport.persist(reportLocation) return None
def makeReport(self, fileName): myReport = Report('oneitem') myReport.addStep('stageOut1') mod1 = myReport.addOutputModule('module1') mod2 = myReport.addOutputModule('module2') file1 = myReport.addOutputFile('module1', { 'lfn': 'FILE1', 'size': 1, 'events': 1 }) file2 = myReport.addOutputFile('module2', { 'lfn': 'FILE2', 'size': 1, 'events': 1 }) file3 = myReport.addOutputFile('module2', { 'lfn': 'FILE3', 'size': 1, 'events': 1 }) myReport.persist(fileName)
def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ if len(failedJobs) == 0: return jrBinds = [] for job in failedJobs: # Make sure the job object goes packed with fwjr_path to be persisted in couch jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) fwjr = Report() try: fwjr.load(jrPath) except Exception: # Something went wrong reading the pickle logging.error( "The pickle in %s could not be loaded, generating a new one", jrPath) fwjr = Report() fwjr.addError("NoJobReport", 99303, "NoJobReport", WM_JOB_ERROR_CODES[99303]) fwjr.save(jrPath) job["fwjr"] = fwjr myThread = threading.currentThread() myThread.transaction.begin() self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs", len(failedJobs)) myThread.transaction.commit() return
def loadJobReport(self, jobReportPath): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. if not jobReportPath: logging.error("Bad FwkJobReport Path: %s", jobReportPath) return self.createMissingFWKJR(99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s", jobReportPath) return self.createMissingFWKJR(99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s", jobReportPath) return self.createMissingFWKJR(99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except UnicodeDecodeError: logging.error("Hit UnicodeDecodeError exception while loading jobReport: %s", jobReportPath) return self.createMissingFWKJR(99997, 'Found undecodable data in jobReport: {}'.format(jobReportPath)) except Exception as ex: msg = "Error loading jobReport: {}\nDetails: {}".format(jobReportPath, str(ex)) logging.error(msg) return self.createMissingFWKJR(99997, 'Cannot load jobReport') if not jobReport.listSteps(): logging.error("FwkJobReport with no steps: %s", jobReportPath) return self.createMissingFWKJR(99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport
def thrashCouch(): """ _thrashCouch_ """ jobs = { "new": set(), "created": set(), "executing": set(), "complete": set(), "success": set(), "cleanout": set() } config = loadConfigurationFile(os.environ["WMAGENT_CONFIG"]) changeState = ChangeState(config) myReport = Report() myReport.unpersist( os.path.join( getWMBASE(), "test/python/WMComponent_t/JobAccountant_t/fwjrs/LoadTest00.pkl")) for i in range(500): jobs = createJobs() changeState.recordInCouch(jobs, "created", "new") changeState.recordInCouch(jobs, "executing", "created") changeState.recordInCouch(jobs, "complete", "executing") for job in jobs: job["fwjr"] = myReport changeState.recordInCouch(jobs, "success", "complete") for job in jobs: job["fwjr"] = None changeState.recordInCouch(jobs, "cleanout", "success") #time.sleep(10) return
def testUnitTestBackendNew(self): myReport = Report() myReport.unpersist( os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl')) myReport.data.cmsRun1.status = 1 myReport.persist( os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl')) executor = StageOutExecutor.StageOut() helper = StageOutTemplate.StageOutStepHelper(self.stepdata) helper.addOverride(override='command', overrideValue='test-win') helper.addOverride(override='option', overrideValue='') helper.addOverride(override='phedex-node', overrideValue='charlie.sheen.biz') helper.addOverride(override='lfn-prefix', overrideValue='test-win') helper.setNewStageoutOverride(True) executor.initialise(self.stepdata, self.job) self.setLocalOverride(self.stepdata) executor.step = self.stepdata executor.execute() self.assertFalse(os.path.exists(os.path.join(self.testDir, 'hosts'))) self.assertFalse( os.path.exists(os.path.join(self.testDir, 'test1', 'hosts')))
def initialise(self, step, job): """ _initialise_ Initialise the executor attributes """ self.step = step self.job = job self.stepName = getStepName(self.step) self.stepSpace = getStepSpace(self.stepName) self.task = self.stepSpace.getWMTask() self.workload = self.stepSpace.taskSpace.workload self.report = Report(self.stepName) self.report.data.task = self.task.name() self.report.data.workload = self.stepSpace.taskSpace.workloadName() self.report.data.id = job['id'] self.errorDestination = getStepErrorDestination(self.step) self.step.section_("execution") self.step.execution.exitStatus = 0 self.step.execution.reportLocation = "%s/Report.pkl" % ( self.stepSpace.location, ) # Set overall step status to 1 (failed) self.report.setStepStatus(stepName=self.stepName, status=1) # // # // Does the step contain settings for an emulator? # // If so, load it up emulatorName = getattr(self.step.emulator, "emulatorName", None) if emulatorName != None: self.emulator = getStepEmulator(emulatorName) self.emulator.initialise(self) self.emulationMode = True return
def testUnitTestBackend(self): myReport = Report() myReport.unpersist( os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl')) myReport.data.cmsRun1.status = 1 myReport.persist( os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl')) executor = LogArchiveExecutor.LogArchive() helper = LogArchiveTemplate.LogArchiveStepHelper(self.stepdata) helper.addOverride(override='command', overrideValue='test-win') helper.addOverride(override='option', overrideValue='') helper.addOverride(override='se-name', overrideValue='charlie.sheen.biz') helper.addOverride(override='lfn-prefix', overrideValue='test-win') executor.initialise(self.stepdata, self.job) self.setLocalOverride(self.stepdata) executor.step = self.stepdata executor.execute() self.assertFalse(os.path.exists(os.path.join(self.testDir, 'hosts'))) self.assertFalse( os.path.exists(os.path.join(self.testDir, 'test1', 'hosts')))
def generateCreateFailedReports(self, createFailedJobs): """ _generateCreateFailedReports_ Create and store FWJR for the jobs that failed on creation leaving meaningful information about what happened with them """ if not createFailedJobs: return fjrsToSave = [] for failedJob in createFailedJobs: report = Report() defaultMsg = "There is a condition which assures that this job will fail if it's submitted" report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg)) jobCache = failedJob.getCache() try: fjrPath = os.path.join(jobCache, "Report.0.pkl") report.save(fjrPath) fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath}) failedJob["fwjr"] = report except Exception: logging.error("Something went wrong while saving the report for job %s", failedJob["id"]) myThread = threading.currentThread() self.setFWJRPath.execute(binds=fjrsToSave, conn=myThread.transaction.conn, transaction=True) return
def _handleSubmitFailedJobs(self, badJobs, exitCode): """ __handleSubmitFailedJobs_ For a default job report for the exitCode and register in the job. Preserve it on disk as well. Propagate the failure to the JobStateMachine. """ fwjrBinds = [] for job in badJobs: job['couch_record'] = None job['fwjr'] = Report() if exitCode in [71102, 71103]: job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', '.join(job['possibleSites']), ', '.join(job['possibleSites'])) elif exitCode in [71101]: # there is no possible site if job.get("fileLocations"): job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ": file locations: " + ', '.join(job['fileLocations']) + ": site white list: " + ', '.join(job['siteWhitelist']) + ": site black list: " + ', '.join(job['siteBlacklist'])) else: job['fwjr'].addError( "JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode] + ', and empty fileLocations') else: job['fwjr'].addError("JobSubmit", exitCode, "SubmitFailed", WM_JOB_ERROR_CODES[exitCode]) fwjrPath = os.path.join(job['cache_dir'], 'Report.%d.pkl' % int(job['retry_count'])) job['fwjr'].setJobID(job['id']) try: job['fwjr'].save(fwjrPath) fwjrBinds.append({"jobid": job["id"], "fwjrpath": fwjrPath}) except IOError as ioer: logging.error( "Failed to write FWJR for submit failed job %d, message: %s", job['id'], str(ioer)) self.changeState.propagate(badJobs, "submitfailed", "created") self.setFWJRPathAction.execute(binds=fwjrBinds) return
def testBadXMLParsing(self): """ _testBadXMLParsing_ Verify that the parsing of a CMSSW XML report works correctly even if the XML is malformed. This should raise a FwkJobReportException, which in CMSSW will be caught """ myReport = Report("cmsRun1") from WMCore.FwkJobReport.Report import FwkJobReportException self.assertRaises(FwkJobReportException, myReport.parse, self.badxmlPath) return
def testPerformanceJSON(self): """ _testPerformanceJSON_ Verify that the performance section of the report is correctly converted to JSON. """ xmlPath = os.path.join( WMCore.WMBase.getTestBase(), "WMCore_t/FwkJobReport_t/PerformanceReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) perfSection = myReport.__to_json__( thunker=None)["steps"]["cmsRun1"]["performance"] self.assertTrue(perfSection.has_key("storage"), "Error: Storage section is missing.") self.assertTrue(perfSection.has_key("memory"), "Error: Memory section is missing.") self.assertTrue(perfSection.has_key("cpu"), "Error: CPU section is missing.") self.assertEqual(perfSection["cpu"]["AvgEventCPU"], "0.626105", "Error: AvgEventCPU is wrong.") self.assertEqual(perfSection["cpu"]["TotalJobTime"], "23.5703", "Error: TotalJobTime is wrong.") self.assertEqual(perfSection["storage"]["readTotalMB"], 39.6166, "Error: readTotalMB is wrong.") self.assertEqual(perfSection["storage"]["readMaxMSec"], 320.653, "Error: readMaxMSec is wrong") self.assertEqual(perfSection["memory"]["PeakValueRss"], "492.293", "Error: PeakValueRss is wrong.") self.assertEqual(perfSection["memory"]["PeakValueVsize"], "643.281", "Error: PeakValueVsize is wrong.") return
def testCPBackendStageOutAgainstReportOld(self): myReport = Report() reportPath = os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl') myReport.unpersist(reportPath) myReport.data.cmsRun1.status = 0 # print("myReport.data.cmsRun1: {}, dir: {}".format(myReport.data.cmsRun1, dir(myReport.data.cmsRun1))) myReport.persist(reportPath) executor = StageOutExecutor.StageOut() executor.initialise(self.stepdata, self.job) self.setLocalOverride(self.stepdata) executor.step = self.stepdata # It should fail with: # AssertionError: LFN candidate: hosts doesn't match any of the following regular expressions: with self.assertRaises(AssertionError): executor.execute() # now fix those output file names to pass the Lexicon check, and execute it again myReport.unpersist(reportPath) # cmsRun1.output.FEVT.files.file0.lfn = 'hosts' # cmsRun1.output.ALCARECOStreamCombined.files.file0.lfn = '/test1/hosts' myReport.data.cmsRun1.output.FEVT.files.file0.lfn = "/store/mc/acqera/pd/FEVT/procstr/abc123.root" myReport.data.cmsRun1.output.ALCARECOStreamCombined.files.file0.lfn = "/store/mc/acqera/pd/ALCARECO/procstr/abc123.root" myReport.persist(reportPath) executor.execute() self.assertTrue( os.path.exists( os.path.join(self.testDir, "store", "mc", "acqera", "pd", "FEVT"))) self.assertTrue( os.path.exists( os.path.join(self.testDir, "store", "mc", "acqera", "pd", "ALCARECO"))) return
def testPerformanceSummary(self): """ _testPerformanceSummary_ Test whether or not we can pull performance information out of a Timing/SimpleMemoryCheck jobReport """ xmlPath = os.path.join(getTestBase(), "WMCore_t/FwkJobReport_t/PerformanceReport.xml") myReport = Report("cmsRun1") myReport.parse(xmlPath) # Do a brief check of the three sections perf = myReport.data.cmsRun1.performance self.assertEqual(perf.memory.PeakValueRss, '492.293') self.assertEqual(perf.cpu.TotalJobCPU, '9.16361') self.assertEqual(perf.storage.writeTotalMB, 5.22226) self.assertEqual(perf.storage.writeTotalSecs, 60317.4) self.assertEqual(perf.storage.readPercentageOps, 0.98585512216030857) return
def testCPBackendLogArchiveAgainstReportNew(self): myReport = Report() myReport.unpersist(os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl')) myReport.data.cmsRun1.status = 0 myReport.persist(os.path.join(self.testDir, 'UnitTests', 'WMTaskSpace', 'cmsRun1', 'Report.pkl')) executor = LogArchiveExecutor.LogArchive() executor.initialise(self.stepdata, self.job) self.setLocalOverride(self.stepdata) self.stepdata.override.newLogArchive = True executor.step = self.stepdata executor.execute() self.assertTrue(os.path.exists(os.path.join(self.testDir, 'hosts'))) self.assertTrue(os.path.exists(os.path.join(self.testDir, 'test1', 'hosts')))