def testFailedJobsUniqueWf(self): """ Performance test of failedJobs with all failed jobs belonging to the same workflow and the same task name """ loadList = [] for i in range(1, 5000): loadList.append(self.jobConfig('wf1', '/wf1/task1', i, 'lfn1')) dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") dcs.failedJobs(loadList) return
def createResubmitSpec(self, serverUrl, couchDB): """ _createResubmitSpec_ Create a bogus resubmit workload. """ self.site = "cmssrm.fnal.gov" workload = WMWorkloadHelper(WMWorkload("TestWorkload")) reco = workload.newTask("reco") workload.setOwnerDetails(name = "evansde77", group = "DMWM") # first task uses the input dataset reco.addInputDataset(primary = "PRIMARY", processed = "processed-v1", tier = "TIER1") reco.data.input.splitting.algorithm = "File" reco.setTaskType("Processing") cmsRunReco = reco.makeStep("cmsRun1") cmsRunReco.setStepType("CMSSW") reco.applyTemplates() cmsRunRecoHelper = cmsRunReco.getTypeHelper() cmsRunRecoHelper.addOutputModule("outputRECO", primaryDataset = "PRIMARY", processedDataset = "processed-v2", dataTier = "TIER2", lfnBase = "/store/dunkindonuts", mergedLFNBase = "/store/kfc") dcs = DataCollectionService(url = serverUrl, database = couchDB) def getJob(workload): job = Job() job["task"] = workload.getTask("reco").getPathName() job["workflow"] = workload.name() job["location"] = self.site job["owner"] = "evansde77" job["group"] = "DMWM" return job testFileA = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileA.setLocation([self.site]) testFileA.addRun(Run(1, 1, 2)) testFileB = WMFile(lfn = makeUUID(), size = 1024, events = 1024) testFileB.setLocation([self.site]) testFileB.addRun(Run(1, 3, 4)) testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) dcs.failedJobs([testJobA]) topLevelTask = workload.getTopLevelTask()[0] workload.truncate("Resubmit_TestWorkload", topLevelTask.getPathName(), serverUrl, couchDB) return workload
def testFailedJobsScrambledWf(self): """ Performance test of failedJobs where jobs belong to 10 different workflows and 3 different tasks """ loadList = [] for i in range(1, 5000): wfName = "wf%d" % (i % 10) taskName = "/wf%d/task%d" % (i % 10, i % 3) loadList.append(self.jobConfig(wfName, taskName, i, '/file/name/lfn1')) dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") dcs.failedJobs(loadList) return
def testC_ACDCTest(self): """ _ACDCTest_ Test whether we can get a goodRunList out of ACDC and process it correctly. """ workload = self.createTestWorkload() dcs = DataCollectionService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) testFileA = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileA.addRun(Run(1, 1, 2)) testFileA.create() testFileB = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileB.addRun(Run(1, 3)) testFileB.create() testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileC.addRun(Run(1, 4, 6)) testFileC.create() testJobB = getJob(workload) testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileD.addRun(Run(1, 7)) testFileD.create() testJobC = getJob(workload) testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileE.addRun(Run(1, 11, 12)) testFileE.create() testJobD = getJob(workload) testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileF.addRun(Run(2, 5, 6, 7)) testFileF.create() testJobE = getJob(workload) testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileG.addRun(Run(2, 10, 11, 12)) testFileG.create() testJobF = getJob(workload) testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileH.addRun(Run(2, 15)) testFileH.create() testJobG = getJob(workload) testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileI.addRun(Run(3, 20)) testFileI.create() testJobH = getJob(workload) testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileJ.addRun(Run(1, 9)) testFileJ.create() testJobI = getJob(workload) testJobI.addFile(testFileJ) # dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, # testJobF, testJobG, testJobH, testJobI]) dcs.failedJobs([testJobA, testJobD, testJobH]) baseName = makeUUID() testFileset = Fileset(name=baseName) testFileset.create() testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testFileset.addFile(testFileH) testFileset.addFile(testFileI) testFileset.addFile(testFileJ) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroups = jobFactory( lumis_per_job=100, halt_job_on_file_boundaries=False, splitOnRun=True, collectionName=workload.name(), filesetName=workload.getTask("reco").getPathName(), owner="evansde77", group="DMWM", couchURL=self.testInit.couchUrl, couchDB=self.testInit.couchDbName, performance=self.performanceParams) self.assertEqual(jobGroups[0].jobs[0]['mask'].getRunAndLumis(), {1: [[1, 2], [3, 3], [11, 12]]}) self.assertEqual(jobGroups[0].jobs[1]['mask'].getRunAndLumis(), {3: [[20, 20]]}) return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package="WMComponent.DBS3Buffer", logger=myThread.logger, dbinterface=myThread.dbi) self.getOutputMapAction = self.daofactory( classname="Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory( classname="Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory( classname="Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname="Jobs.GetType") self.getParentInfoAction = self.daofactory( classname="Files.GetParentInfo") self.setParentageByJob = self.daofactory( classname="Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory( classname="Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname="Files.AddRunLumi") self.setFileLocation = self.daofactory( classname="Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory( classname="Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname="Files.Add") self.jobCompleteInput = self.daofactory(classname="Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname="Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory( classname="Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname="Jobs.LoadFromID") self.getFullJobInfo = self.daofactory( classname="Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory( classname="Jobs.GetFWJRTaskName") self.pnn_to_psn = self.daofactory( classname="Locations.GetPNNtoPSNMapping").execute() self.dbsStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory( classname="DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory( classname="DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory( classname="DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory( classname="DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory( classname="DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory( classname="DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname="ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory( classname="DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # maximum RAW EDM size for Repack output before data is put into Error dataset and skips PromptReco self.maxAllowedRepackOutputSize = getattr( config.JobAccountant, 'maxAllowedRepackOutputSize', 12 * 1024 * 1024 * 1024) # ACDC service self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen=1000) self.datasetAlgoPaths = collections.deque(maxlen=1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen=1000) self.workflowPaths = collections.deque(maxlen=1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://", "") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR( parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning( "Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute( jobId=jobReport.getJobID(), conn=self.getDBConn(), transaction=self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % ( jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info( "TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID=job["id"], fwkJobReport=fwkJobReport) if self.returnJobReport: returnList.append({ 'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport }) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute( binds=self.filesetAssoc, conn=self.getDBConn(), transaction=self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.jobCompleteInput.execute( id=idList, lfnsToSkip=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{ 'jobid': x['id'], 'outcome': x['outcome'] } for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds=outcomeBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute( binds=self.parentageBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute( binds=self.parentageBindsForMerge, conn=self.getDBConn(), transaction=self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction=False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append( outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task, errorDataset=False): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn=jobReportFile["lfn"], size=jobReportFile["size"], events=jobReportFile["events"], checksums=jobReportFile["checksums"], status="NOTUPLOADED") dbsFile.setAlgorithm(appName=datasetInfo["applicationName"], appVer=datasetInfo["applicationVersion"], appFam=jobReportFile["module_label"], psetHash="GIBBERISH", configContent=jobReportFile.get('configURL')) if errorDataset: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"] + "-Error", datasetInfo["processedDataset"], datasetInfo["dataTier"])) else: dbsFile.setDatasetPath( "/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus( validStatus=jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver=jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra( era=jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag=jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id=jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber=run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn=list(jobReportFile["locations"])[0], immediateSave=False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute( [lfn], conn=self.getDBConn(), transaction=self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn=parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID=None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file=fwjrFile, jobID=jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS( fwjrFile, task, jobType == "Repack" and fwjrFile["size"] > self.maxAllowedRepackOutputSize) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName( file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) jobType = self.getJobTypeAction.execute( jobID=jobID, conn=self.getDBConn(), transaction=self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len( outputMap.keys()) == 0 and outputModules == set( ['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set([ 'Merged', 'MergedError', 'logArchive' ]) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set( outputMap.keys()).difference(outputModules) == set( ['write_RAW']): pass else: failJob = True if jobType in ["Processing", "Production"]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error( "Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep( step='logArch1') else: logging.debug( "Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step='logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.warning("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats( fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error( "Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id=jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID=jobID, task=fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputID }) # LogCollect jobs have no output fileset if jobType == "LogCollect": pass # Repack jobs that wrote too large merged output skip output filesets elif jobType == "Repack" and merged and wmbsFile[ "size"] > self.maxAllowedRepackOutputSize: pass else: outputFilesets = self.outputFilesetsForJob( outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({ "lfn": wmbsFile["lfn"], "fileset": outputFileset }) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles and jobType not in ['LogCollect', 'Cleanup']: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView( "FWJRDump", 'jobsByOutputLFN', options={"stale": "update_after"}, keys=keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error( "job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % ( dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({ 'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID }) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute( workflowName, taskPath, conn=self.getDBConn(), transaction=self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({ 'workflowPath': workflowPath, 'workflowID': workflowID }) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'pnn': jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute( siteName=jobLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files=dbsFileTuples, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetLocation.execute(binds=dbsFileLoc, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dbsSetChecksum.execute(bulkList=dbsCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" % dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: outpnn = wmbsFile.getLocations()[0] if self.pnn_to_psn.get(outpnn, None): fileLocations.append({'lfn': lfn, 'location': outpnn}) else: msg = "PNN doesn't exist in wmbs_location_sename table: %s (investigate)" % outpnn logging.error(msg) raise AccountantWorkerException(msg) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({ 'lfn': lfn, 'cksum': selfChecksums[entry], 'cktype': entry }) fileCreate.append([ lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged'] ]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files=fileCreate, conn=self.getDBConn(), transaction=self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute( file=runLumiBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileAddChecksum.execute( bulkList=fileCksumBinds, conn=self.getDBConn(), transaction=self.existingTransaction()) self.setFileLocation.execute( lfn=fileLocations, location=self.fileLocation, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error( "Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn=pnn, immediateSave=False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn=lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute( binds=bindList, conn=self.getDBConn(), transaction=self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute( [{ 'jobid': x } for x in self.jobsWithSkippedFiles.keys()], fileSelection=self.jobsWithSkippedFiles, conn=self.getDBConn(), transaction=self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask=False) return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default' : self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname = "Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") return def setup(self, parameters = None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types jobList = [ job for job in jobList if job['type'] not in ['LogCollect','Cleanup'] ] self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state)) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) self.sendAlert(4, msg = msg) logging.debug("JobInfo: %s" % job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary = True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary = True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs" % len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath)) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] if startTime == None or stopTime == None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i" % job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.exitCodes]): msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.error(msg) self.sendAlert(4, msg = msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes())) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning("Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.debug("About to process %d retry done jobs" % len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.debug("About to process %d failures" % len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = [ 'create', 'submit', 'job' ] for state in failure_states: idList = self.getJobs.execute(state = "%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed" % (len(idList), state)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state = 'retrydone') logging.info("Found %d jobs done with all retries" % len(idList)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID = binds) # You have to have a list if type(results) == dict: results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id = entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID = binds) # You have to have a list if type(results) == dict: results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id = entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def algorithm(self, parameters = None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") myThread = threading.currentThread() try: self.handleErrors() except WMException as ex: try: myThread.transaction.rollback() except: pass raise except CouchConnectionError as ex: msg = "Caught CouchConnectionError exception in ErrorHandler\n" msg += "transactions postponed until the next polling cycle\n" msg += str(ex) logging.exception(msg) except Exception as ex: msg = "Caught exception in ErrorHandler\n" msg += str(ex) msg += str(traceback.format_exc()) msg += "\n\n" logging.error(msg) self.sendAlert(6, msg = msg) if getattr(myThread, 'transaction', None) != None \ and getattr(myThread.transaction, 'transaction', None) != None: myThread.transaction.rollback() raise ErrorHandlerException(msg)
def testC_ACDCTest(self): """ _ACDCTest_ Test whether we can get a goodRunList out of ACDC and process it correctly. """ workload = self.createTestWorkload() dcs = DataCollectionService(url=self.testInit.couchUrl, database=self.testInit.couchDbName) testFileA = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileA.addRun(Run(1, 1, 2)) testFileA.create() testFileB = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileB.addRun(Run(1, 3)) testFileB.create() testJobA = getJob(workload) testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileC.addRun(Run(1, 4, 6)) testFileC.create() testJobB = getJob(workload) testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileD.addRun(Run(1, 7)) testFileD.create() testJobC = getJob(workload) testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileE.addRun(Run(1, 11, 12)) testFileE.create() testJobD = getJob(workload) testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileF.addRun(Run(2, 5, 6, 7)) testFileF.create() testJobE = getJob(workload) testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileG.addRun(Run(2, 10, 11, 12)) testFileG.create() testJobF = getJob(workload) testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileH.addRun(Run(2, 15)) testFileH.create() testJobG = getJob(workload) testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileI.addRun(Run(3, 20)) testFileI.create() testJobH = getJob(workload) testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, locations="T1_US_FNAL_Disk") testFileJ.addRun(Run(1, 9)) testFileJ.create() testJobI = getJob(workload) testJobI.addFile(testFileJ) # dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, # testJobF, testJobG, testJobH, testJobI]) dcs.failedJobs([testJobA, testJobD, testJobH]) baseName = makeUUID() testFileset = Fileset(name=baseName) testFileset.create() testFileset.addFile(testFileA) testFileset.addFile(testFileB) testFileset.addFile(testFileC) testFileset.addFile(testFileD) testFileset.addFile(testFileE) testFileset.addFile(testFileF) testFileset.addFile(testFileG) testFileset.addFile(testFileH) testFileset.addFile(testFileI) testFileset.addFile(testFileJ) testFileset.commit() testSubscription = Subscription(fileset=testFileset, workflow=self.testWorkflow, split_algo="LumiBased", type="Processing") testSubscription.create() splitter = SplitterFactory() jobFactory = splitter(package="WMCore.WMBS", subscription=testSubscription) jobGroups = jobFactory(lumis_per_job=100, halt_job_on_file_boundaries=False, splitOnRun=True, collectionName=workload.name(), filesetName=workload.getTask("reco").getPathName(), owner="evansde77", group="DMWM", couchURL=self.testInit.couchUrl, couchDB=self.testInit.couchDbName, performance=self.performanceParams) self.assertEqual(jobGroups[0].jobs[0]['mask'].getRunAndLumis(), {1: [[1, 2], [3, 3], [11, 12]]}) self.assertEqual(jobGroups[0].jobs[1]['mask'].getRunAndLumis(), {3: [[20, 20]]}) return
class AccountantWorker(WMConnectionBase): """ Class that actually does the work of parsing FWJRs for the Accountant Run through ProcessPool """ def __init__(self, config): """ __init__ Create all DAO objects that are used by this class. """ WMConnectionBase.__init__(self, "WMCore.WMBS") myThread = threading.currentThread() self.dbsDaoFactory = DAOFactory(package = "WMComponent.DBS3Buffer", logger = myThread.logger, dbinterface = myThread.dbi) self.getOutputMapAction = self.daofactory(classname = "Jobs.GetOutputMap") self.bulkAddToFilesetAction = self.daofactory(classname = "Fileset.BulkAddByLFN") self.bulkParentageAction = self.daofactory(classname = "Files.AddBulkParentage") self.getJobTypeAction = self.daofactory(classname = "Jobs.GetType") self.getParentInfoAction = self.daofactory(classname = "Files.GetParentInfo") self.setParentageByJob = self.daofactory(classname = "Files.SetParentageByJob") self.setParentageByMergeJob = self.daofactory(classname = "Files.SetParentageByMergeJob") self.setFileRunLumi = self.daofactory(classname = "Files.AddRunLumi") self.setFileLocation = self.daofactory(classname = "Files.SetLocationByLFN") self.setFileAddChecksum = self.daofactory(classname = "Files.AddChecksumByLFN") self.addFileAction = self.daofactory(classname = "Files.Add") self.jobCompleteInput = self.daofactory(classname = "Jobs.CompleteInput") self.setBulkOutcome = self.daofactory(classname = "Jobs.SetOutcomeBulk") self.getWorkflowSpec = self.daofactory(classname = "Workflow.GetSpecAndNameFromTask") self.getJobInfoByID = self.daofactory(classname = "Jobs.LoadFromID") self.getFullJobInfo = self.daofactory(classname = "Jobs.LoadForErrorHandler") self.getJobTaskNameAction = self.daofactory(classname = "Jobs.GetFWJRTaskName") self.dbsStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.SetStatus") self.dbsParentStatusAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetParentStatus") self.dbsChildrenAction = self.dbsDaoFactory(classname = "DBSBufferFiles.GetChildren") self.dbsCreateFiles = self.dbsDaoFactory(classname = "DBSBufferFiles.Add") self.dbsSetLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.SetLocationByLFN") self.dbsInsertLocation = self.dbsDaoFactory(classname = "DBSBufferFiles.AddLocation") self.dbsSetChecksum = self.dbsDaoFactory(classname = "DBSBufferFiles.AddChecksumByLFN") self.dbsSetRunLumi = self.dbsDaoFactory(classname = "DBSBufferFiles.AddRunLumi") self.dbsGetWorkflow = self.dbsDaoFactory(classname = "ListWorkflow") self.dbsLFNHeritage = self.dbsDaoFactory(classname = "DBSBufferFiles.BulkHeritageParent") self.stateChanger = ChangeState(config) # Decide whether or not to attach jobReport to returned value self.returnJobReport = getattr(config.JobAccountant, 'returnReportFromWorker', False) # Store location for the specs for DBS self.specDir = getattr(config.JobAccountant, 'specDir', None) # ACDC service self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) jobDBurl = sanitizeURL(config.JobStateMachine.couchurl)['url'] jobDBName = config.JobStateMachine.couchDBName jobCouchdb = CouchServer(jobDBurl) self.fwjrCouchDB = jobCouchdb.connectDatabase("%s/fwjrs" % jobDBName) self.localWMStats = WMStatsWriter(config.TaskArchiver.localWMStatsURL, appName="WMStatsAgent") # Hold data for later commital self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} self.count = 0 self.datasetAlgoID = collections.deque(maxlen = 1000) self.datasetAlgoPaths = collections.deque(maxlen = 1000) self.dbsLocations = set() self.workflowIDs = collections.deque(maxlen = 1000) self.workflowPaths = collections.deque(maxlen = 1000) self.phedex = PhEDEx() self.locLists = self.phedex.getNodeMap() return def reset(self): """ _reset_ Reset all global vars between runs. """ self.dbsFilesToCreate = [] self.wmbsFilesToBuild = [] self.wmbsMergeFilesToBuild = [] self.fileLocation = None self.mergedOutputFiles = [] self.listOfJobsToSave = [] self.listOfJobsToFail = [] self.filesetAssoc = [] self.parentageBinds = [] self.parentageBindsForMerge = [] self.jobsWithSkippedFiles = {} gc.collect() return def loadJobReport(self, parameters): """ _loadJobReport_ Given a framework job report on disk, load it and return a FwkJobReport instance. If there is any problem loading or parsing the framework job report return None. """ # The jobReportPath may be prefixed with "file://" which needs to be # removed so it doesn't confuse the FwkJobReport() parser. jobReportPath = parameters.get("fwjr_path", None) if not jobReportPath: logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, "FWJR path is empty") jobReportPath = jobReportPath.replace("file://","") if not os.path.exists(jobReportPath): logging.error("Bad FwkJobReport Path: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99999, 'Cannot find file in jobReport path: %s' % jobReportPath) if os.path.getsize(jobReportPath) == 0: logging.error("Empty FwkJobReport: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99998, 'jobReport of size 0: %s ' % jobReportPath) jobReport = Report() try: jobReport.load(jobReportPath) except Exception as ex: msg = "Error loading jobReport %s\n" % jobReportPath msg += str(ex) logging.error(msg) logging.debug("Failing job: %s\n" % parameters) return self.createMissingFWKJR(parameters, 99997, 'Cannot load jobReport') if len(jobReport.listSteps()) == 0: logging.error("FwkJobReport with no steps: %s" % jobReportPath) return self.createMissingFWKJR(parameters, 99997, 'jobReport with no steps: %s ' % jobReportPath) return jobReport def isTaskExistInFWJR(self, jobReport, jobStatus): """ If taskName is not available in the FWJR, then tries to recover it getting data from the SQL database. """ if not jobReport.getTaskName(): logging.warning("Trying to recover a corrupted FWJR for a %s job with job id %s" % (jobStatus, jobReport.getJobID())) jobInfo = self.getJobTaskNameAction.execute(jobId = jobReport.getJobID(), conn = self.getDBConn(), transaction = self.existingTransaction()) jobReport.setTaskName(jobInfo['taskName']) jobReport.save(jobInfo['fwjr_path']) if not jobReport.getTaskName(): msg = "Report to developers. Failed to recover corrupted fwjr for %s job id %s" % (jobStatus, jobReport.getJobID()) raise AccountantWorkerException(msg) else: logging.info("TaskName '%s' successfully recovered and added to fwjr id %s." % (jobReport.getTaskName(), jobReport.getJobID())) return def __call__(self, parameters): """ __call__ Handle a completed job. The parameters dictionary will contain the job ID and the path to the framework job report. """ returnList = [] self.reset() for job in parameters: logging.info("Handling %s" % job["fwjr_path"]) # Load the job and set the ID fwkJobReport = self.loadJobReport(job) fwkJobReport.setJobID(job['id']) jobSuccess = self.handleJob(jobID = job["id"], fwkJobReport = fwkJobReport) if self.returnJobReport: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess, 'jobReport': fwkJobReport}) else: returnList.append({'id': job["id"], 'jobSuccess': jobSuccess}) self.count += 1 self.beginTransaction() # Now things done at the end of the job # Do what we can with WMBS files self.handleWMBSFiles(self.wmbsFilesToBuild, self.parentageBinds) # handle merge files separately since parentage need to set # separately to support robust merge self.handleWMBSFiles(self.wmbsMergeFilesToBuild, self.parentageBindsForMerge) # Create DBSBufferFiles self.createFilesInDBSBuffer() # Handle filesetAssoc if len(self.filesetAssoc) > 0: self.bulkAddToFilesetAction.execute(binds = self.filesetAssoc, conn = self.getDBConn(), transaction = self.existingTransaction()) # Move successful jobs to successful if len(self.listOfJobsToSave) > 0: idList = [x['id'] for x in self.listOfJobsToSave] outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToSave] self.setBulkOutcome.execute(binds = outcomeBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.jobCompleteInput.execute(id = idList, lfnsToSkip = self.jobsWithSkippedFiles, conn = self.getDBConn(), transaction = self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToSave, "success", "complete") # If we have failed jobs, fail them if len(self.listOfJobsToFail) > 0: outcomeBinds = [{'jobid': x['id'], 'outcome': x['outcome']} for x in self.listOfJobsToFail] self.setBulkOutcome.execute(binds = outcomeBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.stateChanger.propagate(self.listOfJobsToFail, "jobfailed", "complete") # Arrange WMBS parentage if len(self.parentageBinds) > 0: self.setParentageByJob.execute(binds = self.parentageBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) if len(self.parentageBindsForMerge) > 0: self.setParentageByMergeJob.execute(binds = self.parentageBindsForMerge, conn = self.getDBConn(), transaction = self.existingTransaction()) # Straighten out DBS Parentage if len(self.mergedOutputFiles) > 0: self.handleDBSBufferParentage() if len(self.jobsWithSkippedFiles) > 0: self.handleSkippedFiles() self.commitTransaction(existingTransaction = False) return returnList def outputFilesetsForJob(self, outputMap, merged, moduleLabel): """ _outputFilesetsForJob_ Determine if the file should be placed in any other fileset. Note that this will not return the JobGroup output fileset as all jobs will have their output placed there. """ if moduleLabel not in outputMap: logging.info("Output module label missing from output map.") return [] outputFilesets = [] for outputFileset in outputMap[moduleLabel]: if merged == False and outputFileset["output_fileset"] != None: outputFilesets.append(outputFileset["output_fileset"]) else: if outputFileset["merged_output_fileset"] != None: outputFilesets.append(outputFileset["merged_output_fileset"]) return outputFilesets def addFileToDBS(self, jobReportFile, task): """ _addFileToDBS_ Add a file that was output from a job to the DBS buffer. """ datasetInfo = jobReportFile["dataset"] dbsFile = DBSBufferFile(lfn = jobReportFile["lfn"], size = jobReportFile["size"], events = jobReportFile["events"], checksums = jobReportFile["checksums"], status = "NOTUPLOADED") dbsFile.setAlgorithm(appName = datasetInfo["applicationName"], appVer = datasetInfo["applicationVersion"], appFam = jobReportFile["module_label"], psetHash = "GIBBERISH", configContent = jobReportFile.get('configURL')) dbsFile.setDatasetPath("/%s/%s/%s" % (datasetInfo["primaryDataset"], datasetInfo["processedDataset"], datasetInfo["dataTier"])) dbsFile.setValidStatus(validStatus = jobReportFile.get("validStatus", None)) dbsFile.setProcessingVer(ver = jobReportFile.get('processingVer', None)) dbsFile.setAcquisitionEra(era = jobReportFile.get('acquisitionEra', None)) dbsFile.setGlobalTag(globalTag = jobReportFile.get('globalTag', None)) #TODO need to find where to get the prep id dbsFile.setPrepID(prep_id = jobReportFile.get('prep_id', None)) dbsFile['task'] = task for run in jobReportFile["runs"]: newRun = Run(runNumber = run.run) newRun.extend(run.lumis) dbsFile.addRun(newRun) dbsFile.setLocation(pnn = list(jobReportFile["locations"])[0], immediateSave = False) self.dbsFilesToCreate.append(dbsFile) return def findDBSParents(self, lfn): """ _findDBSParents_ Find the parent of the file in DBS This is meant to be called recursively """ parentsInfo = self.getParentInfoAction.execute([lfn], conn = self.getDBConn(), transaction = self.existingTransaction()) newParents = set() for parentInfo in parentsInfo: # This will catch straight to merge files that do not have redneck # parents. We will mark the straight to merge file from the job # as a child of the merged parent. if int(parentInfo["merged"]) == 1: newParents.add(parentInfo["lfn"]) elif parentInfo['gpmerged'] == None: continue # Handle the files that result from merge jobs that aren't redneck # children. We have to setup parentage and then check on whether or # not this file has any redneck children and update their parentage # information. elif int(parentInfo["gpmerged"]) == 1: newParents.add(parentInfo["gplfn"]) # If that didn't work, we've reached the great-grandparents # And we have to work via recursion else: parentSet = self.findDBSParents(lfn = parentInfo['gplfn']) for parent in parentSet: newParents.add(parent) return newParents def addFileToWMBS(self, jobType, fwjrFile, jobMask, task, jobID = None): """ _addFileToWMBS_ Add a file that was produced in a job to WMBS. """ fwjrFile["first_event"] = jobMask["FirstEvent"] if fwjrFile["first_event"] == None: fwjrFile["first_event"] = 0 if jobType == "Merge" and fwjrFile["module_label"] != "logArchive": setattr(fwjrFile["fileRef"], 'merged', True) fwjrFile["merged"] = True wmbsFile = self.createFileFromDataStructsFile(file = fwjrFile, jobID = jobID) if jobType == "Merge": self.wmbsMergeFilesToBuild.append(wmbsFile) else: self.wmbsFilesToBuild.append(wmbsFile) if fwjrFile["merged"]: self.addFileToDBS(fwjrFile, task) return wmbsFile def _mapLocation(self, fwkJobReport): for file in fwkJobReport.getAllFileRefs(): if file and hasattr(file, 'location'): file.location = self.phedex.getBestNodeName(file.location, self.locLists) def handleJob(self, jobID, fwkJobReport): """ _handleJob_ Figure out if a job was successful or not, handle it appropriately (parse FWJR, update WMBS) and return the success status as a boolean """ jobSuccess = fwkJobReport.taskSuccessful() outputMap = self.getOutputMapAction.execute(jobID = jobID, conn = self.getDBConn(), transaction = self.existingTransaction()) jobType = self.getJobTypeAction.execute(jobID = jobID, conn = self.getDBConn(), transaction = self.existingTransaction()) if jobSuccess: fileList = fwkJobReport.getAllFiles() # consistency check comparing outputMap to fileList # they should match except for some limited special cases outputModules = set([]) for fwjrFile in fileList: outputModules.add(fwjrFile['outputModule']) if set(outputMap.keys()) == outputModules: pass elif jobType == "LogCollect" and len(outputMap.keys()) == 0 and outputModules == set(['LogCollect']): pass elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['Merged', 'logArchive']): pass elif jobType == "Merge" and set(outputMap.keys()) == set(['Merged', 'MergedError', 'logArchive']) and outputModules == set(['MergedError', 'logArchive']): pass elif jobType == "Express" and set(outputMap.keys()).difference(outputModules) == set(['write_RAW']): pass else: failJob = True if jobType in [ "Processing", "Production" ]: cmsRunSteps = 0 for step in fwkJobReport.listSteps(): if step.startswith("cmsRun"): cmsRunSteps += 1 if cmsRunSteps > 1: failJob = False if failJob: jobSuccess = False logging.error("Job %d , list of expected outputModules does not match job report, failing job", jobID) logging.debug("Job %d , expected outputModules %s", jobID, sorted(outputMap.keys())) logging.debug("Job %d , fwjr outputModules %s", jobID, sorted(outputModules)) fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1') else: logging.debug("Job %d , list of expected outputModules does not match job report, accepted for multi-step CMSSW job", jobID) else: fileList = fwkJobReport.getAllFilesFromStep(step = 'logArch1') if jobSuccess: logging.info("Job %d , handle successful job", jobID) else: logging.error("Job %d , bad jobReport, failing job", jobID) # make sure the task name is present in FWJR (recover from WMBS if needed) if len(fileList) > 0: if jobSuccess: self.isTaskExistInFWJR(fwkJobReport, "success") else: self.isTaskExistInFWJR(fwkJobReport, "failed") # special check for LogCollect jobs skipLogCollect = False if jobSuccess and jobType == "LogCollect": for fwjrFile in fileList: try: # this assumes there is only one file for LogCollect jobs, not sure what happend if that changes self.associateLogCollectToParentJobsInWMStats(fwkJobReport, fwjrFile["lfn"], fwkJobReport.getTaskName()) except Exception as ex: skipLogCollect = True logging.error("Error occurred: associating log collect location, will try again\n %s" % str(ex)) break # now handle the job (unless the special LogCollect check failed) if not skipLogCollect: wmbsJob = Job(id = jobID) wmbsJob.load() outputID = wmbsJob.loadOutputID() wmbsJob.getMask() wmbsJob["fwjr"] = fwkJobReport if jobSuccess: wmbsJob["outcome"] = "success" else: wmbsJob["outcome"] = "failure" for fwjrFile in fileList: logging.debug("Job %d , register output %s", jobID, fwjrFile["lfn"]) wmbsFile = self.addFileToWMBS(jobType, fwjrFile, wmbsJob["mask"], jobID = jobID, task = fwkJobReport.getTaskName()) merged = fwjrFile['merged'] moduleLabel = fwjrFile["module_label"] if merged: self.mergedOutputFiles.append(wmbsFile) self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputID}) # LogCollect jobs have no output fileset if jobType != "LogCollect": outputFilesets = self.outputFilesetsForJob(outputMap, merged, moduleLabel) for outputFileset in outputFilesets: self.filesetAssoc.append({"lfn": wmbsFile["lfn"], "fileset": outputFileset}) # Check if the job had any skipped files, put them in ACDC containers # We assume full file processing (no job masks) if jobSuccess: skippedFiles = fwkJobReport.getAllSkippedFiles() if skippedFiles: self.jobsWithSkippedFiles[jobID] = skippedFiles # Only save once job is done, and we're sure we made it through okay self._mapLocation(wmbsJob['fwjr']) if jobSuccess: self.listOfJobsToSave.append(wmbsJob) else: self.listOfJobsToFail.append(wmbsJob) return jobSuccess def associateLogCollectToParentJobsInWMStats(self, fwkJobReport, logAchiveLFN, task): """ _associateLogCollectToParentJobsInWMStats_ Associate a logArchive output to its parent job """ inputFileList = fwkJobReport.getAllInputFiles() requestName = task.split('/')[1] keys = [] for inputFile in inputFileList: keys.append([requestName, inputFile["lfn"]]) resultRows = self.fwjrCouchDB.loadView("FWJRDump", 'jobsByOutputLFN', options = {"stale": "update_after"}, keys = keys)['rows'] if len(resultRows) > 0: #get data from wmbs parentWMBSJobIDs = [] for row in resultRows: parentWMBSJobIDs.append({"jobid": row["value"]}) #update Job doc in wmstats results = self.getJobInfoByID.execute(parentWMBSJobIDs) parentJobNames = [] if isinstance(results, list): for jobInfo in results: parentJobNames.append(jobInfo['name']) else: parentJobNames.append(results['name']) self.localWMStats.updateLogArchiveLFN(parentJobNames, logAchiveLFN) else: #TODO: if the couch db is consistent with DB this should be removed (checking resultRow > 0) #It need to be failed and retried. logging.error("job report is missing for updating log archive mapping\n Input file list\n %s" % inputFileList) return def createMissingFWKJR(self, parameters, errorCode = 999, errorDescription = 'Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report def createFilesInDBSBuffer(self): """ _createFilesInDBSBuffer_ It does the actual job of creating things in DBSBuffer WARNING: This assumes all files in a job have the same final location """ if len(self.dbsFilesToCreate) == 0: # Whoops, nothing to do! return dbsFileTuples = [] dbsFileLoc = [] dbsCksumBinds = [] runLumiBinds = [] selfChecksums = None jobLocations = set() for dbsFile in self.dbsFilesToCreate: # Append a tuple in the format specified by DBSBufferFiles.Add # Also run insertDatasetAlgo assocID = None datasetAlgoPath = '%s:%s:%s:%s:%s:%s:%s:%s' % (dbsFile['datasetPath'], dbsFile["appName"], dbsFile["appVer"], dbsFile["appFam"], dbsFile["psetHash"], dbsFile['processingVer'], dbsFile['acquisitionEra'], dbsFile['globalTag']) # First, check if this is in the cache if datasetAlgoPath in self.datasetAlgoPaths: for da in self.datasetAlgoID: if da['datasetAlgoPath'] == datasetAlgoPath: assocID = da['assocID'] break if not assocID: # Then we have to get it ourselves try: assocID = dbsFile.insertDatasetAlgo() self.datasetAlgoPaths.append(datasetAlgoPath) self.datasetAlgoID.append({'datasetAlgoPath': datasetAlgoPath, 'assocID': assocID}) except WMException: raise except Exception as ex: msg = "Unhandled exception while inserting datasetAlgo: %s\n" % datasetAlgoPath msg += str(ex) logging.error(msg) raise AccountantWorkerException(msg) # Associate the workflow to the file using the taskPath and the requestName # TODO: debug why it happens and then drop/recover these cases automatically taskPath = dbsFile.get('task') if not taskPath: msg = "Can't do workflow association, report this error to a developer.\n" msg += "DbsFile : %s" % str(dbsFile) raise AccountantWorkerException(msg) workflowName = taskPath.split('/')[1] workflowPath = '%s:%s' % (workflowName, taskPath) if workflowPath in self.workflowPaths: for wf in self.workflowIDs: if wf['workflowPath'] == workflowPath: workflowID = wf['workflowID'] break else: result = self.dbsGetWorkflow.execute(workflowName, taskPath, conn = self.getDBConn(), transaction = self.existingTransaction()) workflowID = result['id'] self.workflowPaths.append(workflowPath) self.workflowIDs.append({'workflowPath': workflowPath, 'workflowID': workflowID}) lfn = dbsFile['lfn'] selfChecksums = dbsFile['checksums'] jobLocation = dbsFile.getLocations()[0] jobLocations.add(jobLocation) dbsFileTuples.append((lfn, dbsFile['size'], dbsFile['events'], assocID, dbsFile['status'], workflowID)) dbsFileLoc.append({'lfn': lfn, 'sename' : jobLocation}) if dbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': dbsFile['runs']}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): dbsCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry], 'cktype' : entry}) try: diffLocation = jobLocations.difference(self.dbsLocations) for jobLocation in diffLocation: self.dbsInsertLocation.execute(siteName = jobLocation, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsLocations.add(jobLocation) self.dbsCreateFiles.execute(files = dbsFileTuples, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsSetLocation.execute(binds = dbsFileLoc, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dbsSetChecksum.execute(bulkList = dbsCksumBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) if len(runLumiBinds) > 0: self.dbsSetRunLumi.execute(file = runLumiBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Got exception while inserting files into DBSBuffer!\n" msg += str(ex) logging.error(msg) logging.debug("Listing binds:") logging.debug("jobLocation: %s\n" % jobLocation) logging.debug("dbsFiles: %s\n" % dbsFileTuples) logging.debug("dbsFileLoc: %s\n" %dbsFileLoc) logging.debug("Checksum binds: %s\n" % dbsCksumBinds) logging.debug("RunLumi binds: %s\n" % runLumiBinds) raise AccountantWorkerException(msg) # Now that we've created those files, clear the list self.dbsFilesToCreate = [] return def handleWMBSFiles(self, wmbsFilesToBuild, parentageBinds): """ _handleWMBSFiles_ Do what can be done in bulk in bulk """ if len(wmbsFilesToBuild) == 0: # Nothing to do return runLumiBinds = [] fileCksumBinds = [] fileLocations = [] fileCreate = [] for wmbsFile in wmbsFilesToBuild: lfn = wmbsFile['lfn'] if lfn == None: continue selfChecksums = wmbsFile['checksums'] # by jobType add to different parentage relation # if it is the merge job, don't include the parentage on failed input files. # otherwise parentage is set for all input files. parentageBinds.append({'child': lfn, 'jobid': wmbsFile['jid']}) if wmbsFile['runs']: runLumiBinds.append({'lfn': lfn, 'runs': wmbsFile['runs']}) if len(wmbsFile.getLocations()) > 0: fileLocations.append({'lfn': lfn, 'location': wmbsFile.getLocations()[0]}) if selfChecksums: # If we have checksums we have to create a bind # For each different checksum for entry in selfChecksums.keys(): fileCksumBinds.append({'lfn': lfn, 'cksum' : selfChecksums[entry], 'cktype' : entry}) fileCreate.append([lfn, wmbsFile['size'], wmbsFile['events'], None, wmbsFile["first_event"], wmbsFile['merged']]) if len(fileCreate) == 0: return try: self.addFileAction.execute(files = fileCreate, conn = self.getDBConn(), transaction = self.existingTransaction()) if runLumiBinds: self.setFileRunLumi.execute(file = runLumiBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.setFileAddChecksum.execute(bulkList = fileCksumBinds, conn = self.getDBConn(), transaction = self.existingTransaction()) self.setFileLocation.execute(lfn = fileLocations, location = self.fileLocation, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while adding files to WMBS!\n" msg += str(ex) logging.error(msg) logging.debug("Printing binds: \n") logging.debug("FileCreate binds: %s\n" % fileCreate) logging.debug("Runlumi binds: %s\n" % runLumiBinds) logging.debug("Checksum binds: %s\n" % fileCksumBinds) logging.debug("FileLocation binds: %s\n" % fileLocations) raise AccountantWorkerException(msg) # Clear out finished files wmbsFilesToBuild = [] return def createFileFromDataStructsFile(self, file, jobID): """ _createFileFromDataStructsFile_ This function will create a WMBS File given a DataStructs file """ wmbsFile = File() wmbsFile.update(file) if isinstance(file["locations"], set): pnn = list(file["locations"])[0] elif isinstance(file["locations"], list): if len(file['locations']) > 1: logging.error("Have more then one location for a file in job %i" % (jobID)) logging.error("Choosing location %s" % (file['locations'][0])) pnn = file["locations"][0] else: pnn = file["locations"] wmbsFile["locations"] = set() if pnn != None: wmbsFile.setLocation(pnn = pnn, immediateSave = False) wmbsFile['jid'] = jobID return wmbsFile def handleDBSBufferParentage(self): """ _handleDBSBufferParentage_ Handle all the DBSBuffer Parentage in bulk if you can """ outputLFNs = [f['lfn'] for f in self.mergedOutputFiles] bindList = [] for lfn in outputLFNs: newParents = self.findDBSParents(lfn = lfn) for parentLFN in newParents: bindList.append({'child': lfn, 'parent': parentLFN}) # Now all the parents should exist # Commit them to DBSBuffer logging.info("About to commit all DBSBuffer Heritage information") logging.info(len(bindList)) if len(bindList) > 0: try: self.dbsLFNHeritage.execute(binds = bindList, conn = self.getDBConn(), transaction = self.existingTransaction()) except WMException: raise except Exception as ex: msg = "Error while trying to handle the DBS LFN heritage\n" msg += str(ex) msg += "BindList: %s" % bindList logging.error(msg) raise AccountantWorkerException(msg) return def handleSkippedFiles(self): """ _handleSkippedFiles_ Handle all the skipped files in bulk, the way it handles the skipped files imposes an important restriction: Skipped files should have been processed by a single job in the task and no job mask exists in it. This is suitable for jobs using ParentlessMergeBySize/FileBased/MinFileBased splitting algorithms. Here ACDC records and created and the file are moved to wmbs_sub_files_failed from completed. """ jobList = self.getFullJobInfo.execute([{'jobid' : x} for x in self.jobsWithSkippedFiles.keys()], fileSelection = self.jobsWithSkippedFiles, conn = self.getDBConn(), transaction = self.existingTransaction()) self.dataCollection.failedJobs(jobList, useMask = False) return
def testChunking(self): """ _testChunking_ Insert a workload and files that have several distinct sets of locations. Verify that the chunks are created correctly and that they only groups files that have the same set of locations. Also verify that the chunks are pulled out of ACDC correctly. """ dcs = DataCollectionService(url = self.testInit.couchUrl, database = "wmcore-acdc-datacollectionsvc") def getJob(): job = Job() job["task"] = "/ACDCTest/reco" job["workflow"] = "ACDCTest" job["location"] = "cmssrm.fnal.gov" job["owner"] = "cmsdataops" job["group"] = "cmsdataops" return job testFileA = File(lfn = makeUUID(), size = 1024, events = 1024) testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn = makeUUID(), size = 1024, events = 1024) testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileB.addRun(Run(1, 3, 4)) testFileC = File(lfn = makeUUID(), size = 1024, events = 1024) testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileC.addRun(Run(1, 5, 6)) testJobA = getJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testJobA.addFile(testFileC) testFileD = File(lfn = makeUUID(), size = 1024, events = 1024) testFileD.setLocation(["cmssrm.fnal.gov"]) testFileD.addRun(Run(2, 1, 2)) testFileE = File(lfn = makeUUID(), size = 1024, events = 1024) testFileE.setLocation(["cmssrm.fnal.gov"]) testFileE.addRun(Run(2, 3, 4)) testJobB = getJob() testJobB.addFile(testFileD) testJobB.addFile(testFileE) testFileF = File(lfn = makeUUID(), size = 1024, events = 1024, parents = set(["/some/parent/F"])) testFileF.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileF.addRun(Run(3, 1, 2)) testFileG = File(lfn = makeUUID(), size = 1024, events = 1024, parents = set(["/some/parent/G"])) testFileG.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"] ) testFileG.addRun(Run(3, 3, 4)) testFileH = File(lfn = makeUUID(), size = 1024, events = 1024, parents = set(["/some/parent/H"])) testFileH.setLocation(["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileH.addRun(Run(3, 5, 6)) testJobC = getJob() testJobC.addFile(testFileF) testJobC.addFile(testFileG) testJobC.addFile(testFileH) testFileI = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True) testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileI.addRun(Run(4, 1, 2)) testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True) testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"] ) testFileJ.addRun(Run(4, 3, 4)) testFileK = File(lfn = makeUUID(), size = 1024, events = 1024, merged = True) testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileK.addRun(Run(4, 5, 6)) testJobD = getJob() testJobD.addFile(testFileI) testJobD.addFile(testFileJ) testJobD.addFile(testFileK) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD]) chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize = 5) self.assertEqual(len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks)) goldenMetaData = {1: {"lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024}, 2: {"lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048}, 3: {"lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072}, 5: {"lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120}} testFiles =[testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK] lastFile = testFileA for testFile in testFiles: if lastFile["lfn"] < testFile["lfn"]: lastFile = testFile testFiles.remove(lastFile) goldenFiles = {1: [lastFile], 2: [testFileD, testFileE], 3: [testFileF, testFileG, testFileH], 5: testFiles} for chunk in chunks: chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertEqual(chunkMetaData["files"], chunk["files"], "Error: Metadata doesn't match.") self.assertEqual(chunkMetaData["lumis"], chunk["lumis"], "Error: Metadata doesn't match.") self.assertEqual(chunkMetaData["events"], chunk["events"], "Error: Metadata doesn't match.") self.assertEqual(chunkMetaData["locations"], chunk["locations"], "Error: Metadata doesn't match.") self.assertTrue(chunk["files"] in goldenMetaData.keys(), "Error: Extra chunk found.") self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"], "Error: Lumis in chunk is wrong.") self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"], "Error: Locations in chunk is wrong.") self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"], "Error: Events in chunk is wrong.") del goldenMetaData[chunk["files"]] chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertTrue(chunk["files"] in goldenFiles.keys(), "Error: Extra chunk found.") goldenChunkFiles = goldenFiles[chunk["files"]] self.assertEqual(len(chunkFiles), len(goldenChunkFiles)) for chunkFile in chunkFiles: foundFile = None for goldenChunkFile in goldenChunkFiles: if chunkFile["lfn"] == goldenChunkFile["lfn"]: foundFile = goldenChunkFile break self.assertTrue(foundFile != None, "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles)) self.assertEqual(foundFile["parents"], chunkFile["parents"], "Error: File parents should match.") self.assertEqual(foundFile["merged"], chunkFile["merged"], "Error: File merged status should match.") self.assertEqual(foundFile["locations"], chunkFile["locations"], "Error: File locations should match.") self.assertEqual(foundFile["events"], chunkFile["events"], "Error: File locations should match: %s" % chunk["files"]) self.assertEqual(foundFile["size"], chunkFile["size"], "Error: File locations should match.") self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]), "Error: Wrong number of runs.") for run in foundFile["runs"]: runMatch = False for chunkRun in chunkFile["runs"]: if chunkRun.run == run.run and chunkRun.lumis == run.lumis: runMatch = True break self.assertTrue(runMatch, "Error: Run information is wrong.") del goldenFiles[chunk["files"]] singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco") self.assertEqual(singleChunk, {"offset" : 0, "files" : 11, "events" : 11264, "lumis" : 22, "locations" : set(["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"])}, "Error: Single chunk metadata is wrong") return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None self.maxRetries = self.config.ErrorHandler.maxRetries else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url=config.ACDC.couchurl, database=config.ACDC.database) self.setupComponentParam() return def setupComponentParam(self): """ Initialize (and update every cycle) some of the component's parameters, according to the agent type (T0/Production) and agent config. """ if self.reqAuxDB: agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) self.exitCodesNoRetry = agentConfig.get("NoRetryExitCodes", []) if agentConfig.get("UserDrainMode") and agentConfig.get("SpeedDrainMode") \ and agentConfig.get("SpeedDrainConfig")['NoJobRetries']['Enabled']: logging.info("Agent is in speed drain mode. Not retrying any jobs.") self.maxRetries = 0 else: self.maxRetries = agentConfig.get("MaxRetries") if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') def setup(self, parameters=None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types acdcJobList = [job for job in jobList if job['type'] not in ['LogCollect', 'Cleanup']] self.handleACDC(acdcJobList) self.changeState.propagate(jobList, 'exhausted', 'retrydone') return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed", len(jobList), state) retrydoneJobs = [] cooloffJobs = [] passJobs = [] if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) logging.debug("JobInfo: %s", job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary=True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary=True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs", len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.", job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error( "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.", job['id'], reportPath) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] # correct the location if the original location is different from recorded in wmbs # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this. # If location in wmbs needs to be updated, it should happen in JobAccountant. locationFromFWJR = report.getSiteName() if locationFromFWJR: job["location"] = locationFromFWJR job["site_cms_name"] = locationFromFWJR if startTime is None or stopTime is None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i", job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.exitCodesNoRetry]): msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.debug(msg) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning("Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.info("About to process %d retry done jobs", len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.info("About to process %d failures", len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) for jobSlice in grouper(idList, self.maxProcessSize): jobList = self.loadJobsFromList(jobSlice) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs @timeFunction def algorithm(self, parameters=None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") self.setupComponentParam() try: myThread = threading.currentThread() self.handleErrors() except (CouchConnectionError, HTTPException) as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. " msg += "Transactions postponed until the next polling cycle\n" msg += str(ex) logging.error(msg) except Exception as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught unexpected exception in ErrorHandler:\n" msg += str(ex) logging.exception(msg) raise ErrorHandlerException(msg)
def testChunking(self): """ _testChunking_ Insert a workload and files that have several distinct sets of locations. Verify that the chunks are created correctly and that they only groups files that have the same set of locations. Also verify that the chunks are pulled out of ACDC correctly. """ dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") testFileA = File(lfn=makeUUID(), size=1024, events=1024) testFileA.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn=makeUUID(), size=1024, events=1024) testFileB.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileB.addRun(Run(1, 3, 4)) testFileC = File(lfn=makeUUID(), size=1024, events=1024) testFileC.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileC.addRun(Run(1, 5, 6)) testJobA = self.getMinimalJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testJobA.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024) testFileD.setLocation(["cmssrm.fnal.gov"]) testFileD.addRun(Run(2, 1, 2)) testFileE = File(lfn=makeUUID(), size=1024, events=1024) testFileE.setLocation(["cmssrm.fnal.gov"]) testFileE.addRun(Run(2, 3, 4)) testJobB = self.getMinimalJob() testJobB.addFile(testFileD) testJobB.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/F"}) testFileF.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileF.addRun(Run(3, 1, 2)) testFileG = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/G"}) testFileG.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileG.addRun(Run(3, 3, 4)) testFileH = File(lfn=makeUUID(), size=1024, events=1024, parents={"/some/parent/H"}) testFileH.setLocation( ["cmssrm.fnal.gov", "castor.cern.ch", "srm.ral.uk"]) testFileH.addRun(Run(3, 5, 6)) testJobC = self.getMinimalJob() testJobC.addFile(testFileF) testJobC.addFile(testFileG) testJobC.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileI.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileI.addRun(Run(4, 1, 2)) testFileJ = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileJ.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileJ.addRun(Run(4, 3, 4)) testFileK = File(lfn=makeUUID(), size=1024, events=1024, merged=True) testFileK.setLocation(["cmssrm.fnal.gov", "castor.cern.ch"]) testFileK.addRun(Run(4, 5, 6)) testJobD = self.getMinimalJob() testJobD.addFile(testFileI) testJobD.addFile(testFileJ) testJobD.addFile(testFileK) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD]) chunks = dcs.chunkFileset("ACDCTest", "/ACDCTest/reco", chunkSize=5) self.assertEqual( len(chunks), 4, "Error: There should be four chunks: %s" % len(chunks)) goldenMetaData = { 1: { "lumis": 2, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 1024 }, 2: { "lumis": 4, "locations": ["cmssrm.fnal.gov"], "events": 2048 }, 3: { "lumis": 6, "locations": ["castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"], "events": 3072 }, 5: { "lumis": 10, "locations": ["castor.cern.ch", "cmssrm.fnal.gov"], "events": 5120 } } testFiles = [ testFileA, testFileB, testFileC, testFileI, testFileJ, testFileK ] lastFile = testFileA for testFile in testFiles: if lastFile["lfn"] < testFile["lfn"]: lastFile = testFile testFiles.remove(lastFile) goldenFiles = { 1: [lastFile], 2: [testFileD, testFileE], 3: [testFileF, testFileG, testFileH], 5: testFiles } for chunk in chunks: chunkMetaData = dcs.getChunkInfo("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertEqual(chunkMetaData["files"], chunk["files"]) self.assertEqual(chunkMetaData["lumis"], chunk["lumis"]) self.assertEqual(chunkMetaData["events"], chunk["events"]) self.assertEqual(chunkMetaData["locations"], chunk["locations"]) self.assertTrue(chunk["files"] in goldenMetaData.keys(), "Error: Extra chunk found.") self.assertEqual(chunk["lumis"], goldenMetaData[chunk["files"]]["lumis"], "Error: Lumis in chunk is wrong.") self.assertEqual(chunk["locations"], goldenMetaData[chunk["files"]]["locations"], "Error: Locations in chunk is wrong.") self.assertEqual(chunk["events"], goldenMetaData[chunk["files"]]["events"], "Error: Events in chunk is wrong.") del goldenMetaData[chunk["files"]] chunkFiles = dcs.getChunkFiles("ACDCTest", "/ACDCTest/reco", chunk["offset"], chunk["files"]) self.assertTrue(chunk["files"] in goldenFiles.keys(), "Error: Extra chunk found.") goldenChunkFiles = goldenFiles[chunk["files"]] self.assertEqual(len(chunkFiles), len(goldenChunkFiles)) for chunkFile in chunkFiles: foundFile = None for goldenChunkFile in goldenChunkFiles: if chunkFile["lfn"] == goldenChunkFile["lfn"]: foundFile = goldenChunkFile break self.assertIsNotNone( foundFile, "Error: Missing chunk file: %s, %s" % (chunkFiles, goldenChunkFiles)) self.assertEqual(set(foundFile["parents"]), chunkFile["parents"], "Error: File parents should match.") self.assertEqual(foundFile["merged"], chunkFile["merged"], "Error: File merged status should match.") self.assertEqual(foundFile["locations"], chunkFile["locations"], "Error: File locations should match.") self.assertEqual(foundFile["events"], chunkFile["events"]) self.assertEqual(foundFile["size"], chunkFile["size"]) self.assertEqual(len(foundFile["runs"]), len(chunkFile["runs"]), "Error: Wrong number of runs.") for run in foundFile["runs"]: runMatch = False for chunkRun in chunkFile["runs"]: if chunkRun.run == run.run and chunkRun.lumis == run.lumis: runMatch = True break self.assertTrue(runMatch, "Error: Run information is wrong.") del goldenFiles[chunk["files"]] singleChunk = dcs.singleChunkFileset("ACDCTest", "/ACDCTest/reco") self.assertEqual( singleChunk, { "offset": 0, "files": 11, "events": 11264, "lumis": 22, "locations": {"castor.cern.ch", "cmssrm.fnal.gov", "srm.ral.uk"} }, "Error: Single chunk metadata is wrong") return
def testGetLumiWhitelist(self): """ _testGetLumiWhitelist_ Verify that the ACDC whitelist generation code works correctly. We'll add jobs with the following lumi info: # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12] # Run 2, lumis [5, 6, 7], [10, 11, 12], [15] # Run 3, lumis [20] And should get out a whitelist that looks like this: {"1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]]} """ dcs = DataCollectionService(url=self.testInit.couchUrl, database="wmcore-acdc-datacollectionsvc") testFileA = File(lfn=makeUUID(), size=1024, events=1024) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn=makeUUID(), size=1024, events=1024) testFileB.addRun(Run(1, 3)) testJobA = self.getMinimalJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn=makeUUID(), size=1024, events=1024) testFileC.addRun(Run(1, 4, 6)) testJobB = self.getMinimalJob() testJobB.addFile(testFileC) testFileD = File(lfn=makeUUID(), size=1024, events=1024) testFileD.addRun(Run(1, 7)) testJobC = self.getMinimalJob() testJobC.addFile(testFileD) testFileE = File(lfn=makeUUID(), size=1024, events=1024) testFileE.addRun(Run(1, 11, 12)) testJobD = self.getMinimalJob() testJobD.addFile(testFileE) testFileF = File(lfn=makeUUID(), size=1024, events=1024) testFileF.addRun(Run(2, 5, 6, 7)) testJobE = self.getMinimalJob() testJobE.addFile(testFileF) testFileG = File(lfn=makeUUID(), size=1024, events=1024) testFileG.addRun(Run(2, 10, 11, 12)) testJobF = self.getMinimalJob() testJobF.addFile(testFileG) testFileH = File(lfn=makeUUID(), size=1024, events=1024) testFileH.addRun(Run(2, 15)) testJobG = self.getMinimalJob() testJobG.addFile(testFileH) testFileI = File(lfn=makeUUID(), size=1024, events=1024) testFileI.addRun(Run(3, 20)) testJobH = self.getMinimalJob() testJobH.addFile(testFileI) testFileJ = File(lfn=makeUUID(), size=1024, events=1024) testFileJ.addRun(Run(1, 9)) testJobI = self.getMinimalJob() testJobI.addFile(testFileJ) dcs.failedJobs([ testJobA, testJobB, testJobC, testJobD, testJobE, testJobF, testJobG, testJobH, testJobI ]) whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(len(whiteList.keys()), 3, "Error: There should be 3 runs.") self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]], "Error: Whitelist for run 1 is wrong.") self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]], "Error: Whitelist for run 2 is wrong.") self.assertEqual(whiteList["3"], [[20, 20]], "Error: Whitelist for run 3 is wrong.") correctLumiList = LumiList( compactList={ "1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]] }) testLumiList = dcs.getLumilistWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(correctLumiList.getCMSSWString(), testLumiList.getCMSSWString()) return
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if type(self.maxRetries) != dict: self.maxRetries = {'default' : self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException('Max retries for the default job type must be specified') self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.exitCodes = getattr(self.config.ErrorHandler, 'failureExitCodes', []) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname = "Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") # Some exit codes imply an immediate failure, non-configurable self.exitCodes.extend(WMJobPermanentSystemErrors) return def setup(self, parameters = None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed" % (len(jobList), state)) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) self.sendAlert(4, msg = msg) logging.debug("JobInfo: %s" % job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors(cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary = True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary = True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs" % len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error("No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff." % job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error("Failed to find FWJR for job %i in location %s.\n Passing it to cooloff." % (job['id'], reportPath)) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] if startTime == None or stopTime == None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i" % job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % (job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.exitCodes]): msg = "Job %i exhausted due to a bad exit code (%s)" % (job['id'], str(report.getExitCodes())) logging.error(msg) self.sendAlert(4, msg = msg) exhaustJobs.append(job) continue if len([x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % (job['id'], str(report.getExitCodes())) passJobs.append(job) continue cooloffJobs.append(job) except Exception, ex: logging.warning("Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package = "WMCore.WMBS", logger = myThread.logger, dbinterface = myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.getJobs = self.daoFactory(classname = "Jobs.GetAllJobs") self.dataCollection = DataCollectionService(url = config.ACDC.couchurl, database = config.ACDC.database) # initialize the alert framework (if available - config.Alert present) # self.sendAlert will be then be available self.initAlerts(compName = "ErrorHandler") return def setup(self, parameters): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def processRetries(self, jobs, jobType): """ Actually do the retries """ logging.info("Processing retries for %i failed jobs of type %s." % (len(jobs), jobType)) exhaustJobs = [] cooloffJobs = [] # Retries < max retry count for ajob in jobs: # Retries < max retry count if ajob['retry_count'] < self.maxRetries: cooloffJobs.append(ajob) # Check if Retries >= max retry count elif ajob['retry_count'] >= self.maxRetries: exhaustJobs.append(ajob) msg = "Exhausting job %i" % ajob['id'] logging.error(msg) self.sendAlert(6, msg = msg) logging.debug("JobInfo: %s" % ajob) else: logging.debug("Job %i had %s retries remaining" \ % (ajob['id'], str(ajob['retry_count']))) #Now to actually do something. logging.debug("About to propagate jobs") self.changeState.propagate(exhaustJobs, 'exhausted', \ '%sfailed' %(jobType)) self.changeState.propagate(cooloffJobs, '%scooloff' %(jobType), \ '%sfailed' %(jobType)) # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in exhaustJobs: job.failInputFiles() return exhaustJobs def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ logging.debug("Entering ACDC with %i jobs" % len(jobList)) for job in jobList: job.getMask() self.dataCollection.failedJobs(jobList) return def splitJobList(self, jobList, jobType): """ _splitJobList_ Split up list of jobs into more manageable chunks if necessary """ if len(jobList) < 1: # Nothing to do return myThread = threading.currentThread() while len(jobList) > 0: # Loop over the list and handle it one chunk at a time tmpList = jobList[:self.maxProcessSize] jobList = jobList[self.maxProcessSize:] logging.debug("About to process %i errors" % len(tmpList)) myThread.transaction.begin() exhaustList = self.processRetries(tmpList, jobType) self.handleACDC(jobList = exhaustList) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ createList = [] submitList = [] jobList = [] # Run over created jobs idList = self.getJobs.execute(state = 'CreateFailed') logging.info("Found %s failed jobs failed during creation" \ % len(idList)) if len(idList) > 0: createList = self.loadJobsFromList(idList = idList) # Run over submitted jobs idList = self.getJobs.execute(state = 'SubmitFailed') logging.info("Found %s failed jobs failed during submit" \ % len(idList)) if len(idList) > 0: submitList = self.loadJobsFromList(idList = idList) # Run over executed jobs idList = self.getJobs.execute(state = 'JobFailed') logging.info("Found %s failed jobs failed during execution" \ % len(idList)) if len(idList) > 0: jobList = self.loadJobsFromList(idList = idList) self.splitJobList(jobList = createList, jobType = 'create') self.splitJobList(jobList = submitList, jobType = 'submit') self.splitJobList(jobList = jobList, jobType = 'job') return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ loadAction = self.daoFactory(classname = "Jobs.LoadForErrorHandler") binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = loadAction.execute(jobID = binds) # You have to have a list if type(results) == dict: results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id = entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def algorithm(self, parameters = None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") myThread = threading.currentThread() try: self.handleErrors() except WMException, ex: try: myThread.transaction.rollback() except: pass raise except Exception, ex: msg = "Caught exception in ErrorHandler\n" msg += str(ex) msg += str(traceback.format_exc()) msg += "\n\n" logging.error(msg) self.sendAlert(6, msg = msg) if getattr(myThread, 'transaction', None) != None \ and getattr(myThread.transaction, 'transaction', None) != None: myThread.transaction.rollback() raise ErrorHandlerException(msg)
class ErrorHandlerPoller(BaseWorkerThread): """ Polls for Error Conditions, handles them """ def __init__(self, config): """ Initialise class members """ BaseWorkerThread.__init__(self) self.config = config myThread = threading.currentThread() self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=myThread.logger, dbinterface=myThread.dbi) self.changeState = ChangeState(self.config) self.maxRetries = self.config.ErrorHandler.maxRetries if not isinstance(self.maxRetries, dict): self.maxRetries = {'default': self.maxRetries} if 'default' not in self.maxRetries: raise ErrorHandlerException( 'Max retries for the default job type must be specified') self.exitCodesNoRetry = [] self.maxProcessSize = getattr(self.config.ErrorHandler, 'maxProcessSize', 250) self.maxFailTime = getattr(self.config.ErrorHandler, 'maxFailTime', 32 * 3600) self.readFWJR = getattr(self.config.ErrorHandler, 'readFWJR', False) self.passCodes = getattr(self.config.ErrorHandler, 'passExitCodes', []) self.getJobs = self.daoFactory(classname="Jobs.GetAllJobs") self.idLoad = self.daoFactory(classname="Jobs.LoadFromIDWithType") self.loadAction = self.daoFactory(classname="Jobs.LoadForErrorHandler") self.dataCollection = DataCollectionService( url=config.ACDC.couchurl, database=config.ACDC.database) if hasattr(self.config, "Tier0Feeder"): self.reqAuxDB = None else: self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL) return def setup(self, parameters=None): """ Load DB objects required for queries """ # For now, does nothing return def terminate(self, params): """ _terminate_ Do one pass, then commit suicide """ logging.debug("terminating. doing one more pass before we die") self.algorithm(params) def exhaustJobs(self, jobList): """ _exhaustJobs_ Actually do the jobs exhaustion """ self.changeState.propagate(jobList, 'exhausted', 'retrydone') # Remove all the files in the exhausted jobs. logging.debug("About to fail input files for exhausted jobs") for job in jobList: job.failInputFiles() # Do not build ACDC for utilitarian job types jobList = [ job for job in jobList if job['type'] not in ['LogCollect', 'Cleanup'] ] self.handleACDC(jobList) return def processRetries(self, jobList, state): """ _processRetries_ Actually do the retries """ logging.info("Processing retries for %d failed jobs of type %sfailed", len(jobList), state) retrydoneJobs = [] cooloffJobs = [] passJobs = [] # Retries < max retry count for job in jobList: allowedRetries = self.maxRetries.get(job['type'], self.maxRetries['default']) # Retries < allowed max retry count if job['retry_count'] < allowedRetries and state != 'create': cooloffJobs.append(job) # Check if Retries >= allowed max retry count elif job['retry_count'] >= allowedRetries or state == 'create': retrydoneJobs.append(job) msg = "Stopping retries for job %d" % job['id'] logging.debug(msg) logging.debug("JobInfo: %s", job) if self.readFWJR: # Then we have to check each FWJR for exit status cooloffJobs, passJobs, retrydoneFWJRJobs = self.readFWJRForErrors( cooloffJobs) retrydoneJobs.extend(retrydoneFWJRJobs) # Now to actually do something. logging.debug("About to propagate jobs") if len(retrydoneJobs) > 0: self.changeState.propagate(retrydoneJobs, 'retrydone', '%sfailed' % state, updatesummary=True) if len(cooloffJobs) > 0: self.changeState.propagate(cooloffJobs, '%scooloff' % state, '%sfailed' % state, updatesummary=True) if len(passJobs) > 0: # Overwrite the transition states and move directly to created self.changeState.propagate(passJobs, 'created', 'new') return def handleACDC(self, jobList): """ _handleACDC_ Do the ACDC creation and hope it works """ idList = [x['id'] for x in jobList] logging.info("Starting to build ACDC with %i jobs", len(idList)) logging.info("This operation will take some time...") loadList = self.loadJobsFromListFull(idList) for job in loadList: job.getMask() self.dataCollection.failedJobs(loadList) return def readFWJRForErrors(self, jobList): """ _readFWJRForErrors_ Check the FWJRs of the failed jobs and determine those that can be retried and which must be retried without going through cooloff. Returns a triplet with cooloff, passed and exhausted jobs. """ cooloffJobs = [] passJobs = [] exhaustJobs = [] if self.reqAuxDB: self.exitCodesNoRetry = self.reqAuxDB.getWMAgentConfig( self.config.Agent.hostName).get("NoRetryExitCodes", []) for job in jobList: report = Report() reportPath = job['fwjr_path'] if reportPath is None: logging.error( "No FWJR in job %i, ErrorHandler can't process it.\n Passing it to cooloff.", job['id']) cooloffJobs.append(job) continue if not os.path.isfile(reportPath): logging.error( "Failed to find FWJR for job %i in location %s.\n Passing it to cooloff.", job['id'], reportPath) cooloffJobs.append(job) continue try: report.load(reportPath) # First let's check the time conditions times = report.getFirstStartLastStop() startTime = None stopTime = None if times is not None: startTime = times['startTime'] stopTime = times['stopTime'] # correct the location if the original location is different from recorded in wmbs # WARNING: we are not updating job location in wmbs only updating in couchdb by doing this. # If location in wmbs needs to be updated, it should happen in JobAccountant. locationFromFWJR = report.getSiteName() if locationFromFWJR: job["location"] = locationFromFWJR job["site_cms_name"] = locationFromFWJR if startTime is None or stopTime is None: # We have no information to make a decision, keep going. logging.debug("No start, stop times for steps for job %i", job['id']) elif stopTime - startTime > self.maxFailTime: msg = "Job %i exhausted after running on node for %i seconds" % ( job['id'], stopTime - startTime) logging.debug(msg) exhaustJobs.append(job) continue if len([ x for x in report.getExitCodes() if x in self.exitCodesNoRetry ]): msg = "Job %i exhausted due to a bad exit code (%s)" % ( job['id'], str(report.getExitCodes())) logging.error(msg) exhaustJobs.append(job) continue if len( [x for x in report.getExitCodes() if x in self.passCodes]): msg = "Job %i restarted immediately due to an exit code (%s)" % ( job['id'], str(report.getExitCodes())) logging.debug(msg) passJobs.append(job) continue cooloffJobs.append(job) except Exception as ex: logging.warning( "Exception while trying to check jobs for failures!") logging.warning(str(ex)) logging.warning("Ignoring and sending job to cooloff") cooloffJobs.append(job) return cooloffJobs, passJobs, exhaustJobs def handleRetryDoneJobs(self, jobList): """ _handleRetryDoneJobs_ """ myThread = threading.currentThread() logging.info("About to process %d retry done jobs", len(jobList)) myThread.transaction.begin() self.exhaustJobs(jobList) myThread.transaction.commit() return def handleFailedJobs(self, jobList, state): """ _handleFailedJobs_ """ myThread = threading.currentThread() logging.info("About to process %d failures", len(jobList)) myThread.transaction.begin() self.processRetries(jobList, state) myThread.transaction.commit() return def handleErrors(self): """ Queries DB for all watched filesets, if matching filesets become available, create the subscriptions """ # Run over created, submitted and executed job failures failure_states = ['create', 'submit', 'job'] for state in failure_states: idList = self.getJobs.execute(state="%sfailed" % state) logging.info("Found %d failed jobs in state %sfailed", len(idList), state) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleFailedJobs(jobList, state) # Run over jobs done with retries idList = self.getJobs.execute(state='retrydone') logging.info("Found %d jobs done with all retries", len(idList)) while len(idList) > 0: tmpList = idList[:self.maxProcessSize] idList = idList[self.maxProcessSize:] jobList = self.loadJobsFromList(tmpList) self.handleRetryDoneJobs(jobList) return def loadJobsFromList(self, idList): """ _loadJobsFromList_ Load jobs in bulk """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.idLoad.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs def loadJobsFromListFull(self, idList): """ _loadJobsFromList_ Load jobs in bulk. Include the full metadata. """ binds = [] for jobID in idList: binds.append({"jobid": jobID}) results = self.loadAction.execute(jobID=binds) # You have to have a list if isinstance(results, dict): results = [results] listOfJobs = [] for entry in results: # One job per entry tmpJob = Job(id=entry['id']) tmpJob.update(entry) listOfJobs.append(tmpJob) return listOfJobs @timeFunction def algorithm(self, parameters=None): """ Performs the handleErrors method, looking for each type of failure And deal with it as desired. """ logging.debug("Running error handling algorithm") try: myThread = threading.currentThread() self.handleErrors() except (CouchConnectionError, HTTPException) as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught CouchConnectionError/HTTPException exception in ErrorHandler. " msg += "Transactions postponed until the next polling cycle\n" msg += str(ex) logging.error(msg) except Exception as ex: if getattr(myThread, 'transaction', None) is not None: myThread.transaction.rollback() msg = "Caught unexpected exception in ErrorHandler:\n" msg += str(ex) logging.exception(msg) raise ErrorHandlerException(msg)
def testGetLumiWhitelist(self): """ _testGetLumiWhitelist_ Verify that the ACDC whitelist generation code works correctly. We'll add jobs with the following lumi info: # Run 1, lumis [1, 2, 3], [4, 6], [7], [9], [11, 12] # Run 2, lumis [5, 6, 7], [10, 11, 12], [15] # Run 3, lumis [20] And should get out a whitelist that looks like this: {"1": [[1, 4], [6, 7], [9, 9], [11, 12]], "2": [[5, 7], [10, 12], [15, 15]], "3": [[20, 20]]} """ dcs = DataCollectionService(url = self.testInit.couchUrl, database = "wmcore-acdc-datacollectionsvc") def getJob(): job = Job() job["task"] = "/ACDCTest/reco" job["workflow"] = "ACDCTest" job["location"] = "cmssrm.fnal.gov" job["owner"] = "cmsdataops" job["group"] = "cmsdataops" return job testFileA = File(lfn = makeUUID(), size = 1024, events = 1024) testFileA.addRun(Run(1, 1, 2)) testFileB = File(lfn = makeUUID(), size = 1024, events = 1024) testFileB.addRun(Run(1, 3)) testJobA = getJob() testJobA.addFile(testFileA) testJobA.addFile(testFileB) testFileC = File(lfn = makeUUID(), size = 1024, events = 1024) testFileC.addRun(Run(1, 4, 6)) testJobB = getJob() testJobB.addFile(testFileC) testFileD = File(lfn = makeUUID(), size = 1024, events = 1024) testFileD.addRun(Run(1, 7)) testJobC = getJob() testJobC.addFile(testFileD) testFileE = File(lfn = makeUUID(), size = 1024, events = 1024) testFileE.addRun(Run(1, 11, 12)) testJobD = getJob() testJobD.addFile(testFileE) testFileF = File(lfn = makeUUID(), size = 1024, events = 1024) testFileF.addRun(Run(2, 5, 6, 7)) testJobE = getJob() testJobE.addFile(testFileF) testFileG = File(lfn = makeUUID(), size = 1024, events = 1024) testFileG.addRun(Run(2, 10, 11, 12)) testJobF = getJob() testJobF.addFile(testFileG) testFileH = File(lfn = makeUUID(), size = 1024, events = 1024) testFileH.addRun(Run(2, 15)) testJobG = getJob() testJobG.addFile(testFileH) testFileI = File(lfn = makeUUID(), size = 1024, events = 1024) testFileI.addRun(Run(3, 20)) testJobH = getJob() testJobH.addFile(testFileI) testFileJ = File(lfn = makeUUID(), size = 1024, events = 1024) testFileJ.addRun(Run(1, 9)) testJobI = getJob() testJobI.addFile(testFileJ) dcs.failedJobs([testJobA, testJobB, testJobC, testJobD, testJobE, testJobF, testJobG, testJobH, testJobI]) whiteList = dcs.getLumiWhitelist("ACDCTest", "/ACDCTest/reco") self.assertEqual(len(whiteList.keys()), 3, "Error: There should be 3 runs.") self.assertEqual(whiteList["1"], [[1, 4], [6, 7], [9, 9], [11, 12]], "Error: Whitelist for run 1 is wrong.") self.assertEqual(whiteList["2"], [[5, 7], [10, 12], [15, 15]], "Error: Whitelist for run 2 is wrong.") self.assertEqual(whiteList["3"], [[20, 20]], "Error: Whitelist for run 3 is wrong.") return