def generateCreateFailedReports(self, createFailedJobs): """ _generateCreateFailedReports_ Create and store FWJR for the jobs that failed on creation leaving meaningful information about what happened with them """ if not createFailedJobs: return fjrsToSave = [] for failedJob in createFailedJobs: report = Report() defaultMsg = "There is a condition which assures that this job will fail if it's submitted" report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg)) jobCache = failedJob.getCache() try: fjrPath = os.path.join(jobCache, "Report.0.pkl") report.save(fjrPath) fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath}) failedJob["fwjr"] = report except Exception: logging.error("Something went wrong while saving the report for job %s", failedJob["id"]) myThread = threading.currentThread() self.setFWJRPath.execute(binds=fjrsToSave, conn=myThread.transaction.conn, transaction=True) return
def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ if len(failedJobs) == 0: return jrBinds = [] for job in failedJobs: # Make sure the job object goes packed with fwjr_path to be persisted in couch jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) fwjr = Report() try: fwjr.load(jrPath) except Exception: # Something went wrong reading the pickle logging.error("The pickle in %s could not be loaded, generating a new one", jrPath) fwjr = Report() msg = "The job failed due to a timeout, unfortunately the original job report was lost" fwjr.addError("NoJobReport", 99303, "NoJobReport", msg) fwjr.save(jrPath) job["fwjr"] = fwjr myThread = threading.currentThread() myThread.transaction.begin() self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs", len(failedJobs)) myThread.transaction.commit() return
def generateCreateFailedReports(self, createFailedJobs): """ _generateCreateFailedReports_ Create and store FWJR for the jobs that failed on creation leaving meaningful information about what happened with them """ if not createFailedJobs: return fjrsToSave = [] for failedJob in createFailedJobs: report = Report() defaultMsg = "There is a condition which assures that this job will fail if it's submitted" report.addError("CreationFailure", 99305, "CreationFailure", failedJob.get("failedReason", defaultMsg)) jobCache = failedJob.getCache() try: fjrPath = os.path.join(jobCache, "Report.0.pkl") report.save(fjrPath) fjrsToSave.append({"jobid": failedJob["id"], "fwjrpath": fjrPath}) failedJob["fwjr"] = report except Exception: logging.error("Something went wrong while saving the report for job %s" % failedJob["id"]) myThread = threading.currentThread() self.setFWJRPath.execute(binds = fjrsToSave, conn = myThread.transaction.conn, transaction = True) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get('retry_count', None) is None: # Then we can't do anything logging.error("Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error("The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorOut = "condor.%s.out" % job['gridid'] condorErr = "condor.%s.err" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] for condorFile in [condorOut, condorErr, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n' % condorFile logOutput += '\n'.join(logTail) condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job['id'] msg += "Could not find jobCache directory %s\n" % job['cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def createMissingFWKJR(self, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", errorCode, "MissingJobReport", errorDescription) report.data.cmsRun1.status = "Failed" return report
def createMissingFWKJR(self, errorCode=999, errorDescription='Failure of unknown type'): """ _createMissingFWJR_ Create a missing FWJR if the report can't be found by the code in the path location. """ report = Report() report.addError("cmsRun1", 84, errorCode, errorDescription) report.data.cmsRun1.status = "Failed" return report
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): (sub, jobParams) = self.createSubmitRequest(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python Submit", len(jobParams)) try: with schedd.transaction() as txn: submitRes = sub.queue_with_itemdata( txn, 1, iter(jobParams)) clusterId = submitRes.cluster() except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error( "Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: # 4th argument has to be None otherwise HTCondor leaks the result ads # through it (as of 8.7.x). More info in WMCore/#8729 clusterId = schedd.submitMany(clusterAd, procAds, False, None) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error( "Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug( "Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: clusterId = schedd.submitMany(clusterAd, procAds) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error( "Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug( "Finish: Submitting jobs using Condor Python SubmitMany") for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info( "Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ myThread = threading.currentThread() if len(failedJobs) == 0: return myThread = threading.currentThread() # Load DAOs setFWJRAction = self.daoFactory(classname = "Jobs.SetFWJRPath") loadAction = self.daoFactory(classname = "Jobs.LoadFromID") jrBinds = [] for job in failedJobs: jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) #Make sure the job object goes packed with fwjr_path so it #can be persisted in couch fwjr = Report() try: fwjr.load(jrPath) except Exception: #Something went wrong reading the pickle logging.error("The pickle in %s could not be loaded, generating a new one" % jrPath) fwjr = Report() msg = "The job failed due to a timeout, unfortunately the original job report was lost" fwjr.addError("NoJobReport", 99303, "NoJobReport", msg) fwjr.save(jrPath) job["fwjr"] = fwjr # Set all paths at once myThread.transaction.begin() setFWJRAction.execute(binds = jrBinds) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs" % (len(failedJobs))) myThread.transaction.commit() return
def createReport(self, outcome = 0): """ Create a test report """ jobReport = Report() jobReport.addStep('cmsRun1') jobReport.setStepStartTime(stepName = 'cmsRun1') jobReport.setStepStopTime(stepName = 'cmsRun1') if outcome: jobReport.addError('cmsRun1', 200, 'FakeError', 'FakeError') return jobReport
def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ myThread = threading.currentThread() if len(failedJobs) == 0: return myThread = threading.currentThread() # Load DAOs setFWJRAction = self.daoFactory(classname="Jobs.SetFWJRPath") loadAction = self.daoFactory(classname="Jobs.LoadFromID") jrBinds = [] for job in failedJobs: jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) #Make sure the job object goes packed with fwjr_path so it #can be persisted in couch fwjr = Report() try: fwjr.load(jrPath) except Exception: #Something went wrong reading the pickle logging.error( "The pickle in %s could not be loaded, generating a new one" % jrPath) fwjr = Report() msg = "The job failed due to a timeout, unfortunately the original job report was lost" fwjr.addError("NoJobReport", 99303, "NoJobReport", msg) fwjr.save(jrPath) job["fwjr"] = fwjr # Set all paths at once myThread.transaction.begin() setFWJRAction.execute(binds=jrBinds) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs" % (len(failedJobs))) myThread.transaction.commit() return
def submit(self, jobs, info=None): """ _submit_ Submits jobs to the condor queue """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): clusterAd = self.getClusterAd() procAds = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany", len(procAds)) try: # 4th argument has to be None otherwise HTCondor leaks the result ads # through it (as of 8.7.x). More info in WMCore/#8729 clusterId = schedd.submitMany(clusterAd, procAds, False, None) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.exception(str(ex)) logging.error("Moving on the the next batch of jobs and/or cycle....") condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Job submission to condor succeeded, clusterId is %s", clusterId) for index, job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def testExitCode(self): """ _testExitCode_ Test and see if we can get an exit code out of a report Note: Errors without a return code return 99999 """ report = Report("cmsRun1") self.assertEqual(report.getExitCode(), 0) report.addError(stepName = "cmsRun1", exitCode = None, errorType = "test", errorDetails = "test") self.assertEqual(report.getExitCode(), 99999) self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 99999) report.addError(stepName = "cmsRun1", exitCode = '12345', errorType = "test", errorDetails = "test") self.assertEqual(report.getExitCode(), 12345) self.assertEqual(report.getStepExitCode(stepName = "cmsRun1"), 12345)
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs schedd = htcondor.Schedd() # Submit the jobs for jobsReady in grouper(jobs, self.jobsPerSubmit): cluster_ad = self.getClusterAd() proc_ads = self.getProcAds(jobsReady) logging.debug("Start: Submitting %d jobs using Condor Python SubmitMany" % len(proc_ads)) try: clusterId = schedd.submitMany(cluster_ad, proc_ads) except Exception as ex: logging.error("SimpleCondorPlugin job submission failed.") logging.error("Moving on the the next batch of jobs and/or cycle....") logging.exception(ex) condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", str(ex)) for job in jobsReady: job['fwjr'] = condorErrorReport failedJobs.append(job) else: logging.debug("Finish: Submitting jobs using Condor Python SubmitMany") for index,job in enumerate(jobsReady): job['gridid'] = "%s.%s" % (clusterId, index) job['status'] = 'Idle' successfulJobs.append(job) # We must return a list of jobs successfully submitted and a list of jobs failed logging.info("Done submitting jobs for this cycle in SimpleCondorPlugin") return successfulJobs, failedJobs
def failJobs(self, failedJobs): """ _failJobs_ Dump those jobs that have failed due to timeout """ if len(failedJobs) == 0: return jrBinds = [] for job in failedJobs: # Make sure the job object goes packed with fwjr_path to be persisted in couch jrPath = os.path.join(job.getCache(), 'Report.%i.pkl' % (job['retry_count'])) jrBinds.append({'jobid': job['id'], 'fwjrpath': jrPath}) fwjr = Report() try: fwjr.load(jrPath) except Exception: # Something went wrong reading the pickle logging.error( "The pickle in %s could not be loaded, generating a new one", jrPath) fwjr = Report() fwjr.addError("NoJobReport", 99303, "NoJobReport", WM_JOB_ERROR_CODES[99303]) fwjr.save(jrPath) job["fwjr"] = fwjr myThread = threading.currentThread() myThread.transaction.begin() self.setFWJRAction.execute(binds=jrBinds, conn=myThread.transaction.conn, transaction=True) self.changeState.propagate(failedJobs, 'jobfailed', 'executing') logging.info("Failed %i jobs", len(failedJobs)) myThread.transaction.commit() return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get("cache_dir", None) == None or job.get("retry_count", None) == None: # Then we can't do anything logging.error("Can't find this job's cache_dir in CondorPlugin.complete") logging.error("cache_dir: %s" % job.get("cache_dir", "Missing")) logging.error("retry_count: %s" % job.get("retry_count", "Missing")) continue reportName = os.path.join(job["cache_dir"], "Report.%i.pkl" % job["retry_count"]) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # Then we have a real report. # Do nothing continue if os.path.isdir(reportName): # Then something weird has happened. # File error, do nothing logging.error("Went to check on error report for job %i. Found a directory instead.\n" % job["id"]) logging.error("Ignoring this, but this is very strange.\n") # If we're still here, we must not have a real error report logOutput = "Could not find jobReport" logPath = os.path.join(job["cache_dir"], "condor.log") if os.path.isfile(logPath): logTail = BasicAlgos.tail(errLog, 50) logOutput += "Adding end of condor.log to error message:\n" logOutput += logTail condorReport = Report() condorReport.addError("NoJobReport", 61303, "NoJobReport", logOutput) condorReport.save(filename=reportName) logging.debug("No returning job report for job %i" % job["id"]) return
queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get( 'retry_count', None) is None: # Then we can't do anything logging.error( "Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error( "The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp( condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp( condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job[ 'id'] msg += "Could not find jobCache directory %s\n" % job[ 'cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug( "Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) == None or job.get('retry_count', None) == None: # Then we can't do anything logging.error("Can't find this job's cache_dir in CondorPlugin.complete") logging.error("cache_dir: %s" % job.get('cache_dir', 'Missing')) logging.error("retry_count: %s" % job.get('retry_count', 'Missing')) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # Then we have a real report. # Do nothing continue if os.path.isdir(reportName): # Then something weird has happened. # File error, do nothing logging.error("Went to check on error report for job %i. Found a directory instead.\n" % job['id']) logging.error("Ignoring this, but this is very strange.\n") # If we're still here, we must not have a real error report logOutput = 'Could not find jobReport\n' #But we don't know exactly the condor id, so it will append #the last lines of the latest condor log in cache_dir genLogPath = os.path.join(job['cache_dir'], 'condor.*.*.log') logPaths = glob.glob(genLogPath) errLog = None if len(logPaths): errLog = max(logPaths, key = lambda path : os.stat(path).st_mtime) if errLog != None and os.path.isfile(errLog): logTail = BasicAlgos.tail(errLog, 50) logOutput += 'Adding end of condor.log to error message:\n' logOutput += '\n'.join(logTail) if not os.path.isdir(job['cache_dir']): msg = "Serious Error in Completing condor job with id %s!\n" % job.get('id', 'unknown') msg += "Could not find jobCache directory - directory deleted under job: %s\n" % job['cache_dir'] msg += "Creating artificial cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) logOutput += msg condorReport = Report() condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename = reportName) continue condorReport = Report() condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput) if os.path.isfile(reportName): # Then we have a file already there. It should be zero size due # to the if statements above, but we should remove it. if os.path.getsize(reportName) > 0: # This should never happen. If it does, ignore it msg = "Critical strange problem. FWJR changed size while being processed." logging.error(msg) else: try: os.remove(reportName) condorReport.save(filename = reportName) except Exception as ex: logging.error("Cannot remove and replace empty report %s" % reportName) logging.error("Report continuing without error!") else: condorReport.save(filename = reportName) # Debug message to end loop logging.debug("No returning job report for job %i" % job['id']) return
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.queue = self.config.JobSubmitter.LsfPluginQueue self.resourceReq = getattr(self.config.JobSubmitter, 'LsfPluginResourceReq', None) self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup self.batchOutput = getattr(self.config.JobSubmitter, 'LsfPluginBatchOutput', None) successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then we have nothing to do return successfulJobs, failedJobs # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] for job in jobsReady: if job == {}: # Then I don't know how we got here either logging.error( "Was passed a nonexistant job. Ignoring") continue submitScript = self.makeSubmit(job) if not submitScript: # Then we got nothing logging.error("No submit script made!") return {'NoResult': [0]} submitScriptFile = os.path.join(job['cache_dir'], "submit.sh") handle = open(submitScriptFile, 'w') handle.writelines(submitScript) handle.close() # make reasonable job name jobName = "WMAgentJob" regExpParser = re.compile( '.*/JobCreator/JobCache/([^/]+)/[^/]+/.*') match = regExpParser.match(job['cache_dir']) if (match != None): jobName = "%s-%s" % (match.group(1), job['id']) # // # // Submit LSF job # // command = 'bsub' command += ' -q %s' % self.queue if self.resourceReq != None: command += ' -R "%s"' % self.resourceReq command += ' -g %s' % self.jobGroup command += ' -J %s' % jobName lsfLogDir = self.batchOutput if lsfLogDir != None: now = datetime.datetime.today() lsfLogDir += '/%s' % now.strftime("%Y%m%d%H") try: os.mkdir(lsfLogDir) logging.debug("Created directory %s" % lsfLogDir) except OSError, err: # suppress LSF log unless it's about an already exisiting directory if err.errno != errno.EEXIST or not os.path.isdir( lsfLogDir): logging.error( "Can't create directory %s, turning off LSF log" % lsfLogDir) lsfLogDir = None if lsfLogDir == None: command += ' -oo /dev/null' else: command += ' -oo %s/%s.%%J.out' % (lsfLogDir, jobName) command += ' < %s' % submitScriptFile logging.info("Submitting LSF job: %s" % command) p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout = p.communicate()[0] returncode = p.returncode if returncode == 0: # check for correct naming convention in PFN regExpParser = re.compile( 'Job <([0-9]+)> is submitted to queue') match = regExpParser.match(stdout) if match != None: job['gridid'] = match.group(1) successfulJobs.append(job) logging.info("LSF Job ID : %s" % job['gridid']) continue else: logging.error( "bsub didn't return a valid Job ID. Job is not submitted" ) logging.error(stdout) lsfErrorReport = Report() lsfErrorReport.addError("JobSubmit", 61202, "LsfError", stdout) job['fwjr'] = lsfErrorReport failedJobs.append(job)
def submit(self, jobs, info): """ _submit_ Submit jobs for one subscription """ if len(self.pool) == 0: # Starting things up # This is obviously a submit API for x in range(self.nProcess): p = multiprocessing.Process(target=submitWorker, args=(self.input, self.result)) p.start() self.pool.append(p) # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, "getTimeout", 300) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then we have nothing to do return successfulJobs, failedJobs # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job["sandbox"] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x["jobid"] for x in jobList] while len(jobList) > 0: jobsReady = jobList[: self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker :] idList = [x["id"] for x in jobsReady] jdlList = self.makeSubmit(jobList=jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {"NoResult": [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, "w") handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" % (len(jobsReady))) command = "condor_submit %s" % jdlFile self.input.put({"command": command, "idList": idList}) nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): res = self.result.get(block=True, timeout=timeout) output = res["stdout"] error = res["stderr"] idList = res["idList"] if not error == "": logging.error("Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error=error) if errorCheck: condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get("id", None) == jobID: job["fwjr"] = condorErrorReport failedJobs.append(job) break else: for jobID in idList: for job in jobs: if job.get("id", None) == jobID: successfulJobs.append(job) break # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, "deleteJDLFiles", True): for f in jdlFiles: os.remove(f) # We must return a list of jobs successfully submitted, # and a list of jobs failed return successfulJobs, failedJobs
def kill(self, jobs, killMsg = None, errorCode = 61300): """ _kill_ Kill jobs using plugin functions: Only active jobs (status = 1) will be killed An optional killMsg can be sent; this will be written into the job FWJR. The errorCode will be the one specified and if no killMsg is provided then a standard message associated with the exit code will be used. If a previous FWJR exists, this error will be appended to it. """ if not len(jobs): # Nothing to do here return self.check() jobsToKill = {} # Now get a list of which jobs are in the batch system # only kill jobs present there loadedJobs = self._buildRunningJobs(wmbsJobs = jobs) for runningJob in loadedJobs: plugin = runningJob['plugin'] if not plugin in jobsToKill.keys(): jobsToKill[plugin] = [] jobsToKill[plugin].append(runningJob) for plugin in jobsToKill.keys(): if not plugin in self.plugins.keys(): msg = "Jobs tracking with non-existant plugin %s\n" % (plugin) msg += "They were submitted but can't be tracked?\n" msg += "That's too strange to continue\n" logging.error(msg) raise BossAirException(msg) else: # Then we send them to the plugins try: pluginInst = self.plugins[plugin] pluginInst.kill(jobs = jobsToKill[plugin]) # Register the killed jobs for job in jobsToKill[plugin]: if job.get('cache_dir', None) == None or job.get('retry_count', None) == None: continue # Try to save an error report as the jobFWJR if not os.path.isdir(job['cache_dir']): # Then we have a bad cache directory logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id']) logging.debug("cache_dir: %s\n" % job['cache_dir']) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) errorReport = Report() if os.path.exists(reportName) and os.path.getsize(reportName) > 0: # Then there's already a report there. Add messages errorReport.load(reportName) # Build a better job message if killMsg: reportedMsg = killMsg reportedMsg += '\n Job last known status was: %s' % job.get('globalState', 'Unknown') else: reportedMsg = WMJobErrorCodes[errorCode] reportedMsg += '\n Job last known status was: %s' % job.get('globalState', 'Unknown') errorReport.addError("JobKilled", errorCode, "JobKilled", reportedMsg) try: errorReport.save(filename = reportName) except IOError as ioe: logging.warning('Cannot write report %s because of %s' % (reportName, ioe)) except WMException: raise except Exception as ex: msg = "Unhandled exception while calling kill method for plugin %s\n" % plugin msg += str(ex) logging.error(msg) logging.debug("Interrupted while killing following jobs: %s\n" % jobsToKill[plugin]) raise BossAirException(msg) finally: # Even if kill fails, complete the jobs self._complete(jobs = jobsToKill[plugin]) return
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s ", self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node']= overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for fileName in files: # make sure the file information is consistent if hasattr(fileName, 'pfn') and ( not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label') ): msg = "Not a valid file: %s" % fileName logging.error(msg) continue # Figuring out if we should do straight to merge # - should we do straight to merge at all ? # - is straight to merge disabled for this output ? # - are we over the size threshold # - are we over the event threshold ? straightToMerge = False if not getattr(fileName, 'merged', False) and hasattr(self.step.output, 'minMergeSize'): if fileName.module_label not in getattr(self.step.output, 'forceUnmergedOutputs', []): if getattr(fileName, 'size', 0) >= self.step.output.minMergeSize: straightToMerge = True if getattr(fileName, 'events', 0) >= getattr(self.step.output, 'maxMergeEvents', sys.maxsize): straightToMerge = True if straightToMerge: try: fileName = self.handleLFNForMerge(mergefile = fileName, step = step) except Exception as ex: logging.info("minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None)) logging.info("maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None)) logging.error("Encountered error while handling LFN for merge %s", fileName) logging.error(str(ex)) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = getattr(fileName, 'lfn') fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN' : None, 'StageOutCommand': None, 'Checksums' : getattr(fileName, 'checksums', None)} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer['StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.setStepStatus(self.stepName, 1) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files", len(filesTransferred)) return
def execute(self, emulator=None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate(self.step, self.job) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to two hours per retry # this alarm leaves a subprocess behing that may cause trouble, see #6273 waitTime = overrides.get('waitTime', 7200 * self.step.retryCount) logging.info("StageOut override is: %s ", self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['phedex-node'] = overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print("STAGEOUT IS USING NEW STAGEOUT CODE") manager = FMStageOutMgr(retryPauseTime=self.step.retryDelay, numberOfRetries=self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s", step) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s", step, stepLocation) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step=step) for fileName in files: # make sure the file information is consistent if hasattr(fileName, 'pfn') and (not hasattr(fileName, 'lfn') or not hasattr(fileName, 'module_label')): msg = "Not a valid file: %s" % fileName logging.error(msg) continue # Figuring out if we should do straight to merge # - should we do straight to merge at all ? # - is straight to merge disabled for this output ? # - are we over the size threshold # - are we over the event threshold ? straightToMerge = False if not getattr(fileName, 'merged', False) and hasattr( self.step.output, 'minMergeSize'): if fileName.module_label not in getattr( self.step.output, 'forceUnmergedOutputs', []): if getattr(fileName, 'size', 0) >= self.step.output.minMergeSize: straightToMerge = True if getattr(fileName, 'events', 0) >= getattr( self.step.output, 'maxMergeEvents', sys.maxsize): straightToMerge = True if straightToMerge: try: fileName = self.handleLFNForMerge(mergefile=fileName, step=step) except Exception as ex: logging.info( "minMergeSize: %s", getattr(self.step.output, 'minMergeSize', None)) logging.info( "maxMergeEvents: %s", getattr(self.step.output, 'maxMergeEvents', None)) logging.error( "Encountered error while handling LFN for merge %s", fileName) logging.error(str(ex)) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move fileName.pfn to the output PFN fileName.InputPFN = fileName.pfn lfn = getattr(fileName, 'lfn') fileSource = getattr(fileName, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = { 'LFN': lfn, 'PFN': getattr(fileName, 'pfn'), 'PNN': None, 'StageOutCommand': None, 'Checksums': getattr(fileName, 'checksums', None) } signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) fileName.StageOutCommand = fileForTransfer[ 'StageOutCommand'] fileName.location = fileForTransfer['PNN'] fileName.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.setStepStatus(self.stepName, 1) # well, if it fails for one file, it fails for the whole job... break except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist(reportLocation) raise signal.alarm(0) # Am DONE with report. Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files", len(filesTransferred)) return
def testExitCode(self): """ _testExitCode_ Test and see if we can get an exit code out of a report Note: Errors without a return code return 99999 getStepExitCode: returns the first valid and non-zero exit code getExitCode: uses the method above to get an exit code getStepExitCodes: returns a set of all exit codes within the step """ report = Report("cmsRun1") self.assertEqual(report.getExitCode(), 0) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 0) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {}) self.assertItemsEqual(report.getStepErrors(stepName="cmsRun1"), {}) report.addError(stepName="cmsRun1", exitCode=None, errorType="test", errorDetails="test") # None is not a valid exitCode, but it will get mapped to 99999 self.assertEqual(report.getExitCode(), 99999) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 99999) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999}) self.assertEqual( report.getStepErrors(stepName="cmsRun1")['errorCount'], 1) report.addError(stepName="cmsRun1", exitCode=12345, errorType="test", errorDetails="test") self.assertEqual(report.getExitCode(), 12345) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 12345}) self.assertEqual( report.getStepErrors(stepName="cmsRun1")['errorCount'], 2) report.addError(stepName="cmsRun1", exitCode=123, errorType="test", errorDetails="test") self.assertEqual(report.getExitCode(), 12345) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 12345, 123}) self.assertEqual( report.getStepErrors(stepName="cmsRun1")['errorCount'], 3) # now try to record the same exit code once again report.addError(stepName="cmsRun1", exitCode=12345, errorType="test", errorDetails="test") self.assertEqual(report.getExitCode(), 12345) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 12345, 123}) self.assertEqual( report.getStepErrors(stepName="cmsRun1")['errorCount'], 3) # and once again, but different type and details (which does not matter) report.addError(stepName="cmsRun1", exitCode=12345, errorType="testAA", errorDetails="testAA") self.assertEqual(report.getExitCode(), 12345) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 12345) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 12345, 123}) self.assertEqual( report.getStepErrors(stepName="cmsRun1")['errorCount'], 3)
def submit(self, jobs, info = None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.queue = self.config.JobSubmitter.LsfPluginQueue self.resourceReq = getattr(self.config.JobSubmitter, 'LsfPluginResourceReq', None) self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup self.batchOutput = getattr(self.config.JobSubmitter, 'LsfPluginBatchOutput', None) successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then we have nothing to do return successfulJobs, failedJobs # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] for job in jobsReady: if job == {}: # Then I don't know how we got here either logging.error("Was passed a nonexistant job. Ignoring") continue submitScript = self.makeSubmit(job) if not submitScript: # Then we got nothing logging.error("No submit script made!") return {'NoResult': [0]} submitScriptFile = os.path.join(job['cache_dir'], "submit.sh") handle = open(submitScriptFile, 'w') handle.writelines(submitScript) handle.close() # make reasonable job name jobName = "WMAgentJob" regExpParser = re.compile('.*/JobCreator/JobCache/([^/]+)/[^/]+/.*') match = regExpParser.match(job['cache_dir']) if ( match != None ): jobName = "%s-%s" % (match.group(1), job['id']) # // # // Submit LSF job # // command = 'bsub' command += ' -q %s' % self.queue if self.resourceReq != None: command += ' -R "%s"' % self.resourceReq command += ' -g %s' % self.jobGroup command += ' -J %s' % jobName lsfLogDir = self.batchOutput if lsfLogDir != None: now = datetime.datetime.today() lsfLogDir += '/%s' % now.strftime("%Y%m%d%H") try: os.mkdir(lsfLogDir) logging.debug("Created directory %s" % lsfLogDir) except OSError, err: # suppress LSF log unless it's about an already exisiting directory if err.errno != errno.EEXIST or not os.path.isdir(lsfLogDir): logging.error("Can't create directory %s, turning off LSF log" % lsfLogDir) lsfLogDir = None if lsfLogDir == None: command += ' -oo /dev/null' else: command += ' -oo %s/%s.%%J.out' % (lsfLogDir, jobName) command += ' < %s' % submitScriptFile logging.info("Submitting LSF job: %s" % command) p = subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) stdout = p.communicate()[0] returncode = p.returncode if returncode == 0: # check for correct naming convention in PFN regExpParser = re.compile('Job <([0-9]+)> is submitted to queue') match = regExpParser.match(stdout) if match != None: job['gridid'] = match.group(1) successfulJobs.append(job) logging.info("LSF Job ID : %s" % job['gridid'] ) continue else: logging.error("bsub didn't return a valid Job ID. Job is not submitted") logging.error(stdout) lsfErrorReport = Report() lsfErrorReport.addError("JobSubmit", 61202, "LsfError", stdout) job['fwjr'] = lsfErrorReport failedJobs.append(job)
def submit(self, jobs, info = None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.queue = self.config.JobSubmitter.LsfPluginQueue self.resourceReq = getattr(self.config.JobSubmitter, 'LsfPluginResourceReq', None) self.jobGroup = self.config.JobSubmitter.LsfPluginJobGroup self.batchOutput = getattr(self.config.JobSubmitter, 'LsfPluginBatchOutput', None) successfulJobs = [] failedJobs = [] if len(jobs) == 0: # Then we have nothing to do return successfulJobs, failedJobs # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] for job in jobsReady: if job == {}: # Then I don't know how we got here either logging.error("Was passed a nonexistant job. Ignoring") continue submitScript = self.makeSubmit(job) if not submitScript: # Then we got nothing logging.error("No submit script made!") return {'NoResult': [0]} submitScriptFile = os.path.join(job['cache_dir'], "submit.sh") handle = open(submitScriptFile, 'w') handle.writelines(submitScript) handle.close() # // # // Submit LSF job # // command = 'bsub' command += ' -q %s' % self.queue if self.resourceReq != None: command += ' -R "%s"' % self.resourceReq command += ' -g %s' % self.jobGroup command += ' -J %s' % "WMAgentJob" if self.batchOutput == None: command += ' -oo /dev/null' else: command += ' -oo %s' % self.batchOutput command += ' < %s' % submitScriptFile logging.info("Submitting LSF job: %s" % command) p = subprocess.Popen(command, shell = True, stdout = subprocess.PIPE, stderr = subprocess.STDOUT) stdout = p.communicate()[0] returncode = p.returncode if returncode == 0: # check for correct naming convention in PFN regExpParser = re.compile('Job <([0-9]+)> is submitted to queue') match = regExpParser.match(stdout) if match != None: job['gridid'] = match.group(1) successfulJobs.append(job) continue lsfErrorReport = Report() lsfErrorReport.addError("JobSubmit", 61202, "LsfError", stdout) job['fwjr'] = lsfErrorReport failedJobs.append(job) # We must return a list of jobs successfully submitted, # and a list of jobs failed return successfulJobs, failedJobs
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get('retry_count', None) is None: # Then we can't do anything logging.error("Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error("The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job['id'] msg += "Could not find jobCache directory %s\n" % job['cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get( 'retry_count', None) is None: # Then we can't do anything logging.error( "Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error( "The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorOut = "condor.%s.out" % job['gridid'] condorErr = "condor.%s.err" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] for condorFile in [condorOut, condorErr, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n' % condorFile logOutput += '\n'.join(logTail) condorReport.addError("NoJobReport", 99303, "NoJobReport", logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job[ 'id'] msg += "Could not find jobCache directory %s\n" % job[ 'cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug( "Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def submit(self, jobs, info): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up CondorPlugin worker pool") self.input = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target = submitWorker, args = (self.input, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards queueError = False for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x['jobid'] for x in jobList] if queueError: # If the queue has failed, then we must not process # any more jobs this cycle. continue while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList = jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, 'w') handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" %(len(jobsReady))) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.input.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): try: res = self.result.get(block = True, timeout = timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds." % timeout) queueError = True continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) queueError = True continue try: output = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone crticially wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in CondorPlugin") return successfulJobs, failedJobs
def submit(self, jobs, info=None): """ _submit_ Submit jobs for one subscription """ # If we're here, then we have submitter components self.scriptFile = self.config.JobSubmitter.submitScript self.submitDir = self.config.JobSubmitter.submitDir timeout = getattr(self.config.JobSubmitter, 'getTimeout', 400) successfulJobs = [] failedJobs = [] jdlFiles = [] if len(jobs) == 0: # Then was have nothing to do return successfulJobs, failedJobs if len(self.pool) == 0: # Starting things up # This is obviously a submit API logging.info("Starting up CondorPlugin worker pool") self.input = multiprocessing.Queue() self.result = multiprocessing.Queue() for x in range(self.nProcess): p = multiprocessing.Process(target = submitWorker, args = (self.input, self.result, timeout)) p.start() self.pool.append(p) if not os.path.exists(self.submitDir): os.makedirs(self.submitDir) # Now assume that what we get is the following; a mostly # unordered list of jobs with random sandboxes. # We intend to sort them by sandbox. submitDict = {} nSubmits = 0 for job in jobs: sandbox = job['sandbox'] if not sandbox in submitDict.keys(): submitDict[sandbox] = [] submitDict[sandbox].append(job) # Now submit the bastards queueError = False for sandbox in submitDict.keys(): jobList = submitDict.get(sandbox, []) idList = [x['jobid'] for x in jobList] if queueError: # If the queue has failed, then we must not process # any more jobs this cycle. continue while len(jobList) > 0: jobsReady = jobList[:self.config.JobSubmitter.jobsPerWorker] jobList = jobList[self.config.JobSubmitter.jobsPerWorker:] idList = [x['id'] for x in jobsReady] jdlList = self.makeSubmit(jobList = jobsReady) if not jdlList or jdlList == []: # Then we got nothing logging.error("No JDL file made!") return {'NoResult': [0]} jdlFile = "%s/submit_%i_%i.jdl" % (self.submitDir, os.getpid(), idList[0]) handle = open(jdlFile, 'w') handle.writelines(jdlList) handle.close() jdlFiles.append(jdlFile) # Now submit them logging.info("About to submit %i jobs" %(len(jobsReady))) if self.glexecPath: command = 'CS=`which condor_submit`; ' if self.glexecWrapScript: command += 'export GLEXEC_ENV=`%s 2>/dev/null`; ' % self.glexecWrapScript command += 'export GLEXEC_CLIENT_CERT=%s; ' % self.glexecProxyFile command += 'export GLEXEC_SOURCE_PROXY=%s; ' % self.glexecProxyFile command += 'export X509_USER_PROXY=%s; ' % self.glexecProxyFile command += 'export GLEXEC_TARGET_PROXY=%s; ' % self.jdlProxyFile if self.glexecUnwrapScript: command += '%s %s -- $CS %s' % (self.glexecPath, self.glexecUnwrapScript, jdlFile) else: command += '%s $CS %s' % (self.glexecPath, jdlFile) else: command = "condor_submit %s" % jdlFile try: self.input.put({'command': command, 'idList': idList}) except AssertionError as ex: msg = "Critical error: input pipeline probably closed.\n" msg += str(ex) msg += "Error Procedure: Something critical has happened in the worker process\n" msg += "We will now proceed to pull all useful data from the queue (if it exists)\n" msg += "Then refresh the worker pool\n" logging.error(msg) queueError = True break nSubmits += 1 # Now we should have sent all jobs to be submitted # Going to do the rest of it now for n in range(nSubmits): try: res = self.result.get(block = True, timeout = timeout) except Queue.Empty: # If the queue was empty go to the next submit # Those jobs have vanished logging.error("Queue.Empty error received!") logging.error("This could indicate a critical condor error!") logging.error("However, no information of any use was obtained due to process failure.") logging.error("Either process failed, or process timed out after %s seconds." % timeout) queueError = True continue except AssertionError as ex: msg = "Found Assertion error while retrieving output from worker process.\n" msg += str(ex) msg += "This indicates something critical happened to a worker process" msg += "We will recover what jobs we know were submitted, and resubmit the rest" msg += "Refreshing worker pool at end of loop" logging.error(msg) queueError = True continue try: output = res['stdout'] error = res['stderr'] idList = res['idList'] exitCode = res['exitCode'] except KeyError as ex: msg = "Error in finding key from result pipe\n" msg += "Something has gone critically wrong in the worker\n" try: msg += "Result: %s\n" % str(res) except: pass msg += str(ex) logging.error(msg) queueError = True continue if not exitCode == 0: logging.error("Condor returned non-zero. Printing out command stderr") logging.error(error) errorCheck, errorMsg = parseError(error = error) logging.error("Processing failed jobs and proceeding to the next jobs.") logging.error("Do not restart component.") else: errorCheck = None if errorCheck: self.errorCount += 1 condorErrorReport = Report() condorErrorReport.addError("JobSubmit", 61202, "CondorError", errorMsg) for jobID in idList: for job in jobs: if job.get('id', None) == jobID: job['fwjr'] = condorErrorReport failedJobs.append(job) break else: if self.errorCount > 0: self.errorCount -= 1 for jobID in idList: for job in jobs: if job.get('id', None) == jobID: successfulJobs.append(job) break # If we get a lot of errors in a row it's probably time to # report this to the operators. if self.errorCount > self.errorThreshold: try: msg = "Exceeded errorThreshold while submitting to condor. Check condor status." logging.error(msg) logging.error("Reporting to Alert system and continuing to process jobs") from WMCore.Alerts import API as alertAPI preAlert, sender = alertAPI.setUpAlertsMessaging(self, compName = "BossAirCondorPlugin") sendAlert = alertAPI.getSendAlert(sender = sender, preAlert = preAlert) sendAlert(6, msg = msg) sender.unregister() self.errorCount = 0 except: # There's nothing we can really do here pass # Remove JDL files unless commanded otherwise if getattr(self.config.JobSubmitter, 'deleteJDLFiles', True): for f in jdlFiles: os.remove(f) # When we're finished, clean up the queue workers in order # to free up memory (in the midst of the process, the forked # memory space shouldn't be touched, so it should still be # shared, but after this point any action by the Submitter will # result in memory duplication). logging.info("Purging worker pool to clean up memory") self.close() # We must return a list of jobs successfully submitted, # and a list of jobs failed logging.info("Done submitting jobs for this cycle in CondorPlugin") return successfulJobs, failedJobs
def kill(self, jobs, workflowName=None, killMsg=None, errorCode=71300): """ _kill_ Kill jobs using plugin functions: Only active jobs (status = 1) will be killed. If workflowName is given, then kill all its jobs in one shot. An optional killMsg can be sent; this will be written into the job FWJR. The errorCode will be the one specified and if no killMsg is provided then a standard message associated with the exit code will be used. If a previous FWJR exists, this error will be appended to it. """ if not jobs: return jobsToKill = {} # Now get a list of which jobs are in the batch system # only kill jobs present there loadedJobs = self._buildRunningJobs(wmbsJobs=jobs) for runningJob in loadedJobs: plugin = runningJob['plugin'] jobsToKill.setdefault(plugin, []) jobsToKill[plugin].append(runningJob) for plugin in jobsToKill.keys(): if plugin not in self.plugins.keys(): msg = "Jobs tracking with non-existant plugin %s\n" % (plugin) msg += "They were submitted but can't be tracked?\n" msg += "That's too strange to continue\n" logging.error(msg) raise BossAirException(msg) else: # Then we send them to the plugins try: pluginInst = self.plugins[plugin] if workflowName: # jobs are completed regardless whether the kill succeeded or not self._completeKill(jobs=jobsToKill[plugin]) pluginInst.killWorkflowJobs(workflow=workflowName) else: # raise an exception if it fails to kill jobs, such that the same # jobs are retried again in the next cycle pluginInst.kill(jobs=jobsToKill[plugin], raiseEx=True) self._completeKill(jobs=jobsToKill[plugin]) # Register the killed jobs for job in jobsToKill[plugin]: if job.get('cache_dir') is None or job.get('retry_count') is None: continue # Try to save an error report as the jobFWJR if not os.path.isdir(job['cache_dir']): # Then we have a bad cache directory logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n", job['id']) logging.debug("cache_dir: %s\n", job['cache_dir']) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) errorReport = Report() if os.path.exists(reportName) and os.path.getsize(reportName) > 0: # Then there's already a report there. Add messages errorReport.load(reportName) # Build a better job message if killMsg: reportedMsg = killMsg else: reportedMsg = WM_JOB_ERROR_CODES[errorCode] reportedMsg += '\n Job last known status was: %s' % job.get('globalState', 'Unknown') errorReport.addError("JobKilled", errorCode, "JobKilled", reportedMsg) try: errorReport.save(filename=reportName) except IOError as ioe: logging.warning('Cannot write report %s because of %s', reportName, ioe) except RuntimeError: logging.warning("Plugin failed to remove jobs. It will be retried in the next cycle.") except WMException: raise except Exception as ex: msg = "Unhandled exception while calling kill method for plugin %s\n" % plugin msg += str(ex) logging.error(msg) logging.debug("Interrupted while killing following jobs: %s\n", jobsToKill[plugin]) raise BossAirException(msg) return
def testExitCode(self): """ _testExitCode_ Test and see if we can get an exit code out of a report Note: Errors without a return code return 99999 getStepExitCode: returns the first valid and non-zero exit code getExitCode: uses the method above to get an exit code getStepExitCodes: returns a set of all exit codes within the step """ report = Report("cmsRun1") self.assertEqual(report.getExitCode(), 0) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 0) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {}) self.assertItemsEqual(report.getStepErrors(stepName="cmsRun1"), {}) report.addError(stepName="cmsRun1", exitCode=None, errorType="test", errorDetails="test") # None is not a valid exitCode, but it will get mapped to 99999 self.assertEqual(report.getExitCode(), 99999) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 99999) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 1) report.addError(stepName="cmsRun1", exitCode=102, errorType="test", errorDetails="test") self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 2) report.addError(stepName="cmsRun1", exitCode=103, errorType="test", errorDetails="test") self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 3) # now try to record the same exit code once again report.addError(stepName="cmsRun1", exitCode=104, errorType="test", errorDetails="test") self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 4) # and once again, but different type and details (which does not matter) report.addError(stepName="cmsRun1", exitCode=105, errorType="testEE", errorDetails="testAA") self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 5) # and once again, but different type and details - testing unicode handling report.addError(stepName="cmsRun1", exitCode=106, errorType="test", errorDetails="1 тℯṧт") self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 6) # and once again, but different type and details - testing unicode handling report.addError(stepName="cmsRun1", exitCode=107, errorType="test", errorDetails="2 тℯṧт \x95") self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 7) # and once again, but different type and details - testing unicode handling report.addError(stepName="cmsRun1", exitCode=108, errorType="test", errorDetails=encodeUnicodeToBytes("3 тℯṧт")) self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 8) # and once again, but different type and details - testing unicode handling report.addError(stepName="cmsRun1", exitCode=109, errorType="test", errorDetails=decodeBytesToUnicode("4 тℯṧт")) self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108, 109}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 9) # and once again, but different type and details - testing unicode handling report.addError(stepName="cmsRun1", exitCode=110, errorType="test", errorDetails={"нεʟʟ◎": 3.14159}) self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108, 109, 110}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 10) # and once again, but different type and details - testing unicode handling report.addError(stepName="cmsRun1", exitCode=111, errorType="test", errorDetails={"нεʟʟ◎ \x95": "ẘøґℓ∂ \x95"}) self.assertEqual(report.getExitCode(), 102) self.assertEqual(report.getStepExitCode(stepName="cmsRun1"), 102) self.assertItemsEqual(report.getStepExitCodes(stepName="cmsRun1"), {99999, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111}) self.assertEqual(report.getStepErrors(stepName="cmsRun1")['errorCount'], 11)
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) logging.info("StageOut override is: %s " % self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if getattr(self.step, 'newStageout', False) or \ ('newStageOut' in overrides and overrides.get('newStageOut')): useNewStageOutCode = True stageOutCall = {} if "command" in overrides and "option" in overrides \ and "se-name" in overrides and "phedex-node" in overrides \ and"lfn-prefix" in overrides: logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['se-name'] = overrides.get('se-name') stageOutCall['phedex-node']= overrides.get('phedex-node') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print "STAGEOUT IS USING NEW STAGEOUT CODE" manager = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % (step)) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) taskID = getattr(stepReport.data, 'id', None) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for file in files: if not hasattr(file, 'lfn') and hasattr(file, 'pfn'): # Then we're truly hosed on this file; ignore it msg = "Not a file: %s" % file logging.error(msg) continue # Support direct-to-merge # This requires pulling a bunch of stuff from everywhere # First check if it's needed if hasattr(self.step.output, 'minMergeSize') \ and hasattr(file, 'size') \ and not getattr(file, 'merged', False): # We need both of those to continue, and we don't # direct-to-merge if getattr(self.step.output, 'doNotDirectMerge', False): # Then we've been told explicitly not to do direct-to-merge continue if file.size >= self.step.output.minMergeSize: # Then this goes direct to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception as ex: logging.error("Encountered error while handling LFN for merge due to size.\n") logging.error(str(ex)) logging.debug(file) logging.debug("minMergeSize: %s" % self.step.output.minMergeSize) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) elif getattr(self.step.output, 'maxMergeEvents', None) != None\ and getattr(file, 'events', None) != None\ and not getattr(file, 'merged', False): # Then direct-to-merge due to events if # the file is large enough: if file.events >= self.step.output.maxMergeEvents: # straight to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception as ex: logging.error("Encountered error while handling LFN for merge due to events.\n") logging.error(str(ex)) logging.debug(file) logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60402, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move file.pfn to the output PFN file.InputPFN = file.pfn lfn = getattr(file, 'lfn') fileSource = getattr(file, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(file, 'pfn'), 'SEName' : None, 'PNN' : None, 'StageOutCommand': None, 'Checksums' : getattr(file, 'checksums', None)} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) file.StageOutCommand = fileForTransfer['StageOutCommand'] # file.location = fileForTransfer['SEName'] file.location = fileForTransfer['PNN'] file.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.persist("Report.pkl") except Exception as ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist("Report.pkl") raise signal.alarm(0) # Am DONE with report # Persist it stepReport.persist(reportLocation) #Done with all steps, and should have a list of #stagedOut files in fileForTransfer logging.info("Transferred %i files" %(len(filesTransferred))) return
# Try to save an error report as the jobFWJR if not os.path.isdir(job['cache_dir']): # Then we have a bad cache directory logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id']) logging.debug("cache_dir: %s\n" % job['cache_dir']) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.exists(reportName) and os.path.getsize(reportName) > 0: # Then there's already a report there. Ignore this. logging.debug("Not writing report due to pre-existing report for job %i.\n" % job['id']) logging.debug("ReportPath: %s\n" % reportName) continue else: condorErrorReport = Report() condorErrorReport.addError("JobKilled", 61302, "JobKilled", killMsg) condorErrorReport.save(filename = reportName) return def update(self, jobs): """ _update_ Overwrite the database with whatever you put into this function. """ runJobs = self._buildRunningJobs(wmbsJobs = jobs)
def execute(self, emulator = None): """ _execute_ """ #Are we using emulators again? if (emulator != None): return emulator.emulate( self.step, self.job ) overrides = {} if hasattr(self.step, 'override'): overrides = self.step.override.dictionary_() # Set wait to over an hour waitTime = overrides.get('waitTime', 3600 + (self.step.retryDelay * self.step.retryCount)) logging.info("StageOut override is: %s " % self.step) # Pull out StageOutMgr Overrides # switch between old stageOut behavior and new, fancy stage out behavior useNewStageOutCode = False if overrides.has_key('newStageOut') and overrides.get('newStageOut'): useNewStageOutCode = True stageOutCall = {} if overrides.has_key("command") and overrides.has_key("option") \ and overrides.has_key("se-name") and overrides.has_key("lfn-prefix"): logging.critical('using override in StageOut') stageOutCall['command'] = overrides.get('command') stageOutCall['option'] = overrides.get('option') stageOutCall['se-name'] = overrides.get('se-name') stageOutCall['lfn-prefix'] = overrides.get('lfn-prefix') # naw man, this is real # iterate over all the incoming files if not useNewStageOutCode: # old style manager = StageOutMgr.StageOutMgr(**stageOutCall) manager.numberOfRetries = self.step.retryCount manager.retryPauseTime = self.step.retryDelay else: # new style logging.critical("STAGEOUT IS USING NEW STAGEOUT CODE") print "STAGEOUT IS USING NEW STAGEOUT CODE" manager = WMCore.Storage.FileManager.StageOutMgr( retryPauseTime = self.step.retryDelay, numberOfRetries = self.step.retryCount, **stageOutCall) # We need to find a list of steps in our task # And eventually a list of jobReports for out steps # Search through steps for report files filesTransferred = [] for step in self.stepSpace.taskSpace.stepSpaces(): if step == self.stepName: #Don't try to parse your own report; it's not there yet continue stepLocation = os.path.join(self.stepSpace.taskSpace.location, step) logging.info("Beginning report processing for step %s" % (step)) reportLocation = os.path.join(stepLocation, 'Report.pkl') if not os.path.isfile(reportLocation): logging.error("Cannot find report for step %s in space %s" \ % (step, stepLocation)) continue # First, get everything from a file and 'unpersist' it stepReport = Report() stepReport.unpersist(reportLocation, step) taskID = getattr(stepReport.data, 'id', None) # Don't stage out files from bad steps. if not stepReport.stepSuccessful(step): continue # Okay, time to start using stuff # Now I'm a bit confused about this; each report should ONLY # Have the results of that particular step in it, # So getting all the files should get ONLY the files # for that step; or so I hope files = stepReport.getAllFileRefsFromStep(step = step) for file in files: if not hasattr(file, 'lfn') and hasattr(file, 'pfn'): # Then we're truly hosed on this file; ignore it msg = "Not a file: %s" % file logging.error(msg) continue # Support direct-to-merge # This requires pulling a bunch of stuff from everywhere # First check if it's needed if hasattr(self.step.output, 'minMergeSize') \ and hasattr(file, 'size') \ and not getattr(file, 'merged', False): # We need both of those to continue, and we don't # direct-to-merge if getattr(self.step.output, 'doNotDirectMerge', False): # Then we've been told explicitly not to do direct-to-merge continue if file.size >= self.step.output.minMergeSize: # Then this goes direct to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception, ex: logging.error("Encountered error while handling LFN for merge due to size.\n") logging.error(str(ex)) logging.debug(file) logging.debug("minMergeSize: %s" % self.step.output.minMergeSize) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60401, "DirectToMergeFailure", str(ex)) elif getattr(self.step.output, 'maxMergeEvents', None) != None\ and getattr(file, 'events', None) != None\ and not getattr(file, 'merged', False): # Then direct-to-merge due to events if # the file is large enough: if file.events >= self.step.output.maxMergeEvents: # straight to merge try: file = self.handleLFNForMerge(mergefile = file, step = step) except Exception, ex: logging.error("Encountered error while handling LFN for merge due to events.\n") logging.error(str(ex)) logging.debug(file) logging.debug("maxMergeEvents: %s" % self.step.output.maxMergeEvents) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60402, "DirectToMergeFailure", str(ex)) # Save the input PFN in case we need it # Undecided whether to move file.pfn to the output PFN file.InputPFN = file.pfn lfn = getattr(file, 'lfn') fileSource = getattr(file, 'Source', None) if fileSource in ['TFileService', 'UserDefined']: userLfnRegEx(lfn) else: lfnRegEx(lfn) fileForTransfer = {'LFN': lfn, 'PFN': getattr(file, 'pfn'), 'SEName' : None, 'StageOutCommand': None} signal.signal(signal.SIGALRM, alarmHandler) signal.alarm(waitTime) try: manager(fileForTransfer) #Afterwards, the file should have updated info. filesTransferred.append(fileForTransfer) file.StageOutCommand = fileForTransfer['StageOutCommand'] file.location = fileForTransfer['SEName'] file.OutputPFN = fileForTransfer['PFN'] except Alarm: msg = "Indefinite hang during stageOut of logArchive" logging.error(msg) manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60403, "StageOutTimeout", msg) stepReport.persist("Report.pkl") except Exception, ex: manager.cleanSuccessfulStageOuts() stepReport.addError(self.stepName, 60307, "StageOutFailure", str(ex)) stepReport.setStepStatus(self.stepName, 1) stepReport.persist("Report.pkl") raise
def kill(self, jobs, workflowName=None, killMsg=None, errorCode=71300): """ _kill_ Kill jobs using plugin functions: Only active jobs (status = 1) will be killed. If workflowName is given, then kill all its jobs in one shot. An optional killMsg can be sent; this will be written into the job FWJR. The errorCode will be the one specified and if no killMsg is provided then a standard message associated with the exit code will be used. If a previous FWJR exists, this error will be appended to it. """ if not jobs: return jobsToKill = {} # Now get a list of which jobs are in the batch system # only kill jobs present there loadedJobs = self._buildRunningJobs(wmbsJobs=jobs) for runningJob in loadedJobs: plugin = runningJob['plugin'] jobsToKill.setdefault(plugin, []) jobsToKill[plugin].append(runningJob) for plugin in jobsToKill.keys(): if plugin not in self.plugins.keys(): msg = "Jobs tracking with non-existant plugin %s\n" % (plugin) msg += "They were submitted but can't be tracked?\n" msg += "That's too strange to continue\n" logging.error(msg) raise BossAirException(msg) else: # Then we send them to the plugins try: pluginInst = self.plugins[plugin] if workflowName: pluginInst.killWorkflowJobs(workflow=workflowName) else: pluginInst.kill(jobs=jobsToKill[plugin]) # Register the killed jobs for job in jobsToKill[plugin]: if job.get('cache_dir') is None or job.get( 'retry_count') is None: continue # Try to save an error report as the jobFWJR if not os.path.isdir(job['cache_dir']): # Then we have a bad cache directory logging.error( "Could not write a kill FWJR due to non-existant cache_dir for job %i\n", job['id']) logging.debug("cache_dir: %s\n", job['cache_dir']) continue reportName = os.path.join( job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) errorReport = Report() if os.path.exists(reportName) and os.path.getsize( reportName) > 0: # Then there's already a report there. Add messages errorReport.load(reportName) # Build a better job message if killMsg: reportedMsg = killMsg else: reportedMsg = WM_JOB_ERROR_CODES[errorCode] reportedMsg += '\n Job last known status was: %s' % job.get( 'globalState', 'Unknown') errorReport.addError("JobKilled", errorCode, "JobKilled", reportedMsg) try: errorReport.save(filename=reportName) except IOError as ioe: logging.warning( 'Cannot write report %s because of %s', reportName, ioe) except WMException: raise except Exception as ex: msg = "Unhandled exception while calling kill method for plugin %s\n" % plugin msg += str(ex) logging.error(msg) logging.debug( "Interrupted while killing following jobs: %s\n", jobsToKill[plugin]) raise BossAirException(msg) finally: # Even if kill fails, complete the jobs self._completeKill(jobs=jobsToKill[plugin]) return
logging.error( "Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id']) logging.debug("cache_dir: %s\n" % job['cache_dir']) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) condorErrorReport = Report() if os.path.exists( reportName) and os.path.getsize(reportName) > 0: # Then there's already a report there. Add messages condorErrorReport.load(reportName) #Build a better job message reportedMsg = killMsg + '\n Job last known status was: %s' % job.get( 'globalState', 'Unknown') condorErrorReport.addError("JobKilled", 61302, "JobKilled", reportedMsg) try: condorErrorReport.save(filename=reportName) except IOError, ioe: logging.warning('Cannot write report %s because of %s' % (reportName, ioe)) return def update(self, jobs): """ _update_ Overwrite the database with whatever you put into this function. """
continue # Try to save an error report as the jobFWJR if not os.path.isdir(job['cache_dir']): # Then we have a bad cache directory logging.error("Could not write a kill FWJR due to non-existant cache_dir for job %i\n" % job['id']) logging.debug("cache_dir: %s\n" % job['cache_dir']) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) condorErrorReport = Report() if os.path.exists(reportName) and os.path.getsize(reportName) > 0: # Then there's already a report there. Add messages condorErrorReport.load(reportName) #Build a better job message reportedMsg = killMsg + '\n Job last known status was: %s' % job.get('globalState', 'Unknown') condorErrorReport.addError("JobKilled", 61302, "JobKilled", reportedMsg) condorErrorReport.save(filename = reportName) return def update(self, jobs): """ _update_ Overwrite the database with whatever you put into this function. """ runJobs = self._buildRunningJobs(wmbsJobs = jobs)