def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get( 'retry_count', None) is None: # Then we can't do anything logging.error( "Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error( "The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp( condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp( condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job[ 'id'] msg += "Could not find jobCache directory %s\n" % job[ 'cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug( "Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return
def complete(self, jobs): """ Do any completion work required In this case, look for a returned logfile """ for job in jobs: if job.get('cache_dir', None) is None or job.get('retry_count', None) is None: # Then we can't do anything logging.error("Can't find this job's cache_dir or retry count: %s", job) continue reportName = os.path.join(job['cache_dir'], 'Report.%i.pkl' % job['retry_count']) if os.path.isfile(reportName) and os.path.getsize(reportName) > 0: # everything in order, move on continue elif os.path.isdir(reportName): # Then something weird has happened. Report error, do nothing logging.error("The job report for job with id %s and gridid %s is a directory", job['id'], job['gridid']) logging.error("Ignoring this, but this is very strange") else: logging.error("No job report for job with id %s and gridid %s", job['id'], job['gridid']) if os.path.isfile(reportName): os.remove(reportName) # create a report from scratch condorReport = Report() logOutput = 'Could not find jobReport\n' if os.path.isdir(job['cache_dir']): condorErr = "condor.%s.err" % job['gridid'] condorOut = "condor.%s.out" % job['gridid'] condorLog = "condor.%s.log" % job['gridid'] exitCode = 99303 exitType = "NoJobReport" for condorFile in [condorErr, condorOut, condorLog]: condorFilePath = os.path.join(job['cache_dir'], condorFile) logOutput += "\n========== %s ==========\n" % condorFile if os.path.isfile(condorFilePath): logTail = BasicAlgos.tail(condorFilePath, 50) logOutput += 'Adding end of %s to error message:\n\n' % condorFile logOutput += logTail logOutput += '\n\n' if condorFile == condorLog: # for condor log, search for the information for matchObj in getIterMatchObjectOnRegexp(condorFilePath, CONDOR_LOG_FILTER_REGEXP): condorReason = matchObj.group("Reason") if condorReason: logOutput += condorReason if "SYSTEM_PERIODIC_REMOVE" in condorReason or "via condor_rm" in condorReason: exitCode = 99400 exitType = "RemovedByGLIDEIN" else: exitCode = 99401 siteName = matchObj.group("Site") if siteName: condorReport.data.siteName = siteName else: condorReport.data.siteName = "NoReportedSite" else: for matchObj in getIterMatchObjectOnRegexp(condorFilePath, WMEXCEPTION_REGEXP): errMsg = matchObj.group('WMException') if errMsg: logOutput += "\n\n%s\n" % errMsg errMsg = matchObj.group('ERROR') if errMsg: logOutput += "\n\n%s\n" % errMsg logOutput += '\n\n' condorReport.addError(exitType, exitCode, exitType, logOutput) else: msg = "Serious Error in Completing condor job with id %s!\n" % job['id'] msg += "Could not find jobCache directory %s\n" % job['cache_dir'] msg += "Creating a new cache_dir for failed job report\n" logging.error(msg) os.makedirs(job['cache_dir']) condorReport.addError("NoJobReport", 99304, "NoCacheDir", logOutput) condorReport.save(filename=reportName) logging.debug("Created failed job report for job with id %s and gridid %s", job['id'], job['gridid']) return