def test_getTimeLeft( self ): # for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]: for batch, retValue in [( 'LSF', LSF_ReturnValue )]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" ) rcMock = MagicMock() rcMock.return_value = S_OK( retValue ) self.tl.runCommand = rcMock timeMock = MagicMock() tl = TimeLeft() # res = tl.getTimeLeft() # self.assertEqual( res['OK'], True ) batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName] ) batchStr = 'batchPlugin.%s()' % ( batchSystemName ) tl.batchPlugin = eval( batchStr ) tl.scaleFactor = 10.0 tl.normFactor = 10.0 tl.batchPlugin.bin = '/usr/bin' tl.batchPlugin.hostNorm = 10.0 tl.batchPlugin.cpuLimit = 1000 tl.batchPlugin.wallClockLimit = 1000 with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.runCommand", new=rcMock ): with patch( "DIRAC.Core.Utilities.TimeLeft.LSFTimeLeft.time", new=timeMock ): res = tl.getTimeLeft() self.assertEqual( res['OK'], True, res.get('Message', '') ) for batch, retValue in [( 'SGE', SGE_ReturnValue )]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" ) rcMock = MagicMock() rcMock.return_value = S_OK( retValue ) self.tl.runCommand = rcMock tl = TimeLeft() # res = tl.getTimeLeft() # self.assertFalse( res['OK'] ) batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName] ) batchStr = 'batchPlugin.%s()' % ( batchSystemName ) tl.batchPlugin = eval( batchStr ) tl.scaleFactor = 10.0 tl.normFactor = 10.0 tl.batchPlugin.bin = '/usr/bin' tl.batchPlugin.hostNorm = 10.0 tl.batchPlugin.cpuLimit = 1000 tl.batchPlugin.wallClockLimit = 1000 res = tl.getTimeLeft() self.assert_( res['OK'] ) self.assertEqual( res['Value'], 9400.0 )
def test_getTimeLeft(self): # for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]: for batch, retValue in [('LSF', LSF_ReturnValue)]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft") rcMock = MagicMock() rcMock.return_value = S_OK(retValue) self.tl.runCommand = rcMock tl = TimeLeft() # res = tl.getTimeLeft() # self.assertEqual( res['OK'], True ) batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName]) batchStr = 'batchPlugin.%s()' % (batchSystemName) tl.batchPlugin = eval(batchStr) tl.scaleFactor = 10.0 tl.normFactor = 10.0 tl.batchPlugin.bin = '/usr/bin' tl.batchPlugin.hostNorm = 10.0 tl.batchPlugin.cpuLimit = 1000 tl.batchPlugin.wallClockLimit = 1000 res = tl.getTimeLeft() self.assertEqual(res['OK'], True) for batch, retValue in [('SGE', SGE_ReturnValue)]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft") rcMock = MagicMock() rcMock.return_value = S_OK(retValue) self.tl.runCommand = rcMock tl = TimeLeft() # res = tl.getTimeLeft() # self.assertFalse( res['OK'] ) batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName]) batchStr = 'batchPlugin.%s()' % (batchSystemName) tl.batchPlugin = eval(batchStr) tl.scaleFactor = 10.0 tl.normFactor = 10.0 tl.batchPlugin.bin = '/usr/bin' tl.batchPlugin.hostNorm = 10.0 tl.batchPlugin.cpuLimit = 1000 tl.batchPlugin.wallClockLimit = 1000 res = tl.getTimeLeft() self.assert_(res['OK']) self.assertEqual(res['Value'], 9400.0)
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}): """ Constructor, takes system flag as argument. """ self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2)) # SIGINT self.stopSigRegex = jobArgs.get('StopSigRegex', None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.scaleFactor = 1.0 self.processors = processors ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR('Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5) if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0) return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always('Can not start watchdog for the following reason') self.log.always(result['Message']) return result try: while True: self.log.debug('Starting watchdog loop # %d' % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result['OK']: self.log.error("Watchdog error during execution", result['Message']) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR('Exception') ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info('Process to monitor has completed, Watchdog will exit.') return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if not self.stopSigSent and self.stopSigRegex is not None and ( time.time() - self.initialValues['StartTime']) > self.wallClockCheckSeconds * self.wallClockCheckCount: self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues['StartTime']) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn('Problem during recent checks') self.log.warn(result['Message']) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature('jobstart_secs') if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \ (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline)) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error('Failed to send signals to JobWrapper children! (%s)' % str(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose('------------------------------------') self.log.verbose('Checking loop starts for Watchdog') heartBeatDict = {} msg = '' loadAvg = float(os.getloadavg()[0]) msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if 'LoadAverage' not in self.parameters: self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append(loadAvg) memoryUsed = self.getMemoryUsed() msg += 'MemUsed: %.1f kb ' % (memoryUsed) heartBeatDict['MemoryUsed'] = memoryUsed if 'MemoryUsed' not in self.parameters: self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append(memoryUsed) result = self.profiler.getAllProcessData(withChildren=True) if result['OK']: vsize = result['Value']['stats']['vSizeUsage'] * 1024. rss = result['Value']['stats']['memoryUsage'] * 1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault('Vsize', []) self.parameters['Vsize'].append(vsize) self.parameters.setdefault('RSS', []) self.parameters['RSS'].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result['OK']: self.log.warn("Could not establish DiskSpace", result['Message']) else: msg += 'DiskSpace: %.1f MB ' % (result['Value']) if 'DiskSpace' not in self.parameters: self.parameters['DiskSpace'] = [] if result['OK']: self.parameters['DiskSpace'].append(result['Value']) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % (cpu) if 'CPUConsumed' not in self.parameters: self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) else: msg += 'WallClock: %.2f s ' % (result['Value']) self.parameters.setdefault('WallClockTime', list()).append(result['Value']) heartBeatDict['WallClockTime'] = result['Value'] self.log.info(msg) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) self.log.info('Last %s lines of available application output:' % (size)) self.log.info('================START================') for line in outputList: self.log.info(line) self.log.info('=================END=================') self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % (size, Time.dateTime()) border = '=' * len(recentStdOut) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % (hmsCPU) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn('Turning off job peeking for remainder of execution') if 'JOBID' not in os.environ: self.log.info('Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput': recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK('Watchdog checking cycle complete') ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: result = self.profiler.getAllProcessData(withChildren=True) if not result['OK']: self.log.warn('Problem while checking consumed CPU') return result cpuTime = result['Value'] if cpuTime: cpuTimeTotal = cpuTime['stats']['cpuUsageSystem'] + cpuTime['stats']['cpuUsageUser'] self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTimeTotal)) return self.__getCPUHMS(cpuTimeTotal) else: self.log.error("CPU time consumed found to be 0") return S_ERROR() except Exception as e: self.log.warn('Could not determine CPU time consumed with exception') self.log.exception(e) return S_ERROR("Could not determine CPU time consumed with exception") ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = '%02d:%02d:%02d' % (hours, mins, secs) self.log.verbose('Human readable CPU time is: %s' % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info('Received control signal') if isinstance(signalDict, dict): if 'Kill' in signalDict: self.log.info('Received Kill signal, stopping job via control signal') self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info('The following control signal was sent but not understood by the watchdog:') self.log.info(signalDict) else: self.log.info('Expected dictionary for control signal, received:\n%s' % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() if not result['OK']: self.log.warn(result['Message']) return result report += 'WallClock: OK, ' else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() if not result['OK']: self.log.warn(result['Message']) return result report += 'DiskSpace: OK, ' else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() if not result['OK']: self.log.warn("Check of load average failed, but won't fail because of that: %s" % result['Message']) report += 'LoadAverage: ERROR, ' return S_OK() report += 'LoadAverage: OK, ' else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result['OK']: return result report += 'CPUConsumed: OK, ' else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'CPULimit OK, ' else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'MemoryLimit OK, ' else: report += 'MemoryLimit: NA, ' self.log.info(report) return S_OK('All enabled checks passed') ############################################################################# def __checkCPUConsumed(self): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if 'WallClockTime' not in self.parameters: return S_ERROR('Missing WallClockTime info') if 'CPUConsumed' not in self.parameters: return S_ERROR('Missing CPUConsumed info') wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info("Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % ( wallClockTime, self.sampleCPUTime)) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters['CPUConsumed']) < intervals + 1: self.log.info("Not enough snapshots to calculate, there are %s and we need %s" % (len(self.parameters['CPUConsumed']), intervals + 1)) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals] try: cpuTime = self.__convertCPUTime(self.parameters['CPUConsumed'][-1])['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime(self.parameters['CPUConsumed'][-1 - intervals])['Value'] if cpuTime < 0: self.log.warn('Consumed CPU time negative, something wrong may have happened, ignore') return S_OK() if wallClockTime <= 0: self.log.warn('Wallclock time should not be negative or zero, Ignore') return S_OK() ratio = (cpuTime / wallClockTime) * 100. self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'): self.log.info('N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload') return S_OK() self.log.info("Job is stalled!") return S_ERROR('Watchdog identified this job as stalled') except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """ Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(':') # for i in xrange( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR('Could not calculate CPU time') # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result['Value'] = normalizedCPUValue self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """ Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if 'CPUConsumed' in self.parameters: consumedCPU = self.parameters['CPUConsumed'][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict['OK']: currentCPU = consumedCPUDict['Value'] else: return S_OK('Not possible to determine current CPU consumed') if consumedCPU: limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % (self.jobCPUMargin)) return S_ERROR('Job has exceeded maximum CPU time limit') else: return S_OK('Job within CPU limit') elif not currentCPU: self.log.verbose('Both initial and current CPU consumed are null') return S_OK('CPU consumed is not measurable yet') else: return S_OK('Not possible to determine CPU consumed') def __checkMemoryLimit(self): """ Checks that the job memory consumption is within a limit """ if 'Vsize' in self.parameters: vsize = self.parameters['Vsize'][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn("Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit)) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available. """ if 'DiskSpace' in self.parameters: availSpace = self.parameters['DiskSpace'][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info('Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace)) return S_ERROR('Job has insufficient disk space to continue') else: return S_OK('Job has enough disk space available') else: return S_ERROR('Available disk space could not be established') ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if 'StartTime' in self.initialValues: startTime = self.initialValues['StartTime'] if time.time() - startTime > self.maxWallClockTime: self.log.info('Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime)) return S_ERROR('Job has exceeded maximum wall clock time') else: return S_OK('Job within maximum wall clock time') else: return S_ERROR('Job start time could not be established') ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded. """ if 'LoadAverage' in self.parameters: loadAvg = self.parameters['LoadAverage'][-1] if loadAvg > float(self.loadAvgLimit): self.log.info('Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit)) return S_ERROR('Job exceeded maximum load average') else: return S_OK('Job running with normal load average') else: return S_ERROR('Job load average not established') ############################################################################# def __peek(self): """ Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result['OK']: self.log.warn('Could not obtain output from running application thread') self.log.warn(result['Message']) return result ############################################################################# def calibrate(self): """ The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters['WallClockTime'] = [] cpuConsumed = self.__getCPU() if not cpuConsumed['OK']: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed['Value'] self.initialValues['CPUConsumed'] = cpuConsumed self.parameters['CPUConsumed'] = [] self.initialValues['LoadAverage'] = float(os.getloadavg()[0]) self.parameters['LoadAverage'] = [] memUsed = self.getMemoryUsed() self.initialValues['MemoryUsed'] = memUsed self.parameters['MemoryUsed'] = [] result = self.profiler.getAllProcessData(withChildren=True) self.log.verbose('Job Memory: %s' % (result['Value'])) if not result['OK']: self.log.warn('Could not get job memory usage') self.initialValues['Vsize'] = result['Value']['stats']['vSizeUsage'] * 1024. self.initialValues['RSS'] = result['Value']['stats']['memoryUsage'] * 1024. self.parameters['Vsize'] = [] self.parameters['RSS'] = [] result = self.getDiskSpace() self.log.verbose('DiskSpace: %s' % (result)) if not result['OK']: self.log.warn("Could not establish DiskSpace") self.initialValues['DiskSpace'] = result['Value'] self.parameters['DiskSpace'] = [] result = self.getNodeInformation() self.log.verbose('NodeInfo: %s' % (result)) if not result['OK']: self.log.warn("Could not establish static system information") if 'LSB_JOBID' in os.environ: result['LocalJobID'] = os.environ['LSB_JOBID'] if 'PBS_JOBID' in os.environ: result['LocalJobID'] = os.environ['PBS_JOBID'] if 'QSUB_REQNAME' in os.environ: result['LocalJobID'] = os.environ['QSUB_REQNAME'] if 'JOB_ID' in os.environ: result['LocalJobID'] = os.environ['JOB_ID'] self.__reportParameters(result, 'NodeInformation', True) self.__reportParameters(self.initialValues, 'InitialValues') return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result['OK']: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result['Value'] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info('TimeLeft bellow %s, now checking with higher frequency' % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """ Returns average load, memory etc. over execution of job thread """ summary = {} # CPUConsumed if 'CPUConsumed' in self.parameters: cpuList = self.parameters['CPUConsumed'] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: summary['LastUpdateCPU(s)'] = rawCPU['Value'] else: summary['LastUpdateCPU(s)'] = 'Could not be estimated' # DiskSpace if 'DiskSpace' in self.parameters: space = self.parameters['DiskSpace'] if space: value = abs(float(space[-1]) - float(self.initialValues['DiskSpace'])) if value < 0: value = 0 summary['DiskSpace(MB)'] = value else: summary['DiskSpace(MB)'] = 'Could not be estimated' # MemoryUsed if 'MemoryUsed' in self.parameters: memory = self.parameters['MemoryUsed'] if memory: summary['MemoryUsed(kb)'] = abs(float(memory[-1]) - float(self.initialValues['MemoryUsed'])) else: summary['MemoryUsed(kb)'] = 'Could not be estimated' # LoadAverage if 'LoadAverage' in self.parameters: laList = self.parameters['LoadAverage'] if laList: summary['LoadAverage'] = float(sum(laList)) / float(len(laList)) else: summary['LoadAverage'] = 'Could not be estimated' result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) summary['WallClockTime(s)'] = 0 summary['ScaledCPUTime(s)'] = 0 else: wallClock = result['Value'] summary['WallClockTime(s)'] = wallClock summary['ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors self.__reportParameters(summary, 'UsageSummary', True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job. """ try: parameters = [] self.log.info('==========================================================') if title: self.log.info('Watchdog will report %s' % (title)) else: self.log.info('Watchdog will report parameters') self.log.info('==========================================================') vals = params if 'Value' in params: if vals['Value']: vals = params['Value'] for k, v in vals.items(): if v: self.log.info(str(k) + ' = ' + str(v)) parameters.append((k, v)) if report: self.__setJobParamList(parameters) self.log.info('==========================================================') except Exception as x: self.log.warn('Problem while reporting parameters') self.log.warn(str(x)) ############################################################################# def __getWallClockTime(self): """ Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if 'StartTime' in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues['StartTime'] result['Value'] = wallClock else: self.initialValues['StartTime'] = time.time() result['Value'] = 0.0 return result ############################################################################# def __killRunningThread(self): """ Will kill the running thread process and any child processes.""" self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info('Subprocess.killChild() returned:%s ' % (result)) return S_OK('Thread killed') ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """ Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result['OK']: self.log.warn('Problem sending sign of life') self.log.warn(result) if result['OK'] and result['Value']: self.__interpretControlSignal(result['Value']) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client """ # job wrapper template sets the jobID variable if 'JOBID' not in os.environ: self.log.info('Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value) self.log.verbose('setJobParameters(%s,%s)' % (jobID, value)) if not jobParam['OK']: self.log.warn(jobParam['Message']) return jobParam ############################################################################# def getNodeInformation(self): """ Attempts to retrieve all static system information, should be overridden in a subclass""" methodName = 'getNodeInformation' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass') ############################################################################# def getMemoryUsed(self): """Obtains the memory used. """ mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss return float(mem) ############################################################################# def getDiskSpace(self): """ Attempts to get the available disk space, should be overridden in a subclass""" methodName = 'getDiskSpace' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
class Watchdog( object ): ############################################################################# def __init__( self, pid, exeThread, spObject, jobCPUtime, memoryLimit = 0, systemFlag = 'linux2.4' ): """ Constructor, takes system flag as argument. """ self.log = gLogger.getSubLogger( "Watchdog" ) self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUtime = jobCPUtime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.processMonitor = ProcessMonitor() self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 #defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False ############################################################################# def initialize( self, loops = 0 ): """ Watchdog initialization. """ if self.initialized: self.log.info( 'Watchdog already initialized' ) return S_OK() else: self.initialized = True setup = gConfig.getValue( '/DIRAC/Setup', '' ) if not setup: return S_ERROR( 'Can not get the DIRAC Setup value' ) wms_instance = getSystemInstance( "WorkloadManagement" ) if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance' ) self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose( 'Watchdog initialization' ) self.log.info( 'Attempting to Initialize Watchdog for: %s' % ( self.systemFlag ) ) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1 ) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1 ) self.testLoadAvg = gConfig.getValue( self.section + '/CheckLoadAvgFlag', 1 ) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1 ) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0 ) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0 ) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1 ) # Other parameters self.pollingTime = gConfig.getValue( self.section + '/PollingTime', 10 ) # 10 seconds self.checkingTime = gConfig.getValue( self.section + '/CheckingTime', 30 * 60 ) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60 ) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60 ) # e.g. 4 days self.jobPeekFlag = gConfig.getValue( self.section + '/JobPeekFlag', 1 ) # on / off self.minDiskSpace = gConfig.getValue( self.section + '/MinDiskSpace', 10 ) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000 ) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue( self.section + '/CPUSampleTime', 30 * 60 ) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20 ) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5 ) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5 ) # After 5 sample times return null CPU consumption kill job if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % ( self.checkingTime, self.minCheckingTime ) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime ) return S_OK() def run( self ): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always( 'Can not start watchdog for the following reason' ) self.log.always( result['Message'] ) return result try: while True: self.log.debug( 'Starting watchdog loop # %d' % self.count ) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result[ 'OK' ]: self.log.error( "Watchdog error during execution", result[ 'Message' ] ) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep( self.pollingTime - exec_cycle_time ) return S_OK() except Exception: self.log.exception() return S_ERROR( 'Exception' ) ############################################################################# def execute( self ): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info( 'Process to monitor has completed, Watchdog will exit.' ) return S_OK( "Ended" ) if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error( self.checkError, self.timeLeft ) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if ( time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn( 'Problem during recent checks' ) self.log.warn( result['Message'] ) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performChecks( self ): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose( '------------------------------------' ) self.log.verbose( 'Checking loop starts for Watchdog' ) heartBeatDict = {} msg = '' loadAvg = self.getLoadAverage() if not loadAvg['OK']: msg += 'LoadAvg: ERROR' else: loadAvg = loadAvg['Value'] msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if not self.parameters.has_key( 'LoadAverage' ): self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append( loadAvg ) memoryUsed = self.getMemoryUsed() if not memoryUsed['OK']: msg += 'MemUsed: ERROR ' else: memoryUsed = memoryUsed['Value'] msg += 'MemUsed: %.1f kb ' % ( memoryUsed ) heartBeatDict['MemoryUsed'] = memoryUsed if not self.parameters.has_key( 'MemoryUsed' ): self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append( memoryUsed ) result = self.processMonitor.getMemoryConsumed( self.wrapperPID ) if result['OK']: vsize = result['Value']['Vsize']/1024. rss = result['Value']['RSS']/1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault( 'Vsize', [] ) self.parameters['Vsize'].append( vsize ) self.parameters.setdefault( 'RSS', [] ) self.parameters['RSS'].append( rss ) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result['OK']: self.log.warn( "Could not establish DiskSpace", result['Message'] ) else: msg += 'DiskSpace: %.1f MB ' % ( result['Value'] ) if not self.parameters.has_key( 'DiskSpace' ): self.parameters['DiskSpace'] = [] if result['OK']: self.parameters['DiskSpace'].append( result['Value'] ) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % ( cpu ) if not self.parameters.has_key( 'CPUConsumed' ): self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append( cpu ) hmsCPU = cpu rawCPU = self.__convertCPUTime( hmsCPU ) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn( "Failed determining wall clock time", result['Message'] ) else: msg += 'WallClock: %.2f s ' % ( result['Value'] ) self.parameters.setdefault( 'WallClockTime', list() ).append( result['Value'] ) heartBeatDict['WallClockTime'] = result['Value'] self.log.info( msg ) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn( self.checkError ) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len( outputList ) self.log.info( 'Last %s lines of available application output:' % ( size ) ) self.log.info( '================START================' ) for line in outputList: self.log.info( line ) self.log.info( '=================END=================' ) self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len( outputList ) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime() ) border = '=' * len( recentStdOut ) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU ) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % ( border, recentStdOut, cpuTotal, border ) self.log.info( recentStdOut ) for line in outputList: self.log.info( line ) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info( recentStdOut ) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn( 'Turning off job peeking for remainder of execution' ) if not os.environ.has_key( 'JOBID' ): self.log.info( 'Running without JOBID so parameters will not be reported' ) return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput':recentStdOut} self.__sendSignOfLife( int( jobID ), heartBeatDict, staticParamDict ) return S_OK( 'Watchdog checking cycle complete' ) ############################################################################# def __getCPU( self ): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: cpuTime = self.processMonitor.getCPUConsumed( self.wrapperPID ) if not cpuTime['OK']: self.log.warn( 'Problem while checking consumed CPU' ) return cpuTime cpuTime = cpuTime['Value'] if cpuTime: self.log.verbose( "Raw CPU time consumed (s) = %s" % ( cpuTime ) ) return self.__getCPUHMS( cpuTime ) else: self.log.error( "CPU time consumed found to be 0" ) return S_ERROR() except Exception as e: self.log.warn( 'Could not determine CPU time consumed with exception' ) self.log.exception( e ) return S_ERROR( "Could not determine CPU time consumed with exception" ) ############################################################################# def __getCPUHMS( self, cpuTime ): mins, secs = divmod( cpuTime, 60 ) hours, mins = divmod( mins, 60 ) humanTime = '%02d:%02d:%02d' % ( hours, mins, secs ) self.log.verbose( 'Human readable CPU time is: %s' % humanTime ) return S_OK( humanTime ) ############################################################################# def __interpretControlSignal( self, signalDict ): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info( 'Received control signal' ) if type( signalDict ) == type( {} ): if signalDict.has_key( 'Kill' ): self.log.info( 'Received Kill signal, stopping job via control signal' ) self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info( 'The following control signal was sent but not understood by the watchdog:' ) self.log.info( signalDict ) else: self.log.info( 'Expected dictionary for control signal, received:\n%s' % ( signalDict ) ) return S_OK() ############################################################################# def _checkProgress( self ): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'WallClock: OK, ' else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'DiskSpace: OK, ' else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() if not result['OK']: self.log.warn( "Check of load average failed, but won't fail because of that: %s" % result['Message'] ) report += 'LoadAverage: ERROR, ' return S_OK() report += 'LoadAverage: OK, ' else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result['OK']: return result report += 'CPUConsumed: OK, ' else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'CPULimit OK, ' else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result['OK']: self.log.warn( result['Message'] ) return result report += 'MemoryLimit OK, ' else: report += 'MemoryLimit: NA, ' self.log.info( report ) return S_OK( 'All enabled checks passed' ) ############################################################################# def __checkCPUConsumed( self ): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info( "Checking CPU Consumed" ) if 'WallClockTime' not in self.parameters: return S_ERROR( 'Missing WallClockTime info' ) if 'CPUConsumed' not in self.parameters: return S_ERROR( 'Missing CPUConsumed info' ) wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % ( wallClockTime, self.sampleCPUTime ) ) return S_OK() intervals = max( 1, int( self.sampleCPUTime / self.checkingTime ) ) if len( self.parameters['CPUConsumed'] ) < intervals + 1: self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % ( len( self.parameters['CPUConsumed'] ), intervals + 1 ) ) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals ] try: cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1] )['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals ] )['Value'] if cpuTime < 0: self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' ) return S_OK() if wallClockTime <= 0: self.log.warn( 'Wallclock time should not be negative or zero, Ignore' ) return S_OK() ratio = ( cpuTime / wallClockTime ) * 100. self.log.info( "CPU/Wallclock ratio is %.2f%%" % ratio ) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists( 'DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK' ): self.log.info( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' ) return S_OK() self.log.info( "Job is stalled!" ) return S_ERROR( 'Watchdog identified this job as stalled' ) except Exception as e: self.log.error( "Cannot convert CPU consumed from string to int", str( e ) ) return S_OK() ############################################################################# def __convertCPUTime( self, cputime ): """ Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split( ':' ) # for i in xrange( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float( cpuHMS[0] ) * 60 * 60 mins = float( cpuHMS[1] ) * 60 secs = float( cpuHMS[2] ) cpuValue = float( hours + mins + secs ) except Exception as x: self.log.warn( str( x ) ) return S_ERROR( 'Could not calculate CPU time' ) # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result['Value'] = normalizedCPUValue self.log.debug( 'CPU value %s converted to %s' % ( cputime, normalizedCPUValue ) ) return result ############################################################################# def __checkCPULimit( self ): """ Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if self.parameters.has_key( 'CPUConsumed' ): consumedCPU = self.parameters['CPUConsumed'][-1] consumedCPUDict = self.__convertCPUTime( consumedCPU ) if consumedCPUDict['OK']: currentCPU = consumedCPUDict['Value'] else: return S_OK( 'Not possible to determine current CPU consumed' ) if consumedCPU: limit = self.jobCPUtime + self.jobCPUtime * ( self.jobCPUMargin / 100 ) cpuConsumed = float( currentCPU ) if cpuConsumed > limit: self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % ( self.jobCPUMargin ) ) return S_ERROR( 'Job has exceeded maximum CPU time limit' ) else: return S_OK( 'Job within CPU limit' ) elif not currentCPU: self.log.verbose( 'Both initial and current CPU consumed are null' ) return S_OK( 'CPU consumed is not measurable yet' ) else: return S_OK( 'Not possible to determine CPU consumed' ) def __checkMemoryLimit( self ): """ Checks that the job memory consumption is within a limit """ if self.parameters.has_key( 'Vsize' ): vsize = self.parameters['Vsize'][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % ( vsize, self.memoryLimit ) ) return S_OK() ############################################################################# def __checkDiskSpace( self ): """Checks whether the CS defined minimum disk space is available. """ if self.parameters.has_key( 'DiskSpace' ): availSpace = self.parameters['DiskSpace'][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( 'Not enough local disk space for job to continue, defined in CS as %s MB' % ( self.minDiskSpace ) ) return S_ERROR( 'Job has insufficient disk space to continue' ) else: return S_OK( 'Job has enough disk space available' ) else: return S_ERROR( 'Available disk space could not be established' ) ############################################################################# def __checkWallClockTime( self ): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if self.initialValues.has_key( 'StartTime' ): startTime = self.initialValues['StartTime'] if time.time() - startTime > self.maxWallClockTime: self.log.info( 'Job has exceeded maximum wall clock time of %s seconds' % ( self.maxWallClockTime ) ) return S_ERROR( 'Job has exceeded maximum wall clock time' ) else: return S_OK( 'Job within maximum wall clock time' ) else: return S_ERROR( 'Job start time could not be established' ) ############################################################################# def __checkLoadAverage( self ): """Checks whether the CS defined maximum load average is exceeded. """ if self.parameters.has_key( 'LoadAverage' ): loadAvg = self.parameters['LoadAverage'][-1] if loadAvg > float( self.loadAvgLimit ): self.log.info( 'Maximum load average exceeded, defined in CS as %s ' % ( self.loadAvgLimit ) ) return S_ERROR( 'Job exceeded maximum load average' ) else: return S_OK( 'Job running with normal load average' ) else: return S_ERROR( 'Job load average not established' ) ############################################################################# def __peek( self ): """ Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result['OK']: self.log.warn( 'Could not obtain output from running application thread' ) self.log.warn( result['Message'] ) return result ############################################################################# def calibrate( self ): """ The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters['WallClockTime'] = [] cpuConsumed = self.__getCPU() if not cpuConsumed['OK']: self.log.warn( "Could not establish CPU consumed, setting to 0.0" ) cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed['Value'] self.initialValues['CPUConsumed'] = cpuConsumed self.parameters['CPUConsumed'] = [] loadAvg = self.getLoadAverage() if not loadAvg['OK']: self.log.warn( "Could not establish LoadAverage, setting to 0" ) loadAvg = 0 else: loadAvg = loadAvg['Value'] self.initialValues['LoadAverage'] = loadAvg self.parameters['LoadAverage'] = [] memUsed = self.getMemoryUsed() if not memUsed['OK']: self.log.warn( "Could not establish MemoryUsed, setting to 0" ) memUsed = 0 else: memUsed = memUsed['Value'] self.initialValues['MemoryUsed'] = memUsed self.parameters['MemoryUsed'] = [] result = self.processMonitor.getMemoryConsumed( self.wrapperPID ) self.log.verbose( 'Job Memory: %s' % ( result['Value'] ) ) if not result['OK']: self.log.warn( 'Could not get job memory usage' ) self.initialValues['Vsize'] = result['Value']['Vsize']/1024. self.initialValues['RSS'] = result['Value']['RSS']/1024. self.parameters['Vsize'] = [] self.parameters['RSS'] = [] result = self.getDiskSpace() self.log.verbose( 'DiskSpace: %s' % ( result ) ) if not result['OK']: self.log.warn( "Could not establish DiskSpace" ) self.initialValues['DiskSpace'] = result['Value'] self.parameters['DiskSpace'] = [] result = self.getNodeInformation() self.log.verbose( 'NodeInfo: %s' % ( result ) ) if not result['OK']: self.log.warn( "Could not establish static system information" ) if os.environ.has_key( 'LSB_JOBID' ): result['LocalJobID'] = os.environ['LSB_JOBID'] if os.environ.has_key( 'PBS_JOBID' ): result['LocalJobID'] = os.environ['PBS_JOBID'] if os.environ.has_key( 'QSUB_REQNAME' ): result['LocalJobID'] = os.environ['QSUB_REQNAME'] if os.environ.has_key( 'JOB_ID' ): result['LocalJobID'] = os.environ['JOB_ID'] self.__reportParameters( result, 'NodeInformation', True ) self.__reportParameters( self.initialValues, 'InitialValues' ) return S_OK() def __timeLeft( self ): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft( 0.0 ) if not result['OK']: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result['Value'] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info( 'TimeLeft bellow %s, now checking with higher frequency' % timeLeft ) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary( self ): """ Returns average load, memory etc. over execution of job thread """ summary = {} # CPUConsumed if self.parameters.has_key( 'CPUConsumed' ): cpuList = self.parameters['CPUConsumed'] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime( hmsCPU ) if rawCPU['OK']: summary['LastUpdateCPU(s)'] = rawCPU['Value'] else: summary['LastUpdateCPU(s)'] = 'Could not be estimated' # DiskSpace if self.parameters.has_key( 'DiskSpace' ): space = self.parameters['DiskSpace'] if space: value = abs( float( space[-1] ) - float( self.initialValues['DiskSpace'] ) ) if value < 0: value = 0 summary['DiskSpace(MB)'] = value else: summary['DiskSpace(MB)'] = 'Could not be estimated' # MemoryUsed if self.parameters.has_key( 'MemoryUsed' ): memory = self.parameters['MemoryUsed'] if memory: summary['MemoryUsed(kb)'] = abs( float( memory[-1] ) - float( self.initialValues['MemoryUsed'] ) ) else: summary['MemoryUsed(kb)'] = 'Could not be estimated' # LoadAverage if self.parameters.has_key( 'LoadAverage' ): laList = self.parameters['LoadAverage'] if laList: summary['LoadAverage'] = float( sum( laList ) ) / float( len( laList ) ) else: summary['LoadAverage'] = 'Could not be estimated' result = self.__getWallClockTime() if not result['OK']: self.log.warn( "Failed determining wall clock time", result['Message'] ) summary['WallClockTime(s)'] = 0 else: wallClock = result['Value'] summary['WallClockTime(s)'] = wallClock self.__reportParameters( summary, 'UsageSummary', True ) self.currentStats = summary ############################################################################# def __reportParameters( self, params, title = None, report = False ): """Will report parameters for job. """ try: parameters = [] self.log.info( '==========================================================' ) if title: self.log.info( 'Watchdog will report %s' % ( title ) ) else: self.log.info( 'Watchdog will report parameters' ) self.log.info( '==========================================================' ) vals = params if params.has_key( 'Value' ): if vals['Value']: vals = params['Value'] for k, v in vals.items(): if v: self.log.info( str( k ) + ' = ' + str( v ) ) parameters.append( ( k, v ) ) if report: self.__setJobParamList( parameters ) self.log.info( '==========================================================' ) except Exception as x: self.log.warn( 'Problem while reporting parameters' ) self.log.warn( str( x ) ) ############################################################################# def __getWallClockTime( self ): """ Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if self.initialValues.has_key( 'StartTime' ): currentTime = time.time() wallClock = currentTime - self.initialValues['StartTime'] result['Value'] = wallClock else: self.initialValues['StartTime'] = time.time() result['Value'] = 0.0 return result ############################################################################# def __killRunningThread( self ): """ Will kill the running thread process and any child processes.""" self.log.info( 'Sending kill signal to application PID %s' % ( self.spObject.getChildPID() ) ) result = self.spObject.killChild() self.applicationKilled = True self.log.info( 'Subprocess.killChild() returned:%s ' % ( result ) ) return S_OK( 'Thread killed' ) ############################################################################# def __sendSignOfLife( self, jobID, heartBeatDict, staticParamDict ): """ Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 ) result = jobReport.sendHeartBeat( jobID, heartBeatDict, staticParamDict ) if not result['OK']: self.log.warn( 'Problem sending sign of life' ) self.log.warn( result ) if result['OK'] and result['Value']: self.__interpretControlSignal( result['Value'] ) return result ############################################################################# def __setJobParamList( self, value ): """Wraps around setJobParameters of state update client """ # job wrapper template sets the jobID variable if not os.environ.has_key( 'JOBID' ): self.log.info( 'Running without JOBID so parameters will not be reported' ) return S_OK() jobID = os.environ['JOBID'] jobReport = RPCClient( 'WorkloadManagement/JobStateUpdate', timeout = 120 ) jobParam = jobReport.setJobParameters( int( jobID ), value ) self.log.verbose( 'setJobParameters(%s,%s)' % ( jobID, value ) ) if not jobParam['OK']: self.log.warn( jobParam['Message'] ) return jobParam ############################################################################# def getNodeInformation( self ): """ Attempts to retrieve all static system information, should be overridden in a subclass""" methodName = 'getNodeInformation' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) ############################################################################# def getLoadAverage( self ): """ Attempts to get the load average, should be overridden in a subclass""" methodName = 'getLoadAverage' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) ############################################################################# def getMemoryUsed( self ): """ Attempts to get the memory used, should be overridden in a subclass""" methodName = 'getMemoryUsed' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) ############################################################################# def getDiskSpace( self ): """ Attempts to get the available disk space, should be overridden in a subclass""" methodName = 'getDiskSpace' self.log.warn( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' ) return S_ERROR( 'Watchdog: ' + methodName + ' method should be implemented in a subclass' )
class JobAgent( AgentModule ): """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration. """ ############################################################################# def initialize( self, loops = 0 ): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption( 'MonitoringEnabled', False ) # self.log.setLevel('debug') #temporary for debugging self.am_setOption( 'MaxCycles', loops ) ceType = self.am_getOption( 'CEType', 'InProcess' ) localCE = gConfig.getValue( '/LocalSite/LocalCE', '' ) if localCE: self.log.info( 'Defining CE from local configuration = %s' % localCE ) ceType = localCE ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE( ceType ) if not ceInstance['OK']: self.log.warn( ceInstance['Message'] ) return ceInstance self.initTimes = os.times() self.computingElement = ceInstance['Value'] #Localsite options self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' ) self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' ) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 ) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 ) self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 ) self.fillingMode = self.am_getOption( 'FillingModeFlag', False ) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10 ) self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '' ) #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 ) self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK() ############################################################################# def execute( self ): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info( 'Attempting to check CPU time left for filling mode' ) if self.fillingMode: if self.timeLeftError: self.log.warn( self.timeLeftError ) return self.__finish( self.timeLeftError ) self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) ) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft ) if not result['OK']: return self.__finish( result['Message'] ) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join( '.', self.extraOptions ) else: localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" ) localCfg.loadFromFile( localConfigFile ) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft ) localCfg.writeToFile( localConfigFile ) else: return self.__finish( 'Filling Mode is Disabled' ) self.log.verbose( 'Job Agent execution loop' ) available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info( 'Resource is not available' ) self.log.info( available['Message'] ) return self.__finish( 'CE Not Available' ) self.log.info( available['Message'] ) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' ) if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str( self.pilotReference ) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict( '/AgentJobRequirements' ) if result['OK']: requirementsDict = result['Value'] ceDict.update( requirementsDict ) self.log.verbose( ceDict ) start = time.time() jobRequest = self.__requestJob( ceDict ) matchTime = time.time() - start self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) ) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches ) if not jobRequest['OK']: if re.search( 'No match found', jobRequest['Message'] ): self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "seconds timeout" ) != -1: self.log.error( jobRequest['Message'] ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 : self.log.error( jobRequest['Message'] ) return S_ERROR( jobRequest['Message'] ) else: self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) ) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches ) return S_OK( jobRequest['Message'] ) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False ) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key( param ): self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) elif not matcherInfo[param]: self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) else: self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) ) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters( jobJDL ) if not parameters['OK']: self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' ) self.log.warn( parameters['Message'] ) return self.__finish( 'JDL Problem' ) params = parameters['Value'] if not params.has_key( 'JobID' ): msg = 'Job has not JobID defined in JDL parameters' self.__report( jobID, 'Failed', msg ) self.log.warn( msg ) return self.__finish( 'JDL Problem' ) else: jobID = params['JobID'] if not params.has_key( 'JobType' ): self.log.warn( 'Job has no JobType defined in JDL parameters' ) jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key( 'CPUTime' ): self.log.warn( 'Job has no CPU requirement defined in JDL parameters' ) if self.extraOptions: params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) ) self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) ) self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) ) self.jobCount += 1 try: jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName ) jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False ) if os.environ.has_key( 'BOINC_JOB_ID' ): # Report BOINC environment for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']: jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False ) jobReport.setJobStatus( 'Matched', 'Job Received by Agent' ) result = self.__setupProxy( ownerDN, jobGroup ) if not result[ 'OK' ]: return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure ) if 'Value' in result and result[ 'Value' ]: proxyChain = result[ 'Value' ] # Save the job jdl for external monitoring self.__saveJobJDLRequest( jobID, jobJDL ) software = self.__checkInstallSoftware( jobID, params, ceDict ) if not software['OK']: self.log.error( 'Failed to install software for job %s' % ( jobID ) ) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure ) self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) ) submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain ) if not submission['OK']: self.__report( jobID, 'Failed', submission['Message'] ) return self.__finish( submission['Message'] ) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure ) self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) ) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure ) currentTimes = list( os.times() ) for i in range( len( currentTimes ) ): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, _elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft( cpuTime ) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) ) self.scaledCPUTime = scaledCPUTime return S_OK( 'Job Agent cycle complete' ) ############################################################################# def __saveJobJDLRequest( self, jobID, jobJDL ): """Save job JDL local to JobAgent. """ classAdJob = ClassAd( jobJDL ) classAdJob.insertAttributeString( 'LocalCE', self.ceName ) jdlFileName = jobID + '.jdl' jdlFile = open( jdlFileName, 'w' ) jdl = classAdJob.asJDL() jdlFile.write( jdl ) jdlFile.close() ############################################################################# def __getCPUTimeLeft( self ): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ utime, stime, cutime, _cstime, _elapsed = os.times() cpuTime = utime + stime + cutime self.log.info( 'Current raw CPU time consumed is %s' % cpuTime ) timeleft = self.timeLeft - cpuTime * self.cpuFactor return timeleft ############################################################################# def __changeProxy( self, oldProxy, newProxy ): """Can call glexec utility here to set uid or simply log the changeover of a proxy. """ self.log.verbose( 'Log proxy change (to be instrumented)' ) return S_OK() ############################################################################# def __setupProxy( self, ownerDN, ownerGroup ): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ): proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup ) if not proxyResult['OK']: self.log.error( 'Invalid Proxy', proxyResult['Message'] ) return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] ) return S_OK( proxyResult['Value'] ) else: ret = getProxyInfo( disableVOMS = True ) if not ret['OK']: self.log.error( 'Invalid Proxy', ret['Message'] ) return S_ERROR( 'Invalid Proxy' ) proxyChain = ret['Value']['chain'] if not 'groupProperties' in ret['Value']: print ret['Value'] print proxyChain.dumpAllToString() self.log.error( 'Invalid Proxy', 'Group has no properties defined' ) return S_ERROR( 'Proxy has no group properties defined' ) groupProps = ret['Value']['groupProperties'] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup ) if not proxyResult['OK']: self.log.error( 'Invalid Proxy', proxyResult['Message'] ) return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] ) proxyChain = proxyResult['Value'] return S_OK( proxyChain ) ############################################################################# def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) ) token = gConfig.getValue( "/Security/ProxyToken", "" ) if not token: self.log.info( "No token defined. Trying to download proxy without token" ) token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token ) if not retVal[ 'OK' ]: self.log.error( 'Could not retrieve proxy' ) self.log.warn( retVal ) os.system( 'dirac-proxy-info' ) sys.stdout.flush() return S_ERROR( 'Error retrieving proxy' ) chain = retVal[ 'Value' ] return S_OK( chain ) ############################################################################# def __checkInstallSoftware( self, jobID, jobParams, resourceParams ): """Checks software requirement of job and whether this is already present before installing software locally. """ if not jobParams.has_key( 'SoftwareDistModule' ): msg = 'Job has no software installation requirement' self.log.verbose( msg ) return S_OK( msg ) self.__report( jobID, 'Matched', 'Installing Software' ) softwareDist = jobParams['SoftwareDistModule'] self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) ) argumentsDict = {'Job':jobParams, 'CE':resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict ) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] return module.execute() ############################################################################# def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, proxyChain ): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption( 'DefaultLogLevel', 'INFO' ) defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate', 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' ) result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams, extraOptions = self.extraOptions, defaultWrapperLocation = defaultWrapperLocation, log = self.log, logLevel = logLevel ) if not result['OK']: return result wrapperFile = result['Value'] self.__report( jobID, 'Matched', 'Submitted To CE' ) self.log.info( 'Submitting %s to %sCE' % ( os.path.basename( wrapperFile ), self.ceName ) ) #Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error( proxy ) return S_ERROR( 'Payload Proxy Not Found' ) payloadProxy = proxy['Value'] # FIXME: how can we set the batchID before we submit, this makes no sense batchID = 'dc%s' % ( jobID ) submission = self.computingElement.submitJob( wrapperFile, payloadProxy ) ret = S_OK( 'Job submitted' ) if submission['OK']: batchID = submission['Value'] self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) ) self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) ) self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) ) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep( self.jobSubmissionDelay ) else: self.log.error( 'Job submission failed', jobID ) self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) ) if 'ReschedulePayload' in submission: rescheduleFailedJob( jobID, submission['Message'], self.__report ) else: if 'Value' in submission: self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % ( str( submission['Value'] ) ) ) # make sure the Job is declared Failed self.__report( jobID, 'Failed', submission['Message'] ) return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) ) return ret ############################################################################# def __requestJob( self, ceDict ): """Request a single job from the matcher service. """ try: matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 ) result = matcher.requestJob( ceDict ) return result except Exception, x: self.log.exception( lException = x ) return S_ERROR( 'Job request to matcher service failed with exception' )
class JobAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self, loops = 0 ): """Sets default parameters and creates CE instance """ #Disable monitoring self.am_setOption( 'MonitoringEnabled', False ) # self.log.setLevel('debug') #temporary for debugging self.am_setOption( 'MaxCycles', loops ) ceUniqueID = self.am_getOption( 'CEUniqueID', 'InProcess' ) localCE = gConfig.getValue( '/LocalSite/LocalCE', '' ) if localCE: self.log.info( 'Defining CE from local configuration = %s' % localCE ) ceUniqueID = localCE ceFactory = ComputingElementFactory() self.ceName = ceUniqueID ceInstance = ceFactory.getCE( ceUniqueID ) if not ceInstance['OK']: self.log.warn( ceInstance['Message'] ) return ceInstance self.computingElement = ceInstance['Value'] self.diracRoot = os.path.dirname( os.path.dirname( os.path.dirname( os.path.dirname( __file__ ) ) ) ) #Localsite options self.siteRoot = gConfig.getValue( '/LocalSite/Root', os.getcwd() ) self.siteName = gConfig.getValue( '/LocalSite/Site', 'Unknown' ) self.pilotReference = gConfig.getValue( '/LocalSite/PilotReference', 'Unknown' ) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5 ) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 ) defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' self.jobWrapperTemplate = os.path.join( self.diracRoot, self.am_getOption( 'JobWrapperTemplate', defaultWrapperLocation ) ) self.jobSubmissionDelay = self.am_getOption( 'SubmissionDelay', 10 ) self.defaultLogLevel = self.am_getOption( 'DefaultLogLevel', 'info' ) self.fillingMode = self.am_getOption( 'FillingModeFlag', False ) self.jobCount = 0 #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0 ) self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '' ) self.timeLeftError = '' self.scaledCPUTime = 0.0 return S_OK() ############################################################################# def execute( self ): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info( 'Attempting to check CPU time left for filling mode' ) if self.fillingMode: if self.timeLeftError: self.log.warn( self.timeLeftError ) return self.__finish( self.timeLeftError ) self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) ) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft ) if not result['OK']: return self.__finish( result['Message'] ) ceJDL = self.computingElement.getJDL() resourceJDL = ceJDL['Value'] else: return self.__finish( 'Filling Mode is Disabled' ) self.log.verbose( 'Job Agent execution loop' ) available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info( 'Resource is not available' ) self.log.info( available['Message'] ) return self.__finish( 'CE Not Available' ) self.log.info( available['Message'] ) ceJDL = self.computingElement.getJDL() resourceJDL = ceJDL['Value'] self.log.verbose( resourceJDL ) start = time.time() jobRequest = self.__requestJob( resourceJDL ) matchTime = time.time() - start self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) ) if not jobRequest['OK']: if re.search( 'No work available', jobRequest['Message'] ): self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "seconds timeout" ) != -1: self.log.error( jobRequest['Message'] ) return S_OK( jobRequest['Message'] ) elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 : self.log.error( jobRequest['Message'] ) return S_ERROR( jobRequest['Message'] ) else: self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) ) return S_OK( jobRequest['Message'] ) matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key( param ): self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) elif not matcherInfo[param]: self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) ) return self.__finish( 'Matcher Failed' ) else: self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) ) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters( jobJDL ) if not parameters['OK']: self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' ) self.log.warn( parameters['Message'] ) return self.__finish( 'JDL Problem' ) params = parameters['Value'] if not params.has_key( 'JobID' ): msg = 'Job has not JobID defined in JDL parameters' self.__report( jobID, 'Failed', msg ) self.log.warn( msg ) return self.__finish( 'JDL Problem' ) else: jobID = params['JobID'] if not params.has_key( 'JobType' ): self.log.warn( 'Job has no JobType defined in JDL parameters' ) jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key( 'SystemConfig' ): self.log.warn( 'Job has no system configuration defined in JDL parameters' ) systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' ) self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig ) if not systemConfig: self.log.warn( '/LocalSite/Architecture is not defined' ) params['SystemConfig'] = systemConfig else: systemConfig = params['SystemConfig'] if systemConfig.lower() == 'any': systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' ) self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =', '"%s" since it was set to "ANY" in the job description' % systemConfig ) if not systemConfig: self.log.warn( '/LocalSite/Architecture is not defined' ) params['SystemConfig'] = systemConfig if not params.has_key( 'MaxCPUTime' ): self.log.warn( 'Job has no CPU requirement defined in JDL parameters' ) self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) ) self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) ) self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) ) self.jobCount += 1 try: self.__setJobParam( jobID, 'MatcherServiceTime', str( matchTime ) ) if self.gridCEQueue: self.__setJobParam( jobID, 'GridCEQueue', self.gridCEQueue ) self.__report( jobID, 'Matched', 'Job Received by Agent' ) self.__setJobSite( jobID, self.siteName ) self.__reportPilotInfo( jobID ) result = self.__setupProxy( ownerDN, jobGroup ) if not result[ 'OK' ]: return self.__rescheduleFailedJob( jobID, result[ 'Message' ] ) if 'Value' in result and result[ 'Value' ]: proxyChain = result[ 'Value' ] # Is this necessary at all? saveJDL = self.__saveJobJDLRequest( jobID, jobJDL ) #self.__report(jobID,'Matched','Job Prepared to Submit') resourceParameters = self.__getJDLParameters( resourceJDL ) if not resourceParameters['OK']: return resourceParameters resourceParams = resourceParameters['Value'] software = self.__checkInstallSoftware( jobID, params, resourceParams ) if not software['OK']: self.log.error( 'Failed to install software for job %s' % ( jobID ) ) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg ) self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) ) submission = self.__submitJob( jobID, params, resourceParams, optimizerParams, jobJDL, proxyChain ) if not submission['OK']: self.__report( jobID, 'Failed', submission['Message'] ) return self.__finish( submission['Message'] ) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'] ) self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) ) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception' ) result = self.timeLeftUtil.getTimeLeft( 0.0 ) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) ) self.scaledCPUTime = scaledCPUTime return S_OK( 'Job Agent cycle complete' ) ############################################################################# def __getCPUTimeLeft( self ): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ utime, stime, cutime, cstime, elapsed = os.times() cpuTime = utime + stime + cutime self.log.info( 'Current raw CPU time consumed is %s' % cpuTime ) timeleft = self.timeLeft - cpuTime * self.cpuFactor return timeleft ############################################################################# def __changeProxy( self, oldProxy, newProxy ): """Can call glexec utility here to set uid or simply log the changeover of a proxy. """ self.log.verbose( 'Log proxy change (to be instrumented)' ) return S_OK() ############################################################################# def __setupProxy( self, ownerDN, ownerGroup ): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue( '/DIRAC/Security/UseServerCertificate' , False ): proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup ) if not proxyResult['OK']: self.log.error( 'Invalid Proxy', proxyResult['Message'] ) return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] ) return S_OK( proxyResult['Value'] ) else: ret = getProxyInfo( disableVOMS = True ) if not ret['OK']: self.log.error( 'Invalid Proxy', ret['Message'] ) return S_ERROR( 'Invalid Proxy' ) proxyChain = ret['Value']['chain'] if not 'groupProperties' in ret['Value']: print ret['Value'] print proxyChain.dumpAllToString() self.log.error( 'Invalid Proxy', 'Group has no properties defined' ) return S_ERROR( 'Proxy has no group properties defined' ) if Properties.GENERIC_PILOT in ret['Value']['groupProperties']: proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup ) if not proxyResult['OK']: self.log.error( 'Invalid Proxy', proxyResult['Message'] ) return S_ERROR( 'Failed to setup proxy: %s' % proxyResult[ 'Message' ] ) proxyChain = proxyResult['Value'] return S_OK( proxyChain ) ############################################################################# def __requestProxyFromProxyManager( self, ownerDN, ownerGroup ): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info( "Requesting proxy for %s@%s" % ( ownerDN, ownerGroup ) ) token = gConfig.getValue( "/Security/ProxyToken", "" ) if not token: self.log.info( "No token defined. Trying to download proxy without token" ) token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token ) if not retVal[ 'OK' ]: self.log.error( 'Could not retrieve proxy' ) self.log.warn( retVal ) os.system( 'dirac-proxy-info' ) sys.stdout.flush() return S_ERROR( 'Error retrieving proxy' ) chain = retVal[ 'Value' ] return S_OK( chain ) ############################################################################# def __checkInstallSoftware( self, jobID, jobParams, resourceParams ): """Checks software requirement of job and whether this is already present before installing software locally. """ if not jobParams.has_key( 'SoftwareDistModule' ): msg = 'Job has no software installation requirement' self.log.verbose( msg ) return S_OK( msg ) self.__report( jobID, 'Matched', 'Installing Software' ) softwareDist = jobParams['SoftwareDistModule'] #HACK: Delete when svn repo is in production! softwareDist = softwareDist.replace( "DIRAC.LHCbSystem.", "LHCbDIRAC.Core." ) #END OF HACK self.log.verbose( 'Found VO Software Distribution module: %s' % ( softwareDist ) ) argumentsDict = {'Job':jobParams, 'CE':resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule( softwareDist, argumentsDict ) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] result = module.execute() return result ############################################################################# def __submitJob( self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain ): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ result = self.__createJobWrapper( jobID, jobParams, resourceParams, optimizerParams ) if not result['OK']: return result wrapperFile = result['Value'] self.__report( jobID, 'Matched', 'Submitted To CE' ) wrapperName = os.path.basename( wrapperFile ) self.log.info( 'Submitting %s to %sCE' % ( wrapperName, self.ceName ) ) #Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error( proxy ) return S_ERROR( 'Payload Proxy Not Found' ) payloadProxy = proxy['Value'] # FIXME: how can we set the batchID before we submit, this makes no sense batchID = 'dc%s' % ( jobID ) submission = self.computingElement.submitJob( wrapperFile, payloadProxy ) ret = S_OK( 'Job submitted' ) if submission['OK']: batchID = submission['Value'] self.log.info( 'Job %s submitted as %s' % ( jobID, batchID ) ) self.log.verbose( 'Set JobParameter: Local batch ID %s' % ( batchID ) ) self.__setJobParam( jobID, 'LocalBatchID', str( batchID ) ) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep( self.jobSubmissionDelay ) else: self.log.error( 'Job submission failed', jobID ) self.__setJobParam( jobID, 'ErrorMessage', '%s CE Submission Error' % ( self.ceName ) ) if 'ReschedulePayload' in submission: rescheduleFailedJob( jobID, submission['Message'], self.__report ) return S_ERROR( '%s CE Submission Error: %s' % ( self.ceName, submission['Message'] ) ) return ret ############################################################################# def __createJobWrapper( self, jobID, jobParams, resourceParams, optimizerParams ): """This method creates a job wrapper filled with the CE and Job parameters to executed the job. """ arguments = {'Job':jobParams, 'CE':resourceParams, 'Optimizer':optimizerParams} self.log.verbose( 'Job arguments are: \n %s' % ( arguments ) ) workingDir = gConfig.getValue( '/LocalSite/WorkingDirectory', self.siteRoot ) if not os.path.exists( '%s/job/Wrapper' % ( workingDir ) ): try: os.makedirs( '%s/job/Wrapper' % ( workingDir ) ) except Exception: self.log.exception() return S_ERROR( 'Could not create directory for wrapper script' ) jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % ( workingDir, jobID ) if os.path.exists( jobWrapperFile ): self.log.verbose( 'Removing existing Job Wrapper for %s' % ( jobID ) ) os.remove( jobWrapperFile ) fd = open( self.jobWrapperTemplate, 'r' ) wrapperTemplate = fd.read() fd.close() dateStr = time.strftime( "%Y-%m-%d", time.localtime( time.time() ) ) timeStr = time.strftime( "%H:%M", time.localtime( time.time() ) ) date_time = '%s %s' % ( dateStr, timeStr ) signature = __RCSID__ dPython = sys.executable systemConfig = '' if jobParams.has_key( 'SystemConfig' ): systemConfig = jobParams['SystemConfig'] self.log.verbose( 'Job system configuration requirement is %s' % ( systemConfig ) ) if resourceParams.has_key( 'Root' ): jobPython = '%s/%s/bin/python' % ( resourceParams['Root'], systemConfig ) if os.path.exists( jobPython ): self.log.verbose( 'Found local python for job:\n%s' % ( jobPython ) ) dPython = jobPython else: if systemConfig == 'ANY': self.log.verbose( 'Using standard available python %s for job' % ( dPython ) ) else: self.log.warn( 'Job requested python \n%s\n but this is not available locally' % ( jobPython ) ) else: self.log.warn( 'No LocalSite/Root defined' ) else: self.log.warn( 'Job has no system configuration requirement' ) if not systemConfig or systemConfig.lower() == 'any': systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' ) if not systemConfig: self.log.warn( 'Could not establish SystemConfig' ) logLevel = self.defaultLogLevel if jobParams.has_key( 'LogLevel' ): logLevel = jobParams['LogLevel'] self.log.info( 'Found Job LogLevel JDL parameter with value: %s' % ( logLevel ) ) else: self.log.info( 'Applying default LogLevel JDL parameter with value: %s' % ( logLevel ) ) realPythonPath = os.path.realpath( dPython ) self.log.debug( 'Real python path after resolving links is:' ) self.log.debug( realPythonPath ) dPython = realPythonPath siteRootPython = self.siteRoot self.log.debug( 'DIRACPython is:\n%s' % dPython ) self.log.debug( 'SiteRootPythonDir is:\n%s' % siteRootPython ) wrapperTemplate = wrapperTemplate.replace( "@SIGNATURE@", str( signature ) ) wrapperTemplate = wrapperTemplate.replace( "@JOBID@", str( jobID ) ) wrapperTemplate = wrapperTemplate.replace( "@DATESTRING@", str( date_time ) ) wrapperTemplate = wrapperTemplate.replace( "@JOBARGS@", str( arguments ) ) wrapperTemplate = wrapperTemplate.replace( "@SITEPYTHON@", str( siteRootPython ) ) wrapper = open ( jobWrapperFile, "w" ) wrapper.write( wrapperTemplate ) wrapper.close () jobExeFile = '%s/job/Wrapper/Job%s' % ( workingDir, jobID ) jobFileContents = \ """#!/bin/sh %s %s -o LogLevel=%s """ % ( dPython, jobWrapperFile, logLevel ) jobFile = open( jobExeFile, 'w' ) jobFile.write( jobFileContents ) jobFile.close() return S_OK( jobExeFile ) ############################################################################# def __saveJobJDLRequest( self, jobID, jobJDL ): """Save job JDL local to JobAgent. """ classAdJob = ClassAd( jobJDL ) classAdJob.insertAttributeString( 'LocalCE', self.ceName ) jdlFileName = jobID + '.jdl' jdlFile = open( jdlFileName, 'w' ) jdl = classAdJob.asJDL() jdlFile.write( jdl ) jdlFile.close() return S_OK( jdlFileName ) ############################################################################# def __requestJob( self, resourceJDL ): """Request a single job from the matcher service. """ try: matcher = RPCClient( 'WorkloadManagement/Matcher', timeout = 600 ) result = matcher.requestJob( resourceJDL ) return result except Exception, x: self.log.exception( lException = x ) return S_ERROR( 'Job request to matcher service failed with exception' )
class JobAgent(AgentModule): """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration. """ ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', 0.0) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown') self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', 'Unknown') self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10) self.fillingMode = self.am_getOption('FillingModeFlag', False) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', 1000) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10) self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '') # Timeleft self.timeLeftUtil = TimeLeft() self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for p in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % submission[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete') ############################################################################# def __saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent. """ classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString('LocalCE', self.ceName) jdlFileName = jobID + '.jdl' jdlFile = open(jdlFileName, 'w') jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() ############################################################################# def __getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ cpuTime = sum(os.times()[:-1]) self.log.info('Current raw CPU time consumed is %s' % cpuTime) timeleft = self.timeLeft if self.cpuFactor: timeleft -= cpuTime * self.cpuFactor return timeleft ############################################################################# def __changeProxy(self, oldProxy, newProxy): """Can call glexec utility here to set uid or simply log the changeover of a proxy. """ self.log.verbose('Log proxy change (to be instrumented)') return S_OK() ############################################################################# def __setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False): proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Failed to setup proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) return S_OK(proxyResult['Value']) else: ret = getProxyInfo(disableVOMS=True) if not ret['OK']: self.log.error('Invalid Proxy', ret['Message']) return S_ERROR('Invalid Proxy') proxyChain = ret['Value']['chain'] if not 'groupProperties' in ret['Value']: print ret['Value'] print proxyChain.dumpAllToString() self.log.error('Invalid Proxy', 'Group has no properties defined') return S_ERROR('Proxy has no group properties defined') groupProps = ret['Value']['groupProperties'] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Invalid Proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) proxyChain = proxyResult['Value'] return S_OK(proxyChain) ############################################################################# def __requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info( "No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal['OK']: self.log.error('Could not retrieve payload proxy', retVal['Message']) self.log.warn(retVal) os.system('dirac-proxy-info') sys.stdout.flush() return S_ERROR('Error retrieving proxy') chain = retVal['Value'] return S_OK(chain) ############################################################################# def __checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if 'SoftwareDistModule' not in jobParams: msg = 'Job has no software installation requirement' self.log.verbose(msg) return S_OK(msg) self.__report(jobID, 'Matched', 'Installing Software') softwareDist = jobParams['SoftwareDistModule'] self.log.verbose('Found VO Software Distribution module: %s' % (softwareDist)) argumentsDict = {'Job': jobParams, 'CE': resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] return module.execute() ############################################################################# def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption('DefaultLogLevel', 'INFO') defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate', 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py') result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams, extraOptions=self.extraOptions, defaultWrapperLocation=defaultWrapperLocation, log=self.log, logLevel=logLevel) if not result['OK']: return result wrapperFile = result['Value'] self.__report(jobID, 'Matched', 'Submitted To CE') self.log.info('Submitting %s to %sCE' % (os.path.basename(wrapperFile), self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error('Invalid proxy', proxy) return S_ERROR('Payload Proxy Not Found') payloadProxy = proxy['Value'] # FIXME: how can we set the batchID before we submit, this makes no sense batchID = 'dc%s' % (jobID) submission = self.computingElement.submitJob(wrapperFile, payloadProxy) ret = S_OK('Job submitted') if submission['OK']: batchID = submission['Value'] self.log.info('Job %s submitted as %s' % (jobID, batchID)) self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID)) self.__setJobParam(jobID, 'LocalBatchID', str(batchID)) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error('Job submission failed', jobID) self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName)) if 'ReschedulePayload' in submission: rescheduleFailedJob(jobID, submission['Message']) return S_OK( ) # Without this job is marked as failed at line 265 above else: if 'Value' in submission: self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value']))) # make sure the Job is declared Failed self.__report(jobID, 'Failed', submission['Message']) return S_ERROR('%s CE Submission Error: %s' % (self.ceName, submission['Message'])) return ret ############################################################################# def __requestJob(self, ceDict): """Request a single job from the matcher service. """ try: matcher = RPCClient('WorkloadManagement/Matcher', timeout=600) result = matcher.requestJob(ceDict) return result except Exception as x: self.log.exception(lException=x) return S_ERROR( "Job request to matcher service failed with exception") ############################################################################# def __getJDLParameters(self, jdl): """Returns a dictionary of JDL parameters. """ try: parameters = {} # print jdl if not re.search('\[', jdl): jdl = '[' + jdl + ']' classAdJob = ClassAd(jdl) paramsDict = classAdJob.contents for param, value in paramsDict.items(): if value.strip().startswith('{'): self.log.debug('Found list type parameter %s' % (param)) rawValues = value.replace('{', '').replace('}', '').replace( '"', '').split() valueList = [] for val in rawValues: if re.search(',$', val): valueList.append(val[:-1]) else: valueList.append(val) parameters[param] = valueList else: parameters[param] = value.replace('"', '').replace( '{', '"{').replace('}', '}"') self.log.debug('Found standard parameter %s: %s' % (param, parameters[param])) return S_OK(parameters) except Exception, x: self.log.exception(lException=x) return S_ERROR('Exception while extracting JDL parameters for job')
class JobAgent(AgentModule): """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration. """ def __init__(self, agentName, loadName, baseAgentName=False, properties=None): """ Just defines some default parameters """ if not properties: properties = {} super(JobAgent, self).__init__(agentName, loadName, baseAgentName, properties) self.ceName = 'InProcess' self.computingElement = None self.timeLeft = 0.0 self.initTimes = os.times() # Localsite options self.siteName = 'Unknown' self.pilotReference = 'Unknown' self.defaultProxyLength = 86400 * 5 # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = 0.0 self.jobSubmissionDelay = 10 self.fillingMode = False self.minimumTimeLeft = 1000 self.stopOnApplicationFailure = True self.stopAfterFailedMatches = 10 self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = '' # Timeleft self.timeLeftUtil = None self.timeLeftError = '' self.pilotInfoReportedFlag = False ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring, logLevel INFO, limited cycles self.am_setOption('MonitoringEnabled', False) self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue('/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue('/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption('StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue('/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available') self.log.info(result['Message']) return self.__finish('CE Not Available') self.log.info(result['Message']) ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots with %d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn('Job has no CPU requirement defined in JDL parameters') # Job requirement for a number of processors processors = int(params.get('NumberOfProcessors', 1)) wholeNode = 'WholeNode' in params if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result['PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() return S_OK('Job Agent cycle complete') ############################################################################# def __saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent. """ classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString('LocalCE', self.ceName) jdlFileName = jobID + '.jdl' jdlFile = open(jdlFileName, 'w') jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() ############################################################################# def __getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ cpuTime = sum(os.times()[:-1]) self.log.info('Current raw CPU time consumed is %s' % cpuTime) timeleft = self.timeLeft if self.cpuFactor: timeleft -= cpuTime * self.cpuFactor return timeleft ############################################################################# def __setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False): proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Failed to setup proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) return S_OK(proxyResult['Value']) else: ret = getProxyInfo(disableVOMS=True) if not ret['OK']: self.log.error('Invalid Proxy', ret['Message']) return S_ERROR('Invalid Proxy') proxyChain = ret['Value']['chain'] if 'groupProperties' not in ret['Value']: print ret['Value'] print proxyChain.dumpAllToString() self.log.error('Invalid Proxy', 'Group has no properties defined') return S_ERROR('Proxy has no group properties defined') groupProps = ret['Value']['groupProperties'] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Invalid Proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) proxyChain = proxyResult['Value'] return S_OK(proxyChain) ############################################################################# def __requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info("No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal['OK']: self.log.error('Could not retrieve payload proxy', retVal['Message']) self.log.warn(retVal) os.system('dirac-proxy-info') sys.stdout.flush() return S_ERROR('Error retrieving proxy') chain = retVal['Value'] return S_OK(chain) ############################################################################# def __checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if 'SoftwareDistModule' not in jobParams: msg = 'Job has no software installation requirement' self.log.verbose(msg) return S_OK(msg) self.__report(jobID, 'Matched', 'Installing Software') softwareDist = jobParams['SoftwareDistModule'] self.log.verbose('Found VO Software Distribution module: %s' % (softwareDist)) argumentsDict = {'Job': jobParams, 'CE': resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] return module.execute() ############################################################################# def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain, processors, wholeNode=False): """ Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption('DefaultLogLevel', 'INFO') defaultWrapperLocation = self.am_getOption('JobWrapperTemplate', 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py') jobDesc = {"jobID": jobID, "jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams, "extraOptions": self.extraOptions, "defaultWrapperLocation": defaultWrapperLocation} result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc) if not result['OK']: return result wrapperFile = result['Value'] self.__report(jobID, 'Matched', 'Submitted To CE') self.log.info('Submitting JobWrapper %s to %sCE' % (os.path.basename(wrapperFile), self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error('Invalid proxy', proxy) return S_ERROR('Payload Proxy Not Found') payloadProxy = proxy['Value'] submission = self.computingElement.submitJob(wrapperFile, payloadProxy, numberOfProcessors=processors, wholeNode=wholeNode, jobDesc=jobDesc, log=self.log, logLevel=logLevel) ret = S_OK('Job submitted') if submission['OK']: batchID = submission['Value'] self.log.info('Job %s submitted as %s' % (jobID, batchID)) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error('Job submission failed', jobID) self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName)) if 'ReschedulePayload' in submission: rescheduleFailedJob(jobID, submission['Message']) return S_OK() # Without this, the job is marked as failed else: if 'Value' in submission: self.log.error('Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value']))) return S_ERROR('%s CE Error: %s' % (self.ceName, submission['Message'])) return ret ############################################################################# def __getJDLParameters(self, jdl): """Returns a dictionary of JDL parameters. """ try: parameters = {} # print jdl if not re.search(r'\[', jdl): jdl = '[' + jdl + ']' classAdJob = ClassAd(jdl) paramsDict = classAdJob.contents for param, value in paramsDict.items(): if value.strip().startswith('{'): self.log.debug('Found list type parameter %s' % (param)) rawValues = value.replace('{', '').replace('}', '').replace('"', '').split() valueList = [] for val in rawValues: if re.search(',$', val): valueList.append(val[:-1]) else: valueList.append(val) parameters[param] = valueList else: parameters[param] = value.replace('"', '').replace('{', '"{').replace('}', '}"') self.log.debug('Found standard parameter %s: %s' % (param, parameters[param])) return S_OK(parameters) except Exception as x: self.log.exception(lException=x) return S_ERROR('Exception while extracting JDL parameters for job') ############################################################################# def __report(self, jobID, status, minorStatus): """Wraps around setJobStatus of state update client """ jobStatus = JobStateUpdateClient().setJobStatus(int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName) self.log.verbose('setJobStatus(%s,%s,%s,%s)' % (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName)) if not jobStatus['OK']: self.log.warn(jobStatus['Message']) return jobStatus ############################################################################# def __setJobParam(self, jobID, name, value): """Wraps around setJobParameter of state update client """ jobParam = JobStateUpdateClient().setJobParameter(int(jobID), str(name), str(value)) self.log.verbose('setJobParameter(%s,%s,%s)' % (jobID, name, value)) if not jobParam['OK']: self.log.warn(jobParam['Message']) return jobParam ############################################################################# def __finish(self, message, stop=True): """Force the JobAgent to complete gracefully. """ if stop: self.log.info('JobAgent will stop with message "%s", execution complete.' % message) self.am_stopExecution() return S_ERROR(message) else: return S_OK(message) ############################################################################# def __rescheduleFailedJob(self, jobID, message, stop=True): """ Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager """ self.log.warn('Failure during %s' % (message)) jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName) # Setting a job parameter does not help since the job will be rescheduled, # instead set the status with the cause and then another status showing the # reschedule operation. jobReport.setJobStatus(status='Rescheduled', application=message, sendFlag=True) self.log.info('Job will be rescheduled') result = JobManagerClient().rescheduleJob(jobID) if not result['OK']: self.log.error('Failed to reschedule job', result['Message']) return self.__finish('Problem Rescheduling Job', stop) self.log.info('Job Rescheduled %s' % (jobID)) return self.__finish('Job Rescheduled', stop) ############################################################################# def finalize(self): """ Job Agent finalization method """ gridCE = gConfig.getValue('/LocalSite/GridCE', '') queue = gConfig.getValue('/LocalSite/CEQueue', '') wmsAdmin = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdmin.setPilotStatus(str(self.pilotReference), 'Done', gridCE, 'Report from JobAgent', self.siteName, queue) if not result['OK']: self.log.warn(result['Message']) return S_OK()
class JobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ #Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.initTimes = os.times() self.computingElement = ceInstance['Value'] self.diracRoot = os.path.dirname( os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) #Localsite options self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd()) self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown') self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', 'Unknown') self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' self.jobWrapperTemplate = os.path.join( self.diracRoot, self.am_getOption('JobWrapperTemplate', defaultWrapperLocation)) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10) self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info') self.fillingMode = self.am_getOption('FillingModeFlag', False) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10) self.jobCount = 0 self.matchFailedCount = 0 #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0) self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '') self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: #Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft)) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') available = self.computingElement.available() if not available['OK'] or not available['Value']: self.log.info('Resource is not available') self.log.info(available['Message']) return self.__finish('CE Not Available') self.log.info(available['Message']) result = self.computingElement.getDescription() if not result['OK']: return result ceDict = result['Value'] # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if not 'PilotReference' in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime = %.2f (s)' % (matchTime)) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error(jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: self.log.error(jobRequest['Message']) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs: %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] jobID = matcherInfo['JobID'] self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False) matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn(parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if not params.has_key('JobID'): msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if not params.has_key('JobType'): self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if not params.has_key('SystemConfig'): self.log.warn( 'Job has no system configuration defined in JDL parameters') systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig else: systemConfig = params['SystemConfig'] if systemConfig.lower() == 'any': systemConfig = gConfig.getValue('/LocalSite/Architecture', '') self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =', '"%s" since it was set to "ANY" in the job description' % systemConfig) if not systemConfig: self.log.warn('/LocalSite/Architecture is not defined') params['SystemConfig'] = systemConfig if not params.has_key('CPUTime'): self.log.warn( 'Job has no CPU requirement defined in JDL parameters') self.log.verbose('Job request successful: \n %s' % (jobRequest['Value'])) self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' % (jobID, jobType, systemConfig)) self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if self.gridCEQueue: jobReport.setJobParameter('GridCEQueue', self.gridCEQueue, sendFlag=False) if os.environ.has_key('BOINC_JOB_ID'): # Report BOINC environment for p in [ 'BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName' ]: jobReport.setJobParameter(p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo(jobID) result = self.__setupProxy(ownerDN, jobGroup) if not result['OK']: return self.__rescheduleFailedJob( jobID, result['Message'], self.stopOnApplicationFailure) if 'Value' in result and result['Value']: proxyChain = result['Value'] # Is this necessary at all? saveJDL = self.__saveJobJDLRequest(jobID, jobJDL) #self.__report(jobID,'Matched','Job Prepared to Submit') #resourceParameters = self.__getJDLParameters( resourceJDL ) #if not resourceParameters['OK']: # return resourceParameters #resourceParams = resourceParameters['Value'] software = self.__checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job %s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure) self.log.verbose('Before %sCE submitJob()' % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain) if not submission['OK']: self.__report(jobID, 'Failed', submission['Message']) return self.__finish(submission['Message']) elif 'PayloadFailed' in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'], self.stopOnApplicationFailure) self.log.verbose('After %sCE submitJob()' % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) currentTimes = list(os.times()) for i in range(len(currentTimes)): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft(cpuTime) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value'] self.__setJobParam(jobID, 'ScaledCPUTime', str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK('Job Agent cycle complete') ############################################################################# def __getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ utime, stime, cutime, cstime, elapsed = os.times() cpuTime = utime + stime + cutime self.log.info('Current raw CPU time consumed is %s' % cpuTime) timeleft = self.timeLeft - cpuTime * self.cpuFactor return timeleft ############################################################################# def __changeProxy(self, oldProxy, newProxy): """Can call glexec utility here to set uid or simply log the changeover of a proxy. """ self.log.verbose('Log proxy change (to be instrumented)') return S_OK() ############################################################################# def __setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False): proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Invalid Proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) return S_OK(proxyResult['Value']) else: ret = getProxyInfo(disableVOMS=True) if not ret['OK']: self.log.error('Invalid Proxy', ret['Message']) return S_ERROR('Invalid Proxy') proxyChain = ret['Value']['chain'] if not 'groupProperties' in ret['Value']: print ret['Value'] print proxyChain.dumpAllToString() self.log.error('Invalid Proxy', 'Group has no properties defined') return S_ERROR('Proxy has no group properties defined') groupProps = ret['Value']['groupProperties'] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self.__requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Invalid Proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) proxyChain = proxyResult['Value'] return S_OK(proxyChain) ############################################################################# def __requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info( "No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal['OK']: self.log.error('Could not retrieve proxy') self.log.warn(retVal) os.system('dirac-proxy-info') sys.stdout.flush() return S_ERROR('Error retrieving proxy') chain = retVal['Value'] return S_OK(chain) ############################################################################# def __checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if not jobParams.has_key('SoftwareDistModule'): msg = 'Job has no software installation requirement' self.log.verbose(msg) return S_OK(msg) self.__report(jobID, 'Matched', 'Installing Software') softwareDist = jobParams['SoftwareDistModule'] self.log.verbose('Found VO Software Distribution module: %s' % (softwareDist)) argumentsDict = {'Job': jobParams, 'CE': resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] result = module.execute() return result ############################################################################# def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ result = self.__createJobWrapper(jobID, jobParams, resourceParams, optimizerParams) if not result['OK']: return result wrapperData = result['Value'] wrapperFile = wrapperData['execFile'] self.__report(jobID, 'Matched', 'Submitted To CE') wrapperName = os.path.basename(wrapperFile) self.log.info('Submitting %s to %sCE' % (wrapperName, self.ceName)) #Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error(proxy) return S_ERROR('Payload Proxy Not Found') payloadProxy = proxy['Value'] # FIXME: how can we set the batchID before we submit, this makes no sense batchID = 'dc%s' % (jobID) submission = self.computingElement.submitJob(wrapperFile, payloadProxy, wrapperData) ret = S_OK('Job submitted') if submission['OK']: batchID = submission['Value'] self.log.info('Job %s submitted as %s' % (jobID, batchID)) self.log.verbose('Set JobParameter: Local batch ID %s' % (batchID)) self.__setJobParam(jobID, 'LocalBatchID', str(batchID)) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error('Job submission failed', jobID) self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName)) if 'ReschedulePayload' in submission: rescheduleFailedJob(jobID, submission['Message'], self.__report) else: if 'Value' in submission: self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value']))) # make sure the Job is declared Failed self.__report(jobID, 'Failed', submission['Message']) return S_ERROR('%s CE Submission Error: %s' % (self.ceName, submission['Message'])) return ret ############################################################################# def __createJobWrapper(self, jobID, jobParams, resourceParams, optimizerParams): """This method creates a job wrapper filled with the CE and Job parameters to executed the job. """ arguments = { 'Job': jobParams, 'CE': resourceParams, 'Optimizer': optimizerParams } self.log.verbose('Job arguments are: \n %s' % (arguments)) workingDir = gConfig.getValue('/LocalSite/WorkingDirectory', self.siteRoot) if not os.path.exists('%s/job/Wrapper' % (workingDir)): try: os.makedirs('%s/job/Wrapper' % (workingDir)) except Exception: self.log.exception() return S_ERROR('Could not create directory for wrapper script') jobWrapperFile = '%s/job/Wrapper/Wrapper_%s' % (workingDir, jobID) if os.path.exists(jobWrapperFile): self.log.verbose('Removing existing Job Wrapper for %s' % (jobID)) os.remove(jobWrapperFile) fd = open(self.jobWrapperTemplate, 'r') wrapperTemplate = fd.read() fd.close() dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time())) timeStr = time.strftime("%H:%M", time.localtime(time.time())) date_time = '%s %s' % (dateStr, timeStr) signature = __RCSID__ dPython = sys.executable systemConfig = '' if jobParams.has_key('SystemConfig'): systemConfig = jobParams['SystemConfig'] self.log.verbose('Job system configuration requirement is %s' % (systemConfig)) if resourceParams.has_key('Root'): jobPython = '%s/%s/bin/python' % (resourceParams['Root'], systemConfig) if os.path.exists(jobPython): self.log.verbose('Found local python for job:\n%s' % (jobPython)) dPython = jobPython else: if systemConfig == 'ANY': self.log.verbose( 'Using standard available python %s for job' % (dPython)) else: self.log.warn( 'Job requested python \n%s\n but this is not available locally' % (jobPython)) else: self.log.warn('No LocalSite/Root defined') else: self.log.warn('Job has no system configuration requirement') if not systemConfig or systemConfig.lower() == 'any': systemConfig = gConfig.getValue('/LocalSite/Architecture', '') if not systemConfig: self.log.warn('Could not establish SystemConfig') logLevel = self.defaultLogLevel if jobParams.has_key('LogLevel'): logLevel = jobParams['LogLevel'] self.log.info('Found Job LogLevel JDL parameter with value: %s' % (logLevel)) else: self.log.info( 'Applying default LogLevel JDL parameter with value: %s' % (logLevel)) realPythonPath = os.path.realpath(dPython) self.log.debug('Real python path after resolving links is:') self.log.debug(realPythonPath) dPython = realPythonPath siteRootPython = self.siteRoot self.log.debug('DIRACPython is:\n%s' % dPython) self.log.debug('SiteRootPythonDir is:\n%s' % siteRootPython) wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@", str(signature)) wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID)) wrapperTemplate = wrapperTemplate.replace("@DATESTRING@", str(date_time)) wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments)) wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@", str(siteRootPython)) wrapper = open(jobWrapperFile, "w") wrapper.write(wrapperTemplate) wrapper.close() jobExeFile = '%s/job/Wrapper/Job%s' % (workingDir, jobID) jobFileContents = \ """#!/bin/sh %s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no """ % ( dPython, jobWrapperFile, logLevel ) jobFile = open(jobExeFile, 'w') jobFile.write(jobFileContents) jobFile.close() data = { 'execFile': jobExeFile, 'wrapperTemplate': self.jobWrapperTemplate, 'signature': signature, 'wrapperFile': jobWrapperFile, 'logLevel': logLevel, 'jid': jobID, 'datetime': date_time, 'jobArgs': arguments, 'sitePython': siteRootPython } return S_OK(data) ############################################################################# def __saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent. """ classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString('LocalCE', self.ceName) jdlFileName = jobID + '.jdl' jdlFile = open(jdlFileName, 'w') jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() return S_OK(jdlFileName) ############################################################################# def __requestJob(self, ceDict): """Request a single job from the matcher service. """ try: matcher = RPCClient('WorkloadManagement/Matcher', timeout=600) result = matcher.requestJob(ceDict) return result except Exception, x: self.log.exception(lException=x) return S_ERROR( 'Job request to matcher service failed with exception')
class JobAgent(AgentModule): """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration. """ def __init__(self, agentName, loadName, baseAgentName=False, properties=None): """ Just defines some default parameters """ if not properties: properties = {} super(JobAgent, self).__init__(agentName, loadName, baseAgentName, properties) self.ceName = 'InProcess' self.computingElement = None self.timeLeft = 0.0 self.initTimes = os.times() # Localsite options self.siteName = 'Unknown' self.pilotReference = 'Unknown' self.defaultProxyLength = 86400 * 5 # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = 0.0 self.jobSubmissionDelay = 10 self.fillingMode = False self.minimumTimeLeft = 1000 self.stopOnApplicationFailure = True self.stopAfterFailedMatches = 10 self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = '' # Timeleft self.timeLeftUtil = None self.timeLeftError = '' self.pilotInfoReportedFlag = False ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring, logLevel INFO, limited cycles self.am_setOption('MonitoringEnabled', False) self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', self.ceName) localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration', '= %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn("Can't instantiate a CE", ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result if isinstance(result['Value'], list): ceDict = result['Value'][0] else: ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn( "Disabling filling mode as errors calculating time left", self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', '%d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirement for a number of processors processors = int(params.get('NumberOfProcessors', 1)) wholeNode = 'WholeNode' in params if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self._setupProxy(ownerDN, jobGroup) if not result['OK']: return self._rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before %sCE submitJob()' % (self.ceName)) result = self._submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self._getCPUTimeLeft() return S_OK('Job Agent cycle complete') ############################################################################# def __saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent. """ classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString('LocalCE', self.ceName) jdlFileName = jobID + '.jdl' jdlFile = open(jdlFileName, 'w') jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() ############################################################################# def _getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ cpuTime = sum(os.times()[:-1]) self.log.info('Current raw CPU time consumed is %s' % cpuTime) timeleft = self.timeLeft if self.cpuFactor: timeleft -= cpuTime * self.cpuFactor return timeleft ############################################################################# def _setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False): proxyResult = self._requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Failed to setup proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) return S_OK(proxyResult['Value']) else: ret = getProxyInfo(disableVOMS=True) if not ret['OK']: self.log.error('Invalid Proxy', ret['Message']) return S_ERROR('Invalid Proxy') proxyChain = ret['Value']['chain'] if 'groupProperties' not in ret['Value']: print(ret['Value']) print(proxyChain.dumpAllToString()) self.log.error('Invalid Proxy', 'Group has no properties defined') return S_ERROR('Proxy has no group properties defined') groupProps = ret['Value']['groupProperties'] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self._requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Invalid Proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) proxyChain = proxyResult['Value'] return S_OK(proxyChain) ############################################################################# def _requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy', 'for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info( "No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal['OK']: self.log.error('Could not retrieve payload proxy', retVal['Message']) os.system('dirac-proxy-info') sys.stdout.flush() return S_ERROR('Error retrieving proxy') chain = retVal['Value'] return S_OK(chain) ############################################################################# def _checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if 'SoftwareDistModule' not in jobParams: msg = 'Job has no software installation requirement' self.log.verbose(msg) return S_OK(msg) self.__report(jobID, 'Matched', 'Installing Software') softwareDist = jobParams['SoftwareDistModule'] self.log.verbose('Found VO Software Distribution module', ': %s' % (softwareDist)) argumentsDict = {'Job': jobParams, 'CE': resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] return module.execute() ############################################################################# def _submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain, processors, wholeNode=False): """ Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption('DefaultLogLevel', 'INFO') defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate', 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py') jobDesc = { "jobID": jobID, "jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams, "extraOptions": self.extraOptions, "defaultWrapperLocation": defaultWrapperLocation } result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc) if not result['OK']: return result wrapperFile = result['Value'] self.__report(jobID, 'Matched', 'Submitted To CE') self.log.info( 'Submitting JobWrapper', '%s to %sCE' % (os.path.basename(wrapperFile), self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error('Invalid proxy', proxy) return S_ERROR('Payload Proxy Not Found') payloadProxy = proxy['Value'] submission = self.computingElement.submitJob( wrapperFile, payloadProxy, numberOfProcessors=processors, wholeNode=wholeNode, jobDesc=jobDesc, log=self.log, logLevel=logLevel) ret = S_OK('Job submitted') if submission['OK']: batchID = submission['Value'] self.log.info('Job submitted', '%s as %s' % (jobID, batchID)) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error('Job submission failed', jobID) self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName)) if 'ReschedulePayload' in submission: rescheduleFailedJob(jobID, submission['Message']) return S_OK() # Without this, the job is marked as failed else: if 'Value' in submission: self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value']))) return S_ERROR('%s CE Error: %s' % (self.ceName, submission['Message'])) return ret ############################################################################# def _getJDLParameters(self, jdl): """Returns a dictionary of JDL parameters. """ try: parameters = {} # print jdl if not re.search(r'\[', jdl): jdl = '[' + jdl + ']' classAdJob = ClassAd(jdl) paramsDict = classAdJob.contents for param, value in paramsDict.items(): if value.strip().startswith('{'): self.log.debug('Found list type parameter %s' % (param)) rawValues = value.replace('{', '').replace('}', '').replace( '"', '').split() valueList = [] for val in rawValues: if re.search(',$', val): valueList.append(val[:-1]) else: valueList.append(val) parameters[param] = valueList else: parameters[param] = value.replace('"', '').replace( '{', '"{').replace('}', '}"') self.log.debug('Found standard parameter %s: %s' % (param, parameters[param])) return S_OK(parameters) except Exception as x: self.log.exception(lException=x) return S_ERROR('Exception while extracting JDL parameters for job') ############################################################################# def __report(self, jobID, status, minorStatus): """Wraps around setJobStatus of state update client """ jobStatus = JobStateUpdateClient().setJobStatus( int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName) self.log.verbose( 'Setting job status', 'setJobStatus(%s,%s,%s,%s)' % (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName)) if not jobStatus['OK']: self.log.warn('Issue setting the job status', jobStatus['Message']) return jobStatus ############################################################################# def __setJobParam(self, jobID, name, value): """Wraps around setJobParameter of state update client """ jobParam = JobStateUpdateClient().setJobParameter( int(jobID), str(name), str(value)) self.log.verbose('Setting job parameter', 'setJobParameter(%s,%s,%s)' % (jobID, name, value)) if not jobParam['OK']: self.log.warn('Issue setting the job parameter', jobParam['Message']) return jobParam ############################################################################# def __finish(self, message, stop=True): """Force the JobAgent to complete gracefully. """ if stop: self.log.info('JobAgent will stop', 'with message "%s", execution complete.' % message) self.am_stopExecution() return S_ERROR(message) else: return S_OK(message) ############################################################################# def _rescheduleFailedJob(self, jobID, message, stop=True): """ Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager """ self.log.warn('Failure ==> rescheduling', '(during %s)' % (message)) jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName) # Setting a job parameter does not help since the job will be rescheduled, # instead set the status with the cause and then another status showing the # reschedule operation. jobReport.setJobStatus(status='Rescheduled', application=message, sendFlag=True) self.log.info('Job will be rescheduled') result = JobManagerClient().rescheduleJob(jobID) if not result['OK']: self.log.error('Failed to reschedule job', result['Message']) return self.__finish('Problem Rescheduling Job', stop) self.log.info('Job Rescheduled', jobID) return self.__finish('Job Rescheduled', stop) ############################################################################# def finalize(self): """ Job Agent finalization method """ gridCE = gConfig.getValue('/LocalSite/GridCE', '') queue = gConfig.getValue('/LocalSite/CEQueue', '') result = PilotManagerClient().setPilotStatus(str(self.pilotReference), 'Done', gridCE, 'Report from JobAgent', self.siteName, queue) if not result['OK']: self.log.warn('Issue setting the pilot status', result['Message']) return S_OK()
class JobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption("MonitoringEnabled", False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption("MaxCycles", loops) ceType = self.am_getOption("CEType", "InProcess") localCE = gConfig.getValue("/LocalSite/LocalCE", "") if localCE: self.log.info("Defining CE from local configuration = %s" % localCE) ceType = localCE ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance["OK"]: self.log.warn(ceInstance["Message"]) return ceInstance self.initTimes = os.times() self.computingElement = ceInstance["Value"] self.diracRoot = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) # Localsite options self.siteRoot = gConfig.getValue("/LocalSite/Root", os.getcwd()) self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown") self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown") self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0) defaultWrapperLocation = "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py" self.jobWrapperTemplate = os.path.join( self.diracRoot, self.am_getOption("JobWrapperTemplate", defaultWrapperLocation) ) self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10) self.defaultLogLevel = self.am_getOption("DefaultLogLevel", "info") self.fillingMode = self.am_getOption("FillingModeFlag", False) self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10) self.jobCount = 0 self.matchFailedCount = 0 # Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", 0.0) self.gridCEQueue = gConfig.getValue("/Resources/Computing/CEDefaults/GridCEQueue", "") self.timeLeftError = "" self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft)) # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result["OK"]: return self.__finish(result["Message"]) else: return self.__finish("Filling Mode is Disabled") self.log.verbose("Job Agent execution loop") available = self.computingElement.available() if not available["OK"] or not available["Value"]: self.log.info("Resource is not available") self.log.info(available["Message"]) return self.__finish("CE Not Available") self.log.info(available["Message"]) result = self.computingElement.getDescription() if not result["OK"]: return result ceDict = result["Value"] # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if not "PilotReference" in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime = %.2f (s)" % (matchTime)) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: if re.search("No match found", jobRequest["Message"]): self.log.notice("Job request OK: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error(jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("Pilot version does not match") != -1: self.log.error(jobRequest["Message"]) return S_ERROR(jobRequest["Message"]) else: self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False) matcherParams = ["JDL", "DN", "Group"] for param in matcherParams: if not matcherInfo.has_key(param): self.__report(jobID, "Failed", "Matcher did not return %s" % (param)) return self.__finish("Matcher Failed") elif not matcherInfo[param]: self.__report(jobID, "Failed", "Matcher returned null %s" % (param)) return self.__finish("Matcher Failed") else: self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param])) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] optimizerParams = {} for key in matcherInfo.keys(): if not key in matcherParams: value = matcherInfo[key] optimizerParams[key] = value parameters = self.__getJDLParameters(jobJDL) if not parameters["OK"]: self.__report(jobID, "Failed", "Could Not Extract JDL Parameters") self.log.warn(parameters["Message"]) return self.__finish("JDL Problem") params = parameters["Value"] if not params.has_key("JobID"): msg = "Job has not JobID defined in JDL parameters" self.__report(jobID, "Failed", msg) self.log.warn(msg) return self.__finish("JDL Problem") else: jobID = params["JobID"] if not params.has_key("JobType"): self.log.warn("Job has no JobType defined in JDL parameters") jobType = "Unknown" else: jobType = params["JobType"] if not params.has_key("SystemConfig"): self.log.warn("Job has no system configuration defined in JDL parameters") systemConfig = gConfig.getValue("/LocalSite/Architecture", "") self.log.info( "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig ) if not systemConfig: self.log.warn("/LocalSite/Architecture is not defined") params["SystemConfig"] = systemConfig else: systemConfig = params["SystemConfig"] if systemConfig.lower() == "any": systemConfig = gConfig.getValue("/LocalSite/Architecture", "") self.log.info( "Setting SystemConfig = /LocalSite/Architecture =", '"%s" since it was set to "ANY" in the job description' % systemConfig, ) if not systemConfig: self.log.warn("/LocalSite/Architecture is not defined") params["SystemConfig"] = systemConfig if not params.has_key("CPUTime"): self.log.warn("Job has no CPU requirement defined in JDL parameters") self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"])) self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig)) self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False) if self.gridCEQueue: jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False) if os.environ.has_key("BOINC_JOB_ID"): # Report BOINC environment for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]: jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False) jobReport.setJobStatus("Matched", "Job Received by Agent") # self.__setJobSite( jobID, self.siteName ) if not self.pilotInfoReportedFlag: self.__reportPilotInfo(jobID) result = self.__setupProxy(ownerDN, jobGroup) if not result["OK"]: return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure) if "Value" in result and result["Value"]: proxyChain = result["Value"] software = self.__checkInstallSoftware(jobID, params, ceDict) if not software["OK"]: self.log.error("Failed to install software for job %s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure) self.log.verbose("Before %sCE submitJob()" % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain) if not submission["OK"]: self.__report(jobID, "Failed", submission["Message"]) return self.__finish(submission["Message"]) elif "PayloadFailed" in submission: # Do not keep running and do not overwrite the Payload error return self.__finish( "Payload execution failed with error code %s" % submission["PayloadFailed"], self.stopOnApplicationFailure, ) self.log.verbose("After %sCE submitJob()" % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure ) currentTimes = list(os.times()) for i in range(len(currentTimes)): currentTimes[i] -= self.initTimes[i] utime, stime, cutime, cstime, elapsed = currentTimes cpuTime = utime + stime + cutime + cstime result = self.timeLeftUtil.getTimeLeft(cpuTime) if result["OK"]: self.timeLeft = result["Value"] else: if result["Message"] != "Current batch system is not supported": self.timeLeftError = result["Message"] else: if self.cpuFactor: # if the batch system is not defined used the CPUNormalizationFactor # defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"] self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK("Job Agent cycle complete") ############################################################################# def __getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ utime, stime, cutime, cstime, elapsed = os.times() cpuTime = utime + stime + cutime self.log.info("Current raw CPU time consumed is %s" % cpuTime) timeleft = self.timeLeft - cpuTime * self.cpuFactor return timeleft ############################################################################# def __changeProxy(self, oldProxy, newProxy): """Can call glexec utility here to set uid or simply log the changeover of a proxy. """ self.log.verbose("Log proxy change (to be instrumented)") return S_OK() ############################################################################# def __setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False): proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup) if not proxyResult["OK"]: self.log.error("Invalid Proxy", proxyResult["Message"]) return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"]) return S_OK(proxyResult["Value"]) else: ret = getProxyInfo(disableVOMS=True) if not ret["OK"]: self.log.error("Invalid Proxy", ret["Message"]) return S_ERROR("Invalid Proxy") proxyChain = ret["Value"]["chain"] if not "groupProperties" in ret["Value"]: print ret["Value"] print proxyChain.dumpAllToString() self.log.error("Invalid Proxy", "Group has no properties defined") return S_ERROR("Proxy has no group properties defined") groupProps = ret["Value"]["groupProperties"] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup) if not proxyResult["OK"]: self.log.error("Invalid Proxy", proxyResult["Message"]) return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"]) proxyChain = proxyResult["Value"] return S_OK(proxyChain) ############################################################################# def __requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info("No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal["OK"]: self.log.error("Could not retrieve proxy") self.log.warn(retVal) os.system("dirac-proxy-info") sys.stdout.flush() return S_ERROR("Error retrieving proxy") chain = retVal["Value"] return S_OK(chain) ############################################################################# def __checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if not jobParams.has_key("SoftwareDistModule"): msg = "Job has no software installation requirement" self.log.verbose(msg) return S_OK(msg) self.__report(jobID, "Matched", "Installing Software") softwareDist = jobParams["SoftwareDistModule"] self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist)) argumentsDict = {"Job": jobParams, "CE": resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance["OK"]: return moduleInstance module = moduleInstance["Value"] result = module.execute() return result ############################################################################# def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, jobJDL, proxyChain): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ result = self.__createJobWrapper(jobID, jobParams, resourceParams, optimizerParams) if not result["OK"]: return result wrapperData = result["Value"] wrapperFile = wrapperData["execFile"] self.__report(jobID, "Matched", "Submitted To CE") wrapperName = os.path.basename(wrapperFile) self.log.info("Submitting %s to %sCE" % (wrapperName, self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy["OK"]: self.log.error(proxy) return S_ERROR("Payload Proxy Not Found") payloadProxy = proxy["Value"] # FIXME: how can we set the batchID before we submit, this makes no sense batchID = "dc%s" % (jobID) submission = self.computingElement.submitJob(wrapperFile, payloadProxy, wrapperData) ret = S_OK("Job submitted") if submission["OK"]: batchID = submission["Value"] self.log.info("Job %s submitted as %s" % (jobID, batchID)) self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID)) self.__setJobParam(jobID, "LocalBatchID", str(batchID)) if "PayloadFailed" in submission: ret["PayloadFailed"] = submission["PayloadFailed"] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error("Job submission failed", jobID) self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName)) if "ReschedulePayload" in submission: rescheduleFailedJob(jobID, submission["Message"], self.__report) else: if "Value" in submission: self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"]))) # make sure the Job is declared Failed self.__report(jobID, "Failed", submission["Message"]) return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"])) return ret ############################################################################# def __createJobWrapper(self, jobID, jobParams, resourceParams, optimizerParams): """This method creates a job wrapper filled with the CE and Job parameters to executed the job. """ arguments = {"Job": jobParams, "CE": resourceParams, "Optimizer": optimizerParams} self.log.verbose("Job arguments are: \n %s" % (arguments)) workingDir = gConfig.getValue("/LocalSite/WorkingDirectory", self.siteRoot) if not os.path.exists("%s/job/Wrapper" % (workingDir)): try: os.makedirs("%s/job/Wrapper" % (workingDir)) except Exception: self.log.exception() return S_ERROR("Could not create directory for wrapper script") jobWrapperFile = "%s/job/Wrapper/Wrapper_%s" % (workingDir, jobID) if os.path.exists(jobWrapperFile): self.log.verbose("Removing existing Job Wrapper for %s" % (jobID)) os.remove(jobWrapperFile) fd = open(self.jobWrapperTemplate, "r") wrapperTemplate = fd.read() fd.close() dateStr = time.strftime("%Y-%m-%d", time.localtime(time.time())) timeStr = time.strftime("%H:%M", time.localtime(time.time())) date_time = "%s %s" % (dateStr, timeStr) signature = __RCSID__ dPython = sys.executable systemConfig = "" if jobParams.has_key("SystemConfig"): systemConfig = jobParams["SystemConfig"] self.log.verbose("Job system configuration requirement is %s" % (systemConfig)) if resourceParams.has_key("Root"): jobPython = "%s/%s/bin/python" % (resourceParams["Root"], systemConfig) if os.path.exists(jobPython): self.log.verbose("Found local python for job:\n%s" % (jobPython)) dPython = jobPython else: if systemConfig == "ANY": self.log.verbose("Using standard available python %s for job" % (dPython)) else: self.log.warn("Job requested python \n%s\n but this is not available locally" % (jobPython)) else: self.log.warn("No LocalSite/Root defined") else: self.log.warn("Job has no system configuration requirement") if not systemConfig or systemConfig.lower() == "any": systemConfig = gConfig.getValue("/LocalSite/Architecture", "") if not systemConfig: self.log.warn("Could not establish SystemConfig") logLevel = self.defaultLogLevel if jobParams.has_key("LogLevel"): logLevel = jobParams["LogLevel"] self.log.info("Found Job LogLevel JDL parameter with value: %s" % (logLevel)) else: self.log.info("Applying default LogLevel JDL parameter with value: %s" % (logLevel)) realPythonPath = os.path.realpath(dPython) self.log.debug("Real python path after resolving links is:") self.log.debug(realPythonPath) dPython = realPythonPath siteRootPython = self.siteRoot self.log.debug("DIRACPython is:\n%s" % dPython) self.log.debug("SiteRootPythonDir is:\n%s" % siteRootPython) wrapperTemplate = wrapperTemplate.replace("@SIGNATURE@", str(signature)) wrapperTemplate = wrapperTemplate.replace("@JOBID@", str(jobID)) wrapperTemplate = wrapperTemplate.replace("@DATESTRING@", str(date_time)) wrapperTemplate = wrapperTemplate.replace("@JOBARGS@", str(arguments)) wrapperTemplate = wrapperTemplate.replace("@SITEPYTHON@", str(siteRootPython)) wrapper = open(jobWrapperFile, "w") wrapper.write(wrapperTemplate) wrapper.close() jobExeFile = "%s/job/Wrapper/Job%s" % (workingDir, jobID) jobFileContents = """#!/bin/sh %s %s -o LogLevel=%s -o /DIRAC/Security/UseServerCertificate=no """ % ( dPython, jobWrapperFile, logLevel, ) jobFile = open(jobExeFile, "w") jobFile.write(jobFileContents) jobFile.close() data = { "execFile": jobExeFile, "wrapperTemplate": self.jobWrapperTemplate, "signature": signature, "wrapperFile": jobWrapperFile, "logLevel": logLevel, "jid": jobID, "datetime": date_time, "jobArgs": arguments, "sitePython": siteRootPython, } return S_OK(data) ############################################################################# def __requestJob(self, ceDict): """Request a single job from the matcher service. """ try: matcher = RPCClient("WorkloadManagement/Matcher", timeout=600) result = matcher.requestJob(ceDict) return result except Exception, x: self.log.exception(lException=x) return S_ERROR("Job request to matcher service failed with exception")
class JobAgent(AgentModule): """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration. """ ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption("MonitoringEnabled", False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption("MaxCycles", loops) ceType = self.am_getOption("CEType", "InProcess") localCE = gConfig.getValue("/LocalSite/LocalCE", "") if localCE: self.log.info("Defining CE from local configuration = %s" % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance["OK"]: self.log.warn(ceInstance["Message"]) return ceInstance self.computingElement = ceInstance["Value"] result = self.computingElement.getDescription() if not result["OK"]: self.log.warn("Can not get the CE description") return result ceDict = result["Value"] self.timeLeft = ceDict.get("CPUTime", 0.0) self.timeLeft = gConfig.getValue("/Resources/Computing/CEDefaults/MaxCPUTime", self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue("/LocalSite/Site", "Unknown") self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", "Unknown") self.defaultProxyLength = gConfig.getValue("/Registry/DefaultProxyLifeTime", 86400 * 5) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0) self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", 10) self.fillingMode = self.am_getOption("FillingModeFlag", False) self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", 1000) self.stopOnApplicationFailure = self.am_getOption("StopOnApplicationFailure", True) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", 10) self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = gConfig.getValue("/AgentJobRequirements/ExtraOptions", "") # Timeleft self.timeLeftUtil = TimeLeft() self.timeLeftError = "" self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if self.fillingMode: if self.timeLeftError: self.log.warn(self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft)) if self.timeLeft <= self.minimumTimeLeft: return self.__finish("No more time left") # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft) if not result["OK"]: return self.__finish(result["Message"]) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection("/LocalSite"): localCfg.createNewSection("/LocalSite") localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish("Filling Mode is Disabled") self.log.verbose("Job Agent execution loop") available = self.computingElement.available() if not available["OK"] or not available["Value"]: self.log.info("Resource is not available") self.log.info(available["Message"]) return self.__finish("CE Not Available") self.log.info(available["Message"]) result = self.computingElement.getDescription() if not result["OK"]: return result ceDict = result["Value"] # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if not "PilotReference" in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.verbose(ceDict) start = time.time() jobRequest = self.__requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime = %.2f (s)" % (matchTime)) self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: if re.search("No match found", jobRequest["Message"]): self.log.notice("Job request OK: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error("Timeout while requesting job", jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) elif jobRequest["Message"].find("Pilot version does not match") != -1: errorMsg = "Pilot version does not match the production version" self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, "")) return S_ERROR(jobRequest["Message"]) else: self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK(jobRequest["Message"]) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest["Value"] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False) jobID = matcherInfo["JobID"] matcherParams = ["JDL", "DN", "Group"] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, "Failed", "Matcher did not return %s" % (param)) return self.__finish("Matcher Failed") elif not matcherInfo[param]: self.__report(jobID, "Failed", "Matcher returned null %s" % (param)) return self.__finish("Matcher Failed") else: self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param])) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self.__getJDLParameters(jobJDL) if not parameters["OK"]: self.__report(jobID, "Failed", "Could Not Extract JDL Parameters") self.log.warn(parameters["Message"]) return self.__finish("JDL Problem") params = parameters["Value"] if "JobID" not in params: msg = "Job has not JobID defined in JDL parameters" self.__report(jobID, "Failed", msg) self.log.warn(msg) return self.__finish("JDL Problem") else: jobID = params["JobID"] if "JobType" not in params: self.log.warn("Job has no JobType defined in JDL parameters") jobType = "Unknown" else: jobType = params["JobType"] if "CPUTime" not in params: self.log.warn("Job has no CPU requirement defined in JDL parameters") if self.extraOptions: params["Arguments"] += " " + self.extraOptions params["ExtraOptions"] = self.extraOptions self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType)) self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False) jobReport.setJobStatus("Matched", "Job Received by Agent") result = self.__setupProxy(ownerDN, jobGroup) if not result["OK"]: return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure) proxyChain = result.get("Value") # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self.__checkInstallSoftware(jobID, params, ceDict) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug("Before %sCE submitJob()" % (self.ceName)) submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain) if not submission["OK"]: self.__report(jobID, "Failed", submission["Message"]) return self.__finish(submission["Message"]) elif "PayloadFailed" in submission: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % submission["PayloadFailed"] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception: self.log.exception() return self.__rescheduleFailedJob( jobID, "Job processing failed with exception", self.stopOnApplicationFailure ) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime) if result["OK"]: self.timeLeft = result["Value"] else: if result["Message"] != "Current batch system is not supported": self.timeLeftError = result["Message"] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self.__getCPUTimeLeft() scaledCPUTime = self.timeLeftUtil.getScaledCPU() self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime)) self.scaledCPUTime = scaledCPUTime return S_OK("Job Agent cycle complete") ############################################################################# def __saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent. """ classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString("LocalCE", self.ceName) jdlFileName = jobID + ".jdl" jdlFile = open(jdlFileName, "w") jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() ############################################################################# def __getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ cpuTime = sum(os.times()[:-1]) self.log.info("Current raw CPU time consumed is %s" % cpuTime) timeleft = self.timeLeft if self.cpuFactor: timeleft -= cpuTime * self.cpuFactor return timeleft ############################################################################# def __changeProxy(self, oldProxy, newProxy): """Can call glexec utility here to set uid or simply log the changeover of a proxy. """ self.log.verbose("Log proxy change (to be instrumented)") return S_OK() ############################################################################# def __setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False): proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup) if not proxyResult["OK"]: self.log.error("Failed to setup proxy", proxyResult["Message"]) return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"]) return S_OK(proxyResult["Value"]) else: ret = getProxyInfo(disableVOMS=True) if not ret["OK"]: self.log.error("Invalid Proxy", ret["Message"]) return S_ERROR("Invalid Proxy") proxyChain = ret["Value"]["chain"] if not "groupProperties" in ret["Value"]: print ret["Value"] print proxyChain.dumpAllToString() self.log.error("Invalid Proxy", "Group has no properties defined") return S_ERROR("Proxy has no group properties defined") groupProps = ret["Value"]["groupProperties"] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self.__requestProxyFromProxyManager(ownerDN, ownerGroup) if not proxyResult["OK"]: self.log.error("Invalid Proxy", proxyResult["Message"]) return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"]) proxyChain = proxyResult["Value"] return S_OK(proxyChain) ############################################################################# def __requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info("No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup(ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal["OK"]: self.log.error("Could not retrieve payload proxy", retVal["Message"]) self.log.warn(retVal) os.system("dirac-proxy-info") sys.stdout.flush() return S_ERROR("Error retrieving proxy") chain = retVal["Value"] return S_OK(chain) ############################################################################# def __checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if "SoftwareDistModule" not in jobParams: msg = "Job has no software installation requirement" self.log.verbose(msg) return S_OK(msg) self.__report(jobID, "Matched", "Installing Software") softwareDist = jobParams["SoftwareDistModule"] self.log.verbose("Found VO Software Distribution module: %s" % (softwareDist)) argumentsDict = {"Job": jobParams, "CE": resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance["OK"]: return moduleInstance module = moduleInstance["Value"] return module.execute() ############################################################################# def __submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption("DefaultLogLevel", "INFO") defaultWrapperLocation = self.am_getOption( "JobWrapperTemplate", "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py" ) result = createJobWrapper( jobID, jobParams, resourceParams, optimizerParams, extraOptions=self.extraOptions, defaultWrapperLocation=defaultWrapperLocation, log=self.log, logLevel=logLevel, ) if not result["OK"]: return result wrapperFile = result["Value"] self.__report(jobID, "Matched", "Submitted To CE") self.log.info("Submitting %s to %sCE" % (os.path.basename(wrapperFile), self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy["OK"]: self.log.error("Invalid proxy", proxy) return S_ERROR("Payload Proxy Not Found") payloadProxy = proxy["Value"] # FIXME: how can we set the batchID before we submit, this makes no sense batchID = "dc%s" % (jobID) submission = self.computingElement.submitJob(wrapperFile, payloadProxy) ret = S_OK("Job submitted") if submission["OK"]: batchID = submission["Value"] self.log.info("Job %s submitted as %s" % (jobID, batchID)) self.log.verbose("Set JobParameter: Local batch ID %s" % (batchID)) self.__setJobParam(jobID, "LocalBatchID", str(batchID)) if "PayloadFailed" in submission: ret["PayloadFailed"] = submission["PayloadFailed"] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error("Job submission failed", jobID) self.__setJobParam(jobID, "ErrorMessage", "%s CE Submission Error" % (self.ceName)) if "ReschedulePayload" in submission: rescheduleFailedJob(jobID, submission["Message"]) return S_OK() # Without this job is marked as failed at line 265 above else: if "Value" in submission: self.log.error("Error in DIRAC JobWrapper:", "exit code = %s" % (str(submission["Value"]))) # make sure the Job is declared Failed self.__report(jobID, "Failed", submission["Message"]) return S_ERROR("%s CE Submission Error: %s" % (self.ceName, submission["Message"])) return ret ############################################################################# def __requestJob(self, ceDict): """Request a single job from the matcher service. """ try: matcher = RPCClient("WorkloadManagement/Matcher", timeout=600) result = matcher.requestJob(ceDict) return result except Exception as x: self.log.exception(lException=x) return S_ERROR("Job request to matcher service failed with exception") ############################################################################# def __getJDLParameters(self, jdl): """Returns a dictionary of JDL parameters. """ try: parameters = {} # print jdl if not re.search("\[", jdl): jdl = "[" + jdl + "]" classAdJob = ClassAd(jdl) paramsDict = classAdJob.contents for param, value in paramsDict.items(): if value.strip().startswith("{"): self.log.debug("Found list type parameter %s" % (param)) rawValues = value.replace("{", "").replace("}", "").replace('"', "").split() valueList = [] for val in rawValues: if re.search(",$", val): valueList.append(val[:-1]) else: valueList.append(val) parameters[param] = valueList else: parameters[param] = value.replace('"', "").replace("{", '"{').replace("}", '}"') self.log.debug("Found standard parameter %s: %s" % (param, parameters[param])) return S_OK(parameters) except Exception, x: self.log.exception(lException=x) return S_ERROR("Exception while extracting JDL parameters for job")