def test_getTimeLeft(mocker, batch, requiredVariables, returnValue, expected_1, expected_2): """Test getTimeLeft()""" mocker.patch( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.runCommand", return_value=S_OK(returnValue)) tl = TimeLeft() batchSystemName = "%sResourceUsage" % batch batchSystemPath = "DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s" % batchSystemName batchPlugin = __import__(batchSystemPath, globals(), locals(), [batchSystemName]) # Need to be reloaded to update the mock within the module, else, it will reuse the one when loaded the first time reload(batchPlugin) batchStr = "batchPlugin.%s()" % (batchSystemName) tl.batchPlugin = eval(batchStr) tl.cpuPower = 10.0 # Update attributes of the batch systems to get scaled CPU tl.batchPlugin.__dict__.update(requiredVariables) res = tl.getTimeLeft() assert res["OK"] is expected_1 if res["OK"]: assert res["Value"] == expected_2
def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}): """Constructor, takes system flag as argument.""" self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2)) # SIGINT self.stopSigRegex = jobArgs.get("StopSigRegex", None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.cpuPower = 1.0 self.processors = processors
def initialize(self): """Sets default parameters and creates CE instance""" # Disable monitoring self.am_disableMonitoring() localCE = gConfig.getValue("/LocalSite/LocalCE", self.ceName) if localCE != self.ceName: self.log.info("Defining Inner CE from local configuration", "= %s" % localCE) # Create backend Computing Element result = self._initializeComputingElement(localCE) if not result["OK"]: return result result = self._getCEDict(self.computingElement) if not result["OK"]: return result ceDict = result["Value"][0] self.initTimeLeft = ceDict.get("CPUTime", self.initTimeLeft) self.initTimeLeft = gConfig.getValue( "/Resources/Computing/CEDefaults/MaxCPUTime", self.initTimeLeft) self.timeLeft = self.initTimeLeft self.initTimes = os.times() # Localsite options self.siteName = siteName() self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", self.pilotReference) self.defaultProxyLength = gConfig.getValue( "/Registry/DefaultProxyLifeTime", self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", self.cpuFactor) self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", self.jobSubmissionDelay) self.fillingMode = self.am_getOption("FillingModeFlag", self.fillingMode) self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( "StopOnApplicationFailure", self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( "StopAfterFailedMatches", self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( "/AgentJobRequirements/ExtraOptions", self.extraOptions) # Utilities self.timeLeftUtil = TimeLeft() return S_OK()
def test_getScaledCPU(mocker, batch, requiredVariables, returnValue, expected): """ Test getScaledCPU() """ mocker.patch( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.runCommand", return_value=S_OK(returnValue)) tl = TimeLeft() res = tl.getScaledCPU() assert res == 0 tl.scaleFactor = 5.0 tl.normFactor = 5.0 batchSystemName = '%sResourceUsage' % batch batchSystemPath = 'DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s' % batchSystemName batchPlugin = __import__(batchSystemPath, globals(), locals(), [batchSystemName]) # pylint: disable=unused-variable # Need to be reloaded to update the mock within the module, else, it will reuse the one when loaded the first time reload_module(batchPlugin) batchStr = 'batchPlugin.%s()' % (batchSystemName) tl.batchPlugin = eval(batchStr) # Update attributes of the batch systems to get scaled CPU tl.batchPlugin.__dict__.update(requiredVariables) res = tl.getScaledCPU() assert res == expected
def test__computeCPUWorkLeft(mocker, initTimeLeft, timeLeft, cpuFactor, mockTimeLeftReply, expectedTimeLeft): """Test JobAgent()._computeCPUWorkLeft()""" mocker.patch( "DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__") mocker.patch( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.TimeLeft.getTimeLeft", return_value=mockTimeLeftReply) jobAgent = JobAgent("Test", "Test1") jobAgent.log = gLogger jobAgent.log.setLevel("DEBUG") jobAgent.timeLeftUtil = TimeLeft() jobAgent.initTimeLeft = initTimeLeft jobAgent.timeLeft = timeLeft jobAgent.cpuFactor = cpuFactor result = jobAgent._computeCPUWorkLeft() assert abs(result - expectedTimeLeft) < 10
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}): """Constructor, takes system flag as argument.""" self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2)) # SIGINT self.stopSigRegex = jobArgs.get("StopSigRegex", None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.cpuPower = 1.0 self.processors = processors ############################################################################# def initialize(self): """Watchdog initialization.""" if self.initialized: self.log.info("Watchdog already initialized") return S_OK() else: self.initialized = True setup = gConfig.getValue("/DIRAC/Setup", "") if not setup: return S_ERROR("Can not get the DIRAC Setup value") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") self.section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance self.log.verbose("Watchdog initialization") # Test control flags self.testWallClock = gConfig.getValue(self.section + "/CheckWallClockFlag", 1) self.testDiskSpace = gConfig.getValue(self.section + "/CheckDiskSpaceFlag", 1) self.testLoadAvg = gConfig.getValue(self.section + "/CheckLoadAvgFlag", 1) self.testCPUConsumed = gConfig.getValue(self.section + "/CheckCPUConsumedFlag", 1) self.testCPULimit = gConfig.getValue(self.section + "/CheckCPULimitFlag", 0) self.testMemoryLimit = gConfig.getValue(self.section + "/CheckMemoryLimitFlag", 0) self.testTimeLeft = gConfig.getValue(self.section + "/CheckTimeLeftFlag", 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + "/PollingTime", 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + "/CheckingTime", 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + "/MinCheckingTime", 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + "/MaxWallClockTime", 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + "/JobPeekFlag", 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + "/MinDiskSpace", 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + "/LoadAverageLimit", 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + "/CPUSampleTime", 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + "/JobCPULimitMargin", 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + "/MinCPUWallClockRatio", 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue(self.section + "/NullCPUCountLimit", 5) if self.checkingTime < self.minCheckingTime: self.log.info( "Requested CheckingTime of %s setting to %s seconds (minimum)" % (self.checkingTime, self.minCheckingTime) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + "/TimeLeftLimit", 150 * self.pollingTime) self.cpuPower = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 1.0) return S_OK() def run(self): """The main watchdog execution method""" result = self.initialize() if not result["OK"]: self.log.always("Can not start watchdog for the following reason") self.log.always(result["Message"]) return result try: while True: self.log.debug("Starting watchdog loop # %d" % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result["OK"]: self.log.error("Watchdog error during execution", result["Message"]) break elif result["Value"] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR("Exception") ############################################################################# def execute(self): """The main agent execution method of the Watchdog.""" if not self.exeThread.is_alive(): self.__getUsageSummary() self.log.info("Process to monitor has completed, Watchdog will exit.") return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if ( not self.stopSigSent and self.stopSigRegex is not None and (time.time() - self.initialValues["StartTime"]) > self.wallClockCheckSeconds * self.wallClockCheckCount ): self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = JobMinorStatus.JOB_EXCEEDED_CPU self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues["StartTime"]) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result["OK"]: self.log.warn("Problem during recent checks") self.log.warn(result["Message"]) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature("jobstart_secs") if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and ( wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds ): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info("Sending signal to JobWrapper children", "(%s)" % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open("/proc/%d/cmdline" % childPid, "r").read().replace("\0", " ").strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline) ) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error("Failed to send signals to JobWrapper children!", repr(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose("------------------------------------") self.log.verbose("Checking loop starts for Watchdog") heartBeatDict = {} msg = "" loadAvg = float(os.getloadavg()[0]) msg += "LoadAvg: %d " % loadAvg heartBeatDict["LoadAverage"] = loadAvg if "LoadAverage" not in self.parameters: self.parameters["LoadAverage"] = [] self.parameters["LoadAverage"].append(loadAvg) memoryUsed = self.getMemoryUsed() msg += "MemUsed: %.1f kb " % (memoryUsed) heartBeatDict["MemoryUsed"] = memoryUsed if "MemoryUsed" not in self.parameters: self.parameters["MemoryUsed"] = [] self.parameters["MemoryUsed"].append(memoryUsed) result = self.profiler.vSizeUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get vSize info from profiler", result["Message"]) else: vsize = result["Value"] * 1024.0 heartBeatDict["Vsize"] = vsize self.parameters.setdefault("Vsize", []) self.parameters["Vsize"].append(vsize) msg += "Job Vsize: %.1f kb " % vsize result = self.profiler.memoryUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get rss info from profiler", result["Message"]) else: rss = result["Value"] * 1024.0 heartBeatDict["RSS"] = rss self.parameters.setdefault("RSS", []) self.parameters["RSS"].append(rss) msg += "Job RSS: %.1f kb " % rss if "DiskSpace" not in self.parameters: self.parameters["DiskSpace"] = [] # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused # (specific request from CERN batch service). result = self.getDiskSpace(exclude="fuse") if not result["OK"]: self.log.warn("Could not establish DiskSpace", result["Message"]) else: msg += "DiskSpace: %.1f MB " % (result["Value"]) self.parameters["DiskSpace"].append(result["Value"]) heartBeatDict["AvailableDiskSpace"] = result["Value"] cpu = self.__getCPU() if not cpu["OK"]: msg += "CPU: ERROR " hmsCPU = 0 else: cpu = cpu["Value"] msg += "CPU: %s (h:m:s) " % (cpu) if "CPUConsumed" not in self.parameters: self.parameters["CPUConsumed"] = [] self.parameters["CPUConsumed"].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: heartBeatDict["CPUConsumed"] = rawCPU["Value"] result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) else: msg += "WallClock: %.2f s " % (result["Value"]) self.parameters.setdefault("WallClockTime", list()).append(result["Value"]) heartBeatDict["WallClockTime"] = result["Value"] * self.processors self.log.info(msg) result = self._checkProgress() if not result["OK"]: self.checkError = result["Message"] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] self.log.info("Last lines of available application output:") self.log.info("================START================") for line in outputList: self.log.info(line) self.log.info("=================END=================") self.__killRunningThread() return S_OK() recentStdOut = "None" if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) recentStdOut = "Last %s lines of application output from Watchdog on %s [UTC]:" % ( size, Time.dateTime(), ) border = "=" * len(recentStdOut) cpuTotal = "Last reported CPU consumed for job is %s (h:m:s)" % (hmsCPU) if self.timeLeft: cpuTotal += ", Batch Queue Time Left %s (s @ HS06)" % self.timeLeft recentStdOut = "\n%s\n%s\n%s\n%s\n" % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + "\n" else: recentStdOut = ( "Watchdog is initializing and will attempt to obtain standard output from application thread" ) self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn("Turning off job peeking for remainder of execution") if "JOBID" not in os.environ: self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] staticParamDict = {"StandardOutput": recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK("Watchdog checking cycle complete") ############################################################################# def __getCPU(self): """Uses the profiler to get CPU time for current process, its child, and the terminated child, and returns HH:MM:SS after conversion. """ result = self.profiler.cpuUsageUser(withChildren=True, withTerminatedChildren=True) if not result["OK"]: self.log.warn("Issue while checking consumed CPU for user", result["Message"]) if result["Errno"] == errno.ESRCH: self.log.warn("The main process does not exist (anymore). This might be correct.") return result cpuUsageUser = result["Value"] result = self.profiler.cpuUsageSystem(withChildren=True, withTerminatedChildren=True) if not result["OK"]: self.log.warn("Issue while checking consumed CPU for system", result["Message"]) if result["Errno"] == errno.ESRCH: self.log.warn("The main process does not exist (anymore). This might be correct.") return result cpuUsageSystem = result["Value"] cpuTimeTotal = cpuUsageUser + cpuUsageSystem if cpuTimeTotal: self.log.verbose("Raw CPU time consumed (s) =", cpuTimeTotal) return self.__getCPUHMS(cpuTimeTotal) self.log.error("CPU time consumed found to be 0") return S_ERROR() ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = "%02d:%02d:%02d" % (hours, mins, secs) self.log.verbose("Human readable CPU time is: %s" % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info("Received control signal") if isinstance(signalDict, dict): if "Kill" in signalDict: self.log.info("Received Kill signal, stopping job via control signal") self.checkError = JobMinorStatus.RECEIVED_KILL_SIGNAL self.__killRunningThread() else: self.log.info("The following control signal was sent but not understood by the watchdog:") self.log.info(signalDict) else: self.log.info("Expected dictionary for control signal", "received:\n%s" % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = "" if self.testWallClock: result = self.__checkWallClockTime() if not result["OK"]: self.log.warn(result["Message"]) return result report += "WallClock: OK, " else: report += "WallClock: NA," if self.testDiskSpace: result = self.__checkDiskSpace() if not result["OK"]: self.log.warn(result["Message"]) return result report += "DiskSpace: OK, " else: report += "DiskSpace: NA," if self.testLoadAvg: result = self.__checkLoadAverage() if not result["OK"]: self.log.warn( "Check of load average failed, but won't fail because of that", ": %s" % result["Message"] ) report += "LoadAverage: ERROR, " return S_OK() report += "LoadAverage: OK, " else: report += "LoadAverage: NA," if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result["OK"]: return result report += "CPUConsumed: OK, " else: report += "CPUConsumed: NA, " if self.testCPULimit: result = self.__checkCPULimit() if not result["OK"]: self.log.warn(result["Message"]) return result report += "CPULimit OK, " else: report += "CPULimit: NA, " if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += "TimeLeft: OK" else: report += "TimeLeft: NA" if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result["OK"]: self.log.warn(result["Message"]) return result report += "MemoryLimit OK, " else: report += "MemoryLimit: NA, " self.log.info(report) return S_OK("All enabled checks passed") ############################################################################# def __checkCPUConsumed(self): """Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if "WallClockTime" not in self.parameters: return S_ERROR("Missing WallClockTime info") if "CPUConsumed" not in self.parameters: return S_ERROR("Missing CPUConsumed info") wallClockTime = self.parameters["WallClockTime"][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time is still smaller than sample time", "(%s) < (%s)" % (wallClockTime, self.sampleCPUTime), ) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters["CPUConsumed"]) < intervals + 1: self.log.info( "Not enough snapshots to calculate", "there are %s and we need %s" % (len(self.parameters["CPUConsumed"]), intervals + 1), ) return S_OK() wallClockTime = self.parameters["WallClockTime"][-1] - self.parameters["WallClockTime"][-1 - intervals] try: cpuTime = self.__convertCPUTime(self.parameters["CPUConsumed"][-1])["Value"] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime(self.parameters["CPUConsumed"][-1 - intervals])["Value"] if cpuTime < 0: self.log.warn("Consumed CPU time negative, something wrong may have happened, ignore") return S_OK() if wallClockTime <= 0: self.log.warn("Wallclock time should not be negative or zero, Ignore") return S_OK() ratio = (cpuTime / wallClockTime) * 100 self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if ( os.path.exists("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK") or "DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK" in os.environ ): self.log.warn( "N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload" ) return S_OK() self.log.info("Job is stalled!") return S_ERROR(JobMinorStatus.WATCHDOG_STALLED) except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(":") # for i in range( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR("Could not calculate CPU time") # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result["Value"] = normalizedCPUValue self.log.debug("CPU value %s converted to %s" % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if "CPUConsumed" in self.parameters: consumedCPU = self.parameters["CPUConsumed"][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict["OK"]: currentCPU = consumedCPUDict["Value"] else: return S_OK("Not possible to determine current CPU consumed") if consumedCPU: limit = int(self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100)) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( "Job has consumed more than the specified CPU limit", "with an additional %s%% margin" % (self.jobCPUMargin), ) return S_ERROR("Job has exceeded maximum CPU time limit") return S_OK("Job within CPU limit") if not currentCPU: self.log.verbose("Both initial and current CPU consumed are null") return S_OK("CPU consumed is not measurable yet") return S_OK("Not possible to determine CPU consumed") def __checkMemoryLimit(self): """Checks that the job memory consumption is within a limit""" vsize = 0 if "Vsize" in self.parameters: vsize = self.parameters["Vsize"][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit) ) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available.""" if "DiskSpace" in self.parameters: availSpace = self.parameters["DiskSpace"][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( "Not enough local disk space for job to continue, defined in CS as %s MB" % (self.minDiskSpace) ) return S_ERROR(JobMinorStatus.JOB_INSUFFICIENT_DISK) else: return S_OK("Job has enough disk space available") else: return S_ERROR("Available disk space could not be established") ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if "StartTime" in self.initialValues: startTime = self.initialValues["StartTime"] if time.time() - startTime > self.maxWallClockTime: self.log.info("Job has exceeded maximum wall clock time of %s seconds" % (self.maxWallClockTime)) return S_ERROR(JobMinorStatus.JOB_EXCEEDED_WALL_CLOCK) else: return S_OK("Job within maximum wall clock time") else: return S_ERROR("Job start time could not be established") ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded.""" if "LoadAverage" in self.parameters: loadAvg = self.parameters["LoadAverage"][-1] if loadAvg > float(self.loadAvgLimit): self.log.info("Maximum load average exceeded, defined in CS as %s " % (self.loadAvgLimit)) return S_ERROR("Job exceeded maximum load average") return S_OK("Job running with normal load average") return S_ERROR("Job load average not established") ############################################################################# def __peek(self): """Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result["OK"]: self.log.warn("Could not obtain output from running application thread") self.log.warn(result["Message"]) return result ############################################################################# def calibrate(self): """The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters["WallClockTime"] = [] cpuConsumed = self.__getCPU() if not cpuConsumed["OK"]: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed["Value"] self.initialValues["CPUConsumed"] = cpuConsumed self.parameters["CPUConsumed"] = [] self.initialValues["LoadAverage"] = float(os.getloadavg()[0]) self.parameters["LoadAverage"] = [] memUsed = self.getMemoryUsed() self.initialValues["MemoryUsed"] = memUsed self.parameters["MemoryUsed"] = [] result = self.profiler.vSizeUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get vSize info from profiler", result["Message"]) else: vsize = result["Value"] * 1024.0 self.initialValues["Vsize"] = vsize self.log.verbose("Vsize(kb)", "%.1f" % vsize) self.parameters["Vsize"] = [] result = self.profiler.memoryUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get rss info from profiler", result["Message"]) else: rss = result["Value"] * 1024.0 self.initialValues["RSS"] = rss self.log.verbose("RSS(kb)", "%.1f" % rss) self.parameters["RSS"] = [] # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused # (specific request from CERN batch service). result = self.getDiskSpace(exclude="fuse") self.log.verbose("DiskSpace: %s" % (result)) if not result["OK"]: self.log.warn("Could not establish DiskSpace") else: self.initialValues["DiskSpace"] = result["Value"] self.parameters["DiskSpace"] = [] result = self.getNodeInformation() self.log.verbose("NodeInfo", result) if "LSB_JOBID" in os.environ: result["LocalJobID"] = os.environ["LSB_JOBID"] if "PBS_JOBID" in os.environ: result["LocalJobID"] = os.environ["PBS_JOBID"] if "QSUB_REQNAME" in os.environ: result["LocalJobID"] = os.environ["QSUB_REQNAME"] if "JOB_ID" in os.environ: result["LocalJobID"] = os.environ["JOB_ID"] self.__reportParameters(result, "NodeInformation", True) self.__reportParameters(self.initialValues, "InitialValues") return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result["OK"]: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result["Value"] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info("TimeLeft bellow %s, now checking with higher frequency" % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """Returns average load, memory etc. over execution of job thread""" summary = {} # CPUConsumed if "CPUConsumed" in self.parameters: cpuList = self.parameters["CPUConsumed"] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: summary["LastUpdateCPU(s)"] = rawCPU["Value"] else: summary["LastUpdateCPU(s)"] = "Could not be estimated" # DiskSpace if "DiskSpace" in self.parameters: space = self.parameters["DiskSpace"] if space: value = abs(float(space[-1]) - float(self.initialValues["DiskSpace"])) if value < 0: value = 0 summary["DiskSpace(MB)"] = value else: summary["DiskSpace(MB)"] = "Could not be estimated" # MemoryUsed if "MemoryUsed" in self.parameters: memory = self.parameters["MemoryUsed"] if memory: summary["MemoryUsed(kb)"] = abs(float(memory[-1]) - float(self.initialValues["MemoryUsed"])) else: summary["MemoryUsed(kb)"] = "Could not be estimated" # LoadAverage if "LoadAverage" in self.parameters: laList = self.parameters["LoadAverage"] if laList: summary["LoadAverage"] = sum(laList) / len(laList) else: summary["LoadAverage"] = "Could not be estimated" result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) summary["WallClockTime(s)"] = 0 summary["ScaledCPUTime(s)"] = 0 else: wallClock = result["Value"] summary["WallClockTime(s)"] = wallClock * self.processors summary["ScaledCPUTime(s)"] = wallClock * self.cpuPower * self.processors self.__reportParameters(summary, "UsageSummary", True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job.""" try: parameters = [] self.log.info("", "==========================================================") if title: self.log.info("Watchdog will report", title) else: self.log.info("Watchdog will report parameters") self.log.info("", "==========================================================") vals = params if "Value" in params: if vals["Value"]: vals = params["Value"] for k, v in vals.items(): if v: self.log.info(str(k) + " = " + str(v)) parameters.append([k, v]) if report: self.__setJobParamList(parameters) self.log.info("", "==========================================================") except Exception as x: self.log.warn("Problem while reporting parameters") self.log.warn(repr(x)) ############################################################################# def __getWallClockTime(self): """Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if "StartTime" in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues["StartTime"] result["Value"] = wallClock else: self.initialValues["StartTime"] = time.time() result["Value"] = 0.0 return result ############################################################################# def __killRunningThread(self): """Will kill the running thread process and any child processes.""" self.log.info("Sending kill signal to application PID %s" % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info("Subprocess.killChild() returned:%s " % (result)) return S_OK("Thread killed") ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result["OK"]: self.log.warn("Problem sending sign of life") self.log.warn(result) if result["OK"] and result["Value"]: self.__interpretControlSignal(result["Value"]) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client""" # job wrapper template sets the jobID variable if "JOBID" not in os.environ: self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value) self.log.verbose("setJobParameters(%s,%s)" % (jobID, value)) if not jobParam["OK"]: self.log.warn(jobParam["Message"]) return jobParam ############################################################################# def getNodeInformation(self): """Retrieves all static system information""" result = {} result["HostName"] = socket.gethostname() result["CPU(MHz)"] = psutil.cpu_freq()[0] result["Memory(kB)"] = int(psutil.virtual_memory()[1] / 1024) result["LocalAccount"] = getpass.getuser() if os.path.exists("/proc/cpuinfo"): with open("/proc/cpuinfo", "r") as cpuinfo: info = cpuinfo.readlines() try: result["ModelName"] = info[4].split(":")[1].replace(" ", "").replace("\n", "") except IndexError: pass try: result["CacheSize(kB)"] = [x.strip().split(":")[1] for x in info if "cache size" in x][0].strip() except IndexError: pass return result ############################################################################# def getMemoryUsed(self): """Obtains the memory used.""" mem = ( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss ) return float(mem) ############################################################################# def getDiskSpace(self, exclude=None): """Obtains the available disk space.""" result = S_OK() diskSpace = getDiskSpace(exclude=exclude) if diskSpace == -1: result = S_ERROR("Could not obtain disk usage") self.log.warn(" Could not obtain disk usage") result["Value"] = float(-1) return result result["Value"] = float(diskSpace) return result
class JobAgent(AgentModule): """This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration.""" def __init__(self, agentName, loadName, baseAgentName=False, properties=None): """Just defines some default parameters""" if not properties: properties = {} super(JobAgent, self).__init__(agentName, loadName, baseAgentName, properties) # Inner CE # CE type the JobAgent submits to. It can be "InProcess" or "Pool" or "Singularity". self.ceName = "InProcess" # "Inner" CE submission type (e.g. for Pool CE). It can be "InProcess" or "Singularity". self.innerCESubmissionType = "InProcess" self.computingElement = None # The ComputingElement object, e.g. SingularityComputingElement() # Localsite options self.siteName = "Unknown" self.pilotReference = "Unknown" self.defaultProxyLength = 86400 * 5 # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = 0.0 self.jobSubmissionDelay = 10 self.fillingMode = True self.minimumTimeLeft = 5000 self.stopOnApplicationFailure = True self.stopAfterFailedMatches = 10 self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = "" # Timeleft self.initTimes = os.times() self.initTimeLeft = 0.0 self.timeLeft = self.initTimeLeft self.timeLeftUtil = None self.pilotInfoReportedFlag = False ############################################################################# def initialize(self): """Sets default parameters and creates CE instance""" localCE = gConfig.getValue("/LocalSite/LocalCE", self.ceName) if localCE != self.ceName: self.log.info("Defining Inner CE from local configuration", "= %s" % localCE) # Create backend Computing Element result = self._initializeComputingElement(localCE) if not result["OK"]: return result result = self._getCEDict(self.computingElement) if not result["OK"]: return result ceDict = result["Value"][0] self.initTimeLeft = ceDict.get("CPUTime", self.initTimeLeft) self.initTimeLeft = gConfig.getValue( "/Resources/Computing/CEDefaults/MaxCPUTime", self.initTimeLeft) self.timeLeft = self.initTimeLeft self.initTimes = os.times() # Localsite options self.siteName = siteName() self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", self.pilotReference) self.defaultProxyLength = gConfig.getValue( "/Registry/DefaultProxyLifeTime", self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", self.cpuFactor) self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", self.jobSubmissionDelay) self.fillingMode = self.am_getOption("FillingModeFlag", self.fillingMode) self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( "StopOnApplicationFailure", self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( "StopAfterFailedMatches", self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( "/AgentJobRequirements/ExtraOptions", self.extraOptions) # Utilities self.timeLeftUtil = TimeLeft() return S_OK() def _initializeComputingElement(self, localCE): """Generate a ComputingElement and configure it""" ceFactory = ComputingElementFactory() self.ceName = localCE.split("/")[ 0] # It might be "Pool/Singularity", or simply "Pool" self.innerCESubmissionType = (localCE.split("/")[1] if len( localCE.split("/")) == 2 else self.innerCESubmissionType) ceInstance = ceFactory.getCE(self.ceName) if not ceInstance["OK"]: self.log.warn("Can't instantiate a CE", ceInstance["Message"]) return ceInstance self.computingElement = ceInstance["Value"] self.computingElement.setParameters( {"InnerCESubmissionType": self.innerCESubmissionType}) return S_OK() ############################################################################# def execute(self): """The JobAgent execution method.""" # Temporary mechanism to pass a shutdown message to the agent if os.path.exists("/var/lib/dirac_drain"): return self._finish("Node is being drained by an operator") self.log.verbose("Job Agent execution loop") # Check that there is enough slots to match a job result = self._checkCEAvailability(self.computingElement) if not result["OK"]: return self._finish(result["Message"]) if result["OK"] and result["Value"]: return result # Check that we are allowed to continue and that time left is sufficient if self.jobCount: cpuWorkLeft = self._computeCPUWorkLeft() result = self._checkCPUWorkLeft(cpuWorkLeft) if not result["OK"]: return result result = self._setCPUWorkLeft(cpuWorkLeft) if not result["OK"]: return result # Get environment details and enhance them result = self._getCEDict(self.computingElement) if not result["OK"]: return result ceDictList = result["Value"] for ceDict in ceDictList: self._setCEDict(ceDict) # Try to match a job jobRequest = self._matchAJob(ceDictList) self.stopAfterFailedMatches = self.am_getOption( "StopAfterFailedMatches", self.stopAfterFailedMatches) if not jobRequest["OK"]: res = self._checkMatchingIssues(jobRequest) if not res["OK"]: self._finish(res["Message"]) return res # if we don't match a job, independently from the reason, # we wait a bit longer before trying again time.sleep( int(self.am_getOption("PollingTime")) * (self.matchFailedCount + 1) * 2) return res # If we are, we matched a job # Reset the Counter self.matchFailedCount = 0 # Check matcher information returned matcherParams = ["JDL", "DN", "Group"] matcherInfo = jobRequest["Value"] jobID = matcherInfo["JobID"] jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName) result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport) if not result["OK"]: return self._finish(result["Message"]) # Get matcher information if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( "PilotInfoReportedFlag", False) jobJDL = matcherInfo["JDL"] jobGroup = matcherInfo["Group"] ownerDN = matcherInfo["DN"] ceDict = matcherInfo["CEDict"] matchTime = matcherInfo["matchTime"] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] # Get JDL paramters parameters = self._getJDLParameters(jobJDL) if not parameters["OK"]: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters") self.log.warn("Could Not Extract JDL Parameters", parameters["Message"]) return self._finish("JDL Problem") params = parameters["Value"] result = self._extractValuesFromJobParams(params, jobReport) if not result["OK"]: return self._finish(result["Value"]) submissionParams = result["Value"] jobID = submissionParams["jobID"] jobType = submissionParams["jobType"] self.log.verbose("Job request successful: \n", jobRequest["Value"]) self.log.info( "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False) if "BOINC_JOB_ID" in os.environ: # Report BOINC environment for thisp in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"): jobReport.setJobParameter(par_name=thisp, par_value=gConfig.getValue( "/LocalSite/%s" % thisp, "Unknown"), sendFlag=False) jobReport.setJobStatus(minorStatus="Job Received by Agent", sendFlag=False) result_setupProxy = self._setupProxy(ownerDN, jobGroup) if not result_setupProxy["OK"]: result = self._rescheduleFailedJob( jobID, result_setupProxy["Message"]) return self._finish(result["Message"], self.stopOnApplicationFailure) proxyChain = result_setupProxy.get("Value") # Save the job jdl for external monitoring self._saveJobJDLRequest(jobID, jobJDL) # Check software and install them if required software = self._checkInstallSoftware(jobID, params, ceDict, jobReport) if not software["OK"]: self.log.error("Failed to install software for job", "%s" % (jobID)) errorMsg = software["Message"] if not errorMsg: errorMsg = "Failed software installation" result = self._rescheduleFailedJob(jobID, errorMsg) return self._finish(result["Message"], self.stopOnApplicationFailure) gridCE = gConfig.getValue("/LocalSite/GridCE", "") if gridCE: jobReport.setJobParameter(par_name="GridCE", par_value=gridCE, sendFlag=False) queue = gConfig.getValue("/LocalSite/CEQueue", "") if queue: jobReport.setJobParameter(par_name="CEQueue", par_value=queue, sendFlag=False) self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName)) result_submitJob = self._submitJob( jobID=jobID, jobParams=params, resourceParams=ceDict, optimizerParams=optimizerParams, proxyChain=proxyChain, jobReport=jobReport, processors=submissionParams["processors"], wholeNode=submissionParams["wholeNode"], maxNumberOfProcessors=submissionParams[ "maxNumberOfProcessors"], mpTag=submissionParams["mpTag"], ) # Committing the JobReport before evaluating the result of job submission res = jobReport.commit() if not res["OK"]: resFD = jobReport.generateForwardDISET() if not resFD["OK"]: self.log.error("Error generating ForwardDISET operation", resFD["Message"]) elif resFD["Value"]: # Here we create the Request. op = resFD["Value"] request = Request() requestName = "jobAgent_%s" % jobID request.RequestName = requestName.replace('"', "") request.JobID = jobID request.SourceComponent = "JobAgent_%s" % jobID request.addOperation(op) # This might fail, but only a message would be printed. self._sendFailoverRequest(request) if not result_submitJob["OK"]: return self._finish(result_submitJob["Message"]) elif "PayloadFailed" in result_submitJob: # Do not keep running and do not overwrite the Payload error message = "Payload execution failed with error code %s" % result_submitJob[ "PayloadFailed"] if self.stopOnApplicationFailure: return self._finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug("After %sCE submitJob()" % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) result = self._rescheduleFailedJob( jobID, "Job processing failed with exception", direct=True) return self._finish(result["Message"], self.stopOnApplicationFailure) return S_OK("Job Agent cycle complete") ############################################################################# def _saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent.""" classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString("LocalCE", self.ceName) jdlFileName = jobID + ".jdl" jdlFile = open(jdlFileName, "w") jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() ############################################################################# def _getCEDict(self, computingElement): """Get CE description :param ComputingElement computingElement: ComputingElement instance :return: list of dict of attributes """ # if we are here we assume that a job can be matched result = computingElement.getDescription() if not result["OK"]: self.log.warn("Can not get the CE description") return result # We can have several prioritized job retrieval strategies if isinstance(result["Value"], dict): ceDictList = [result["Value"]] else: # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result["Value"] return S_OK(ceDictList) def _setCEDict(self, ceDict): """Set CEDict""" # Add pilot information gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown") if gridCE != "Unknown": ceDict["GridCE"] = gridCE if "PilotReference" not in ceDict: ceDict["PilotReference"] = str(self.pilotReference) ceDict["PilotBenchmark"] = self.cpuFactor ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict("/AgentJobRequirements") if result["OK"]: requirementsDict = result["Value"] ceDict.update(requirementsDict) self.log.info("Requirements:", requirementsDict) def _checkCEAvailability(self, computingElement): """Check availability of computingElement""" result = computingElement.available() if not result["OK"]: self.log.info("Resource is not available", result["Message"]) return S_ERROR("CE Not Available") ceInfoDict = result["CEInfoDict"] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result["Value"] if not availableSlots: if runningJobs: self.log.info("No available slots", ": %d running jobs" % runningJobs) return S_OK("Job Agent cycle complete with %d running jobs" % runningJobs) self.log.info( "CE is not available (and there are no running jobs)") return S_ERROR("CE Not Available") return S_OK() ############################################################################# def _computeCPUWorkLeft(self, processors=1): """ Compute CPU Work Left in hepspec06 seconds :param int processors: number of processors available :return: cpu work left (cpu time left * cpu power of the cpus) """ # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTimeConsumed = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTimeConsumed, processors) if not result["OK"]: self.log.warn( "There were errors calculating time left using the Timeleft utility", result["Message"]) self.log.warn( "The time left will be calculated using os.times() and the info in our possession" ) self.log.info("Current raw CPU time consumed is %s" % cpuTimeConsumed) if self.cpuFactor: return self.initTimeLeft - cpuTimeConsumed * self.cpuFactor return self.timeLeft return result["Value"] def _checkCPUWorkLeft(self, cpuWorkLeft): """Check that fillingMode is enabled and time left is sufficient to continue the execution""" # Only call timeLeft utility after a job has been picked up self.log.info("Attempting to check CPU time left for filling mode") if not self.fillingMode: return self._finish("Filling Mode is Disabled") self.log.info("normalized CPU units remaining in slot", cpuWorkLeft) if cpuWorkLeft <= self.minimumTimeLeft: return self._finish("No more time left") return S_OK() def _setCPUWorkLeft(self, cpuWorkLeft): """Update the TimeLeft within the CE and the configuration for next matching request""" self.timeLeft = cpuWorkLeft result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result["OK"]: return self._finish(result["Message"]) self._updateConfiguration("CPUTimeLeft", self.timeLeft) return S_OK() ############################################################################# def _updateConfiguration(self, key, value, path="/LocalSite"): """Update local configuration to be used by submitted job wrappers""" localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join(".", self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) section = "/" for p in path.split("/")[1:]: section = os.path.join(section, p) if not localCfg.isSection(section): localCfg.createNewSection(section) localCfg.setOption("%s/%s" % (section, key), value) localCfg.writeToFile(localConfigFile) ############################################################################# def _setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue("/DIRAC/Security/UseServerCertificate", False): proxyResult = self._requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult["OK"]: self.log.error("Failed to setup proxy", proxyResult["Message"]) return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"]) return S_OK(proxyResult["Value"]) else: ret = getProxyInfo(disableVOMS=True) if not ret["OK"]: self.log.error("Invalid Proxy", ret["Message"]) return S_ERROR("Invalid Proxy") proxyChain = ret["Value"]["chain"] if "groupProperties" not in ret["Value"]: print(ret["Value"]) print(proxyChain.dumpAllToString()) self.log.error("Invalid Proxy", "Group has no properties defined") return S_ERROR("Proxy has no group properties defined") groupProps = ret["Value"]["groupProperties"] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self._requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult["OK"]: self.log.error("Invalid Proxy", proxyResult["Message"]) return S_ERROR("Failed to setup proxy: %s" % proxyResult["Message"]) proxyChain = proxyResult["Value"] return S_OK(proxyChain) def _requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy', 'for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.verbose( "No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal["OK"]: self.log.error("Could not retrieve payload proxy", retVal["Message"]) os.system("dirac-proxy-info") sys.stdout.flush() return S_ERROR("Error retrieving proxy") chain = retVal["Value"] return S_OK(chain) ############################################################################# def _checkInstallSoftware(self, jobID, jobParams, resourceParams, jobReport): """Checks software requirement of job and whether this is already present before installing software locally. """ if "SoftwareDistModule" not in jobParams: msg = "Job has no software installation requirement" self.log.verbose(msg) return S_OK(msg) jobReport.setJobStatus(minorStatus="Installing Software", sendFlag=False) softwareDist = jobParams["SoftwareDistModule"] self.log.verbose("Found VO Software Distribution module", ": %s" % (softwareDist)) argumentsDict = {"Job": jobParams, "CE": resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance["OK"]: return moduleInstance module = moduleInstance["Value"] return module.execute() ############################################################################# def _matchAJob(self, ceDictList): """Call the Matcher with each ceDict until we get a job""" jobRequest = S_ERROR("No CE Dictionary available") for ceDict in ceDictList: self.log.verbose("CE dict", ceDict) start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info("MatcherTime", "= %.2f (s)" % (matchTime)) if jobRequest["OK"]: jobRequest["Value"]["matchTime"] = matchTime jobRequest["Value"]["CEDict"] = ceDict break return jobRequest def _checkMatchingIssues(self, jobRequest): """Check the source of the matching issue :param dict jobRequest: S_ERROR returned by the matcher :return: S_OK/S_ERROR """ if DErrno.cmpError(jobRequest, DErrno.EWMSPLTVER): self.log.error("Pilot version mismatch", jobRequest["Message"]) return jobRequest if DErrno.cmpError(jobRequest, DErrno.EWMSNOMATCH): self.log.notice("Job request OK, but no match found", jobRequest["Message"]) elif jobRequest["Message"].find("seconds timeout") != -1: self.log.error("Timeout while requesting job", jobRequest["Message"]) else: self.log.notice("Failed to get jobs", jobRequest["Message"]) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self._finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches) return S_OK() def _checkMatcherInfo(self, matcherInfo, matcherParams, jobReport): """Check that all relevant information about the job are available""" for param in matcherParams: if param not in matcherInfo: jobReport.setJobStatus( status=JobStatus.FAILED, minorStatus="Matcher did not return %s" % (param)) return S_ERROR("Matcher Failed") if not matcherInfo[param]: jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Matcher returned null %s" % (param)) return S_ERROR("Matcher Failed") self.log.verbose("Matcher returned", "%s = %s " % (param, matcherInfo[param])) return S_OK() ############################################################################# def _submitJob( self, jobID, jobParams, resourceParams, optimizerParams, proxyChain, jobReport, processors=1, wholeNode=False, maxNumberOfProcessors=0, mpTag=False, ): """Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption("DefaultLogLevel", "INFO") defaultWrapperLocation = self.am_getOption( "JobWrapperTemplate", "DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py") # Add the number of requested processors to the job environment if "ExecutionEnvironment" in jobParams: if isinstance(jobParams["ExecutionEnvironment"], six.string_types): jobParams["ExecutionEnvironment"] = jobParams[ "ExecutionEnvironment"].split(";") jobParams.setdefault("ExecutionEnvironment", []).append("DIRAC_JOB_PROCESSORS=%d" % processors) jobDesc = { "jobID": jobID, "jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams, "extraOptions": self.extraOptions, "defaultWrapperLocation": defaultWrapperLocation, } result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc) if not result["OK"]: return result wrapperFile = result["Value"][0] inputs = list(result["Value"][1:]) jobReport.setJobStatus(minorStatus="Submitting To CE") self.log.info( "Submitting JobWrapper", "%s to %sCE" % (os.path.basename(wrapperFile), self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy["OK"]: self.log.error("Invalid proxy", proxy) return S_ERROR("Payload Proxy Not Found") payloadProxy = proxy["Value"] submission = self.computingElement.submitJob( wrapperFile, payloadProxy, numberOfProcessors=processors, maxNumberOfProcessors=maxNumberOfProcessors, wholeNode=wholeNode, mpTag=mpTag, jobDesc=jobDesc, log=self.log, logLevel=logLevel, inputs=inputs, ) submissionResult = S_OK("Job submitted") if submission["OK"]: batchID = submission["Value"] self.log.info("Job submitted", "(DIRAC JobID: %s; Batch ID: %s" % (jobID, batchID)) if "PayloadFailed" in submission: submissionResult["PayloadFailed"] = submission["PayloadFailed"] time.sleep(self.jobSubmissionDelay) else: self.log.error("Job submission failed", jobID) jobReport.setJobParameter(par_name="ErrorMessage", par_value="%s CE Submission Error" % (self.ceName), sendFlag=False) if "ReschedulePayload" in submission: result = self._rescheduleFailedJob(jobID, submission["Message"]) self._finish(result["Message"], self.stopOnApplicationFailure) return S_OK() # Without this, the job is marked as failed else: if "Value" in submission: # yes, it's "correct", S_ERROR with 'Value' key self.log.error( "Error in DIRAC JobWrapper or inner CE execution:", "exit code = %s" % (str(submission["Value"])), ) self.log.error("CE Error", "%s : %s" % (self.ceName, submission["Message"])) submissionResult = submission return submissionResult ############################################################################# def _getJDLParameters(self, jdl): """Returns a dictionary of JDL parameters.""" try: parameters = {} # print jdl if not re.search(r"\[", jdl): jdl = "[" + jdl + "]" classAdJob = ClassAd(jdl) paramsDict = classAdJob.contents for param, value in paramsDict.items(): if value.strip().startswith("{"): self.log.debug("Found list type parameter %s" % (param)) rawValues = value.replace("{", "").replace("}", "").replace( '"', "").split() valueList = [] for val in rawValues: if re.search(",$", val): valueList.append(val[:-1]) else: valueList.append(val) parameters[param] = valueList else: parameters[param] = value.replace('"', "").replace( "{", '"{').replace("}", '}"') self.log.debug("Found standard parameter %s: %s" % (param, parameters[param])) return S_OK(parameters) except Exception as x: self.log.exception(lException=x) return S_ERROR("Exception while extracting JDL parameters for job") def _extractValuesFromJobParams(self, params, jobReport): """Extract values related to the job from the job parameter dictionary""" submissionDict = {} submissionDict["jobID"] = params.get("JobID") if not submissionDict["jobID"]: msg = "Job has not JobID defined in JDL parameters" jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus=msg) self.log.warn(msg) return S_ERROR("JDL Problem") submissionDict["jobType"] = params.get("JobType", "Unknown") if submissionDict["jobType"] == "Unknown": self.log.warn("Job has no JobType defined in JDL parameters") if "CPUTime" not in params: self.log.warn( "Job has no CPU requirement defined in JDL parameters") # Job requirements for determining the number of processors # the minimum number of processors requested submissionDict["processors"] = int( params.get("NumberOfProcessors", int(params.get("MinNumberOfProcessors", 1)))) # the maximum number of processors allowed to the payload submissionDict["maxNumberOfProcessors"] = int( params.get("MaxNumberOfProcessors", 0)) # need or not the whole node for the job submissionDict["wholeNode"] = "WholeNode" in params submissionDict["mpTag"] = "MultiProcessor" in params.get("Tags", []) if self.extraOptions and "dirac-jobexec" in params.get( "Executable", "").strip(): params["Arguments"] = (params.get("Arguments", "") + " " + self.extraOptions).strip() params["ExtraOptions"] = self.extraOptions return S_OK(submissionDict) ############################################################################# def _finish(self, message, stop=True): """Force the JobAgent to complete gracefully.""" if stop: self.log.info("JobAgent will stop", 'with message "%s", execution complete.' % message) self.am_stopExecution() return S_ERROR(message) return S_OK(message) ############################################################################# def _rescheduleFailedJob(self, jobID, message, direct=False): """ Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager """ self.log.warn("Failure ==> rescheduling", "(during %s)" % (message)) if direct: JobStateUpdateClient().setJobStatus(int(jobID), status=JobStatus.RESCHEDULED, applicationStatus=message, source="JobAgent@%s", force=True) else: jobReport = JobReport(int(jobID), "JobAgent@%s" % self.siteName) # Setting a job parameter does not help since the job will be rescheduled, # instead set the status with the cause and then another status showing the # reschedule operation. jobReport.setJobStatus(status=JobStatus.RESCHEDULED, applicationStatus=message, sendFlag=True) self.log.info("Job will be rescheduled") result = JobManagerClient().rescheduleJob(jobID) if not result["OK"]: self.log.error("Failed to reschedule job", result["Message"]) return S_ERROR("Problem Rescheduling Job") self.log.info("Job Rescheduled", jobID) return S_ERROR("Job Rescheduled") ############################################################################# def _sendFailoverRequest(self, request): """Send failover reques per Job. This request would basically be a DISET request for setting the job status. If this fails, it only prints a message. :param Request request: Request() object :return: None """ if len(request): self.log.info("Trying to send the failover request") # The request is ready, send it now isValid = RequestValidator().validate(request) if not isValid["OK"]: self.log.error("Failover request is not valid", isValid["Message"]) self.log.error("Printing out the content of the request") reqToJSON = request.toJSON() if reqToJSON["OK"]: print(str(reqToJSON["Value"])) else: self.log.error( "Something went wrong creating the JSON from request", reqToJSON["Message"]) else: # Now trying to send the request requestClient = ReqClient() result = requestClient.putRequest(request) if not result["OK"]: self.log.error("Failed to set failover request", result["Message"]) def finalize(self): """Job Agent finalization method""" # wait for all jobs to be completed res = self.computingElement.shutdown() if not res["OK"]: self.log.error("CE could not be properly shut down", res["Message"]) elif res["Value"]: self.log.info("Job submission(s) result", res["Value"]) gridCE = gConfig.getValue("/LocalSite/GridCE", "") queue = gConfig.getValue("/LocalSite/CEQueue", "") result = PilotManagerClient().setPilotStatus(str(self.pilotReference), PilotStatus.DONE, gridCE, "Report from JobAgent", self.siteName, queue) if not result["OK"]: self.log.warn("Issue setting the pilot status", result["Message"]) return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring, logLevel INFO, limited cycles self.am_setOption('MonitoringEnabled', False) self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', self.ceName) localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration', '= %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn("Can't instantiate a CE", ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result if isinstance(result['Value'], list): ceDict = result['Value'][0] else: ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK()
class JobAgent(AgentModule): """ This agent is what runs in a worker node. The pilot runs it, after having prepared its configuration. """ def __init__(self, agentName, loadName, baseAgentName=False, properties=None): """ Just defines some default parameters """ if not properties: properties = {} super(JobAgent, self).__init__(agentName, loadName, baseAgentName, properties) self.ceName = 'InProcess' self.computingElement = None self.timeLeft = 0.0 self.initTimes = os.times() # Localsite options self.siteName = 'Unknown' self.pilotReference = 'Unknown' self.defaultProxyLength = 86400 * 5 # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = 0.0 self.jobSubmissionDelay = 10 self.fillingMode = False self.minimumTimeLeft = 1000 self.stopOnApplicationFailure = True self.stopAfterFailedMatches = 10 self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = '' # Timeleft self.timeLeftUtil = None self.timeLeftError = '' self.pilotInfoReportedFlag = False ############################################################################# def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring, logLevel INFO, limited cycles self.am_setOption('MonitoringEnabled', False) self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', self.ceName) localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration', '= %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn("Can't instantiate a CE", ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result if isinstance(result['Value'], list): ceDict = result['Value'][0] else: ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK() ############################################################################# def execute(self): """The JobAgent execution method. """ if self.jobCount: # Temporary mechanism to pass a shutdown message to the agent if os.path.exists('/var/lib/dirac_drain'): return self.__finish('Node is being drained by an operator') # Only call timeLeft utility after a job has been picked up self.log.info('Attempting to check CPU time left for filling mode') if self.fillingMode: if self.timeLeftError: self.log.warn( "Disabling filling mode as errors calculating time left", self.timeLeftError) return self.__finish(self.timeLeftError) self.log.info('normalized CPU units remaining in slot', self.timeLeft) if self.timeLeft <= self.minimumTimeLeft: return self.__finish('No more time left') # Need to update the Configuration so that the new value is published in the next matching request result = self.computingElement.setCPUTimeLeft( cpuTimeLeft=self.timeLeft) if not result['OK']: return self.__finish(result['Message']) # Update local configuration to be used by submitted job wrappers localCfg = CFG() if self.extraOptions: localConfigFile = os.path.join('.', self.extraOptions) else: localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg") localCfg.loadFromFile(localConfigFile) if not localCfg.isSection('/LocalSite'): localCfg.createNewSection('/LocalSite') localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft) localCfg.writeToFile(localConfigFile) else: return self.__finish('Filling Mode is Disabled') self.log.verbose('Job Agent execution loop') result = self.computingElement.available() if not result['OK']: self.log.info('Resource is not available', result['Message']) return self.__finish('CE Not Available') ceInfoDict = result['CEInfoDict'] runningJobs = ceInfoDict.get("RunningJobs") availableSlots = result['Value'] if not availableSlots: if runningJobs: self.log.info('No available slots', '%d running jobs' % runningJobs) return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs) else: self.log.info('CE is not available') return self.__finish('CE Not Available') result = self.computingElement.getDescription() if not result['OK']: return result # We can have several prioritized job retrieval strategies if isinstance(result['Value'], dict): ceDictList = [result['Value']] elif isinstance(result['Value'], list): # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy' ceDictList = result['Value'] for ceDict in ceDictList: # Add pilot information gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown') if gridCE != 'Unknown': ceDict['GridCE'] = gridCE if 'PilotReference' not in ceDict: ceDict['PilotReference'] = str(self.pilotReference) ceDict['PilotBenchmark'] = self.cpuFactor ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag # Add possible job requirements result = gConfig.getOptionsDict('/AgentJobRequirements') if result['OK']: requirementsDict = result['Value'] ceDict.update(requirementsDict) self.log.info('Requirements:', requirementsDict) self.log.verbose('CE dict', ceDict) # here finally calling the matcher start = time.time() jobRequest = MatcherClient().requestJob(ceDict) matchTime = time.time() - start self.log.info('MatcherTime', '= %.2f (s)' % (matchTime)) if jobRequest['OK']: break self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) if not jobRequest['OK']: if re.search('No match found', jobRequest['Message']): self.log.notice('Job request OK, but no match found', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find("seconds timeout") != -1: self.log.error('Timeout while requesting job', jobRequest['Message']) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) elif jobRequest['Message'].find( "Pilot version does not match") != -1: errorMsg = 'Pilot version does not match the production version' self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, '')) return S_ERROR(jobRequest['Message']) else: self.log.notice('Failed to get jobs', ': %s' % (jobRequest['Message'])) self.matchFailedCount += 1 if self.matchFailedCount > self.stopAfterFailedMatches: return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches) return S_OK(jobRequest['Message']) # Reset the Counter self.matchFailedCount = 0 matcherInfo = jobRequest['Value'] if not self.pilotInfoReportedFlag: # Check the flag after the first access to the Matcher self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False) jobID = matcherInfo['JobID'] matcherParams = ['JDL', 'DN', 'Group'] for param in matcherParams: if param not in matcherInfo: self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param)) return self.__finish('Matcher Failed') elif not matcherInfo[param]: self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param)) return self.__finish('Matcher Failed') else: self.log.verbose('Matcher returned', '%s = %s ' % (param, matcherInfo[param])) jobJDL = matcherInfo['JDL'] jobGroup = matcherInfo['Group'] ownerDN = matcherInfo['DN'] optimizerParams = {} for key in matcherInfo: if key not in matcherParams: optimizerParams[key] = matcherInfo[key] parameters = self._getJDLParameters(jobJDL) if not parameters['OK']: self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters') self.log.warn('Could Not Extract JDL Parameters', parameters['Message']) return self.__finish('JDL Problem') params = parameters['Value'] if 'JobID' not in params: msg = 'Job has not JobID defined in JDL parameters' self.__report(jobID, 'Failed', msg) self.log.warn(msg) return self.__finish('JDL Problem') else: jobID = params['JobID'] if 'JobType' not in params: self.log.warn('Job has no JobType defined in JDL parameters') jobType = 'Unknown' else: jobType = params['JobType'] if 'CPUTime' not in params: self.log.warn( 'Job has no CPU requirement defined in JDL parameters') # Job requirements for determining the number of processors # the minimum number of processors requested processors = int( params.get('NumberOfProcessors', int(params.get('MinNumberOfProcessors', 1)))) # the maximum number of processors allowed to the payload maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0)) # need or not the whole node for the job wholeNode = 'WholeNode' in params mpTag = 'MultiProcessor' in params.get('Tags', []) if self.extraOptions: params['Arguments'] += ' ' + self.extraOptions params['ExtraOptions'] = self.extraOptions self.log.verbose('Job request successful: \n', jobRequest['Value']) self.log.info( 'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' % (jobID, jobType, ownerDN, jobGroup)) self.jobCount += 1 try: jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName) jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False) if 'BOINC_JOB_ID' in os.environ: # Report BOINC environment for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'): jobReport.setJobParameter(thisp, gConfig.getValue( '/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False) jobReport.setJobStatus('Matched', 'Job Received by Agent') result = self._setupProxy(ownerDN, jobGroup) if not result['OK']: return self._rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure) proxyChain = result.get('Value') # Save the job jdl for external monitoring self.__saveJobJDLRequest(jobID, jobJDL) software = self._checkInstallSoftware(jobID, params, ceDict) if not software['OK']: self.log.error('Failed to install software for job', '%s' % (jobID)) errorMsg = software['Message'] if not errorMsg: errorMsg = 'Failed software installation' return self._rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure) self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName)) result = self._submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode, maxNumberOfProcessors, mpTag) if not result['OK']: self.__report(jobID, 'Failed', result['Message']) return self.__finish(result['Message']) elif 'PayloadFailed' in result: # Do not keep running and do not overwrite the Payload error message = 'Payload execution failed with error code %s' % result[ 'PayloadFailed'] if self.stopOnApplicationFailure: return self.__finish(message, self.stopOnApplicationFailure) else: self.log.info(message) self.log.debug('After %sCE submitJob()' % (self.ceName)) except Exception as subExcept: # pylint: disable=broad-except self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True) return self._rescheduleFailedJob( jobID, 'Job processing failed with exception', self.stopOnApplicationFailure) # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?) cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1]) result = self.timeLeftUtil.getTimeLeft(cpuTime, processors) if result['OK']: self.timeLeft = result['Value'] else: if result['Message'] != 'Current batch system is not supported': self.timeLeftError = result['Message'] else: # if the batch system is not defined, use the process time and the CPU normalization defined locally self.timeLeft = self._getCPUTimeLeft() return S_OK('Job Agent cycle complete') ############################################################################# def __saveJobJDLRequest(self, jobID, jobJDL): """Save job JDL local to JobAgent. """ classAdJob = ClassAd(jobJDL) classAdJob.insertAttributeString('LocalCE', self.ceName) jdlFileName = jobID + '.jdl' jdlFile = open(jdlFileName, 'w') jdl = classAdJob.asJDL() jdlFile.write(jdl) jdlFile.close() ############################################################################# def _getCPUTimeLeft(self): """Return the TimeLeft as estimated by DIRAC using the Normalization Factor in the Local Config. """ cpuTime = sum(os.times()[:-1]) self.log.info('Current raw CPU time consumed is %s' % cpuTime) timeleft = self.timeLeft if self.cpuFactor: timeleft -= cpuTime * self.cpuFactor return timeleft ############################################################################# def _setupProxy(self, ownerDN, ownerGroup): """ Retrieve a proxy for the execution of the job """ if gConfig.getValue('/DIRAC/Security/UseServerCertificate', False): proxyResult = self._requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Failed to setup proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) return S_OK(proxyResult['Value']) else: ret = getProxyInfo(disableVOMS=True) if not ret['OK']: self.log.error('Invalid Proxy', ret['Message']) return S_ERROR('Invalid Proxy') proxyChain = ret['Value']['chain'] if 'groupProperties' not in ret['Value']: print(ret['Value']) print(proxyChain.dumpAllToString()) self.log.error('Invalid Proxy', 'Group has no properties defined') return S_ERROR('Proxy has no group properties defined') groupProps = ret['Value']['groupProperties'] if Properties.GENERIC_PILOT in groupProps or Properties.PILOT in groupProps: proxyResult = self._requestProxyFromProxyManager( ownerDN, ownerGroup) if not proxyResult['OK']: self.log.error('Invalid Proxy', proxyResult['Message']) return S_ERROR('Failed to setup proxy: %s' % proxyResult['Message']) proxyChain = proxyResult['Value'] return S_OK(proxyChain) ############################################################################# def _requestProxyFromProxyManager(self, ownerDN, ownerGroup): """Retrieves user proxy with correct role for job and sets up environment to run job locally. """ self.log.info("Requesting proxy', 'for %s@%s" % (ownerDN, ownerGroup)) token = gConfig.getValue("/Security/ProxyToken", "") if not token: self.log.info( "No token defined. Trying to download proxy without token") token = False retVal = gProxyManager.getPayloadProxyFromDIRACGroup( ownerDN, ownerGroup, self.defaultProxyLength, token) if not retVal['OK']: self.log.error('Could not retrieve payload proxy', retVal['Message']) os.system('dirac-proxy-info') sys.stdout.flush() return S_ERROR('Error retrieving proxy') chain = retVal['Value'] return S_OK(chain) ############################################################################# def _checkInstallSoftware(self, jobID, jobParams, resourceParams): """Checks software requirement of job and whether this is already present before installing software locally. """ if 'SoftwareDistModule' not in jobParams: msg = 'Job has no software installation requirement' self.log.verbose(msg) return S_OK(msg) self.__report(jobID, 'Matched', 'Installing Software') softwareDist = jobParams['SoftwareDistModule'] self.log.verbose('Found VO Software Distribution module', ': %s' % (softwareDist)) argumentsDict = {'Job': jobParams, 'CE': resourceParams} moduleFactory = ModuleFactory() moduleInstance = moduleFactory.getModule(softwareDist, argumentsDict) if not moduleInstance['OK']: return moduleInstance module = moduleInstance['Value'] return module.execute() ############################################################################# def _submitJob(self, jobID, jobParams, resourceParams, optimizerParams, proxyChain, processors=1, wholeNode=False, maxNumberOfProcessors=0, mpTag=False): """ Submit job to the Computing Element instance after creating a custom Job Wrapper with the available job parameters. """ logLevel = self.am_getOption('DefaultLogLevel', 'INFO') defaultWrapperLocation = self.am_getOption( 'JobWrapperTemplate', 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py') jobDesc = { "jobID": jobID, "jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams, "extraOptions": self.extraOptions, "defaultWrapperLocation": defaultWrapperLocation } result = createJobWrapper(log=self.log, logLevel=logLevel, **jobDesc) if not result['OK']: return result wrapperFile = result['Value'] self.__report(jobID, 'Matched', 'Submitted To CE') self.log.info( 'Submitting JobWrapper', '%s to %sCE' % (os.path.basename(wrapperFile), self.ceName)) # Pass proxy to the CE proxy = proxyChain.dumpAllToString() if not proxy['OK']: self.log.error('Invalid proxy', proxy) return S_ERROR('Payload Proxy Not Found') payloadProxy = proxy['Value'] submission = self.computingElement.submitJob( wrapperFile, payloadProxy, numberOfProcessors=processors, maxNumberOfProcessors=maxNumberOfProcessors, wholeNode=wholeNode, mpTag=mpTag, jobDesc=jobDesc, log=self.log, logLevel=logLevel) ret = S_OK('Job submitted') if submission['OK']: batchID = submission['Value'] self.log.info('Job submitted', '%s as %s' % (jobID, batchID)) if 'PayloadFailed' in submission: ret['PayloadFailed'] = submission['PayloadFailed'] return ret time.sleep(self.jobSubmissionDelay) else: self.log.error('Job submission failed', jobID) self.__setJobParam(jobID, 'ErrorMessage', '%s CE Submission Error' % (self.ceName)) if 'ReschedulePayload' in submission: rescheduleFailedJob(jobID, submission['Message']) return S_OK() # Without this, the job is marked as failed else: if 'Value' in submission: self.log.error( 'Error in DIRAC JobWrapper:', 'exit code = %s' % (str(submission['Value']))) return S_ERROR('%s CE Error: %s' % (self.ceName, submission['Message'])) return ret ############################################################################# def _getJDLParameters(self, jdl): """Returns a dictionary of JDL parameters. """ try: parameters = {} # print jdl if not re.search(r'\[', jdl): jdl = '[' + jdl + ']' classAdJob = ClassAd(jdl) paramsDict = classAdJob.contents for param, value in paramsDict.items(): if value.strip().startswith('{'): self.log.debug('Found list type parameter %s' % (param)) rawValues = value.replace('{', '').replace('}', '').replace( '"', '').split() valueList = [] for val in rawValues: if re.search(',$', val): valueList.append(val[:-1]) else: valueList.append(val) parameters[param] = valueList else: parameters[param] = value.replace('"', '').replace( '{', '"{').replace('}', '}"') self.log.debug('Found standard parameter %s: %s' % (param, parameters[param])) return S_OK(parameters) except Exception as x: self.log.exception(lException=x) return S_ERROR('Exception while extracting JDL parameters for job') ############################################################################# def __report(self, jobID, status, minorStatus): """Wraps around setJobStatus of state update client """ jobStatus = JobStateUpdateClient().setJobStatus( int(jobID), status, minorStatus, 'JobAgent@%s' % self.siteName) self.log.verbose( 'Setting job status', 'setJobStatus(%s,%s,%s,%s)' % (jobID, status, minorStatus, 'JobAgent@%s' % self.siteName)) if not jobStatus['OK']: self.log.warn('Issue setting the job status', jobStatus['Message']) return jobStatus ############################################################################# def __setJobParam(self, jobID, name, value): """Wraps around setJobParameter of state update client """ jobParam = JobStateUpdateClient().setJobParameter( int(jobID), str(name), str(value)) self.log.verbose('Setting job parameter', 'setJobParameter(%s,%s,%s)' % (jobID, name, value)) if not jobParam['OK']: self.log.warn('Issue setting the job parameter', jobParam['Message']) return jobParam ############################################################################# def __finish(self, message, stop=True): """Force the JobAgent to complete gracefully. """ if stop: self.log.info('JobAgent will stop', 'with message "%s", execution complete.' % message) self.am_stopExecution() return S_ERROR(message) else: return S_OK(message) ############################################################################# def _rescheduleFailedJob(self, jobID, message, stop=True): """ Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager """ self.log.warn('Failure ==> rescheduling', '(during %s)' % (message)) jobReport = JobReport(int(jobID), 'JobAgent@%s' % self.siteName) # Setting a job parameter does not help since the job will be rescheduled, # instead set the status with the cause and then another status showing the # reschedule operation. jobReport.setJobStatus(status='Rescheduled', application=message, sendFlag=True) self.log.info('Job will be rescheduled') result = JobManagerClient().rescheduleJob(jobID) if not result['OK']: self.log.error('Failed to reschedule job', result['Message']) return self.__finish('Problem Rescheduling Job', stop) self.log.info('Job Rescheduled', jobID) return self.__finish('Job Rescheduled', stop) ############################################################################# def finalize(self): """ Job Agent finalization method """ gridCE = gConfig.getValue('/LocalSite/GridCE', '') queue = gConfig.getValue('/LocalSite/CEQueue', '') result = PilotManagerClient().setPilotStatus(str(self.pilotReference), 'Done', gridCE, 'Report from JobAgent', self.siteName, queue) if not result['OK']: self.log.warn('Issue setting the pilot status', result['Message']) return S_OK()
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}): """ Constructor, takes system flag as argument. """ self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800)) # 30 minutes self.stopSigFinishSeconds = int( jobArgs.get('StopSigFinishSeconds', 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2)) # SIGINT self.stopSigRegex = jobArgs.get('StopSigRegex', None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.scaleFactor = 1.0 self.processors = processors ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5) if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0) return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always('Can not start watchdog for the following reason') self.log.always(result['Message']) return result try: while True: self.log.debug('Starting watchdog loop # %d' % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result['OK']: self.log.error("Watchdog error during execution", result['Message']) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR('Exception') ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info( 'Process to monitor has completed, Watchdog will exit.') return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if not self.stopSigSent and self.stopSigRegex is not None and ( time.time() - self.initialValues['StartTime'] ) > self.wallClockCheckSeconds * self.wallClockCheckCount: self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues['StartTime'] ) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn('Problem during recent checks') self.log.warn(result['Message']) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature('jobstart_secs') if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \ (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline)) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error( 'Failed to send signals to JobWrapper children! (%s)' % str(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose('------------------------------------') self.log.verbose('Checking loop starts for Watchdog') heartBeatDict = {} msg = '' loadAvg = float(os.getloadavg()[0]) msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if 'LoadAverage' not in self.parameters: self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append(loadAvg) memoryUsed = self.getMemoryUsed() msg += 'MemUsed: %.1f kb ' % (memoryUsed) heartBeatDict['MemoryUsed'] = memoryUsed if 'MemoryUsed' not in self.parameters: self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append(memoryUsed) result = self.profiler.vSizeUsage(withChildren=True) if not result['OK']: self.log.warn("Could not get vSize info from profiler", result['Message']) else: vsize = result['Value'] * 1024. heartBeatDict['Vsize'] = vsize self.parameters.setdefault('Vsize', []) self.parameters['Vsize'].append(vsize) msg += "Job Vsize: %.1f kb " % vsize result = self.profiler.memoryUsage(withChildren=True) if not result['OK']: self.log.warn("Could not get rss info from profiler", result['Message']) else: rss = result['Value'] * 1024. heartBeatDict['RSS'] = rss self.parameters.setdefault('RSS', []) self.parameters['RSS'].append(rss) msg += "Job RSS: %.1f kb " % rss if 'DiskSpace' not in self.parameters: self.parameters['DiskSpace'] = [] result = self.getDiskSpace() if not result['OK']: self.log.warn("Could not establish DiskSpace", result['Message']) else: msg += 'DiskSpace: %.1f MB ' % (result['Value']) self.parameters['DiskSpace'].append(result['Value']) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % (cpu) if 'CPUConsumed' not in self.parameters: self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) else: msg += 'WallClock: %.2f s ' % (result['Value']) self.parameters.setdefault('WallClockTime', list()).append(result['Value']) heartBeatDict['WallClockTime'] = result['Value'] self.log.info(msg) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) self.log.info( 'Last %s lines of available application output:' % (size)) self.log.info('================START================') for line in outputList: self.log.info(line) self.log.info('=================END=================') self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % ( size, Time.dateTime()) border = '=' * len(recentStdOut) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % ( hmsCPU) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn( 'Turning off job peeking for remainder of execution') if 'JOBID' not in os.environ: self.log.info( 'Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput': recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK('Watchdog checking cycle complete') ############################################################################# def __getCPU(self): """Uses the profiler to get CPU time for current process, its child, and the terminated child, and returns HH:MM:SS after conversion. """ result = self.profiler.cpuUsageUser(withChildren=True, withTerminatedChildren=True) if not result['OK']: self.log.warn("Issue while checking consumed CPU for user", result['Message']) if result['Errno'] == errno.ESRCH: self.log.warn( "The main process does not exist (anymore). This might be correct." ) return result cpuUsageUser = result['Value'] result = self.profiler.cpuUsageSystem(withChildren=True, withTerminatedChildren=True) if not result['OK']: self.log.warn("Issue while checking consumed CPU for system", result['Message']) if result['Errno'] == errno.ESRCH: self.log.warn( "The main process does not exist (anymore). This might be correct." ) return result cpuUsageSystem = result['Value'] cpuTimeTotal = cpuUsageUser + cpuUsageSystem if cpuTimeTotal: self.log.verbose("Raw CPU time consumed (s) =", cpuTimeTotal) return self.__getCPUHMS(cpuTimeTotal) self.log.error("CPU time consumed found to be 0") return S_ERROR() ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = '%02d:%02d:%02d' % (hours, mins, secs) self.log.verbose('Human readable CPU time is: %s' % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info('Received control signal') if isinstance(signalDict, dict): if 'Kill' in signalDict: self.log.info( 'Received Kill signal, stopping job via control signal') self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info( 'The following control signal was sent but not understood by the watchdog:' ) self.log.info(signalDict) else: self.log.info( 'Expected dictionary for control signal, received:\n%s' % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() if not result['OK']: self.log.warn(result['Message']) return result report += 'WallClock: OK, ' else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() if not result['OK']: self.log.warn(result['Message']) return result report += 'DiskSpace: OK, ' else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() if not result['OK']: self.log.warn( "Check of load average failed, but won't fail because of that: %s" % result['Message']) report += 'LoadAverage: ERROR, ' return S_OK() report += 'LoadAverage: OK, ' else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result['OK']: return result report += 'CPUConsumed: OK, ' else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'CPULimit OK, ' else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'MemoryLimit OK, ' else: report += 'MemoryLimit: NA, ' self.log.info(report) return S_OK('All enabled checks passed') ############################################################################# def __checkCPUConsumed(self): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if 'WallClockTime' not in self.parameters: return S_ERROR('Missing WallClockTime info') if 'CPUConsumed' not in self.parameters: return S_ERROR('Missing CPUConsumed info') wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % (wallClockTime, self.sampleCPUTime)) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters['CPUConsumed']) < intervals + 1: self.log.info( "Not enough snapshots to calculate, there are %s and we need %s" % (len(self.parameters['CPUConsumed']), intervals + 1)) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters[ 'WallClockTime'][-1 - intervals] try: cpuTime = self.__convertCPUTime( self.parameters['CPUConsumed'][-1])['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime( self.parameters['CPUConsumed'][-1 - intervals])['Value'] if cpuTime < 0: self.log.warn( 'Consumed CPU time negative, something wrong may have happened, ignore' ) return S_OK() if wallClockTime <= 0: self.log.warn( 'Wallclock time should not be negative or zero, Ignore') return S_OK() ratio = (cpuTime / wallClockTime) * 100. self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK') or \ 'DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK' in os.environ: self.log.warn( 'N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload' ) return S_OK() self.log.info("Job is stalled!") return S_ERROR('Watchdog identified this job as stalled') except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """ Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(':') # for i in xrange( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR('Could not calculate CPU time') # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result['Value'] = normalizedCPUValue self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """ Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if 'CPUConsumed' in self.parameters: consumedCPU = self.parameters['CPUConsumed'][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict['OK']: currentCPU = consumedCPUDict['Value'] else: return S_OK('Not possible to determine current CPU consumed') if consumedCPU: limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % (self.jobCPUMargin)) return S_ERROR('Job has exceeded maximum CPU time limit') else: return S_OK('Job within CPU limit') elif not currentCPU: self.log.verbose('Both initial and current CPU consumed are null') return S_OK('CPU consumed is not measurable yet') else: return S_OK('Not possible to determine CPU consumed') def __checkMemoryLimit(self): """ Checks that the job memory consumption is within a limit """ vsize = 0 if 'Vsize' in self.parameters: vsize = self.parameters['Vsize'][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit)) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available. """ if 'DiskSpace' in self.parameters: availSpace = self.parameters['DiskSpace'][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( 'Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace)) return S_ERROR('Job has insufficient disk space to continue') else: return S_OK('Job has enough disk space available') else: return S_ERROR('Available disk space could not be established') ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if 'StartTime' in self.initialValues: startTime = self.initialValues['StartTime'] if time.time() - startTime > self.maxWallClockTime: self.log.info( 'Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime)) return S_ERROR('Job has exceeded maximum wall clock time') else: return S_OK('Job within maximum wall clock time') else: return S_ERROR('Job start time could not be established') ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded. """ if 'LoadAverage' in self.parameters: loadAvg = self.parameters['LoadAverage'][-1] if loadAvg > float(self.loadAvgLimit): self.log.info( 'Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit)) return S_ERROR('Job exceeded maximum load average') else: return S_OK('Job running with normal load average') else: return S_ERROR('Job load average not established') ############################################################################# def __peek(self): """ Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result['OK']: self.log.warn( 'Could not obtain output from running application thread') self.log.warn(result['Message']) return result ############################################################################# def calibrate(self): """ The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters['WallClockTime'] = [] cpuConsumed = self.__getCPU() if not cpuConsumed['OK']: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed['Value'] self.initialValues['CPUConsumed'] = cpuConsumed self.parameters['CPUConsumed'] = [] self.initialValues['LoadAverage'] = float(os.getloadavg()[0]) self.parameters['LoadAverage'] = [] memUsed = self.getMemoryUsed() self.initialValues['MemoryUsed'] = memUsed self.parameters['MemoryUsed'] = [] result = self.profiler.vSizeUsage(withChildren=True) if not result['OK']: self.log.warn("Could not get vSize info from profiler", result['Message']) else: vsize = result['Value'] * 1024. self.initialValues['Vsize'] = vsize self.log.verbose("Vsize(kb)", "%.1f" % vsize) self.parameters['Vsize'] = [] result = self.profiler.memoryUsage(withChildren=True) if not result['OK']: self.log.warn("Could not get rss info from profiler", result['Message']) else: rss = result['Value'] * 1024. self.initialValues['RSS'] = rss self.log.verbose("RSS(kb)", "%.1f" % rss) self.parameters['RSS'] = [] result = self.getDiskSpace() self.log.verbose('DiskSpace', result) if not result['OK']: self.log.warn("Could not establish DiskSpace") else: self.initialValues['DiskSpace'] = result['Value'] self.parameters['DiskSpace'] = [] result = self.getNodeInformation() self.log.verbose('NodeInfo', result) if not result['OK']: self.log.warn("Could not establish static system information") if 'LSB_JOBID' in os.environ: result['LocalJobID'] = os.environ['LSB_JOBID'] if 'PBS_JOBID' in os.environ: result['LocalJobID'] = os.environ['PBS_JOBID'] if 'QSUB_REQNAME' in os.environ: result['LocalJobID'] = os.environ['QSUB_REQNAME'] if 'JOB_ID' in os.environ: result['LocalJobID'] = os.environ['JOB_ID'] self.__reportParameters(result, 'NodeInformation', True) self.__reportParameters(self.initialValues, 'InitialValues') return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result['OK']: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result['Value'] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info( 'TimeLeft bellow %s, now checking with higher frequency' % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """ Returns average load, memory etc. over execution of job thread """ summary = {} # CPUConsumed if 'CPUConsumed' in self.parameters: cpuList = self.parameters['CPUConsumed'] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: summary['LastUpdateCPU(s)'] = rawCPU['Value'] else: summary['LastUpdateCPU(s)'] = 'Could not be estimated' # DiskSpace if 'DiskSpace' in self.parameters: space = self.parameters['DiskSpace'] if space: value = abs( float(space[-1]) - float(self.initialValues['DiskSpace'])) if value < 0: value = 0 summary['DiskSpace(MB)'] = value else: summary['DiskSpace(MB)'] = 'Could not be estimated' # MemoryUsed if 'MemoryUsed' in self.parameters: memory = self.parameters['MemoryUsed'] if memory: summary['MemoryUsed(kb)'] = abs( float(memory[-1]) - float(self.initialValues['MemoryUsed'])) else: summary['MemoryUsed(kb)'] = 'Could not be estimated' # LoadAverage if 'LoadAverage' in self.parameters: laList = self.parameters['LoadAverage'] if laList: summary['LoadAverage'] = float(sum(laList)) / float( len(laList)) else: summary['LoadAverage'] = 'Could not be estimated' result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) summary['WallClockTime(s)'] = 0 summary['ScaledCPUTime(s)'] = 0 else: wallClock = result['Value'] summary['WallClockTime(s)'] = wallClock summary[ 'ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors self.__reportParameters(summary, 'UsageSummary', True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job. """ try: parameters = [] self.log.info( '==========================================================') if title: self.log.info('Watchdog will report %s' % (title)) else: self.log.info('Watchdog will report parameters') self.log.info( '==========================================================') vals = params if 'Value' in params: if vals['Value']: vals = params['Value'] for k, v in vals.items(): if v: self.log.info(str(k) + ' = ' + str(v)) parameters.append([k, v]) if report: self.__setJobParamList(parameters) self.log.info( '==========================================================') except Exception as x: self.log.warn('Problem while reporting parameters') self.log.warn(str(x)) ############################################################################# def __getWallClockTime(self): """ Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if 'StartTime' in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues['StartTime'] result['Value'] = wallClock else: self.initialValues['StartTime'] = time.time() result['Value'] = 0.0 return result ############################################################################# def __killRunningThread(self): """ Will kill the running thread process and any child processes.""" self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info('Subprocess.killChild() returned:%s ' % (result)) return S_OK('Thread killed') ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """ Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result['OK']: self.log.warn('Problem sending sign of life') self.log.warn(result) if result['OK'] and result['Value']: self.__interpretControlSignal(result['Value']) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client """ # job wrapper template sets the jobID variable if 'JOBID' not in os.environ: self.log.info( 'Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value) self.log.verbose('setJobParameters(%s,%s)' % (jobID, value)) if not jobParam['OK']: self.log.warn(jobParam['Message']) return jobParam ############################################################################# def getNodeInformation(self): """ Attempts to retrieve all static system information, should be overridden in a subclass""" methodName = 'getNodeInformation' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass') ############################################################################# def getMemoryUsed(self): """Obtains the memory used. """ mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss return float(mem) ############################################################################# def getDiskSpace(self): """ Attempts to get the available disk space, should be overridden in a subclass""" methodName = 'getDiskSpace' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
def getCPUTime(cpuNormalizationFactor): """Trying to get CPUTime left for execution (in seconds). It will first look to get the work left looking for batch system information useing the TimeLeft utility. If it succeeds, it will convert it in real second, and return it. If it fails, it tries to get it from the static info found in CS. If it fails, it returns the default, which is a large 9999999, that we may consider as "Infinite". This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value args: cpuNormalizationFactor (float): the CPU power of the current Worker Node. If not passed in, it's get from the local configuration returns: cpuTimeLeft (int): the CPU time left, in seconds """ cpuTimeLeft = 0.0 cpuWorkLeft = gConfig.getValue("/LocalSite/CPUTimeLeft", 0) if not cpuWorkLeft: # Try and get the information from the CPU left utility result = TimeLeft().getTimeLeft() if result["OK"]: cpuWorkLeft = result["Value"] if cpuWorkLeft > 0: # This is in HS06sseconds # We need to convert in real seconds if not cpuNormalizationFactor: # if cpuNormalizationFactor passed in is 0, try get it from the local cfg cpuNormalizationFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0) if cpuNormalizationFactor: cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor if not cpuTimeLeft: # now we know that we have to find the CPUTimeLeft by looking in the CS # this is not granted to be correct as the CS units may not be real seconds gridCE = gConfig.getValue("/LocalSite/GridCE") ceQueue = gConfig.getValue("/LocalSite/CEQueue") if not ceQueue: # we have to look for a ceQueue in the CS # A bit hacky. We should better profit from something generic gLogger.warn("No CEQueue in local configuration, looking to find one in CS") siteName = DIRAC.siteName() queueSection = "/Resources/Sites/%s/%s/CEs/%s/Queues" % (siteName.split(".")[0], siteName, gridCE) res = gConfig.getSections(queueSection) if not res["OK"]: raise RuntimeError(res["Message"]) queues = res["Value"] cpuTimes = [gConfig.getValue(queueSection + "/" + queue + "/maxCPUTime", 9999999.0) for queue in queues] # These are (real, wall clock) minutes - damn BDII! cpuTimeLeft = min(cpuTimes) * 60 else: queueInfo = getQueueInfo("%s/%s" % (gridCE, ceQueue)) cpuTimeLeft = 9999999.0 if not queueInfo["OK"] or not queueInfo["Value"]: gLogger.warn("Can't find a CE/queue, defaulting CPUTime to %d" % cpuTimeLeft) else: queueCSSection = queueInfo["Value"]["QueueCSSection"] # These are (real, wall clock) minutes - damn BDII! cpuTimeInMinutes = gConfig.getValue("%s/maxCPUTime" % queueCSSection, 0.0) if cpuTimeInMinutes: cpuTimeLeft = cpuTimeInMinutes * 60.0 gLogger.info("CPUTime for %s: %f" % (queueCSSection, cpuTimeLeft)) else: gLogger.warn( "Can't find maxCPUTime for %s, defaulting CPUTime to %f" % (queueCSSection, cpuTimeLeft) ) return int(cpuTimeLeft)