def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}): """ Constructor, takes system flag as argument. """ self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2)) # SIGINT self.stopSigRegex = jobArgs.get('StopSigRegex', None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.scaleFactor = 1.0 self.processors = processors
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}): """Constructor, takes system flag as argument.""" self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2)) # SIGINT self.stopSigRegex = jobArgs.get("StopSigRegex", None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.cpuPower = 1.0 self.processors = processors ############################################################################# def initialize(self): """Watchdog initialization.""" if self.initialized: self.log.info("Watchdog already initialized") return S_OK() else: self.initialized = True setup = gConfig.getValue("/DIRAC/Setup", "") if not setup: return S_ERROR("Can not get the DIRAC Setup value") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") self.section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance self.log.verbose("Watchdog initialization") # Test control flags self.testWallClock = gConfig.getValue(self.section + "/CheckWallClockFlag", 1) self.testDiskSpace = gConfig.getValue(self.section + "/CheckDiskSpaceFlag", 1) self.testLoadAvg = gConfig.getValue(self.section + "/CheckLoadAvgFlag", 1) self.testCPUConsumed = gConfig.getValue(self.section + "/CheckCPUConsumedFlag", 1) self.testCPULimit = gConfig.getValue(self.section + "/CheckCPULimitFlag", 0) self.testMemoryLimit = gConfig.getValue(self.section + "/CheckMemoryLimitFlag", 0) self.testTimeLeft = gConfig.getValue(self.section + "/CheckTimeLeftFlag", 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + "/PollingTime", 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + "/CheckingTime", 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + "/MinCheckingTime", 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + "/MaxWallClockTime", 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + "/JobPeekFlag", 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + "/MinDiskSpace", 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + "/LoadAverageLimit", 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + "/CPUSampleTime", 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + "/JobCPULimitMargin", 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + "/MinCPUWallClockRatio", 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue(self.section + "/NullCPUCountLimit", 5) if self.checkingTime < self.minCheckingTime: self.log.info( "Requested CheckingTime of %s setting to %s seconds (minimum)" % (self.checkingTime, self.minCheckingTime) ) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + "/TimeLeftLimit", 150 * self.pollingTime) self.cpuPower = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 1.0) return S_OK() def run(self): """The main watchdog execution method""" result = self.initialize() if not result["OK"]: self.log.always("Can not start watchdog for the following reason") self.log.always(result["Message"]) return result try: while True: self.log.debug("Starting watchdog loop # %d" % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result["OK"]: self.log.error("Watchdog error during execution", result["Message"]) break elif result["Value"] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR("Exception") ############################################################################# def execute(self): """The main agent execution method of the Watchdog.""" if not self.exeThread.is_alive(): self.__getUsageSummary() self.log.info("Process to monitor has completed, Watchdog will exit.") return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if ( not self.stopSigSent and self.stopSigRegex is not None and (time.time() - self.initialValues["StartTime"]) > self.wallClockCheckSeconds * self.wallClockCheckCount ): self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = JobMinorStatus.JOB_EXCEEDED_CPU self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues["StartTime"]) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result["OK"]: self.log.warn("Problem during recent checks") self.log.warn(result["Message"]) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature("jobstart_secs") if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and ( wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds ): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info("Sending signal to JobWrapper children", "(%s)" % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open("/proc/%d/cmdline" % childPid, "r").read().replace("\0", " ").strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline) ) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error("Failed to send signals to JobWrapper children!", repr(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose("------------------------------------") self.log.verbose("Checking loop starts for Watchdog") heartBeatDict = {} msg = "" loadAvg = float(os.getloadavg()[0]) msg += "LoadAvg: %d " % loadAvg heartBeatDict["LoadAverage"] = loadAvg if "LoadAverage" not in self.parameters: self.parameters["LoadAverage"] = [] self.parameters["LoadAverage"].append(loadAvg) memoryUsed = self.getMemoryUsed() msg += "MemUsed: %.1f kb " % (memoryUsed) heartBeatDict["MemoryUsed"] = memoryUsed if "MemoryUsed" not in self.parameters: self.parameters["MemoryUsed"] = [] self.parameters["MemoryUsed"].append(memoryUsed) result = self.profiler.vSizeUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get vSize info from profiler", result["Message"]) else: vsize = result["Value"] * 1024.0 heartBeatDict["Vsize"] = vsize self.parameters.setdefault("Vsize", []) self.parameters["Vsize"].append(vsize) msg += "Job Vsize: %.1f kb " % vsize result = self.profiler.memoryUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get rss info from profiler", result["Message"]) else: rss = result["Value"] * 1024.0 heartBeatDict["RSS"] = rss self.parameters.setdefault("RSS", []) self.parameters["RSS"].append(rss) msg += "Job RSS: %.1f kb " % rss if "DiskSpace" not in self.parameters: self.parameters["DiskSpace"] = [] # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused # (specific request from CERN batch service). result = self.getDiskSpace(exclude="fuse") if not result["OK"]: self.log.warn("Could not establish DiskSpace", result["Message"]) else: msg += "DiskSpace: %.1f MB " % (result["Value"]) self.parameters["DiskSpace"].append(result["Value"]) heartBeatDict["AvailableDiskSpace"] = result["Value"] cpu = self.__getCPU() if not cpu["OK"]: msg += "CPU: ERROR " hmsCPU = 0 else: cpu = cpu["Value"] msg += "CPU: %s (h:m:s) " % (cpu) if "CPUConsumed" not in self.parameters: self.parameters["CPUConsumed"] = [] self.parameters["CPUConsumed"].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: heartBeatDict["CPUConsumed"] = rawCPU["Value"] result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) else: msg += "WallClock: %.2f s " % (result["Value"]) self.parameters.setdefault("WallClockTime", list()).append(result["Value"]) heartBeatDict["WallClockTime"] = result["Value"] * self.processors self.log.info(msg) result = self._checkProgress() if not result["OK"]: self.checkError = result["Message"] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] self.log.info("Last lines of available application output:") self.log.info("================START================") for line in outputList: self.log.info(line) self.log.info("=================END=================") self.__killRunningThread() return S_OK() recentStdOut = "None" if self.jobPeekFlag: result = self.__peek() if result["OK"]: outputList = result["Value"] size = len(outputList) recentStdOut = "Last %s lines of application output from Watchdog on %s [UTC]:" % ( size, Time.dateTime(), ) border = "=" * len(recentStdOut) cpuTotal = "Last reported CPU consumed for job is %s (h:m:s)" % (hmsCPU) if self.timeLeft: cpuTotal += ", Batch Queue Time Left %s (s @ HS06)" % self.timeLeft recentStdOut = "\n%s\n%s\n%s\n%s\n" % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + "\n" else: recentStdOut = ( "Watchdog is initializing and will attempt to obtain standard output from application thread" ) self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn("Turning off job peeking for remainder of execution") if "JOBID" not in os.environ: self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] staticParamDict = {"StandardOutput": recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK("Watchdog checking cycle complete") ############################################################################# def __getCPU(self): """Uses the profiler to get CPU time for current process, its child, and the terminated child, and returns HH:MM:SS after conversion. """ result = self.profiler.cpuUsageUser(withChildren=True, withTerminatedChildren=True) if not result["OK"]: self.log.warn("Issue while checking consumed CPU for user", result["Message"]) if result["Errno"] == errno.ESRCH: self.log.warn("The main process does not exist (anymore). This might be correct.") return result cpuUsageUser = result["Value"] result = self.profiler.cpuUsageSystem(withChildren=True, withTerminatedChildren=True) if not result["OK"]: self.log.warn("Issue while checking consumed CPU for system", result["Message"]) if result["Errno"] == errno.ESRCH: self.log.warn("The main process does not exist (anymore). This might be correct.") return result cpuUsageSystem = result["Value"] cpuTimeTotal = cpuUsageUser + cpuUsageSystem if cpuTimeTotal: self.log.verbose("Raw CPU time consumed (s) =", cpuTimeTotal) return self.__getCPUHMS(cpuTimeTotal) self.log.error("CPU time consumed found to be 0") return S_ERROR() ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = "%02d:%02d:%02d" % (hours, mins, secs) self.log.verbose("Human readable CPU time is: %s" % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info("Received control signal") if isinstance(signalDict, dict): if "Kill" in signalDict: self.log.info("Received Kill signal, stopping job via control signal") self.checkError = JobMinorStatus.RECEIVED_KILL_SIGNAL self.__killRunningThread() else: self.log.info("The following control signal was sent but not understood by the watchdog:") self.log.info(signalDict) else: self.log.info("Expected dictionary for control signal", "received:\n%s" % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = "" if self.testWallClock: result = self.__checkWallClockTime() if not result["OK"]: self.log.warn(result["Message"]) return result report += "WallClock: OK, " else: report += "WallClock: NA," if self.testDiskSpace: result = self.__checkDiskSpace() if not result["OK"]: self.log.warn(result["Message"]) return result report += "DiskSpace: OK, " else: report += "DiskSpace: NA," if self.testLoadAvg: result = self.__checkLoadAverage() if not result["OK"]: self.log.warn( "Check of load average failed, but won't fail because of that", ": %s" % result["Message"] ) report += "LoadAverage: ERROR, " return S_OK() report += "LoadAverage: OK, " else: report += "LoadAverage: NA," if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result["OK"]: return result report += "CPUConsumed: OK, " else: report += "CPUConsumed: NA, " if self.testCPULimit: result = self.__checkCPULimit() if not result["OK"]: self.log.warn(result["Message"]) return result report += "CPULimit OK, " else: report += "CPULimit: NA, " if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += "TimeLeft: OK" else: report += "TimeLeft: NA" if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result["OK"]: self.log.warn(result["Message"]) return result report += "MemoryLimit OK, " else: report += "MemoryLimit: NA, " self.log.info(report) return S_OK("All enabled checks passed") ############################################################################# def __checkCPUConsumed(self): """Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if "WallClockTime" not in self.parameters: return S_ERROR("Missing WallClockTime info") if "CPUConsumed" not in self.parameters: return S_ERROR("Missing CPUConsumed info") wallClockTime = self.parameters["WallClockTime"][-1] if wallClockTime < self.sampleCPUTime: self.log.info( "Stopping check, wallclock time is still smaller than sample time", "(%s) < (%s)" % (wallClockTime, self.sampleCPUTime), ) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters["CPUConsumed"]) < intervals + 1: self.log.info( "Not enough snapshots to calculate", "there are %s and we need %s" % (len(self.parameters["CPUConsumed"]), intervals + 1), ) return S_OK() wallClockTime = self.parameters["WallClockTime"][-1] - self.parameters["WallClockTime"][-1 - intervals] try: cpuTime = self.__convertCPUTime(self.parameters["CPUConsumed"][-1])["Value"] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime(self.parameters["CPUConsumed"][-1 - intervals])["Value"] if cpuTime < 0: self.log.warn("Consumed CPU time negative, something wrong may have happened, ignore") return S_OK() if wallClockTime <= 0: self.log.warn("Wallclock time should not be negative or zero, Ignore") return S_OK() ratio = (cpuTime / wallClockTime) * 100 self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if ( os.path.exists("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK") or "DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK" in os.environ ): self.log.warn( "N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload" ) return S_OK() self.log.info("Job is stalled!") return S_ERROR(JobMinorStatus.WATCHDOG_STALLED) except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(":") # for i in range( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR("Could not calculate CPU time") # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result["Value"] = normalizedCPUValue self.log.debug("CPU value %s converted to %s" % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if "CPUConsumed" in self.parameters: consumedCPU = self.parameters["CPUConsumed"][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict["OK"]: currentCPU = consumedCPUDict["Value"] else: return S_OK("Not possible to determine current CPU consumed") if consumedCPU: limit = int(self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100)) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( "Job has consumed more than the specified CPU limit", "with an additional %s%% margin" % (self.jobCPUMargin), ) return S_ERROR("Job has exceeded maximum CPU time limit") return S_OK("Job within CPU limit") if not currentCPU: self.log.verbose("Both initial and current CPU consumed are null") return S_OK("CPU consumed is not measurable yet") return S_OK("Not possible to determine CPU consumed") def __checkMemoryLimit(self): """Checks that the job memory consumption is within a limit""" vsize = 0 if "Vsize" in self.parameters: vsize = self.parameters["Vsize"][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn( "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit) ) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available.""" if "DiskSpace" in self.parameters: availSpace = self.parameters["DiskSpace"][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info( "Not enough local disk space for job to continue, defined in CS as %s MB" % (self.minDiskSpace) ) return S_ERROR(JobMinorStatus.JOB_INSUFFICIENT_DISK) else: return S_OK("Job has enough disk space available") else: return S_ERROR("Available disk space could not be established") ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if "StartTime" in self.initialValues: startTime = self.initialValues["StartTime"] if time.time() - startTime > self.maxWallClockTime: self.log.info("Job has exceeded maximum wall clock time of %s seconds" % (self.maxWallClockTime)) return S_ERROR(JobMinorStatus.JOB_EXCEEDED_WALL_CLOCK) else: return S_OK("Job within maximum wall clock time") else: return S_ERROR("Job start time could not be established") ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded.""" if "LoadAverage" in self.parameters: loadAvg = self.parameters["LoadAverage"][-1] if loadAvg > float(self.loadAvgLimit): self.log.info("Maximum load average exceeded, defined in CS as %s " % (self.loadAvgLimit)) return S_ERROR("Job exceeded maximum load average") return S_OK("Job running with normal load average") return S_ERROR("Job load average not established") ############################################################################# def __peek(self): """Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result["OK"]: self.log.warn("Could not obtain output from running application thread") self.log.warn(result["Message"]) return result ############################################################################# def calibrate(self): """The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters["WallClockTime"] = [] cpuConsumed = self.__getCPU() if not cpuConsumed["OK"]: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed["Value"] self.initialValues["CPUConsumed"] = cpuConsumed self.parameters["CPUConsumed"] = [] self.initialValues["LoadAverage"] = float(os.getloadavg()[0]) self.parameters["LoadAverage"] = [] memUsed = self.getMemoryUsed() self.initialValues["MemoryUsed"] = memUsed self.parameters["MemoryUsed"] = [] result = self.profiler.vSizeUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get vSize info from profiler", result["Message"]) else: vsize = result["Value"] * 1024.0 self.initialValues["Vsize"] = vsize self.log.verbose("Vsize(kb)", "%.1f" % vsize) self.parameters["Vsize"] = [] result = self.profiler.memoryUsage(withChildren=True) if not result["OK"]: self.log.warn("Could not get rss info from profiler", result["Message"]) else: rss = result["Value"] * 1024.0 self.initialValues["RSS"] = rss self.log.verbose("RSS(kb)", "%.1f" % rss) self.parameters["RSS"] = [] # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused # (specific request from CERN batch service). result = self.getDiskSpace(exclude="fuse") self.log.verbose("DiskSpace: %s" % (result)) if not result["OK"]: self.log.warn("Could not establish DiskSpace") else: self.initialValues["DiskSpace"] = result["Value"] self.parameters["DiskSpace"] = [] result = self.getNodeInformation() self.log.verbose("NodeInfo", result) if "LSB_JOBID" in os.environ: result["LocalJobID"] = os.environ["LSB_JOBID"] if "PBS_JOBID" in os.environ: result["LocalJobID"] = os.environ["PBS_JOBID"] if "QSUB_REQNAME" in os.environ: result["LocalJobID"] = os.environ["QSUB_REQNAME"] if "JOB_ID" in os.environ: result["LocalJobID"] = os.environ["JOB_ID"] self.__reportParameters(result, "NodeInformation", True) self.__reportParameters(self.initialValues, "InitialValues") return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result["OK"]: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result["Value"] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info("TimeLeft bellow %s, now checking with higher frequency" % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """Returns average load, memory etc. over execution of job thread""" summary = {} # CPUConsumed if "CPUConsumed" in self.parameters: cpuList = self.parameters["CPUConsumed"] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU["OK"]: summary["LastUpdateCPU(s)"] = rawCPU["Value"] else: summary["LastUpdateCPU(s)"] = "Could not be estimated" # DiskSpace if "DiskSpace" in self.parameters: space = self.parameters["DiskSpace"] if space: value = abs(float(space[-1]) - float(self.initialValues["DiskSpace"])) if value < 0: value = 0 summary["DiskSpace(MB)"] = value else: summary["DiskSpace(MB)"] = "Could not be estimated" # MemoryUsed if "MemoryUsed" in self.parameters: memory = self.parameters["MemoryUsed"] if memory: summary["MemoryUsed(kb)"] = abs(float(memory[-1]) - float(self.initialValues["MemoryUsed"])) else: summary["MemoryUsed(kb)"] = "Could not be estimated" # LoadAverage if "LoadAverage" in self.parameters: laList = self.parameters["LoadAverage"] if laList: summary["LoadAverage"] = sum(laList) / len(laList) else: summary["LoadAverage"] = "Could not be estimated" result = self.__getWallClockTime() if not result["OK"]: self.log.warn("Failed determining wall clock time", result["Message"]) summary["WallClockTime(s)"] = 0 summary["ScaledCPUTime(s)"] = 0 else: wallClock = result["Value"] summary["WallClockTime(s)"] = wallClock * self.processors summary["ScaledCPUTime(s)"] = wallClock * self.cpuPower * self.processors self.__reportParameters(summary, "UsageSummary", True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job.""" try: parameters = [] self.log.info("", "==========================================================") if title: self.log.info("Watchdog will report", title) else: self.log.info("Watchdog will report parameters") self.log.info("", "==========================================================") vals = params if "Value" in params: if vals["Value"]: vals = params["Value"] for k, v in vals.items(): if v: self.log.info(str(k) + " = " + str(v)) parameters.append([k, v]) if report: self.__setJobParamList(parameters) self.log.info("", "==========================================================") except Exception as x: self.log.warn("Problem while reporting parameters") self.log.warn(repr(x)) ############################################################################# def __getWallClockTime(self): """Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if "StartTime" in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues["StartTime"] result["Value"] = wallClock else: self.initialValues["StartTime"] = time.time() result["Value"] = 0.0 return result ############################################################################# def __killRunningThread(self): """Will kill the running thread process and any child processes.""" self.log.info("Sending kill signal to application PID %s" % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info("Subprocess.killChild() returned:%s " % (result)) return S_OK("Thread killed") ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result["OK"]: self.log.warn("Problem sending sign of life") self.log.warn(result) if result["OK"] and result["Value"]: self.__interpretControlSignal(result["Value"]) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client""" # job wrapper template sets the jobID variable if "JOBID" not in os.environ: self.log.info("Running without JOBID so parameters will not be reported") return S_OK() jobID = os.environ["JOBID"] jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value) self.log.verbose("setJobParameters(%s,%s)" % (jobID, value)) if not jobParam["OK"]: self.log.warn(jobParam["Message"]) return jobParam ############################################################################# def getNodeInformation(self): """Retrieves all static system information""" result = {} result["HostName"] = socket.gethostname() result["CPU(MHz)"] = psutil.cpu_freq()[0] result["Memory(kB)"] = int(psutil.virtual_memory()[1] / 1024) result["LocalAccount"] = getpass.getuser() if os.path.exists("/proc/cpuinfo"): with open("/proc/cpuinfo", "r") as cpuinfo: info = cpuinfo.readlines() try: result["ModelName"] = info[4].split(":")[1].replace(" ", "").replace("\n", "") except IndexError: pass try: result["CacheSize(kB)"] = [x.strip().split(":")[1] for x in info if "cache size" in x][0].strip() except IndexError: pass return result ############################################################################# def getMemoryUsed(self): """Obtains the memory used.""" mem = ( resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss ) return float(mem) ############################################################################# def getDiskSpace(self, exclude=None): """Obtains the available disk space.""" result = S_OK() diskSpace = getDiskSpace(exclude=exclude) if diskSpace == -1: result = S_ERROR("Could not obtain disk usage") self.log.warn(" Could not obtain disk usage") result["Value"] = float(-1) return result result["Value"] = float(diskSpace) return result
class Watchdog(object): ############################################################################# def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}): """ Constructor, takes system flag as argument. """ self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2)) # SIGINT self.stopSigRegex = jobArgs.get('StopSigRegex', None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.scaleFactor = 1.0 self.processors = processors ############################################################################# def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR('Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5) # ratio %age # After 5 sample times return null CPU consumption kill job self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5) if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0) return S_OK() def run(self): """ The main watchdog execution method """ result = self.initialize() if not result['OK']: self.log.always('Can not start watchdog for the following reason') self.log.always(result['Message']) return result try: while True: self.log.debug('Starting watchdog loop # %d' % self.count) start_cycle_time = time.time() result = self.execute() exec_cycle_time = time.time() - start_cycle_time if not result['OK']: self.log.error("Watchdog error during execution", result['Message']) break elif result['Value'] == "Ended": break self.count += 1 if exec_cycle_time < self.pollingTime: time.sleep(self.pollingTime - exec_cycle_time) return S_OK() except Exception: self.log.exception() return S_ERROR('Exception') ############################################################################# def execute(self): """ The main agent execution method of the Watchdog. """ if not self.exeThread.isAlive(): self.__getUsageSummary() self.log.info('Process to monitor has completed, Watchdog will exit.') return S_OK("Ended") # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL if not self.stopSigSent and self.stopSigRegex is not None and ( time.time() - self.initialValues['StartTime']) > self.wallClockCheckSeconds * self.wallClockCheckCount: self.wallClockCheckCount += 1 self._performWallClockChecks() if self.littleTimeLeft: # if we have gone over enough iterations query again if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1: self.checkError = 'Job has reached the CPU limit of the queue' self.log.error(self.checkError, self.timeLeft) self.__killRunningThread() return S_OK() else: self.littleTimeLeftCount -= 1 # Note: need to poll regularly to see if the thread is alive # but only perform checks with a certain frequency if (time.time() - self.initialValues['StartTime']) > self.checkingTime * self.checkCount: self.checkCount += 1 result = self._performChecks() if not result['OK']: self.log.warn('Problem during recent checks') self.log.warn(result['Message']) return S_OK() else: # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount)) return S_OK() ############################################################################# def _performWallClockChecks(self): """Watchdog performs the wall clock checks based on MJF. Signals are sent to processes if we need to stop, but function always returns S_OK() """ mjf = MJF.MJF() try: wallClockSecondsLeft = mjf.getWallClockSecondsLeft() except Exception as e: # Just stop if we can't get the wall clock seconds left return S_OK() jobstartSeconds = mjf.getIntJobFeature('jobstart_secs') if jobstartSeconds is None: # Just stop if we don't know when the job started return S_OK() if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \ (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds): # Need to send the signal! Assume it works to avoid sending the signal more than once self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber) self.stopSigSent = True try: for childPid in getChildrenPIDs(self.wrapperPID): try: cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip() except IOError: # Process gone away? Not running on Linux? Skip anyway continue if re.search(self.stopSigRegex, cmdline) is not None: self.log.info( 'Sending signal %d to process ID %d, cmdline = "%s"' % (self.stopSigNumber, childPid, cmdline)) os.kill(childPid, self.stopSigNumber) except Exception as e: self.log.error('Failed to send signals to JobWrapper children! (%s)' % str(e)) return S_OK() ############################################################################# def _performChecks(self): """The Watchdog checks are performed at a different period to the checking of the application thread and correspond to the checkingTime. """ self.log.verbose('------------------------------------') self.log.verbose('Checking loop starts for Watchdog') heartBeatDict = {} msg = '' loadAvg = float(os.getloadavg()[0]) msg += 'LoadAvg: %d ' % loadAvg heartBeatDict['LoadAverage'] = loadAvg if 'LoadAverage' not in self.parameters: self.parameters['LoadAverage'] = [] self.parameters['LoadAverage'].append(loadAvg) memoryUsed = self.getMemoryUsed() msg += 'MemUsed: %.1f kb ' % (memoryUsed) heartBeatDict['MemoryUsed'] = memoryUsed if 'MemoryUsed' not in self.parameters: self.parameters['MemoryUsed'] = [] self.parameters['MemoryUsed'].append(memoryUsed) result = self.profiler.getAllProcessData(withChildren=True) if result['OK']: vsize = result['Value']['stats']['vSizeUsage'] * 1024. rss = result['Value']['stats']['memoryUsage'] * 1024. heartBeatDict['Vsize'] = vsize heartBeatDict['RSS'] = rss self.parameters.setdefault('Vsize', []) self.parameters['Vsize'].append(vsize) self.parameters.setdefault('RSS', []) self.parameters['RSS'].append(rss) msg += "Job Vsize: %.1f kb " % vsize msg += "Job RSS: %.1f kb " % rss result = self.getDiskSpace() if not result['OK']: self.log.warn("Could not establish DiskSpace", result['Message']) else: msg += 'DiskSpace: %.1f MB ' % (result['Value']) if 'DiskSpace' not in self.parameters: self.parameters['DiskSpace'] = [] if result['OK']: self.parameters['DiskSpace'].append(result['Value']) heartBeatDict['AvailableDiskSpace'] = result['Value'] cpu = self.__getCPU() if not cpu['OK']: msg += 'CPU: ERROR ' hmsCPU = 0 else: cpu = cpu['Value'] msg += 'CPU: %s (h:m:s) ' % (cpu) if 'CPUConsumed' not in self.parameters: self.parameters['CPUConsumed'] = [] self.parameters['CPUConsumed'].append(cpu) hmsCPU = cpu rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: heartBeatDict['CPUConsumed'] = rawCPU['Value'] result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) else: msg += 'WallClock: %.2f s ' % (result['Value']) self.parameters.setdefault('WallClockTime', list()).append(result['Value']) heartBeatDict['WallClockTime'] = result['Value'] self.log.info(msg) result = self._checkProgress() if not result['OK']: self.checkError = result['Message'] self.log.warn(self.checkError) if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) self.log.info('Last %s lines of available application output:' % (size)) self.log.info('================START================') for line in outputList: self.log.info(line) self.log.info('=================END=================') self.__killRunningThread() return S_OK() recentStdOut = 'None' if self.jobPeekFlag: result = self.__peek() if result['OK']: outputList = result['Value'] size = len(outputList) recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % (size, Time.dateTime()) border = '=' * len(recentStdOut) cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % (hmsCPU) if self.timeLeft: cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border) self.log.info(recentStdOut) for line in outputList: self.log.info(line) recentStdOut += line + '\n' else: recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread' self.log.info(recentStdOut) self.peekFailCount += 1 if self.peekFailCount > self.peekRetry: self.jobPeekFlag = 0 self.log.warn('Turning off job peeking for remainder of execution') if 'JOBID' not in os.environ: self.log.info('Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] staticParamDict = {'StandardOutput': recentStdOut} self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict) return S_OK('Watchdog checking cycle complete') ############################################################################# def __getCPU(self): """Uses os.times() to get CPU time and returns HH:MM:SS after conversion. """ try: result = self.profiler.getAllProcessData(withChildren=True) if not result['OK']: self.log.warn('Problem while checking consumed CPU') return result cpuTime = result['Value'] if cpuTime: cpuTimeTotal = cpuTime['stats']['cpuUsageSystem'] + cpuTime['stats']['cpuUsageUser'] self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTimeTotal)) return self.__getCPUHMS(cpuTimeTotal) else: self.log.error("CPU time consumed found to be 0") return S_ERROR() except Exception as e: self.log.warn('Could not determine CPU time consumed with exception') self.log.exception(e) return S_ERROR("Could not determine CPU time consumed with exception") ############################################################################# def __getCPUHMS(self, cpuTime): mins, secs = divmod(cpuTime, 60) hours, mins = divmod(mins, 60) humanTime = '%02d:%02d:%02d' % (hours, mins, secs) self.log.verbose('Human readable CPU time is: %s' % humanTime) return S_OK(humanTime) ############################################################################# def __interpretControlSignal(self, signalDict): """This method is called whenever a signal is sent via the result of sending a sign of life. """ self.log.info('Received control signal') if isinstance(signalDict, dict): if 'Kill' in signalDict: self.log.info('Received Kill signal, stopping job via control signal') self.checkError = 'Received Kill signal' self.__killRunningThread() else: self.log.info('The following control signal was sent but not understood by the watchdog:') self.log.info(signalDict) else: self.log.info('Expected dictionary for control signal, received:\n%s' % (signalDict)) return S_OK() ############################################################################# def _checkProgress(self): """This method calls specific tests to determine whether the job execution is proceeding normally. CS flags can easily be added to add or remove tests via central configuration. """ report = '' if self.testWallClock: result = self.__checkWallClockTime() if not result['OK']: self.log.warn(result['Message']) return result report += 'WallClock: OK, ' else: report += 'WallClock: NA,' if self.testDiskSpace: result = self.__checkDiskSpace() if not result['OK']: self.log.warn(result['Message']) return result report += 'DiskSpace: OK, ' else: report += 'DiskSpace: NA,' if self.testLoadAvg: result = self.__checkLoadAverage() if not result['OK']: self.log.warn("Check of load average failed, but won't fail because of that: %s" % result['Message']) report += 'LoadAverage: ERROR, ' return S_OK() report += 'LoadAverage: OK, ' else: report += 'LoadAverage: NA,' if self.testCPUConsumed: result = self.__checkCPUConsumed() if not result['OK']: return result report += 'CPUConsumed: OK, ' else: report += 'CPUConsumed: NA, ' if self.testCPULimit: result = self.__checkCPULimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'CPULimit OK, ' else: report += 'CPULimit: NA, ' if self.testTimeLeft: self.__timeLeft() if self.timeLeft: report += 'TimeLeft: OK' else: report += 'TimeLeft: NA' if self.testMemoryLimit: result = self.__checkMemoryLimit() if not result['OK']: self.log.warn(result['Message']) return result report += 'MemoryLimit OK, ' else: report += 'MemoryLimit: NA, ' self.log.info(report) return S_OK('All enabled checks passed') ############################################################################# def __checkCPUConsumed(self): """ Checks whether the CPU consumed by application process is reasonable. This method will report stalled jobs to be killed. """ self.log.info("Checking CPU Consumed") if 'WallClockTime' not in self.parameters: return S_ERROR('Missing WallClockTime info') if 'CPUConsumed' not in self.parameters: return S_ERROR('Missing CPUConsumed info') wallClockTime = self.parameters['WallClockTime'][-1] if wallClockTime < self.sampleCPUTime: self.log.info("Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % ( wallClockTime, self.sampleCPUTime)) return S_OK() intervals = max(1, int(self.sampleCPUTime / self.checkingTime)) if len(self.parameters['CPUConsumed']) < intervals + 1: self.log.info("Not enough snapshots to calculate, there are %s and we need %s" % (len(self.parameters['CPUConsumed']), intervals + 1)) return S_OK() wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals] try: cpuTime = self.__convertCPUTime(self.parameters['CPUConsumed'][-1])['Value'] # For some reason, some times the CPU consumed estimation returns 0 # if cpuTime == 0: # return S_OK() cpuTime -= self.__convertCPUTime(self.parameters['CPUConsumed'][-1 - intervals])['Value'] if cpuTime < 0: self.log.warn('Consumed CPU time negative, something wrong may have happened, ignore') return S_OK() if wallClockTime <= 0: self.log.warn('Wallclock time should not be negative or zero, Ignore') return S_OK() ratio = (cpuTime / wallClockTime) * 100. self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio) # in case of error cpuTime might be 0, exclude this if ratio < self.minCPUWallClockRatio: if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'): self.log.info('N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload') return S_OK() self.log.info("Job is stalled!") return S_ERROR('Watchdog identified this job as stalled') except Exception as e: self.log.error("Cannot convert CPU consumed from string to int", str(e)) return S_OK() ############################################################################# def __convertCPUTime(self, cputime): """ Method to convert the CPU time as returned from the Watchdog instances to the equivalent DIRAC normalized CPU time to be compared to the Job CPU requirement. """ cpuValue = 0 cpuHMS = cputime.split(':') # for i in xrange( len( cpuHMS ) ): # cpuHMS[i] = cpuHMS[i].replace( '00', '0' ) try: hours = float(cpuHMS[0]) * 60 * 60 mins = float(cpuHMS[1]) * 60 secs = float(cpuHMS[2]) cpuValue = float(hours + mins + secs) except Exception as x: self.log.warn(str(x)) return S_ERROR('Could not calculate CPU time') # Normalization to be implemented normalizedCPUValue = cpuValue result = S_OK() result['Value'] = normalizedCPUValue self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue)) return result ############################################################################# def __checkCPULimit(self): """ Checks that the job has consumed more than the job CPU requirement (plus a configurable margin) and kills them as necessary. """ consumedCPU = 0 if 'CPUConsumed' in self.parameters: consumedCPU = self.parameters['CPUConsumed'][-1] consumedCPUDict = self.__convertCPUTime(consumedCPU) if consumedCPUDict['OK']: currentCPU = consumedCPUDict['Value'] else: return S_OK('Not possible to determine current CPU consumed') if consumedCPU: limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100) cpuConsumed = float(currentCPU) if cpuConsumed > limit: self.log.info( 'Job has consumed more than the specified CPU limit with an additional %s%% margin' % (self.jobCPUMargin)) return S_ERROR('Job has exceeded maximum CPU time limit') else: return S_OK('Job within CPU limit') elif not currentCPU: self.log.verbose('Both initial and current CPU consumed are null') return S_OK('CPU consumed is not measurable yet') else: return S_OK('Not possible to determine CPU consumed') def __checkMemoryLimit(self): """ Checks that the job memory consumption is within a limit """ if 'Vsize' in self.parameters: vsize = self.parameters['Vsize'][-1] if vsize and self.memoryLimit: if vsize > self.memoryLimit: vsize = vsize # Just a warning for the moment self.log.warn("Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit)) return S_OK() ############################################################################# def __checkDiskSpace(self): """Checks whether the CS defined minimum disk space is available. """ if 'DiskSpace' in self.parameters: availSpace = self.parameters['DiskSpace'][-1] if availSpace >= 0 and availSpace < self.minDiskSpace: self.log.info('Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace)) return S_ERROR('Job has insufficient disk space to continue') else: return S_OK('Job has enough disk space available') else: return S_ERROR('Available disk space could not be established') ############################################################################# def __checkWallClockTime(self): """Checks whether the job has been running for the CS defined maximum wall clock time. """ if 'StartTime' in self.initialValues: startTime = self.initialValues['StartTime'] if time.time() - startTime > self.maxWallClockTime: self.log.info('Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime)) return S_ERROR('Job has exceeded maximum wall clock time') else: return S_OK('Job within maximum wall clock time') else: return S_ERROR('Job start time could not be established') ############################################################################# def __checkLoadAverage(self): """Checks whether the CS defined maximum load average is exceeded. """ if 'LoadAverage' in self.parameters: loadAvg = self.parameters['LoadAverage'][-1] if loadAvg > float(self.loadAvgLimit): self.log.info('Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit)) return S_ERROR('Job exceeded maximum load average') else: return S_OK('Job running with normal load average') else: return S_ERROR('Job load average not established') ############################################################################# def __peek(self): """ Uses ExecutionThread.getOutput() method to obtain standard output from running thread via subprocess callback function. """ result = self.exeThread.getOutput() if not result['OK']: self.log.warn('Could not obtain output from running application thread') self.log.warn(result['Message']) return result ############################################################################# def calibrate(self): """ The calibrate method obtains the initial values for system memory and load and calculates the margin for error for the rest of the Watchdog cycle. """ self.__getWallClockTime() self.parameters['WallClockTime'] = [] cpuConsumed = self.__getCPU() if not cpuConsumed['OK']: self.log.warn("Could not establish CPU consumed, setting to 0.0") cpuConsumed = 0.0 else: cpuConsumed = cpuConsumed['Value'] self.initialValues['CPUConsumed'] = cpuConsumed self.parameters['CPUConsumed'] = [] self.initialValues['LoadAverage'] = float(os.getloadavg()[0]) self.parameters['LoadAverage'] = [] memUsed = self.getMemoryUsed() self.initialValues['MemoryUsed'] = memUsed self.parameters['MemoryUsed'] = [] result = self.profiler.getAllProcessData(withChildren=True) self.log.verbose('Job Memory: %s' % (result['Value'])) if not result['OK']: self.log.warn('Could not get job memory usage') self.initialValues['Vsize'] = result['Value']['stats']['vSizeUsage'] * 1024. self.initialValues['RSS'] = result['Value']['stats']['memoryUsage'] * 1024. self.parameters['Vsize'] = [] self.parameters['RSS'] = [] result = self.getDiskSpace() self.log.verbose('DiskSpace: %s' % (result)) if not result['OK']: self.log.warn("Could not establish DiskSpace") self.initialValues['DiskSpace'] = result['Value'] self.parameters['DiskSpace'] = [] result = self.getNodeInformation() self.log.verbose('NodeInfo: %s' % (result)) if not result['OK']: self.log.warn("Could not establish static system information") if 'LSB_JOBID' in os.environ: result['LocalJobID'] = os.environ['LSB_JOBID'] if 'PBS_JOBID' in os.environ: result['LocalJobID'] = os.environ['PBS_JOBID'] if 'QSUB_REQNAME' in os.environ: result['LocalJobID'] = os.environ['QSUB_REQNAME'] if 'JOB_ID' in os.environ: result['LocalJobID'] = os.environ['JOB_ID'] self.__reportParameters(result, 'NodeInformation', True) self.__reportParameters(self.initialValues, 'InitialValues') return S_OK() def __timeLeft(self): """ return Normalized CPU time left in the batch system 0 if not available update self.timeLeft and self.littleTimeLeft """ # Get CPU time left in the batch system result = self.timeLeftUtil.getTimeLeft(0.0) if not result['OK']: # Could not get CPU time left, we might need to wait for the first loop # or the Utility is not working properly for this batch system # or we are in a batch system timeLeft = 0 else: timeLeft = result['Value'] self.timeLeft = timeLeft if not self.littleTimeLeft: if timeLeft and timeLeft < self.grossTimeLeftLimit: self.log.info('TimeLeft bellow %s, now checking with higher frequency' % timeLeft) self.littleTimeLeft = True # TODO: better configurable way of doing this to be coded self.littleTimeLeftCount = 15 else: if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit: timeLeft = -1 return timeLeft ############################################################################# def __getUsageSummary(self): """ Returns average load, memory etc. over execution of job thread """ summary = {} # CPUConsumed if 'CPUConsumed' in self.parameters: cpuList = self.parameters['CPUConsumed'] if cpuList: hmsCPU = cpuList[-1] rawCPU = self.__convertCPUTime(hmsCPU) if rawCPU['OK']: summary['LastUpdateCPU(s)'] = rawCPU['Value'] else: summary['LastUpdateCPU(s)'] = 'Could not be estimated' # DiskSpace if 'DiskSpace' in self.parameters: space = self.parameters['DiskSpace'] if space: value = abs(float(space[-1]) - float(self.initialValues['DiskSpace'])) if value < 0: value = 0 summary['DiskSpace(MB)'] = value else: summary['DiskSpace(MB)'] = 'Could not be estimated' # MemoryUsed if 'MemoryUsed' in self.parameters: memory = self.parameters['MemoryUsed'] if memory: summary['MemoryUsed(kb)'] = abs(float(memory[-1]) - float(self.initialValues['MemoryUsed'])) else: summary['MemoryUsed(kb)'] = 'Could not be estimated' # LoadAverage if 'LoadAverage' in self.parameters: laList = self.parameters['LoadAverage'] if laList: summary['LoadAverage'] = float(sum(laList)) / float(len(laList)) else: summary['LoadAverage'] = 'Could not be estimated' result = self.__getWallClockTime() if not result['OK']: self.log.warn("Failed determining wall clock time", result['Message']) summary['WallClockTime(s)'] = 0 summary['ScaledCPUTime(s)'] = 0 else: wallClock = result['Value'] summary['WallClockTime(s)'] = wallClock summary['ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors self.__reportParameters(summary, 'UsageSummary', True) self.currentStats = summary ############################################################################# def __reportParameters(self, params, title=None, report=False): """Will report parameters for job. """ try: parameters = [] self.log.info('==========================================================') if title: self.log.info('Watchdog will report %s' % (title)) else: self.log.info('Watchdog will report parameters') self.log.info('==========================================================') vals = params if 'Value' in params: if vals['Value']: vals = params['Value'] for k, v in vals.items(): if v: self.log.info(str(k) + ' = ' + str(v)) parameters.append((k, v)) if report: self.__setJobParamList(parameters) self.log.info('==========================================================') except Exception as x: self.log.warn('Problem while reporting parameters') self.log.warn(str(x)) ############################################################################# def __getWallClockTime(self): """ Establishes the Wall Clock time spent since the Watchdog initialization""" result = S_OK() if 'StartTime' in self.initialValues: currentTime = time.time() wallClock = currentTime - self.initialValues['StartTime'] result['Value'] = wallClock else: self.initialValues['StartTime'] = time.time() result['Value'] = 0.0 return result ############################################################################# def __killRunningThread(self): """ Will kill the running thread process and any child processes.""" self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID())) result = self.spObject.killChild() self.applicationKilled = True self.log.info('Subprocess.killChild() returned:%s ' % (result)) return S_OK('Thread killed') ############################################################################# def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict): """ Sends sign of life 'heartbeat' signal and triggers control signal interpretation. """ result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict) if not result['OK']: self.log.warn('Problem sending sign of life') self.log.warn(result) if result['OK'] and result['Value']: self.__interpretControlSignal(result['Value']) return result ############################################################################# def __setJobParamList(self, value): """Wraps around setJobParameters of state update client """ # job wrapper template sets the jobID variable if 'JOBID' not in os.environ: self.log.info('Running without JOBID so parameters will not be reported') return S_OK() jobID = os.environ['JOBID'] jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value) self.log.verbose('setJobParameters(%s,%s)' % (jobID, value)) if not jobParam['OK']: self.log.warn(jobParam['Message']) return jobParam ############################################################################# def getNodeInformation(self): """ Attempts to retrieve all static system information, should be overridden in a subclass""" methodName = 'getNodeInformation' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass') ############################################################################# def getMemoryUsed(self): """Obtains the memory used. """ mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \ resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss return float(mem) ############################################################################# def getDiskSpace(self): """ Attempts to get the available disk space, should be overridden in a subclass""" methodName = 'getDiskSpace' self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass') return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
def test_cpuUsage(): mainProcess = Popen( ['python', 'tests/Utilities/ProcessesCreator_withChildren.py']) time.sleep(2) p = Profiler(mainProcess.pid) res = p.pid() assert res['OK'] is True res = p.status() assert res['OK'] is True # user res = p.cpuUsageUser() assert res['OK'] is True assert res['Value'] > 0 resC = p.cpuUsageUser(withChildren=True) assert resC['OK'] is True assert resC['Value'] > 0 assert resC['Value'] >= res['Value'] res = p.cpuUsageUser() assert res['OK'] is True assert res['Value'] > 0 resC = p.cpuUsageUser(withChildren=True) assert resC['OK'] is True assert resC['Value'] > 0 assert resC['Value'] >= res['Value'] resT = p.cpuUsageUser(withTerminatedChildren=True) assert resT['OK'] is True assert resT['Value'] > 0 assert resT['Value'] >= res['Value'] resTC = p.cpuUsageUser(withChildren=True, withTerminatedChildren=True) assert resTC['OK'] is True assert resTC['Value'] > 0 assert resTC['Value'] >= res['Value'] # system res = p.cpuUsageSystem() assert res['OK'] is True assert res['Value'] >= 0 resWC = p.cpuUsageSystem(withChildren=True) assert resWC['OK'] is True assert resWC['Value'] >= 0 assert resWC['Value'] >= res['Value'] res = p.cpuUsageSystem() assert res['OK'] is True assert res['Value'] > 0 resC = p.cpuUsageSystem(withChildren=True) assert resC['OK'] is True assert resC['Value'] > 0 assert resC['Value'] >= res['Value'] resT = p.cpuUsageSystem(withTerminatedChildren=True) assert resT['OK'] is True assert resT['Value'] > 0 assert resT['Value'] >= res['Value'] resTC = p.cpuUsageSystem(withChildren=True, withTerminatedChildren=True) assert resTC['OK'] is True assert resTC['Value'] > 0 assert resTC['Value'] >= res['Value'] # After this the main process will no-longer exist mainProcess.wait() res = p.cpuUsageUser() assert res['OK'] is False assert res['Errno'] == 3
def test_base(): p = Profiler() res = p.pid() assert res['OK'] is False res = p.status() assert res['OK'] is False mainProcess = Popen( ['python', 'tests/Utilities/ProcessesCreator_withChildren.py']) time.sleep(1) p = Profiler(mainProcess.pid) res = p.pid() assert res['OK'] is True res = p.status() assert res['OK'] is True res = p.runningTime() assert res['OK'] is True assert res['Value'] > 0 res = p.memoryUsage() assert res['OK'] is True assert res['Value'] > 0 resWC = p.memoryUsage(withChildren=True) assert resWC['OK'] is True assert resWC['Value'] > 0 assert resWC['Value'] >= res['Value'] res = p.vSizeUsage() assert res['OK'] is True assert res['Value'] > 0 resWC = p.vSizeUsage(withChildren=True) assert resWC['OK'] is True assert resWC['Value'] > 0 assert resWC['Value'] >= res['Value'] res = p.vSizeUsage() assert res['OK'] is True assert res['Value'] > 0 resWC = p.vSizeUsage(withChildren=True) assert resWC['OK'] is True assert resWC['Value'] > 0 assert resWC['Value'] >= res['Value'] res = p.numThreads() assert res['OK'] is True assert res['Value'] > 0 resWC = p.numThreads(withChildren=True) assert resWC['OK'] is True assert resWC['Value'] > 0 assert resWC['Value'] >= res['Value'] res = p.cpuPercentage() assert res['OK'] is True assert res['Value'] >= 0 resWC = p.cpuPercentage(withChildren=True) assert resWC['OK'] is True assert resWC['Value'] >= 0 assert resWC['Value'] >= res['Value']
def test_cpuUsage(): mainProcess = Popen([ "python", join(dirname(DIRAC.__file__), "tests/Utilities/ProcessesCreator_withChildren.py"), ]) time.sleep(2) p = Profiler(mainProcess.pid) res = p.pid() assert res["OK"] is True res = p.status() assert res["OK"] is True # user res = p.cpuUsageUser() assert res["OK"] is True assert res["Value"] > 0 resC = p.cpuUsageUser(withChildren=True) assert resC["OK"] is True assert resC["Value"] > 0 assert resC["Value"] >= res["Value"] res = p.cpuUsageUser() assert res["OK"] is True assert res["Value"] > 0 resC = p.cpuUsageUser(withChildren=True) assert resC["OK"] is True assert resC["Value"] > 0 assert resC["Value"] >= res["Value"] resT = p.cpuUsageUser(withTerminatedChildren=True) assert resT["OK"] is True assert resT["Value"] > 0 assert resT["Value"] >= res["Value"] resTC = p.cpuUsageUser(withChildren=True, withTerminatedChildren=True) assert resTC["OK"] is True assert resTC["Value"] > 0 assert resTC["Value"] >= res["Value"] # system res = p.cpuUsageSystem() assert res["OK"] is True assert res["Value"] >= 0 resWC = p.cpuUsageSystem(withChildren=True) assert resWC["OK"] is True assert resWC["Value"] >= 0 assert resWC["Value"] >= res["Value"] res = p.cpuUsageSystem() assert res["OK"] is True assert res["Value"] > 0 resC = p.cpuUsageSystem(withChildren=True) assert resC["OK"] is True assert resC["Value"] > 0 assert resC["Value"] >= res["Value"] resT = p.cpuUsageSystem(withTerminatedChildren=True) assert resT["OK"] is True assert resT["Value"] > 0 assert resT["Value"] >= res["Value"] resTC = p.cpuUsageSystem(withChildren=True, withTerminatedChildren=True) assert resTC["OK"] is True assert resTC["Value"] > 0 assert resTC["Value"] >= res["Value"] # After this the main process will no-longer exist mainProcess.wait() res = p.cpuUsageUser() assert res["OK"] is False assert res["Errno"] == 3
def test_base(): p = Profiler() res = p.pid() assert res["OK"] is False res = p.status() assert res["OK"] is False mainProcess = Popen([ "python", join(dirname(DIRAC.__file__), "tests/Utilities/ProcessesCreator_withChildren.py"), ]) time.sleep(1) p = Profiler(mainProcess.pid) res = p.pid() assert res["OK"] is True res = p.status() assert res["OK"] is True res = p.runningTime() assert res["OK"] is True assert res["Value"] > 0 res = p.memoryUsage() assert res["OK"] is True assert res["Value"] > 0 resWC = p.memoryUsage(withChildren=True) assert resWC["OK"] is True assert resWC["Value"] > 0 assert resWC["Value"] >= res["Value"] res = p.vSizeUsage() assert res["OK"] is True assert res["Value"] > 0 resWC = p.vSizeUsage(withChildren=True) assert resWC["OK"] is True assert resWC["Value"] > 0 assert resWC["Value"] >= res["Value"] res = p.vSizeUsage() assert res["OK"] is True assert res["Value"] > 0 resWC = p.vSizeUsage(withChildren=True) assert resWC["OK"] is True assert resWC["Value"] > 0 assert resWC["Value"] >= res["Value"] res = p.numThreads() assert res["OK"] is True assert res["Value"] > 0 resWC = p.numThreads(withChildren=True) assert resWC["OK"] is True assert resWC["Value"] > 0 assert resWC["Value"] >= res["Value"] res = p.cpuPercentage() assert res["OK"] is True assert res["Value"] >= 0 resWC = p.cpuPercentage(withChildren=True) assert resWC["OK"] is True assert resWC["Value"] >= 0 assert resWC["Value"] >= res["Value"]