Esempio n. 1
0
  def __init__(self, pid, exeThread, spObject, jobCPUTime,
               memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}):
    """ Constructor, takes system flag as argument.
    """
    self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800))  # 30 minutes
    self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800))  # 30 minutes
    self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2))  # SIGINT
    self.stopSigRegex = jobArgs.get('StopSigRegex', None)
    self.stopSigSent = False

    self.log = gLogger.getSubLogger("Watchdog")
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUTime = jobCPUTime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.profiler = Profiler(pid)
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    # defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.wallClockCheckSeconds = 5 * 60  # 5 minutes
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.wallClockCheckCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    self.scaleFactor = 1.0
    self.processors = processors
Esempio n. 2
0
class Watchdog(object):

    #############################################################################
    def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}):
        """Constructor, takes system flag as argument."""
        self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800))  # 30 minutes
        self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800))  # 30 minutes
        self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2))  # SIGINT
        self.stopSigRegex = jobArgs.get("StopSigRegex", None)
        self.stopSigSent = False

        self.log = gLogger.getSubLogger("Watchdog")
        self.exeThread = exeThread
        self.wrapperPID = pid
        self.appPID = self.exeThread.getCurrentPID()
        self.spObject = spObject
        self.jobCPUTime = jobCPUTime
        self.memoryLimit = memoryLimit
        self.calibration = 0
        self.initialValues = {}
        self.parameters = {}
        self.peekFailCount = 0
        self.peekRetry = 5
        self.profiler = Profiler(pid)
        self.checkError = ""
        self.currentStats = {}
        self.initialized = False
        self.count = 0

        # defaults
        self.testWallClock = 1
        self.testDiskSpace = 1
        self.testLoadAvg = 1
        self.maxWallClockTime = 3 * 24 * 60 * 60
        self.testCPUConsumed = 1
        self.testCPULimit = 0
        self.testMemoryLimit = 0
        self.testTimeLeft = 1
        self.pollingTime = 10  # 10 seconds
        self.checkingTime = 30 * 60  # 30 minute period
        self.minCheckingTime = 20 * 60  # 20 mins
        self.wallClockCheckSeconds = 5 * 60  # 5 minutes
        self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
        self.jobPeekFlag = 1  # on / off
        self.minDiskSpace = 10  # MB
        self.loadAvgLimit = 1000  # > 1000 and jobs killed
        self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
        self.jobCPUMargin = 20  # %age buffer before killing job
        self.minCPUWallClockRatio = 5  # ratio %age
        self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
        self.checkCount = 0
        self.wallClockCheckCount = 0
        self.nullCPUCount = 0

        self.grossTimeLeftLimit = 10 * self.checkingTime
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = 0
        self.littleTimeLeft = False
        self.cpuPower = 1.0
        self.processors = processors

    #############################################################################
    def initialize(self):
        """Watchdog initialization."""
        if self.initialized:
            self.log.info("Watchdog already initialized")
            return S_OK()
        else:
            self.initialized = True

        setup = gConfig.getValue("/DIRAC/Setup", "")
        if not setup:
            return S_ERROR("Can not get the DIRAC Setup value")
        wms_instance = getSystemInstance("WorkloadManagement")
        if not wms_instance:
            return S_ERROR("Can not get the WorkloadManagement system instance")
        self.section = "/Systems/WorkloadManagement/%s/JobWrapper" % wms_instance

        self.log.verbose("Watchdog initialization")
        # Test control flags
        self.testWallClock = gConfig.getValue(self.section + "/CheckWallClockFlag", 1)
        self.testDiskSpace = gConfig.getValue(self.section + "/CheckDiskSpaceFlag", 1)
        self.testLoadAvg = gConfig.getValue(self.section + "/CheckLoadAvgFlag", 1)
        self.testCPUConsumed = gConfig.getValue(self.section + "/CheckCPUConsumedFlag", 1)
        self.testCPULimit = gConfig.getValue(self.section + "/CheckCPULimitFlag", 0)
        self.testMemoryLimit = gConfig.getValue(self.section + "/CheckMemoryLimitFlag", 0)
        self.testTimeLeft = gConfig.getValue(self.section + "/CheckTimeLeftFlag", 1)
        # Other parameters
        self.pollingTime = gConfig.getValue(self.section + "/PollingTime", 10)  # 10 seconds
        self.checkingTime = gConfig.getValue(self.section + "/CheckingTime", 30 * 60)  # 30 minute period
        self.minCheckingTime = gConfig.getValue(self.section + "/MinCheckingTime", 20 * 60)  # 20 mins
        self.maxWallClockTime = gConfig.getValue(self.section + "/MaxWallClockTime", 3 * 24 * 60 * 60)  # e.g. 4 days
        self.jobPeekFlag = gConfig.getValue(self.section + "/JobPeekFlag", 1)  # on / off
        self.minDiskSpace = gConfig.getValue(self.section + "/MinDiskSpace", 10)  # MB
        self.loadAvgLimit = gConfig.getValue(self.section + "/LoadAverageLimit", 1000)  # > 1000 and jobs killed
        self.sampleCPUTime = gConfig.getValue(self.section + "/CPUSampleTime", 30 * 60)  # e.g. up to 20mins sample
        self.jobCPUMargin = gConfig.getValue(self.section + "/JobCPULimitMargin", 20)  # %age buffer before killing job
        self.minCPUWallClockRatio = gConfig.getValue(self.section + "/MinCPUWallClockRatio", 5)  # ratio %age
        # After 5 sample times return null CPU consumption kill job
        self.nullCPULimit = gConfig.getValue(self.section + "/NullCPUCountLimit", 5)
        if self.checkingTime < self.minCheckingTime:
            self.log.info(
                "Requested CheckingTime of %s setting to %s seconds (minimum)"
                % (self.checkingTime, self.minCheckingTime)
            )
            self.checkingTime = self.minCheckingTime

        # The time left is returned in seconds @ 250 SI00 = 1 HS06,
        # the self.checkingTime and self.pollingTime are in seconds,
        # thus they need to be multiplied by a large enough factor
        self.fineTimeLeftLimit = gConfig.getValue(self.section + "/TimeLeftLimit", 150 * self.pollingTime)
        self.cpuPower = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 1.0)

        return S_OK()

    def run(self):
        """The main watchdog execution method"""

        result = self.initialize()
        if not result["OK"]:
            self.log.always("Can not start watchdog for the following reason")
            self.log.always(result["Message"])
            return result

        try:
            while True:
                self.log.debug("Starting watchdog loop # %d" % self.count)
                start_cycle_time = time.time()
                result = self.execute()
                exec_cycle_time = time.time() - start_cycle_time
                if not result["OK"]:
                    self.log.error("Watchdog error during execution", result["Message"])
                    break
                elif result["Value"] == "Ended":
                    break
                self.count += 1
                if exec_cycle_time < self.pollingTime:
                    time.sleep(self.pollingTime - exec_cycle_time)
            return S_OK()
        except Exception:
            self.log.exception()
            return S_ERROR("Exception")

    #############################################################################
    def execute(self):
        """The main agent execution method of the Watchdog."""

        if not self.exeThread.is_alive():
            self.__getUsageSummary()
            self.log.info("Process to monitor has completed, Watchdog will exit.")
            return S_OK("Ended")

        # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL
        if (
            not self.stopSigSent
            and self.stopSigRegex is not None
            and (time.time() - self.initialValues["StartTime"]) > self.wallClockCheckSeconds * self.wallClockCheckCount
        ):
            self.wallClockCheckCount += 1
            self._performWallClockChecks()

        if self.littleTimeLeft:
            # if we have gone over enough iterations query again
            if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
                self.checkError = JobMinorStatus.JOB_EXCEEDED_CPU
                self.log.error(self.checkError, self.timeLeft)
                self.__killRunningThread()
                return S_OK()
            else:
                self.littleTimeLeftCount -= 1

        # Note: need to poll regularly to see if the thread is alive
        #      but only perform checks with a certain frequency
        if (time.time() - self.initialValues["StartTime"]) > self.checkingTime * self.checkCount:
            self.checkCount += 1
            result = self._performChecks()
            if not result["OK"]:
                self.log.warn("Problem during recent checks")
                self.log.warn(result["Message"])
            return S_OK()
        else:
            # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount))
            return S_OK()

    #############################################################################
    def _performWallClockChecks(self):
        """Watchdog performs the wall clock checks based on MJF. Signals are sent
        to processes if we need to stop, but function always returns S_OK()
        """
        mjf = MJF.MJF()

        try:
            wallClockSecondsLeft = mjf.getWallClockSecondsLeft()
        except Exception as e:
            # Just stop if we can't get the wall clock seconds left
            return S_OK()

        jobstartSeconds = mjf.getIntJobFeature("jobstart_secs")
        if jobstartSeconds is None:
            # Just stop if we don't know when the job started
            return S_OK()

        if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and (
            wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds
        ):
            # Need to send the signal! Assume it works to avoid sending the signal more than once
            self.log.info("Sending signal to JobWrapper children", "(%s)" % self.stopSigNumber)
            self.stopSigSent = True

            try:
                for childPid in getChildrenPIDs(self.wrapperPID):
                    try:
                        cmdline = open("/proc/%d/cmdline" % childPid, "r").read().replace("\0", " ").strip()
                    except IOError:
                        # Process gone away? Not running on Linux? Skip anyway
                        continue

                    if re.search(self.stopSigRegex, cmdline) is not None:
                        self.log.info(
                            'Sending signal %d to process ID %d, cmdline = "%s"'
                            % (self.stopSigNumber, childPid, cmdline)
                        )
                        os.kill(childPid, self.stopSigNumber)

            except Exception as e:
                self.log.error("Failed to send signals to JobWrapper children!", repr(e))

        return S_OK()

    #############################################################################
    def _performChecks(self):
        """The Watchdog checks are performed at a different period to the checking of the
        application thread and correspond to the checkingTime.
        """
        self.log.verbose("------------------------------------")
        self.log.verbose("Checking loop starts for Watchdog")
        heartBeatDict = {}

        msg = ""

        loadAvg = float(os.getloadavg()[0])
        msg += "LoadAvg: %d " % loadAvg
        heartBeatDict["LoadAverage"] = loadAvg
        if "LoadAverage" not in self.parameters:
            self.parameters["LoadAverage"] = []
        self.parameters["LoadAverage"].append(loadAvg)

        memoryUsed = self.getMemoryUsed()
        msg += "MemUsed: %.1f kb " % (memoryUsed)
        heartBeatDict["MemoryUsed"] = memoryUsed
        if "MemoryUsed" not in self.parameters:
            self.parameters["MemoryUsed"] = []
        self.parameters["MemoryUsed"].append(memoryUsed)

        result = self.profiler.vSizeUsage(withChildren=True)
        if not result["OK"]:
            self.log.warn("Could not get vSize info from profiler", result["Message"])
        else:
            vsize = result["Value"] * 1024.0
            heartBeatDict["Vsize"] = vsize
            self.parameters.setdefault("Vsize", [])
            self.parameters["Vsize"].append(vsize)
            msg += "Job Vsize: %.1f kb " % vsize

        result = self.profiler.memoryUsage(withChildren=True)
        if not result["OK"]:
            self.log.warn("Could not get rss info from profiler", result["Message"])
        else:
            rss = result["Value"] * 1024.0
            heartBeatDict["RSS"] = rss
            self.parameters.setdefault("RSS", [])
            self.parameters["RSS"].append(rss)
            msg += "Job RSS: %.1f kb " % rss

        if "DiskSpace" not in self.parameters:
            self.parameters["DiskSpace"] = []

        # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused
        # (specific request from CERN batch service).
        result = self.getDiskSpace(exclude="fuse")
        if not result["OK"]:
            self.log.warn("Could not establish DiskSpace", result["Message"])
        else:
            msg += "DiskSpace: %.1f MB " % (result["Value"])
            self.parameters["DiskSpace"].append(result["Value"])
            heartBeatDict["AvailableDiskSpace"] = result["Value"]

        cpu = self.__getCPU()
        if not cpu["OK"]:
            msg += "CPU: ERROR "
            hmsCPU = 0
        else:
            cpu = cpu["Value"]
            msg += "CPU: %s (h:m:s) " % (cpu)
            if "CPUConsumed" not in self.parameters:
                self.parameters["CPUConsumed"] = []
            self.parameters["CPUConsumed"].append(cpu)
            hmsCPU = cpu
            rawCPU = self.__convertCPUTime(hmsCPU)
            if rawCPU["OK"]:
                heartBeatDict["CPUConsumed"] = rawCPU["Value"]

        result = self.__getWallClockTime()
        if not result["OK"]:
            self.log.warn("Failed determining wall clock time", result["Message"])
        else:
            msg += "WallClock: %.2f s " % (result["Value"])
            self.parameters.setdefault("WallClockTime", list()).append(result["Value"])
            heartBeatDict["WallClockTime"] = result["Value"] * self.processors
        self.log.info(msg)

        result = self._checkProgress()
        if not result["OK"]:
            self.checkError = result["Message"]
            self.log.warn(self.checkError)

            if self.jobPeekFlag:
                result = self.__peek()
                if result["OK"]:
                    outputList = result["Value"]
                    self.log.info("Last lines of available application output:")
                    self.log.info("================START================")
                    for line in outputList:
                        self.log.info(line)

                    self.log.info("=================END=================")

            self.__killRunningThread()
            return S_OK()

        recentStdOut = "None"
        if self.jobPeekFlag:
            result = self.__peek()
            if result["OK"]:
                outputList = result["Value"]
                size = len(outputList)
                recentStdOut = "Last %s lines of application output from Watchdog on %s [UTC]:" % (
                    size,
                    Time.dateTime(),
                )
                border = "=" * len(recentStdOut)
                cpuTotal = "Last reported CPU consumed for job is %s (h:m:s)" % (hmsCPU)
                if self.timeLeft:
                    cpuTotal += ", Batch Queue Time Left %s (s @ HS06)" % self.timeLeft
                recentStdOut = "\n%s\n%s\n%s\n%s\n" % (border, recentStdOut, cpuTotal, border)
                self.log.info(recentStdOut)
                for line in outputList:
                    self.log.info(line)
                    recentStdOut += line + "\n"
            else:
                recentStdOut = (
                    "Watchdog is initializing and will attempt to obtain standard output from application thread"
                )
                self.log.info(recentStdOut)
                self.peekFailCount += 1
                if self.peekFailCount > self.peekRetry:
                    self.jobPeekFlag = 0
                    self.log.warn("Turning off job peeking for remainder of execution")

        if "JOBID" not in os.environ:
            self.log.info("Running without JOBID so parameters will not be reported")
            return S_OK()

        jobID = os.environ["JOBID"]
        staticParamDict = {"StandardOutput": recentStdOut}
        self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict)
        return S_OK("Watchdog checking cycle complete")

    #############################################################################
    def __getCPU(self):
        """Uses the profiler to get CPU time for current process, its child, and the terminated child,
        and returns HH:MM:SS after conversion.
        """
        result = self.profiler.cpuUsageUser(withChildren=True, withTerminatedChildren=True)
        if not result["OK"]:
            self.log.warn("Issue while checking consumed CPU for user", result["Message"])
            if result["Errno"] == errno.ESRCH:
                self.log.warn("The main process does not exist (anymore). This might be correct.")
            return result
        cpuUsageUser = result["Value"]

        result = self.profiler.cpuUsageSystem(withChildren=True, withTerminatedChildren=True)
        if not result["OK"]:
            self.log.warn("Issue while checking consumed CPU for system", result["Message"])
            if result["Errno"] == errno.ESRCH:
                self.log.warn("The main process does not exist (anymore). This might be correct.")
            return result
        cpuUsageSystem = result["Value"]

        cpuTimeTotal = cpuUsageUser + cpuUsageSystem
        if cpuTimeTotal:
            self.log.verbose("Raw CPU time consumed (s) =", cpuTimeTotal)
            return self.__getCPUHMS(cpuTimeTotal)
        self.log.error("CPU time consumed found to be 0")
        return S_ERROR()

    #############################################################################
    def __getCPUHMS(self, cpuTime):
        mins, secs = divmod(cpuTime, 60)
        hours, mins = divmod(mins, 60)
        humanTime = "%02d:%02d:%02d" % (hours, mins, secs)
        self.log.verbose("Human readable CPU time is: %s" % humanTime)
        return S_OK(humanTime)

    #############################################################################
    def __interpretControlSignal(self, signalDict):
        """This method is called whenever a signal is sent via the result of
        sending a sign of life.
        """
        self.log.info("Received control signal")
        if isinstance(signalDict, dict):
            if "Kill" in signalDict:
                self.log.info("Received Kill signal, stopping job via control signal")
                self.checkError = JobMinorStatus.RECEIVED_KILL_SIGNAL
                self.__killRunningThread()
            else:
                self.log.info("The following control signal was sent but not understood by the watchdog:")
                self.log.info(signalDict)
        else:
            self.log.info("Expected dictionary for control signal", "received:\n%s" % (signalDict))

        return S_OK()

    #############################################################################
    def _checkProgress(self):
        """This method calls specific tests to determine whether the job execution
        is proceeding normally.  CS flags can easily be added to add or remove
        tests via central configuration.
        """
        report = ""

        if self.testWallClock:
            result = self.__checkWallClockTime()
            if not result["OK"]:
                self.log.warn(result["Message"])
                return result
            report += "WallClock: OK, "
        else:
            report += "WallClock: NA,"

        if self.testDiskSpace:
            result = self.__checkDiskSpace()
            if not result["OK"]:
                self.log.warn(result["Message"])
                return result
            report += "DiskSpace: OK, "
        else:
            report += "DiskSpace: NA,"

        if self.testLoadAvg:
            result = self.__checkLoadAverage()
            if not result["OK"]:
                self.log.warn(
                    "Check of load average failed, but won't fail because of that", ": %s" % result["Message"]
                )
                report += "LoadAverage: ERROR, "
                return S_OK()
            report += "LoadAverage: OK, "
        else:
            report += "LoadAverage: NA,"

        if self.testCPUConsumed:
            result = self.__checkCPUConsumed()
            if not result["OK"]:
                return result
            report += "CPUConsumed: OK, "
        else:
            report += "CPUConsumed: NA, "

        if self.testCPULimit:
            result = self.__checkCPULimit()
            if not result["OK"]:
                self.log.warn(result["Message"])
                return result
            report += "CPULimit OK, "
        else:
            report += "CPULimit: NA, "

        if self.testTimeLeft:
            self.__timeLeft()
            if self.timeLeft:
                report += "TimeLeft: OK"
        else:
            report += "TimeLeft: NA"

        if self.testMemoryLimit:
            result = self.__checkMemoryLimit()
            if not result["OK"]:
                self.log.warn(result["Message"])
                return result
            report += "MemoryLimit OK, "
        else:
            report += "MemoryLimit: NA, "

        self.log.info(report)
        return S_OK("All enabled checks passed")

    #############################################################################
    def __checkCPUConsumed(self):
        """Checks whether the CPU consumed by application process is reasonable. This
        method will report stalled jobs to be killed.
        """
        self.log.info("Checking CPU Consumed")

        if "WallClockTime" not in self.parameters:
            return S_ERROR("Missing WallClockTime info")
        if "CPUConsumed" not in self.parameters:
            return S_ERROR("Missing CPUConsumed info")

        wallClockTime = self.parameters["WallClockTime"][-1]
        if wallClockTime < self.sampleCPUTime:
            self.log.info(
                "Stopping check, wallclock time is still smaller than sample time",
                "(%s) < (%s)" % (wallClockTime, self.sampleCPUTime),
            )
            return S_OK()

        intervals = max(1, int(self.sampleCPUTime / self.checkingTime))
        if len(self.parameters["CPUConsumed"]) < intervals + 1:
            self.log.info(
                "Not enough snapshots to calculate",
                "there are %s and we need %s" % (len(self.parameters["CPUConsumed"]), intervals + 1),
            )
            return S_OK()

        wallClockTime = self.parameters["WallClockTime"][-1] - self.parameters["WallClockTime"][-1 - intervals]
        try:
            cpuTime = self.__convertCPUTime(self.parameters["CPUConsumed"][-1])["Value"]
            # For some reason, some times the CPU consumed estimation returns 0
            # if cpuTime == 0:
            #   return S_OK()
            cpuTime -= self.__convertCPUTime(self.parameters["CPUConsumed"][-1 - intervals])["Value"]
            if cpuTime < 0:
                self.log.warn("Consumed CPU time negative, something wrong may have happened, ignore")
                return S_OK()
            if wallClockTime <= 0:
                self.log.warn("Wallclock time should not be negative or zero, Ignore")
                return S_OK()

            ratio = (cpuTime / wallClockTime) * 100

            self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio)
            # in case of error cpuTime might be 0, exclude this
            if ratio < self.minCPUWallClockRatio:
                if (
                    os.path.exists("DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK")
                    or "DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK" in os.environ
                ):
                    self.log.warn(
                        "N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload"
                    )
                    return S_OK()
                self.log.info("Job is stalled!")
                return S_ERROR(JobMinorStatus.WATCHDOG_STALLED)
        except Exception as e:
            self.log.error("Cannot convert CPU consumed from string to int", str(e))

        return S_OK()

    #############################################################################

    def __convertCPUTime(self, cputime):
        """Method to convert the CPU time as returned from the Watchdog
        instances to the equivalent DIRAC normalized CPU time to be compared
        to the Job CPU requirement.
        """
        cpuValue = 0
        cpuHMS = cputime.split(":")
        # for i in range( len( cpuHMS ) ):
        #   cpuHMS[i] = cpuHMS[i].replace( '00', '0' )

        try:
            hours = float(cpuHMS[0]) * 60 * 60
            mins = float(cpuHMS[1]) * 60
            secs = float(cpuHMS[2])
            cpuValue = float(hours + mins + secs)
        except Exception as x:
            self.log.warn(str(x))
            return S_ERROR("Could not calculate CPU time")

        # Normalization to be implemented
        normalizedCPUValue = cpuValue

        result = S_OK()
        result["Value"] = normalizedCPUValue
        self.log.debug("CPU value %s converted to %s" % (cputime, normalizedCPUValue))
        return result

    #############################################################################

    def __checkCPULimit(self):
        """Checks that the job has consumed more than the job CPU requirement
        (plus a configurable margin) and kills them as necessary.
        """
        consumedCPU = 0
        if "CPUConsumed" in self.parameters:
            consumedCPU = self.parameters["CPUConsumed"][-1]

        consumedCPUDict = self.__convertCPUTime(consumedCPU)
        if consumedCPUDict["OK"]:
            currentCPU = consumedCPUDict["Value"]
        else:
            return S_OK("Not possible to determine current CPU consumed")

        if consumedCPU:
            limit = int(self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100))
            cpuConsumed = float(currentCPU)
            if cpuConsumed > limit:
                self.log.info(
                    "Job has consumed more than the specified CPU limit",
                    "with an additional %s%% margin" % (self.jobCPUMargin),
                )
                return S_ERROR("Job has exceeded maximum CPU time limit")
            return S_OK("Job within CPU limit")

        if not currentCPU:
            self.log.verbose("Both initial and current CPU consumed are null")
            return S_OK("CPU consumed is not measurable yet")

        return S_OK("Not possible to determine CPU consumed")

    def __checkMemoryLimit(self):
        """Checks that the job memory consumption is within a limit"""
        vsize = 0
        if "Vsize" in self.parameters:
            vsize = self.parameters["Vsize"][-1]

        if vsize and self.memoryLimit:
            if vsize > self.memoryLimit:
                vsize = vsize
                # Just a warning for the moment
                self.log.warn(
                    "Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit)
                )

        return S_OK()

    #############################################################################
    def __checkDiskSpace(self):
        """Checks whether the CS defined minimum disk space is available."""
        if "DiskSpace" in self.parameters:
            availSpace = self.parameters["DiskSpace"][-1]
            if availSpace >= 0 and availSpace < self.minDiskSpace:
                self.log.info(
                    "Not enough local disk space for job to continue, defined in CS as %s MB" % (self.minDiskSpace)
                )
                return S_ERROR(JobMinorStatus.JOB_INSUFFICIENT_DISK)
            else:
                return S_OK("Job has enough disk space available")
        else:
            return S_ERROR("Available disk space could not be established")

    #############################################################################
    def __checkWallClockTime(self):
        """Checks whether the job has been running for the CS defined maximum
        wall clock time.
        """
        if "StartTime" in self.initialValues:
            startTime = self.initialValues["StartTime"]
            if time.time() - startTime > self.maxWallClockTime:
                self.log.info("Job has exceeded maximum wall clock time of %s seconds" % (self.maxWallClockTime))
                return S_ERROR(JobMinorStatus.JOB_EXCEEDED_WALL_CLOCK)
            else:
                return S_OK("Job within maximum wall clock time")
        else:
            return S_ERROR("Job start time could not be established")

    #############################################################################
    def __checkLoadAverage(self):
        """Checks whether the CS defined maximum load average is exceeded."""
        if "LoadAverage" in self.parameters:
            loadAvg = self.parameters["LoadAverage"][-1]
            if loadAvg > float(self.loadAvgLimit):
                self.log.info("Maximum load average exceeded, defined in CS as %s " % (self.loadAvgLimit))
                return S_ERROR("Job exceeded maximum load average")
            return S_OK("Job running with normal load average")
        return S_ERROR("Job load average not established")

    #############################################################################
    def __peek(self):
        """Uses ExecutionThread.getOutput() method to obtain standard output
        from running thread via subprocess callback function.
        """
        result = self.exeThread.getOutput()
        if not result["OK"]:
            self.log.warn("Could not obtain output from running application thread")
            self.log.warn(result["Message"])

        return result

    #############################################################################
    def calibrate(self):
        """The calibrate method obtains the initial values for system memory and load
        and calculates the margin for error for the rest of the Watchdog cycle.
        """
        self.__getWallClockTime()
        self.parameters["WallClockTime"] = []

        cpuConsumed = self.__getCPU()
        if not cpuConsumed["OK"]:
            self.log.warn("Could not establish CPU consumed, setting to 0.0")
            cpuConsumed = 0.0
        else:
            cpuConsumed = cpuConsumed["Value"]

        self.initialValues["CPUConsumed"] = cpuConsumed
        self.parameters["CPUConsumed"] = []

        self.initialValues["LoadAverage"] = float(os.getloadavg()[0])
        self.parameters["LoadAverage"] = []

        memUsed = self.getMemoryUsed()

        self.initialValues["MemoryUsed"] = memUsed
        self.parameters["MemoryUsed"] = []

        result = self.profiler.vSizeUsage(withChildren=True)
        if not result["OK"]:
            self.log.warn("Could not get vSize info from profiler", result["Message"])
        else:
            vsize = result["Value"] * 1024.0
            self.initialValues["Vsize"] = vsize
            self.log.verbose("Vsize(kb)", "%.1f" % vsize)
        self.parameters["Vsize"] = []

        result = self.profiler.memoryUsage(withChildren=True)
        if not result["OK"]:
            self.log.warn("Could not get rss info from profiler", result["Message"])
        else:
            rss = result["Value"] * 1024.0
            self.initialValues["RSS"] = rss
            self.log.verbose("RSS(kb)", "%.1f" % rss)
        self.parameters["RSS"] = []

        # We exclude fuse so that mountpoints can be cleaned up by automount after a period unused
        # (specific request from CERN batch service).
        result = self.getDiskSpace(exclude="fuse")
        self.log.verbose("DiskSpace: %s" % (result))
        if not result["OK"]:
            self.log.warn("Could not establish DiskSpace")
        else:
            self.initialValues["DiskSpace"] = result["Value"]
        self.parameters["DiskSpace"] = []

        result = self.getNodeInformation()
        self.log.verbose("NodeInfo", result)

        if "LSB_JOBID" in os.environ:
            result["LocalJobID"] = os.environ["LSB_JOBID"]
        if "PBS_JOBID" in os.environ:
            result["LocalJobID"] = os.environ["PBS_JOBID"]
        if "QSUB_REQNAME" in os.environ:
            result["LocalJobID"] = os.environ["QSUB_REQNAME"]
        if "JOB_ID" in os.environ:
            result["LocalJobID"] = os.environ["JOB_ID"]

        self.__reportParameters(result, "NodeInformation", True)
        self.__reportParameters(self.initialValues, "InitialValues")
        return S_OK()

    def __timeLeft(self):
        """
        return Normalized CPU time left in the batch system
        0 if not available
        update self.timeLeft and self.littleTimeLeft
        """
        # Get CPU time left in the batch system
        result = self.timeLeftUtil.getTimeLeft(0.0)
        if not result["OK"]:
            # Could not get CPU time left, we might need to wait for the first loop
            # or the Utility is not working properly for this batch system
            # or we are in a batch system
            timeLeft = 0
        else:
            timeLeft = result["Value"]

        self.timeLeft = timeLeft
        if not self.littleTimeLeft:
            if timeLeft and timeLeft < self.grossTimeLeftLimit:
                self.log.info("TimeLeft bellow %s, now checking with higher frequency" % timeLeft)
                self.littleTimeLeft = True
                # TODO: better configurable way of doing this to be coded
                self.littleTimeLeftCount = 15
        else:
            if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit:
                timeLeft = -1

        return timeLeft

    #############################################################################
    def __getUsageSummary(self):
        """Returns average load, memory etc. over execution of job thread"""
        summary = {}
        # CPUConsumed
        if "CPUConsumed" in self.parameters:
            cpuList = self.parameters["CPUConsumed"]
            if cpuList:
                hmsCPU = cpuList[-1]
                rawCPU = self.__convertCPUTime(hmsCPU)
                if rawCPU["OK"]:
                    summary["LastUpdateCPU(s)"] = rawCPU["Value"]
            else:
                summary["LastUpdateCPU(s)"] = "Could not be estimated"
        # DiskSpace
        if "DiskSpace" in self.parameters:
            space = self.parameters["DiskSpace"]
            if space:
                value = abs(float(space[-1]) - float(self.initialValues["DiskSpace"]))
                if value < 0:
                    value = 0
                summary["DiskSpace(MB)"] = value
            else:
                summary["DiskSpace(MB)"] = "Could not be estimated"
        # MemoryUsed
        if "MemoryUsed" in self.parameters:
            memory = self.parameters["MemoryUsed"]
            if memory:
                summary["MemoryUsed(kb)"] = abs(float(memory[-1]) - float(self.initialValues["MemoryUsed"]))
            else:
                summary["MemoryUsed(kb)"] = "Could not be estimated"
        # LoadAverage
        if "LoadAverage" in self.parameters:
            laList = self.parameters["LoadAverage"]
            if laList:
                summary["LoadAverage"] = sum(laList) / len(laList)
            else:
                summary["LoadAverage"] = "Could not be estimated"

        result = self.__getWallClockTime()
        if not result["OK"]:
            self.log.warn("Failed determining wall clock time", result["Message"])
            summary["WallClockTime(s)"] = 0
            summary["ScaledCPUTime(s)"] = 0
        else:
            wallClock = result["Value"]
            summary["WallClockTime(s)"] = wallClock * self.processors
            summary["ScaledCPUTime(s)"] = wallClock * self.cpuPower * self.processors

        self.__reportParameters(summary, "UsageSummary", True)
        self.currentStats = summary

    #############################################################################
    def __reportParameters(self, params, title=None, report=False):
        """Will report parameters for job."""
        try:
            parameters = []
            self.log.info("", "==========================================================")
            if title:
                self.log.info("Watchdog will report", title)
            else:
                self.log.info("Watchdog will report parameters")
            self.log.info("", "==========================================================")
            vals = params
            if "Value" in params:
                if vals["Value"]:
                    vals = params["Value"]
            for k, v in vals.items():
                if v:
                    self.log.info(str(k) + " = " + str(v))
                    parameters.append([k, v])
            if report:
                self.__setJobParamList(parameters)

            self.log.info("", "==========================================================")
        except Exception as x:
            self.log.warn("Problem while reporting parameters")
            self.log.warn(repr(x))

    #############################################################################
    def __getWallClockTime(self):
        """Establishes the Wall Clock time spent since the Watchdog initialization"""
        result = S_OK()
        if "StartTime" in self.initialValues:
            currentTime = time.time()
            wallClock = currentTime - self.initialValues["StartTime"]
            result["Value"] = wallClock
        else:
            self.initialValues["StartTime"] = time.time()
            result["Value"] = 0.0

        return result

    #############################################################################
    def __killRunningThread(self):
        """Will kill the running thread process and any child processes."""
        self.log.info("Sending kill signal to application PID %s" % (self.spObject.getChildPID()))
        result = self.spObject.killChild()
        self.applicationKilled = True
        self.log.info("Subprocess.killChild() returned:%s " % (result))
        return S_OK("Thread killed")

    #############################################################################
    def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict):
        """Sends sign of life 'heartbeat' signal and triggers control signal
        interpretation.
        """
        result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict)
        if not result["OK"]:
            self.log.warn("Problem sending sign of life")
            self.log.warn(result)

        if result["OK"] and result["Value"]:
            self.__interpretControlSignal(result["Value"])

        return result

    #############################################################################
    def __setJobParamList(self, value):
        """Wraps around setJobParameters of state update client"""
        # job wrapper template sets the jobID variable
        if "JOBID" not in os.environ:
            self.log.info("Running without JOBID so parameters will not be reported")
            return S_OK()
        jobID = os.environ["JOBID"]
        jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value)
        self.log.verbose("setJobParameters(%s,%s)" % (jobID, value))
        if not jobParam["OK"]:
            self.log.warn(jobParam["Message"])

        return jobParam

    #############################################################################
    def getNodeInformation(self):
        """Retrieves all static system information"""
        result = {}
        result["HostName"] = socket.gethostname()
        result["CPU(MHz)"] = psutil.cpu_freq()[0]
        result["Memory(kB)"] = int(psutil.virtual_memory()[1] / 1024)
        result["LocalAccount"] = getpass.getuser()

        if os.path.exists("/proc/cpuinfo"):
            with open("/proc/cpuinfo", "r") as cpuinfo:
                info = cpuinfo.readlines()
            try:
                result["ModelName"] = info[4].split(":")[1].replace(" ", "").replace("\n", "")
            except IndexError:
                pass
            try:
                result["CacheSize(kB)"] = [x.strip().split(":")[1] for x in info if "cache size" in x][0].strip()
            except IndexError:
                pass

        return result

    #############################################################################
    def getMemoryUsed(self):
        """Obtains the memory used."""
        mem = (
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
        )
        return float(mem)

    #############################################################################
    def getDiskSpace(self, exclude=None):
        """Obtains the available disk space."""
        result = S_OK()
        diskSpace = getDiskSpace(exclude=exclude)

        if diskSpace == -1:
            result = S_ERROR("Could not obtain disk usage")
            self.log.warn(" Could not obtain disk usage")
            result["Value"] = float(-1)
            return result

        result["Value"] = float(diskSpace)
        return result
Esempio n. 3
0
class Watchdog(object):

  #############################################################################
  def __init__(self, pid, exeThread, spObject, jobCPUTime,
               memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}):
    """ Constructor, takes system flag as argument.
    """
    self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800))  # 30 minutes
    self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800))  # 30 minutes
    self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2))  # SIGINT
    self.stopSigRegex = jobArgs.get('StopSigRegex', None)
    self.stopSigSent = False

    self.log = gLogger.getSubLogger("Watchdog")
    self.systemFlag = systemFlag
    self.exeThread = exeThread
    self.wrapperPID = pid
    self.appPID = self.exeThread.getCurrentPID()
    self.spObject = spObject
    self.jobCPUTime = jobCPUTime
    self.memoryLimit = memoryLimit
    self.calibration = 0
    self.initialValues = {}
    self.parameters = {}
    self.peekFailCount = 0
    self.peekRetry = 5
    self.profiler = Profiler(pid)
    self.checkError = ''
    self.currentStats = {}
    self.initialized = False
    self.count = 0

    # defaults
    self.testWallClock = 1
    self.testDiskSpace = 1
    self.testLoadAvg = 1
    self.maxWallClockTime = 3 * 24 * 60 * 60
    self.testCPUConsumed = 1
    self.testCPULimit = 0
    self.testMemoryLimit = 0
    self.testTimeLeft = 1
    self.pollingTime = 10  # 10 seconds
    self.checkingTime = 30 * 60  # 30 minute period
    self.minCheckingTime = 20 * 60  # 20 mins
    self.wallClockCheckSeconds = 5 * 60  # 5 minutes
    self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
    self.jobPeekFlag = 1  # on / off
    self.minDiskSpace = 10  # MB
    self.loadAvgLimit = 1000  # > 1000 and jobs killed
    self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
    self.jobCPUMargin = 20  # %age buffer before killing job
    self.minCPUWallClockRatio = 5  # ratio %age
    self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
    self.checkCount = 0
    self.wallClockCheckCount = 0
    self.nullCPUCount = 0

    self.grossTimeLeftLimit = 10 * self.checkingTime
    self.timeLeftUtil = TimeLeft()
    self.timeLeft = 0
    self.littleTimeLeft = False
    self.scaleFactor = 1.0
    self.processors = processors

  #############################################################################
  def initialize(self, loops=0):
    """ Watchdog initialization.
    """
    if self.initialized:
      self.log.info('Watchdog already initialized')
      return S_OK()
    else:
      self.initialized = True

    setup = gConfig.getValue('/DIRAC/Setup', '')
    if not setup:
      return S_ERROR('Can not get the DIRAC Setup value')
    wms_instance = getSystemInstance("WorkloadManagement")
    if not wms_instance:
      return S_ERROR('Can not get the WorkloadManagement system instance')
    self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance

    self.maxcount = loops
    self.log.verbose('Watchdog initialization')
    self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag))
    # Test control flags
    self.testWallClock = gConfig.getValue(self.section + '/CheckWallClockFlag', 1)
    self.testDiskSpace = gConfig.getValue(self.section + '/CheckDiskSpaceFlag', 1)
    self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1)
    self.testCPUConsumed = gConfig.getValue(self.section + '/CheckCPUConsumedFlag', 1)
    self.testCPULimit = gConfig.getValue(self.section + '/CheckCPULimitFlag', 0)
    self.testMemoryLimit = gConfig.getValue(self.section + '/CheckMemoryLimitFlag', 0)
    self.testTimeLeft = gConfig.getValue(self.section + '/CheckTimeLeftFlag', 1)
    # Other parameters
    self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10)  # 10 seconds
    self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60)  # 30 minute period
    self.minCheckingTime = gConfig.getValue(self.section + '/MinCheckingTime', 20 * 60)  # 20 mins
    self.maxWallClockTime = gConfig.getValue(self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60)  # e.g. 4 days
    self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1)  # on / off
    self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10)  # MB
    self.loadAvgLimit = gConfig.getValue(self.section + '/LoadAverageLimit', 1000)  # > 1000 and jobs killed
    self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60)  # e.g. up to 20mins sample
    self.jobCPUMargin = gConfig.getValue(self.section + '/JobCPULimitMargin', 20)  # %age buffer before killing job
    self.minCPUWallClockRatio = gConfig.getValue(self.section + '/MinCPUWallClockRatio', 5)  # ratio %age
    # After 5 sample times return null CPU consumption kill job
    self.nullCPULimit = gConfig.getValue(self.section + '/NullCPUCountLimit', 5)
    if self.checkingTime < self.minCheckingTime:
      self.log.info(
          'Requested CheckingTime of %s setting to %s seconds (minimum)' %
          (self.checkingTime, self.minCheckingTime))
      self.checkingTime = self.minCheckingTime

    # The time left is returned in seconds @ 250 SI00 = 1 HS06,
    # the self.checkingTime and self.pollingTime are in seconds,
    # thus they need to be multiplied by a large enough factor
    self.fineTimeLeftLimit = gConfig.getValue(self.section + '/TimeLeftLimit', 150 * self.pollingTime)
    self.scaleFactor = gConfig.getValue('/LocalSite/CPUScalingFactor', 1.0)

    return S_OK()

  def run(self):
    """ The main watchdog execution method
    """

    result = self.initialize()
    if not result['OK']:
      self.log.always('Can not start watchdog for the following reason')
      self.log.always(result['Message'])
      return result

    try:
      while True:
        self.log.debug('Starting watchdog loop # %d' % self.count)
        start_cycle_time = time.time()
        result = self.execute()
        exec_cycle_time = time.time() - start_cycle_time
        if not result['OK']:
          self.log.error("Watchdog error during execution", result['Message'])
          break
        elif result['Value'] == "Ended":
          break
        self.count += 1
        if exec_cycle_time < self.pollingTime:
          time.sleep(self.pollingTime - exec_cycle_time)
      return S_OK()
    except Exception:
      self.log.exception()
      return S_ERROR('Exception')

  #############################################################################
  def execute(self):
    """ The main agent execution method of the Watchdog.
    """

    if not self.exeThread.isAlive():
      self.__getUsageSummary()
      self.log.info('Process to monitor has completed, Watchdog will exit.')
      return S_OK("Ended")

    # WallClock checks every self.wallClockCheckSeconds, but only if StopSigRegex is defined in JDL
    if not self.stopSigSent and self.stopSigRegex is not None and (
            time.time() - self.initialValues['StartTime']) > self.wallClockCheckSeconds * self.wallClockCheckCount:
      self.wallClockCheckCount += 1
      self._performWallClockChecks()

    if self.littleTimeLeft:
      # if we have gone over enough iterations query again
      if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
        self.checkError = 'Job has reached the CPU limit of the queue'
        self.log.error(self.checkError, self.timeLeft)
        self.__killRunningThread()
        return S_OK()
      else:
        self.littleTimeLeftCount -= 1

    # Note: need to poll regularly to see if the thread is alive
    #      but only perform checks with a certain frequency
    if (time.time() - self.initialValues['StartTime']) > self.checkingTime * self.checkCount:
      self.checkCount += 1
      result = self._performChecks()
      if not result['OK']:
        self.log.warn('Problem during recent checks')
        self.log.warn(result['Message'])
      return S_OK()
    else:
      # self.log.debug('Application thread is alive: checking count is %s' %(self.checkCount))
      return S_OK()

  #############################################################################
  def _performWallClockChecks(self):
    """Watchdog performs the wall clock checks based on MJF. Signals are sent
       to processes if we need to stop, but function always returns S_OK()
    """
    mjf = MJF.MJF()

    try:
      wallClockSecondsLeft = mjf.getWallClockSecondsLeft()
    except Exception as e:
      # Just stop if we can't get the wall clock seconds left
      return S_OK()

    jobstartSeconds = mjf.getIntJobFeature('jobstart_secs')
    if jobstartSeconds is None:
      # Just stop if we don't know when the job started
      return S_OK()

    if (int(time.time()) > jobstartSeconds + self.stopSigStartSeconds) and \
       (wallClockSecondsLeft < self.stopSigFinishSeconds + self.wallClockCheckSeconds):
      # Need to send the signal! Assume it works to avoid sending the signal more than once
      self.log.info('Sending signal %d to JobWrapper children' % self.stopSigNumber)
      self.stopSigSent = True

      try:
        for childPid in getChildrenPIDs(self.wrapperPID):
          try:
            cmdline = open('/proc/%d/cmdline' % childPid, 'r').read().replace('\0', ' ').strip()
          except IOError:
            # Process gone away? Not running on Linux? Skip anyway
            continue

          if re.search(self.stopSigRegex, cmdline) is not None:
            self.log.info(
                'Sending signal %d to process ID %d, cmdline = "%s"' %
                (self.stopSigNumber, childPid, cmdline))
            os.kill(childPid, self.stopSigNumber)

      except Exception as e:
        self.log.error('Failed to send signals to JobWrapper children! (%s)' % str(e))

    return S_OK()

  #############################################################################
  def _performChecks(self):
    """The Watchdog checks are performed at a different period to the checking of the
       application thread and correspond to the checkingTime.
    """
    self.log.verbose('------------------------------------')
    self.log.verbose('Checking loop starts for Watchdog')
    heartBeatDict = {}

    msg = ''

    loadAvg = float(os.getloadavg()[0])
    msg += 'LoadAvg: %d ' % loadAvg
    heartBeatDict['LoadAverage'] = loadAvg
    if 'LoadAverage' not in self.parameters:
      self.parameters['LoadAverage'] = []
    self.parameters['LoadAverage'].append(loadAvg)

    memoryUsed = self.getMemoryUsed()
    msg += 'MemUsed: %.1f kb ' % (memoryUsed)
    heartBeatDict['MemoryUsed'] = memoryUsed
    if 'MemoryUsed' not in self.parameters:
      self.parameters['MemoryUsed'] = []
    self.parameters['MemoryUsed'].append(memoryUsed)

    result = self.profiler.getAllProcessData(withChildren=True)
    if result['OK']:
      vsize = result['Value']['stats']['vSizeUsage'] * 1024.
      rss = result['Value']['stats']['memoryUsage'] * 1024.
      heartBeatDict['Vsize'] = vsize
      heartBeatDict['RSS'] = rss
      self.parameters.setdefault('Vsize', [])
      self.parameters['Vsize'].append(vsize)
      self.parameters.setdefault('RSS', [])
      self.parameters['RSS'].append(rss)
      msg += "Job Vsize: %.1f kb " % vsize
      msg += "Job RSS: %.1f kb " % rss
    result = self.getDiskSpace()
    if not result['OK']:
      self.log.warn("Could not establish DiskSpace", result['Message'])
    else:
      msg += 'DiskSpace: %.1f MB ' % (result['Value'])
    if 'DiskSpace' not in self.parameters:
      self.parameters['DiskSpace'] = []
    if result['OK']:
      self.parameters['DiskSpace'].append(result['Value'])
      heartBeatDict['AvailableDiskSpace'] = result['Value']

    cpu = self.__getCPU()
    if not cpu['OK']:
      msg += 'CPU: ERROR '
      hmsCPU = 0
    else:
      cpu = cpu['Value']
      msg += 'CPU: %s (h:m:s) ' % (cpu)
      if 'CPUConsumed' not in self.parameters:
        self.parameters['CPUConsumed'] = []
      self.parameters['CPUConsumed'].append(cpu)
      hmsCPU = cpu
      rawCPU = self.__convertCPUTime(hmsCPU)
      if rawCPU['OK']:
        heartBeatDict['CPUConsumed'] = rawCPU['Value']

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn("Failed determining wall clock time", result['Message'])
    else:
      msg += 'WallClock: %.2f s ' % (result['Value'])
      self.parameters.setdefault('WallClockTime', list()).append(result['Value'])
      heartBeatDict['WallClockTime'] = result['Value']
    self.log.info(msg)

    result = self._checkProgress()
    if not result['OK']:
      self.checkError = result['Message']
      self.log.warn(self.checkError)

      if self.jobPeekFlag:
        result = self.__peek()
        if result['OK']:
          outputList = result['Value']
          size = len(outputList)
          self.log.info('Last %s lines of available application output:' % (size))
          self.log.info('================START================')
          for line in outputList:
            self.log.info(line)

          self.log.info('=================END=================')

      self.__killRunningThread()
      return S_OK()

    recentStdOut = 'None'
    if self.jobPeekFlag:
      result = self.__peek()
      if result['OK']:
        outputList = result['Value']
        size = len(outputList)
        recentStdOut = 'Last %s lines of application output from Watchdog on %s [UTC]:' % (size, Time.dateTime())
        border = '=' * len(recentStdOut)
        cpuTotal = 'Last reported CPU consumed for job is %s (h:m:s)' % (hmsCPU)
        if self.timeLeft:
          cpuTotal += ', Batch Queue Time Left %s (s @ HS06)' % self.timeLeft
        recentStdOut = '\n%s\n%s\n%s\n%s\n' % (border, recentStdOut, cpuTotal, border)
        self.log.info(recentStdOut)
        for line in outputList:
          self.log.info(line)
          recentStdOut += line + '\n'
      else:
        recentStdOut = 'Watchdog is initializing and will attempt to obtain standard output from application thread'
        self.log.info(recentStdOut)
        self.peekFailCount += 1
        if self.peekFailCount > self.peekRetry:
          self.jobPeekFlag = 0
          self.log.warn('Turning off job peeking for remainder of execution')

    if 'JOBID' not in os.environ:
      self.log.info('Running without JOBID so parameters will not be reported')
      return S_OK()

    jobID = os.environ['JOBID']
    staticParamDict = {'StandardOutput': recentStdOut}
    self.__sendSignOfLife(int(jobID), heartBeatDict, staticParamDict)
    return S_OK('Watchdog checking cycle complete')

  #############################################################################
  def __getCPU(self):
    """Uses os.times() to get CPU time and returns HH:MM:SS after conversion.
    """
    try:
      result = self.profiler.getAllProcessData(withChildren=True)
      if not result['OK']:
        self.log.warn('Problem while checking consumed CPU')
        return result
      cpuTime = result['Value']
      if cpuTime:
        cpuTimeTotal = cpuTime['stats']['cpuUsageSystem'] + cpuTime['stats']['cpuUsageUser']
        self.log.verbose("Raw CPU time consumed (s) = %s" % (cpuTimeTotal))
        return self.__getCPUHMS(cpuTimeTotal)
      else:
        self.log.error("CPU time consumed found to be 0")
        return S_ERROR()

    except Exception as e:
      self.log.warn('Could not determine CPU time consumed with exception')
      self.log.exception(e)
      return S_ERROR("Could not determine CPU time consumed with exception")

  #############################################################################
  def __getCPUHMS(self, cpuTime):
    mins, secs = divmod(cpuTime, 60)
    hours, mins = divmod(mins, 60)
    humanTime = '%02d:%02d:%02d' % (hours, mins, secs)
    self.log.verbose('Human readable CPU time is: %s' % humanTime)
    return S_OK(humanTime)

  #############################################################################
  def __interpretControlSignal(self, signalDict):
    """This method is called whenever a signal is sent via the result of
       sending a sign of life.
    """
    self.log.info('Received control signal')
    if isinstance(signalDict, dict):
      if 'Kill' in signalDict:
        self.log.info('Received Kill signal, stopping job via control signal')
        self.checkError = 'Received Kill signal'
        self.__killRunningThread()
      else:
        self.log.info('The following control signal was sent but not understood by the watchdog:')
        self.log.info(signalDict)
    else:
      self.log.info('Expected dictionary for control signal, received:\n%s' % (signalDict))

    return S_OK()

  #############################################################################
  def _checkProgress(self):
    """This method calls specific tests to determine whether the job execution
       is proceeding normally.  CS flags can easily be added to add or remove
       tests via central configuration.
    """
    report = ''

    if self.testWallClock:
      result = self.__checkWallClockTime()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'WallClock: OK, '
    else:
      report += 'WallClock: NA,'

    if self.testDiskSpace:
      result = self.__checkDiskSpace()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'DiskSpace: OK, '
    else:
      report += 'DiskSpace: NA,'

    if self.testLoadAvg:
      result = self.__checkLoadAverage()
      if not result['OK']:
        self.log.warn("Check of load average failed, but won't fail because of that: %s" % result['Message'])
        report += 'LoadAverage: ERROR, '
        return S_OK()
      report += 'LoadAverage: OK, '
    else:
      report += 'LoadAverage: NA,'

    if self.testCPUConsumed:
      result = self.__checkCPUConsumed()
      if not result['OK']:
        return result
      report += 'CPUConsumed: OK, '
    else:
      report += 'CPUConsumed: NA, '

    if self.testCPULimit:
      result = self.__checkCPULimit()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'CPULimit OK, '
    else:
      report += 'CPULimit: NA, '

    if self.testTimeLeft:
      self.__timeLeft()
      if self.timeLeft:
        report += 'TimeLeft: OK'
    else:
      report += 'TimeLeft: NA'

    if self.testMemoryLimit:
      result = self.__checkMemoryLimit()
      if not result['OK']:
        self.log.warn(result['Message'])
        return result
      report += 'MemoryLimit OK, '
    else:
      report += 'MemoryLimit: NA, '

    self.log.info(report)
    return S_OK('All enabled checks passed')

  #############################################################################
  def __checkCPUConsumed(self):
    """ Checks whether the CPU consumed by application process is reasonable. This
        method will report stalled jobs to be killed.
    """
    self.log.info("Checking CPU Consumed")

    if 'WallClockTime' not in self.parameters:
      return S_ERROR('Missing WallClockTime info')
    if 'CPUConsumed' not in self.parameters:
      return S_ERROR('Missing CPUConsumed info')

    wallClockTime = self.parameters['WallClockTime'][-1]
    if wallClockTime < self.sampleCPUTime:
      self.log.info("Stopping check, wallclock time (%s) is still smaller than sample time (%s)" % (
          wallClockTime,
          self.sampleCPUTime))
      return S_OK()

    intervals = max(1, int(self.sampleCPUTime / self.checkingTime))
    if len(self.parameters['CPUConsumed']) < intervals + 1:
      self.log.info("Not enough snapshots to calculate, there are %s and we need %s" %
                    (len(self.parameters['CPUConsumed']), intervals + 1))
      return S_OK()

    wallClockTime = self.parameters['WallClockTime'][-1] - self.parameters['WallClockTime'][-1 - intervals]
    try:
      cpuTime = self.__convertCPUTime(self.parameters['CPUConsumed'][-1])['Value']
      # For some reason, some times the CPU consumed estimation returns 0
      # if cpuTime == 0:
      #   return S_OK()
      cpuTime -= self.__convertCPUTime(self.parameters['CPUConsumed'][-1 - intervals])['Value']
      if cpuTime < 0:
        self.log.warn('Consumed CPU time negative, something wrong may have happened, ignore')
        return S_OK()
      if wallClockTime <= 0:
        self.log.warn('Wallclock time should not be negative or zero, Ignore')
        return S_OK()

      ratio = (cpuTime / wallClockTime) * 100.

      self.log.info("CPU/Wallclock ratio is %.2f%%" % ratio)
      # in case of error cpuTime might be 0, exclude this
      if ratio < self.minCPUWallClockRatio:
        if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'):
          self.log.info('N.B. job would be declared as stalled but CPU / WallClock check is disabled by payload')
          return S_OK()
        self.log.info("Job is stalled!")
        return S_ERROR('Watchdog identified this job as stalled')
    except Exception as e:
      self.log.error("Cannot convert CPU consumed from string to int", str(e))

    return S_OK()

  #############################################################################

  def __convertCPUTime(self, cputime):
    """ Method to convert the CPU time as returned from the Watchdog
        instances to the equivalent DIRAC normalized CPU time to be compared
        to the Job CPU requirement.
    """
    cpuValue = 0
    cpuHMS = cputime.split(':')
    # for i in xrange( len( cpuHMS ) ):
    #   cpuHMS[i] = cpuHMS[i].replace( '00', '0' )

    try:
      hours = float(cpuHMS[0]) * 60 * 60
      mins = float(cpuHMS[1]) * 60
      secs = float(cpuHMS[2])
      cpuValue = float(hours + mins + secs)
    except Exception as x:
      self.log.warn(str(x))
      return S_ERROR('Could not calculate CPU time')

    # Normalization to be implemented
    normalizedCPUValue = cpuValue

    result = S_OK()
    result['Value'] = normalizedCPUValue
    self.log.debug('CPU value %s converted to %s' % (cputime, normalizedCPUValue))
    return result

  #############################################################################

  def __checkCPULimit(self):
    """ Checks that the job has consumed more than the job CPU requirement
        (plus a configurable margin) and kills them as necessary.
    """
    consumedCPU = 0
    if 'CPUConsumed' in self.parameters:
      consumedCPU = self.parameters['CPUConsumed'][-1]

    consumedCPUDict = self.__convertCPUTime(consumedCPU)
    if consumedCPUDict['OK']:
      currentCPU = consumedCPUDict['Value']
    else:
      return S_OK('Not possible to determine current CPU consumed')

    if consumedCPU:
      limit = self.jobCPUTime + self.jobCPUTime * (self.jobCPUMargin / 100)
      cpuConsumed = float(currentCPU)
      if cpuConsumed > limit:
        self.log.info(
            'Job has consumed more than the specified CPU limit with an additional %s%% margin' %
            (self.jobCPUMargin))
        return S_ERROR('Job has exceeded maximum CPU time limit')
      else:
        return S_OK('Job within CPU limit')
    elif not currentCPU:
      self.log.verbose('Both initial and current CPU consumed are null')
      return S_OK('CPU consumed is not measurable yet')
    else:
      return S_OK('Not possible to determine CPU consumed')

  def __checkMemoryLimit(self):
    """ Checks that the job memory consumption is within a limit
    """
    if 'Vsize' in self.parameters:
      vsize = self.parameters['Vsize'][-1]

    if vsize and self.memoryLimit:
      if vsize > self.memoryLimit:
        vsize = vsize
        # Just a warning for the moment
        self.log.warn("Job has consumed %f.2 KB of memory with the limit of %f.2 KB" % (vsize, self.memoryLimit))

    return S_OK()

  #############################################################################
  def __checkDiskSpace(self):
    """Checks whether the CS defined minimum disk space is available.
    """
    if 'DiskSpace' in self.parameters:
      availSpace = self.parameters['DiskSpace'][-1]
      if availSpace >= 0 and availSpace < self.minDiskSpace:
        self.log.info('Not enough local disk space for job to continue, defined in CS as %s MB' % (self.minDiskSpace))
        return S_ERROR('Job has insufficient disk space to continue')
      else:
        return S_OK('Job has enough disk space available')
    else:
      return S_ERROR('Available disk space could not be established')

  #############################################################################
  def __checkWallClockTime(self):
    """Checks whether the job has been running for the CS defined maximum
       wall clock time.
    """
    if 'StartTime' in self.initialValues:
      startTime = self.initialValues['StartTime']
      if time.time() - startTime > self.maxWallClockTime:
        self.log.info('Job has exceeded maximum wall clock time of %s seconds' % (self.maxWallClockTime))
        return S_ERROR('Job has exceeded maximum wall clock time')
      else:
        return S_OK('Job within maximum wall clock time')
    else:
      return S_ERROR('Job start time could not be established')

  #############################################################################
  def __checkLoadAverage(self):
    """Checks whether the CS defined maximum load average is exceeded.
    """
    if 'LoadAverage' in self.parameters:
      loadAvg = self.parameters['LoadAverage'][-1]
      if loadAvg > float(self.loadAvgLimit):
        self.log.info('Maximum load average exceeded, defined in CS as %s ' % (self.loadAvgLimit))
        return S_ERROR('Job exceeded maximum load average')
      else:
        return S_OK('Job running with normal load average')
    else:
      return S_ERROR('Job load average not established')

  #############################################################################
  def __peek(self):
    """ Uses ExecutionThread.getOutput() method to obtain standard output
        from running thread via subprocess callback function.
    """
    result = self.exeThread.getOutput()
    if not result['OK']:
      self.log.warn('Could not obtain output from running application thread')
      self.log.warn(result['Message'])

    return result

  #############################################################################
  def calibrate(self):
    """ The calibrate method obtains the initial values for system memory and load
        and calculates the margin for error for the rest of the Watchdog cycle.
    """
    self.__getWallClockTime()
    self.parameters['WallClockTime'] = []

    cpuConsumed = self.__getCPU()
    if not cpuConsumed['OK']:
      self.log.warn("Could not establish CPU consumed, setting to 0.0")
      cpuConsumed = 0.0
    else:
      cpuConsumed = cpuConsumed['Value']

    self.initialValues['CPUConsumed'] = cpuConsumed
    self.parameters['CPUConsumed'] = []

    self.initialValues['LoadAverage'] = float(os.getloadavg()[0])
    self.parameters['LoadAverage'] = []

    memUsed = self.getMemoryUsed()

    self.initialValues['MemoryUsed'] = memUsed
    self.parameters['MemoryUsed'] = []

    result = self.profiler.getAllProcessData(withChildren=True)
    self.log.verbose('Job Memory: %s' % (result['Value']))
    if not result['OK']:
      self.log.warn('Could not get job memory usage')

    self.initialValues['Vsize'] = result['Value']['stats']['vSizeUsage'] * 1024.
    self.initialValues['RSS'] = result['Value']['stats']['memoryUsage'] * 1024.
    self.parameters['Vsize'] = []
    self.parameters['RSS'] = []

    result = self.getDiskSpace()
    self.log.verbose('DiskSpace: %s' % (result))
    if not result['OK']:
      self.log.warn("Could not establish DiskSpace")

    self.initialValues['DiskSpace'] = result['Value']
    self.parameters['DiskSpace'] = []

    result = self.getNodeInformation()
    self.log.verbose('NodeInfo: %s' % (result))
    if not result['OK']:
      self.log.warn("Could not establish static system information")

    if 'LSB_JOBID' in os.environ:
      result['LocalJobID'] = os.environ['LSB_JOBID']
    if 'PBS_JOBID' in os.environ:
      result['LocalJobID'] = os.environ['PBS_JOBID']
    if 'QSUB_REQNAME' in os.environ:
      result['LocalJobID'] = os.environ['QSUB_REQNAME']
    if 'JOB_ID' in os.environ:
      result['LocalJobID'] = os.environ['JOB_ID']

    self.__reportParameters(result, 'NodeInformation', True)
    self.__reportParameters(self.initialValues, 'InitialValues')
    return S_OK()

  def __timeLeft(self):
    """
      return Normalized CPU time left in the batch system
      0 if not available
      update self.timeLeft and self.littleTimeLeft
    """
    # Get CPU time left in the batch system
    result = self.timeLeftUtil.getTimeLeft(0.0)
    if not result['OK']:
      # Could not get CPU time left, we might need to wait for the first loop
      # or the Utility is not working properly for this batch system
      # or we are in a batch system
      timeLeft = 0
    else:
      timeLeft = result['Value']

    self.timeLeft = timeLeft
    if not self.littleTimeLeft:
      if timeLeft and timeLeft < self.grossTimeLeftLimit:
        self.log.info('TimeLeft bellow %s, now checking with higher frequency' % timeLeft)
        self.littleTimeLeft = True
        # TODO: better configurable way of doing this to be coded
        self.littleTimeLeftCount = 15
    else:
      if self.timeLeft and self.timeLeft < self.fineTimeLeftLimit:
        timeLeft = -1

    return timeLeft

  #############################################################################
  def __getUsageSummary(self):
    """ Returns average load, memory etc. over execution of job thread
    """
    summary = {}
    # CPUConsumed
    if 'CPUConsumed' in self.parameters:
      cpuList = self.parameters['CPUConsumed']
      if cpuList:
        hmsCPU = cpuList[-1]
        rawCPU = self.__convertCPUTime(hmsCPU)
        if rawCPU['OK']:
          summary['LastUpdateCPU(s)'] = rawCPU['Value']
      else:
        summary['LastUpdateCPU(s)'] = 'Could not be estimated'
    # DiskSpace
    if 'DiskSpace' in self.parameters:
      space = self.parameters['DiskSpace']
      if space:
        value = abs(float(space[-1]) - float(self.initialValues['DiskSpace']))
        if value < 0:
          value = 0
        summary['DiskSpace(MB)'] = value
      else:
        summary['DiskSpace(MB)'] = 'Could not be estimated'
    # MemoryUsed
    if 'MemoryUsed' in self.parameters:
      memory = self.parameters['MemoryUsed']
      if memory:
        summary['MemoryUsed(kb)'] = abs(float(memory[-1]) - float(self.initialValues['MemoryUsed']))
      else:
        summary['MemoryUsed(kb)'] = 'Could not be estimated'
    # LoadAverage
    if 'LoadAverage' in self.parameters:
      laList = self.parameters['LoadAverage']
      if laList:
        summary['LoadAverage'] = float(sum(laList)) / float(len(laList))
      else:
        summary['LoadAverage'] = 'Could not be estimated'

    result = self.__getWallClockTime()
    if not result['OK']:
      self.log.warn("Failed determining wall clock time", result['Message'])
      summary['WallClockTime(s)'] = 0
      summary['ScaledCPUTime(s)'] = 0
    else:
      wallClock = result['Value']
      summary['WallClockTime(s)'] = wallClock
      summary['ScaledCPUTime(s)'] = wallClock * self.scaleFactor * self.processors

    self.__reportParameters(summary, 'UsageSummary', True)
    self.currentStats = summary

  #############################################################################
  def __reportParameters(self, params, title=None, report=False):
    """Will report parameters for job.
    """
    try:
      parameters = []
      self.log.info('==========================================================')
      if title:
        self.log.info('Watchdog will report %s' % (title))
      else:
        self.log.info('Watchdog will report parameters')
      self.log.info('==========================================================')
      vals = params
      if 'Value' in params:
        if vals['Value']:
          vals = params['Value']
      for k, v in vals.items():
        if v:
          self.log.info(str(k) + ' = ' + str(v))
          parameters.append((k, v))
      if report:
        self.__setJobParamList(parameters)

      self.log.info('==========================================================')
    except Exception as x:
      self.log.warn('Problem while reporting parameters')
      self.log.warn(str(x))

  #############################################################################
  def __getWallClockTime(self):
    """ Establishes the Wall Clock time spent since the Watchdog initialization"""
    result = S_OK()
    if 'StartTime' in self.initialValues:
      currentTime = time.time()
      wallClock = currentTime - self.initialValues['StartTime']
      result['Value'] = wallClock
    else:
      self.initialValues['StartTime'] = time.time()
      result['Value'] = 0.0

    return result

  #############################################################################
  def __killRunningThread(self):
    """ Will kill the running thread process and any child processes."""
    self.log.info('Sending kill signal to application PID %s' % (self.spObject.getChildPID()))
    result = self.spObject.killChild()
    self.applicationKilled = True
    self.log.info('Subprocess.killChild() returned:%s ' % (result))
    return S_OK('Thread killed')

  #############################################################################
  def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict):
    """ Sends sign of life 'heartbeat' signal and triggers control signal
        interpretation.
    """
    result = JobStateUpdateClient().sendHeartBeat(jobID, heartBeatDict, staticParamDict)
    if not result['OK']:
      self.log.warn('Problem sending sign of life')
      self.log.warn(result)

    if result['OK'] and result['Value']:
      self.__interpretControlSignal(result['Value'])

    return result

  #############################################################################
  def __setJobParamList(self, value):
    """Wraps around setJobParameters of state update client
    """
    # job wrapper template sets the jobID variable
    if 'JOBID' not in os.environ:
      self.log.info('Running without JOBID so parameters will not be reported')
      return S_OK()
    jobID = os.environ['JOBID']
    jobParam = JobStateUpdateClient().setJobParameters(int(jobID), value)
    self.log.verbose('setJobParameters(%s,%s)' % (jobID, value))
    if not jobParam['OK']:
      self.log.warn(jobParam['Message'])

    return jobParam

  #############################################################################
  def getNodeInformation(self):
    """ Attempts to retrieve all static system information, should be overridden in a subclass"""
    methodName = 'getNodeInformation'
    self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass')
    return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')

  #############################################################################
  def getMemoryUsed(self):
    """Obtains the memory used.
    """
    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + \
        resource.getrusage(resource.RUSAGE_CHILDREN).ru_maxrss
    return float(mem)

  #############################################################################
  def getDiskSpace(self):
    """ Attempts to get the available disk space, should be overridden in a subclass"""
    methodName = 'getDiskSpace'
    self.log.warn('Watchdog: ' + methodName + ' method should be implemented in a subclass')
    return S_ERROR('Watchdog: ' + methodName + ' method should be implemented in a subclass')
Esempio n. 4
0
def test_cpuUsage():
    mainProcess = Popen(
        ['python', 'tests/Utilities/ProcessesCreator_withChildren.py'])
    time.sleep(2)
    p = Profiler(mainProcess.pid)
    res = p.pid()
    assert res['OK'] is True
    res = p.status()
    assert res['OK'] is True

    # user
    res = p.cpuUsageUser()
    assert res['OK'] is True
    assert res['Value'] > 0
    resC = p.cpuUsageUser(withChildren=True)
    assert resC['OK'] is True
    assert resC['Value'] > 0
    assert resC['Value'] >= res['Value']

    res = p.cpuUsageUser()
    assert res['OK'] is True
    assert res['Value'] > 0
    resC = p.cpuUsageUser(withChildren=True)
    assert resC['OK'] is True
    assert resC['Value'] > 0
    assert resC['Value'] >= res['Value']

    resT = p.cpuUsageUser(withTerminatedChildren=True)
    assert resT['OK'] is True
    assert resT['Value'] > 0
    assert resT['Value'] >= res['Value']

    resTC = p.cpuUsageUser(withChildren=True, withTerminatedChildren=True)
    assert resTC['OK'] is True
    assert resTC['Value'] > 0
    assert resTC['Value'] >= res['Value']

    # system
    res = p.cpuUsageSystem()
    assert res['OK'] is True
    assert res['Value'] >= 0
    resWC = p.cpuUsageSystem(withChildren=True)
    assert resWC['OK'] is True
    assert resWC['Value'] >= 0
    assert resWC['Value'] >= res['Value']

    res = p.cpuUsageSystem()
    assert res['OK'] is True
    assert res['Value'] > 0
    resC = p.cpuUsageSystem(withChildren=True)
    assert resC['OK'] is True
    assert resC['Value'] > 0
    assert resC['Value'] >= res['Value']

    resT = p.cpuUsageSystem(withTerminatedChildren=True)
    assert resT['OK'] is True
    assert resT['Value'] > 0
    assert resT['Value'] >= res['Value']

    resTC = p.cpuUsageSystem(withChildren=True, withTerminatedChildren=True)
    assert resTC['OK'] is True
    assert resTC['Value'] > 0
    assert resTC['Value'] >= res['Value']

    # After this the main process will no-longer exist
    mainProcess.wait()

    res = p.cpuUsageUser()
    assert res['OK'] is False
    assert res['Errno'] == 3
Esempio n. 5
0
def test_base():
    p = Profiler()
    res = p.pid()
    assert res['OK'] is False
    res = p.status()
    assert res['OK'] is False

    mainProcess = Popen(
        ['python', 'tests/Utilities/ProcessesCreator_withChildren.py'])
    time.sleep(1)
    p = Profiler(mainProcess.pid)
    res = p.pid()
    assert res['OK'] is True
    res = p.status()
    assert res['OK'] is True
    res = p.runningTime()
    assert res['OK'] is True
    assert res['Value'] > 0

    res = p.memoryUsage()
    assert res['OK'] is True
    assert res['Value'] > 0
    resWC = p.memoryUsage(withChildren=True)
    assert resWC['OK'] is True
    assert resWC['Value'] > 0
    assert resWC['Value'] >= res['Value']

    res = p.vSizeUsage()
    assert res['OK'] is True
    assert res['Value'] > 0
    resWC = p.vSizeUsage(withChildren=True)
    assert resWC['OK'] is True
    assert resWC['Value'] > 0
    assert resWC['Value'] >= res['Value']

    res = p.vSizeUsage()
    assert res['OK'] is True
    assert res['Value'] > 0
    resWC = p.vSizeUsage(withChildren=True)
    assert resWC['OK'] is True
    assert resWC['Value'] > 0
    assert resWC['Value'] >= res['Value']

    res = p.numThreads()
    assert res['OK'] is True
    assert res['Value'] > 0
    resWC = p.numThreads(withChildren=True)
    assert resWC['OK'] is True
    assert resWC['Value'] > 0
    assert resWC['Value'] >= res['Value']

    res = p.cpuPercentage()
    assert res['OK'] is True
    assert res['Value'] >= 0
    resWC = p.cpuPercentage(withChildren=True)
    assert resWC['OK'] is True
    assert resWC['Value'] >= 0
    assert resWC['Value'] >= res['Value']
Esempio n. 6
0
def test_cpuUsage():
    mainProcess = Popen([
        "python",
        join(dirname(DIRAC.__file__),
             "tests/Utilities/ProcessesCreator_withChildren.py"),
    ])
    time.sleep(2)
    p = Profiler(mainProcess.pid)
    res = p.pid()
    assert res["OK"] is True
    res = p.status()
    assert res["OK"] is True

    # user
    res = p.cpuUsageUser()
    assert res["OK"] is True
    assert res["Value"] > 0
    resC = p.cpuUsageUser(withChildren=True)
    assert resC["OK"] is True
    assert resC["Value"] > 0
    assert resC["Value"] >= res["Value"]

    res = p.cpuUsageUser()
    assert res["OK"] is True
    assert res["Value"] > 0
    resC = p.cpuUsageUser(withChildren=True)
    assert resC["OK"] is True
    assert resC["Value"] > 0
    assert resC["Value"] >= res["Value"]

    resT = p.cpuUsageUser(withTerminatedChildren=True)
    assert resT["OK"] is True
    assert resT["Value"] > 0
    assert resT["Value"] >= res["Value"]

    resTC = p.cpuUsageUser(withChildren=True, withTerminatedChildren=True)
    assert resTC["OK"] is True
    assert resTC["Value"] > 0
    assert resTC["Value"] >= res["Value"]

    # system
    res = p.cpuUsageSystem()
    assert res["OK"] is True
    assert res["Value"] >= 0
    resWC = p.cpuUsageSystem(withChildren=True)
    assert resWC["OK"] is True
    assert resWC["Value"] >= 0
    assert resWC["Value"] >= res["Value"]

    res = p.cpuUsageSystem()
    assert res["OK"] is True
    assert res["Value"] > 0
    resC = p.cpuUsageSystem(withChildren=True)
    assert resC["OK"] is True
    assert resC["Value"] > 0
    assert resC["Value"] >= res["Value"]

    resT = p.cpuUsageSystem(withTerminatedChildren=True)
    assert resT["OK"] is True
    assert resT["Value"] > 0
    assert resT["Value"] >= res["Value"]

    resTC = p.cpuUsageSystem(withChildren=True, withTerminatedChildren=True)
    assert resTC["OK"] is True
    assert resTC["Value"] > 0
    assert resTC["Value"] >= res["Value"]

    # After this the main process will no-longer exist
    mainProcess.wait()

    res = p.cpuUsageUser()
    assert res["OK"] is False
    assert res["Errno"] == 3
Esempio n. 7
0
def test_base():
    p = Profiler()
    res = p.pid()
    assert res["OK"] is False
    res = p.status()
    assert res["OK"] is False

    mainProcess = Popen([
        "python",
        join(dirname(DIRAC.__file__),
             "tests/Utilities/ProcessesCreator_withChildren.py"),
    ])
    time.sleep(1)
    p = Profiler(mainProcess.pid)
    res = p.pid()
    assert res["OK"] is True
    res = p.status()
    assert res["OK"] is True
    res = p.runningTime()
    assert res["OK"] is True
    assert res["Value"] > 0

    res = p.memoryUsage()
    assert res["OK"] is True
    assert res["Value"] > 0
    resWC = p.memoryUsage(withChildren=True)
    assert resWC["OK"] is True
    assert resWC["Value"] > 0
    assert resWC["Value"] >= res["Value"]

    res = p.vSizeUsage()
    assert res["OK"] is True
    assert res["Value"] > 0
    resWC = p.vSizeUsage(withChildren=True)
    assert resWC["OK"] is True
    assert resWC["Value"] > 0
    assert resWC["Value"] >= res["Value"]

    res = p.vSizeUsage()
    assert res["OK"] is True
    assert res["Value"] > 0
    resWC = p.vSizeUsage(withChildren=True)
    assert resWC["OK"] is True
    assert resWC["Value"] > 0
    assert resWC["Value"] >= res["Value"]

    res = p.numThreads()
    assert res["OK"] is True
    assert res["Value"] > 0
    resWC = p.numThreads(withChildren=True)
    assert resWC["OK"] is True
    assert resWC["Value"] > 0
    assert resWC["Value"] >= res["Value"]

    res = p.cpuPercentage()
    assert res["OK"] is True
    assert res["Value"] >= 0
    resWC = p.cpuPercentage(withChildren=True)
    assert resWC["OK"] is True
    assert resWC["Value"] >= 0
    assert resWC["Value"] >= res["Value"]