예제 #1
0
def test_getScaledCPU(mocker, batch, requiredVariables, returnValue, expected):
    """ Test getScaledCPU()
  """
    mocker.patch(
        "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.runCommand",
        return_value=S_OK(returnValue))
    tl = TimeLeft()
    res = tl.getScaledCPU()
    assert res == 0

    tl.scaleFactor = 5.0
    tl.normFactor = 5.0

    batchSystemName = '%sResourceUsage' % batch
    batchSystemPath = 'DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s' % batchSystemName
    batchPlugin = __import__(batchSystemPath, globals(), locals(),
                             [batchSystemName])  # pylint: disable=unused-variable
    # Need to be reloaded to update the mock within the module, else, it will reuse the one when loaded the first time
    reload_module(batchPlugin)

    batchStr = 'batchPlugin.%s()' % (batchSystemName)
    tl.batchPlugin = eval(batchStr)

    # Update attributes of the batch systems to get scaled CPU
    tl.batchPlugin.__dict__.update(requiredVariables)

    res = tl.getScaledCPU()
    assert res == expected
예제 #2
0
def test_getTimeLeft(mocker, batch, requiredVariables, returnValue, expected_1,
                     expected_2):
    """Test getTimeLeft()"""
    mocker.patch(
        "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.runCommand",
        return_value=S_OK(returnValue))
    tl = TimeLeft()

    batchSystemName = "%sResourceUsage" % batch
    batchSystemPath = "DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s" % batchSystemName
    batchPlugin = __import__(batchSystemPath, globals(), locals(),
                             [batchSystemName])
    # Need to be reloaded to update the mock within the module, else, it will reuse the one when loaded the first time
    reload(batchPlugin)

    batchStr = "batchPlugin.%s()" % (batchSystemName)
    tl.batchPlugin = eval(batchStr)
    tl.cpuPower = 10.0

    # Update attributes of the batch systems to get scaled CPU
    tl.batchPlugin.__dict__.update(requiredVariables)

    res = tl.getTimeLeft()
    assert res["OK"] is expected_1
    if res["OK"]:
        assert res["Value"] == expected_2
예제 #3
0
파일: Watchdog.py 프로젝트: DIRACGrid/DIRAC
    def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}):
        """Constructor, takes system flag as argument."""
        self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800))  # 30 minutes
        self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800))  # 30 minutes
        self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2))  # SIGINT
        self.stopSigRegex = jobArgs.get("StopSigRegex", None)
        self.stopSigSent = False

        self.log = gLogger.getSubLogger("Watchdog")
        self.exeThread = exeThread
        self.wrapperPID = pid
        self.appPID = self.exeThread.getCurrentPID()
        self.spObject = spObject
        self.jobCPUTime = jobCPUTime
        self.memoryLimit = memoryLimit
        self.calibration = 0
        self.initialValues = {}
        self.parameters = {}
        self.peekFailCount = 0
        self.peekRetry = 5
        self.profiler = Profiler(pid)
        self.checkError = ""
        self.currentStats = {}
        self.initialized = False
        self.count = 0

        # defaults
        self.testWallClock = 1
        self.testDiskSpace = 1
        self.testLoadAvg = 1
        self.maxWallClockTime = 3 * 24 * 60 * 60
        self.testCPUConsumed = 1
        self.testCPULimit = 0
        self.testMemoryLimit = 0
        self.testTimeLeft = 1
        self.pollingTime = 10  # 10 seconds
        self.checkingTime = 30 * 60  # 30 minute period
        self.minCheckingTime = 20 * 60  # 20 mins
        self.wallClockCheckSeconds = 5 * 60  # 5 minutes
        self.maxWallClockTime = 3 * 24 * 60 * 60  # e.g. 4 days
        self.jobPeekFlag = 1  # on / off
        self.minDiskSpace = 10  # MB
        self.loadAvgLimit = 1000  # > 1000 and jobs killed
        self.sampleCPUTime = 30 * 60  # e.g. up to 20mins sample
        self.jobCPUMargin = 20  # %age buffer before killing job
        self.minCPUWallClockRatio = 5  # ratio %age
        self.nullCPULimit = 5  # After 5 sample times return null CPU consumption kill job
        self.checkCount = 0
        self.wallClockCheckCount = 0
        self.nullCPUCount = 0

        self.grossTimeLeftLimit = 10 * self.checkingTime
        self.timeLeftUtil = TimeLeft()
        self.timeLeft = 0
        self.littleTimeLeft = False
        self.cpuPower = 1.0
        self.processors = processors
예제 #4
0
    def initialize(self):
        """Sets default parameters and creates CE instance"""

        # Disable monitoring
        self.am_disableMonitoring()

        localCE = gConfig.getValue("/LocalSite/LocalCE", self.ceName)
        if localCE != self.ceName:
            self.log.info("Defining Inner CE from local configuration",
                          "= %s" % localCE)

        # Create backend Computing Element
        result = self._initializeComputingElement(localCE)
        if not result["OK"]:
            return result

        result = self._getCEDict(self.computingElement)
        if not result["OK"]:
            return result
        ceDict = result["Value"][0]

        self.initTimeLeft = ceDict.get("CPUTime", self.initTimeLeft)
        self.initTimeLeft = gConfig.getValue(
            "/Resources/Computing/CEDefaults/MaxCPUTime", self.initTimeLeft)
        self.timeLeft = self.initTimeLeft

        self.initTimes = os.times()
        # Localsite options
        self.siteName = siteName()
        self.pilotReference = gConfig.getValue("/LocalSite/PilotReference",
                                               self.pilotReference)
        self.defaultProxyLength = gConfig.getValue(
            "/Registry/DefaultProxyLifeTime", self.defaultProxyLength)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor",
                                          self.cpuFactor)
        self.jobSubmissionDelay = self.am_getOption("SubmissionDelay",
                                                    self.jobSubmissionDelay)
        self.fillingMode = self.am_getOption("FillingModeFlag",
                                             self.fillingMode)
        self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft",
                                                 self.minimumTimeLeft)
        self.stopOnApplicationFailure = self.am_getOption(
            "StopOnApplicationFailure", self.stopOnApplicationFailure)
        self.stopAfterFailedMatches = self.am_getOption(
            "StopAfterFailedMatches", self.stopAfterFailedMatches)
        self.extraOptions = gConfig.getValue(
            "/AgentJobRequirements/ExtraOptions", self.extraOptions)
        # Utilities
        self.timeLeftUtil = TimeLeft()
        return S_OK()
예제 #5
0
def test__computeCPUWorkLeft(mocker, initTimeLeft, timeLeft, cpuFactor,
                             mockTimeLeftReply, expectedTimeLeft):
    """Test JobAgent()._computeCPUWorkLeft()"""
    mocker.patch(
        "DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__")
    mocker.patch(
        "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.TimeLeft.getTimeLeft",
        return_value=mockTimeLeftReply)

    jobAgent = JobAgent("Test", "Test1")
    jobAgent.log = gLogger
    jobAgent.log.setLevel("DEBUG")
    jobAgent.timeLeftUtil = TimeLeft()

    jobAgent.initTimeLeft = initTimeLeft
    jobAgent.timeLeft = timeLeft
    jobAgent.cpuFactor = cpuFactor
    result = jobAgent._computeCPUWorkLeft()

    assert abs(result - expectedTimeLeft) < 10
예제 #6
0
    def initialize(self, loops=0):
        """Sets default parameters and creates CE instance
    """
        # Disable monitoring, logLevel INFO, limited cycles
        self.am_setOption('MonitoringEnabled', False)
        self.am_setOption('MaxCycles', loops)

        ceType = self.am_getOption('CEType', self.ceName)
        localCE = gConfig.getValue('/LocalSite/LocalCE', '')
        if localCE:
            self.log.info('Defining CE from local configuration',
                          '= %s' % localCE)
            ceType = localCE

        # Create backend Computing Element
        ceFactory = ComputingElementFactory()
        self.ceName = ceType
        ceInstance = ceFactory.getCE(ceType)
        if not ceInstance['OK']:
            self.log.warn("Can't instantiate a CE", ceInstance['Message'])
            return ceInstance
        self.computingElement = ceInstance['Value']

        result = self.computingElement.getDescription()
        if not result['OK']:
            self.log.warn("Can not get the CE description")
            return result
        if isinstance(result['Value'], list):
            ceDict = result['Value'][0]
        else:
            ceDict = result['Value']
        self.timeLeft = ceDict.get('CPUTime', self.timeLeft)
        self.timeLeft = gConfig.getValue(
            '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft)

        self.initTimes = os.times()
        # Localsite options
        self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName)
        self.pilotReference = gConfig.getValue('/LocalSite/PilotReference',
                                               self.pilotReference)
        self.defaultProxyLength = gConfig.getValue(
            '/Registry/DefaultProxyLifeTime', self.defaultProxyLength)
        # Agent options
        # This is the factor to convert raw CPU to Normalized units (based on the CPU Model)
        self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor',
                                          self.cpuFactor)
        self.jobSubmissionDelay = self.am_getOption('SubmissionDelay',
                                                    self.jobSubmissionDelay)
        self.fillingMode = self.am_getOption('FillingModeFlag',
                                             self.fillingMode)
        self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft',
                                                 self.minimumTimeLeft)
        self.stopOnApplicationFailure = self.am_getOption(
            'StopOnApplicationFailure', self.stopOnApplicationFailure)
        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)
        self.extraOptions = gConfig.getValue(
            '/AgentJobRequirements/ExtraOptions', self.extraOptions)
        # Timeleft
        self.timeLeftUtil = TimeLeft()
        return S_OK()
예제 #7
0
def getCPUTime(cpuNormalizationFactor):
    """Trying to get CPUTime left for execution (in seconds).

    It will first look to get the work left looking for batch system information useing the TimeLeft utility.
    If it succeeds, it will convert it in real second, and return it.

    If it fails, it tries to get it from the static info found in CS.
    If it fails, it returns the default, which is a large 9999999, that we may consider as "Infinite".

    This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value

    args:
      cpuNormalizationFactor (float): the CPU power of the current Worker Node.
      If not passed in, it's get from the local configuration

    returns:
      cpuTimeLeft (int): the CPU time left, in seconds
    """
    cpuTimeLeft = 0.0
    cpuWorkLeft = gConfig.getValue("/LocalSite/CPUTimeLeft", 0)

    if not cpuWorkLeft:
        # Try and get the information from the CPU left utility
        result = TimeLeft().getTimeLeft()
        if result["OK"]:
            cpuWorkLeft = result["Value"]

    if cpuWorkLeft > 0:
        # This is in HS06sseconds
        # We need to convert in real seconds
        if not cpuNormalizationFactor:  # if cpuNormalizationFactor passed in is 0, try get it from the local cfg
            cpuNormalizationFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0)
        if cpuNormalizationFactor:
            cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor

    if not cpuTimeLeft:
        # now we know that we have to find the CPUTimeLeft by looking in the CS
        # this is not granted to be correct as the CS units may not be real seconds
        gridCE = gConfig.getValue("/LocalSite/GridCE")
        ceQueue = gConfig.getValue("/LocalSite/CEQueue")
        if not ceQueue:
            # we have to look for a ceQueue in the CS
            # A bit hacky. We should better profit from something generic
            gLogger.warn("No CEQueue in local configuration, looking to find one in CS")
            siteName = DIRAC.siteName()
            queueSection = "/Resources/Sites/%s/%s/CEs/%s/Queues" % (siteName.split(".")[0], siteName, gridCE)
            res = gConfig.getSections(queueSection)
            if not res["OK"]:
                raise RuntimeError(res["Message"])
            queues = res["Value"]
            cpuTimes = [gConfig.getValue(queueSection + "/" + queue + "/maxCPUTime", 9999999.0) for queue in queues]
            # These are (real, wall clock) minutes - damn BDII!
            cpuTimeLeft = min(cpuTimes) * 60
        else:
            queueInfo = getQueueInfo("%s/%s" % (gridCE, ceQueue))
            cpuTimeLeft = 9999999.0
            if not queueInfo["OK"] or not queueInfo["Value"]:
                gLogger.warn("Can't find a CE/queue, defaulting CPUTime to %d" % cpuTimeLeft)
            else:
                queueCSSection = queueInfo["Value"]["QueueCSSection"]
                # These are (real, wall clock) minutes - damn BDII!
                cpuTimeInMinutes = gConfig.getValue("%s/maxCPUTime" % queueCSSection, 0.0)
                if cpuTimeInMinutes:
                    cpuTimeLeft = cpuTimeInMinutes * 60.0
                    gLogger.info("CPUTime for %s: %f" % (queueCSSection, cpuTimeLeft))
                else:
                    gLogger.warn(
                        "Can't find maxCPUTime for %s, defaulting CPUTime to %f" % (queueCSSection, cpuTimeLeft)
                    )

    return int(cpuTimeLeft)