def test_getScaledCPU(mocker, batch, requiredVariables, returnValue, expected): """ Test getScaledCPU() """ mocker.patch( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.runCommand", return_value=S_OK(returnValue)) tl = TimeLeft() res = tl.getScaledCPU() assert res == 0 tl.scaleFactor = 5.0 tl.normFactor = 5.0 batchSystemName = '%sResourceUsage' % batch batchSystemPath = 'DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s' % batchSystemName batchPlugin = __import__(batchSystemPath, globals(), locals(), [batchSystemName]) # pylint: disable=unused-variable # Need to be reloaded to update the mock within the module, else, it will reuse the one when loaded the first time reload_module(batchPlugin) batchStr = 'batchPlugin.%s()' % (batchSystemName) tl.batchPlugin = eval(batchStr) # Update attributes of the batch systems to get scaled CPU tl.batchPlugin.__dict__.update(requiredVariables) res = tl.getScaledCPU() assert res == expected
def test_getTimeLeft(mocker, batch, requiredVariables, returnValue, expected_1, expected_2): """Test getTimeLeft()""" mocker.patch( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.runCommand", return_value=S_OK(returnValue)) tl = TimeLeft() batchSystemName = "%sResourceUsage" % batch batchSystemPath = "DIRAC.Resources.Computing.BatchSystems.TimeLeft.%s" % batchSystemName batchPlugin = __import__(batchSystemPath, globals(), locals(), [batchSystemName]) # Need to be reloaded to update the mock within the module, else, it will reuse the one when loaded the first time reload(batchPlugin) batchStr = "batchPlugin.%s()" % (batchSystemName) tl.batchPlugin = eval(batchStr) tl.cpuPower = 10.0 # Update attributes of the batch systems to get scaled CPU tl.batchPlugin.__dict__.update(requiredVariables) res = tl.getTimeLeft() assert res["OK"] is expected_1 if res["OK"]: assert res["Value"] == expected_2
def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}): """Constructor, takes system flag as argument.""" self.stopSigStartSeconds = int(jobArgs.get("StopSigStartSeconds", 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get("StopSigFinishSeconds", 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get("StopSigNumber", 2)) # SIGINT self.stopSigRegex = jobArgs.get("StopSigRegex", None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = "" self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.cpuPower = 1.0 self.processors = processors
def initialize(self): """Sets default parameters and creates CE instance""" # Disable monitoring self.am_disableMonitoring() localCE = gConfig.getValue("/LocalSite/LocalCE", self.ceName) if localCE != self.ceName: self.log.info("Defining Inner CE from local configuration", "= %s" % localCE) # Create backend Computing Element result = self._initializeComputingElement(localCE) if not result["OK"]: return result result = self._getCEDict(self.computingElement) if not result["OK"]: return result ceDict = result["Value"][0] self.initTimeLeft = ceDict.get("CPUTime", self.initTimeLeft) self.initTimeLeft = gConfig.getValue( "/Resources/Computing/CEDefaults/MaxCPUTime", self.initTimeLeft) self.timeLeft = self.initTimeLeft self.initTimes = os.times() # Localsite options self.siteName = siteName() self.pilotReference = gConfig.getValue("/LocalSite/PilotReference", self.pilotReference) self.defaultProxyLength = gConfig.getValue( "/Registry/DefaultProxyLifeTime", self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", self.cpuFactor) self.jobSubmissionDelay = self.am_getOption("SubmissionDelay", self.jobSubmissionDelay) self.fillingMode = self.am_getOption("FillingModeFlag", self.fillingMode) self.minimumTimeLeft = self.am_getOption("MinimumTimeLeft", self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( "StopOnApplicationFailure", self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( "StopAfterFailedMatches", self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( "/AgentJobRequirements/ExtraOptions", self.extraOptions) # Utilities self.timeLeftUtil = TimeLeft() return S_OK()
def test__computeCPUWorkLeft(mocker, initTimeLeft, timeLeft, cpuFactor, mockTimeLeftReply, expectedTimeLeft): """Test JobAgent()._computeCPUWorkLeft()""" mocker.patch( "DIRAC.WorkloadManagementSystem.Agent.JobAgent.AgentModule.__init__") mocker.patch( "DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft.TimeLeft.getTimeLeft", return_value=mockTimeLeftReply) jobAgent = JobAgent("Test", "Test1") jobAgent.log = gLogger jobAgent.log.setLevel("DEBUG") jobAgent.timeLeftUtil = TimeLeft() jobAgent.initTimeLeft = initTimeLeft jobAgent.timeLeft = timeLeft jobAgent.cpuFactor = cpuFactor result = jobAgent._computeCPUWorkLeft() assert abs(result - expectedTimeLeft) < 10
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring, logLevel INFO, limited cycles self.am_setOption('MonitoringEnabled', False) self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', self.ceName) localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration', '= %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn("Can't instantiate a CE", ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result if isinstance(result['Value'], list): ceDict = result['Value'][0] else: ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK()
def getCPUTime(cpuNormalizationFactor): """Trying to get CPUTime left for execution (in seconds). It will first look to get the work left looking for batch system information useing the TimeLeft utility. If it succeeds, it will convert it in real second, and return it. If it fails, it tries to get it from the static info found in CS. If it fails, it returns the default, which is a large 9999999, that we may consider as "Infinite". This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value args: cpuNormalizationFactor (float): the CPU power of the current Worker Node. If not passed in, it's get from the local configuration returns: cpuTimeLeft (int): the CPU time left, in seconds """ cpuTimeLeft = 0.0 cpuWorkLeft = gConfig.getValue("/LocalSite/CPUTimeLeft", 0) if not cpuWorkLeft: # Try and get the information from the CPU left utility result = TimeLeft().getTimeLeft() if result["OK"]: cpuWorkLeft = result["Value"] if cpuWorkLeft > 0: # This is in HS06sseconds # We need to convert in real seconds if not cpuNormalizationFactor: # if cpuNormalizationFactor passed in is 0, try get it from the local cfg cpuNormalizationFactor = gConfig.getValue("/LocalSite/CPUNormalizationFactor", 0.0) if cpuNormalizationFactor: cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor if not cpuTimeLeft: # now we know that we have to find the CPUTimeLeft by looking in the CS # this is not granted to be correct as the CS units may not be real seconds gridCE = gConfig.getValue("/LocalSite/GridCE") ceQueue = gConfig.getValue("/LocalSite/CEQueue") if not ceQueue: # we have to look for a ceQueue in the CS # A bit hacky. We should better profit from something generic gLogger.warn("No CEQueue in local configuration, looking to find one in CS") siteName = DIRAC.siteName() queueSection = "/Resources/Sites/%s/%s/CEs/%s/Queues" % (siteName.split(".")[0], siteName, gridCE) res = gConfig.getSections(queueSection) if not res["OK"]: raise RuntimeError(res["Message"]) queues = res["Value"] cpuTimes = [gConfig.getValue(queueSection + "/" + queue + "/maxCPUTime", 9999999.0) for queue in queues] # These are (real, wall clock) minutes - damn BDII! cpuTimeLeft = min(cpuTimes) * 60 else: queueInfo = getQueueInfo("%s/%s" % (gridCE, ceQueue)) cpuTimeLeft = 9999999.0 if not queueInfo["OK"] or not queueInfo["Value"]: gLogger.warn("Can't find a CE/queue, defaulting CPUTime to %d" % cpuTimeLeft) else: queueCSSection = queueInfo["Value"]["QueueCSSection"] # These are (real, wall clock) minutes - damn BDII! cpuTimeInMinutes = gConfig.getValue("%s/maxCPUTime" % queueCSSection, 0.0) if cpuTimeInMinutes: cpuTimeLeft = cpuTimeInMinutes * 60.0 gLogger.info("CPUTime for %s: %f" % (queueCSSection, cpuTimeLeft)) else: gLogger.warn( "Can't find maxCPUTime for %s, defaulting CPUTime to %f" % (queueCSSection, cpuTimeLeft) ) return int(cpuTimeLeft)