def test_getTimeLeft(self): # for batch, retValue in [( 'LSF', LSF_ReturnValue ), ( 'SGE', SGE_ReturnValue )]: for batch, retValue in [('LSF', LSF_ReturnValue)]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft") rcMock = MagicMock() rcMock.return_value = S_OK(retValue) self.tl.runCommand = rcMock tl = TimeLeft() # res = tl.getTimeLeft() # self.assertEqual( res['OK'], True ) batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName]) batchStr = 'batchPlugin.%s()' % (batchSystemName) tl.batchPlugin = eval(batchStr) tl.scaleFactor = 10.0 tl.normFactor = 10.0 tl.batchPlugin.bin = '/usr/bin' tl.batchPlugin.hostNorm = 10.0 tl.batchPlugin.cpuLimit = 1000 tl.batchPlugin.wallClockLimit = 1000 res = tl.getTimeLeft() self.assertEqual(res['OK'], True) for batch, retValue in [('SGE', SGE_ReturnValue)]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft") rcMock = MagicMock() rcMock.return_value = S_OK(retValue) self.tl.runCommand = rcMock tl = TimeLeft() # res = tl.getTimeLeft() # self.assertFalse( res['OK'] ) batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % batchSystemName, globals(), locals(), [batchSystemName]) batchStr = 'batchPlugin.%s()' % (batchSystemName) tl.batchPlugin = eval(batchStr) tl.scaleFactor = 10.0 tl.normFactor = 10.0 tl.batchPlugin.bin = '/usr/bin' tl.batchPlugin.hostNorm = 10.0 tl.batchPlugin.cpuLimit = 1000 tl.batchPlugin.wallClockLimit = 1000 res = tl.getTimeLeft() self.assert_(res['OK']) self.assertEqual(res['Value'], 9400.0)
def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, systemFlag='linux', jobArgs={}): """ Constructor, takes system flag as argument. """ self.stopSigStartSeconds = int(jobArgs.get('StopSigStartSeconds', 1800)) # 30 minutes self.stopSigFinishSeconds = int(jobArgs.get('StopSigFinishSeconds', 1800)) # 30 minutes self.stopSigNumber = int(jobArgs.get('StopSigNumber', 2)) # SIGINT self.stopSigRegex = jobArgs.get('StopSigRegex', None) self.stopSigSent = False self.log = gLogger.getSubLogger("Watchdog") self.systemFlag = systemFlag self.exeThread = exeThread self.wrapperPID = pid self.appPID = self.exeThread.getCurrentPID() self.spObject = spObject self.jobCPUTime = jobCPUTime self.memoryLimit = memoryLimit self.calibration = 0 self.initialValues = {} self.parameters = {} self.peekFailCount = 0 self.peekRetry = 5 self.profiler = Profiler(pid) self.checkError = '' self.currentStats = {} self.initialized = False self.count = 0 # defaults self.testWallClock = 1 self.testDiskSpace = 1 self.testLoadAvg = 1 self.maxWallClockTime = 3 * 24 * 60 * 60 self.testCPUConsumed = 1 self.testCPULimit = 0 self.testMemoryLimit = 0 self.testTimeLeft = 1 self.pollingTime = 10 # 10 seconds self.checkingTime = 30 * 60 # 30 minute period self.minCheckingTime = 20 * 60 # 20 mins self.wallClockCheckSeconds = 5 * 60 # 5 minutes self.maxWallClockTime = 3 * 24 * 60 * 60 # e.g. 4 days self.jobPeekFlag = 1 # on / off self.minDiskSpace = 10 # MB self.loadAvgLimit = 1000 # > 1000 and jobs killed self.sampleCPUTime = 30 * 60 # e.g. up to 20mins sample self.jobCPUMargin = 20 # %age buffer before killing job self.minCPUWallClockRatio = 5 # ratio %age self.nullCPULimit = 5 # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.wallClockCheckCount = 0 self.nullCPUCount = 0 self.grossTimeLeftLimit = 10 * self.checkingTime self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False self.scaleFactor = 1.0 self.processors = processors
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ #Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceUniqueID = self.am_getOption('CEUniqueID', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceUniqueID = localCE ceFactory = ComputingElementFactory() self.ceName = ceUniqueID ceInstance = ceFactory.getCE(ceUniqueID) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] self.diracRoot = os.path.dirname( os.path.dirname(os.path.dirname(os.path.dirname(__file__)))) #Localsite options self.siteRoot = gConfig.getValue('/LocalSite/Root', os.getcwd()) self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown') self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', 'Unknown') self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) defaultWrapperLocation = 'DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py' self.jobWrapperTemplate = os.path.join( self.diracRoot, self.am_getOption('JobWrapperTemplate', defaultWrapperLocation)) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10) self.defaultLogLevel = self.am_getOption('DefaultLogLevel', 'info') self.fillingMode = self.am_getOption('FillingModeFlag', False) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10) self.jobCount = 0 self.matchFailedCount = 0 #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0) self.gridCEQueue = gConfig.getValue( '/Resources/Computing/CEDefaults/GridCEQueue', '') self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE # Create backend Computing Element ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.computingElement = ceInstance['Value'] result = self.computingElement.getDescription() if not result['OK']: self.log.warn("Can not get the CE description") return result ceDict = result['Value'] self.timeLeft = ceDict.get('CPUTime', self.timeLeft) self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', self.timeLeft) self.initTimes = os.times() # Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', self.siteName) self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', self.pilotReference) self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', self.defaultProxyLength) # Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', self.cpuFactor) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', self.jobSubmissionDelay) self.fillingMode = self.am_getOption('FillingModeFlag', self.fillingMode) self.minimumTimeLeft = self.am_getOption('MinimumTimeLeft', self.minimumTimeLeft) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', self.stopOnApplicationFailure) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches) self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', self.extraOptions) # Timeleft self.timeLeftUtil = TimeLeft() return S_OK()
def initialize(self, loops=0): """Sets default parameters and creates CE instance """ # Disable monitoring self.am_setOption('MonitoringEnabled', False) # self.log.setLevel('debug') #temporary for debugging self.am_setOption('MaxCycles', loops) ceType = self.am_getOption('CEType', 'InProcess') localCE = gConfig.getValue('/LocalSite/LocalCE', '') if localCE: self.log.info('Defining CE from local configuration = %s' % localCE) ceType = localCE ceFactory = ComputingElementFactory() self.ceName = ceType ceInstance = ceFactory.getCE(ceType) if not ceInstance['OK']: self.log.warn(ceInstance['Message']) return ceInstance self.initTimes = os.times() self.computingElement = ceInstance['Value'] #Localsite options self.siteName = gConfig.getValue('/LocalSite/Site', 'Unknown') self.pilotReference = gConfig.getValue('/LocalSite/PilotReference', 'Unknown') self.defaultProxyLength = gConfig.getValue( '/Registry/DefaultProxyLifeTime', 86400 * 5) #Agent options # This is the factor to convert raw CPU to Normalized units (based on the CPU Model) self.cpuFactor = gConfig.getValue('/LocalSite/CPUNormalizationFactor', 0.0) self.jobSubmissionDelay = self.am_getOption('SubmissionDelay', 10) self.fillingMode = self.am_getOption('FillingModeFlag', False) self.stopOnApplicationFailure = self.am_getOption( 'StopOnApplicationFailure', True) self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', 10) self.jobCount = 0 self.matchFailedCount = 0 self.extraOptions = gConfig.getValue( '/AgentJobRequirements/ExtraOptions', '') #Timeleft self.timeLeftUtil = TimeLeft() self.timeLeft = gConfig.getValue( '/Resources/Computing/CEDefaults/MaxCPUTime', 0.0) self.timeLeftError = '' self.scaledCPUTime = 0.0 self.pilotInfoReportedFlag = False return S_OK()
def test_getScaledCPU( self ): tl = TimeLeft() res = tl.getScaledCPU() self.assertEqual( res, 0 ) tl.scaleFactor = 5.0 tl.normFactor = 5.0 for batch, retValue in [( 'LSF', LSF_ReturnValue )]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" ) rcMock = MagicMock() rcMock.return_value = S_OK( retValue ) self.tl.runCommand = rcMock batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % #pylint: disable=unused-variable batchSystemName, globals(), locals(), [batchSystemName] ) batchStr = 'batchPlugin.%s()' % ( batchSystemName ) tl.batchPlugin = eval( batchStr ) res = tl.getScaledCPU() self.assertEqual( res, 0.0 ) for batch, retValue in [( 'SGE', SGE_ReturnValue )]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" ) rcMock = MagicMock() rcMock.return_value = S_OK( retValue ) self.tl.runCommand = rcMock batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % #pylint: disable=unused-variable batchSystemName, globals(), locals(), [batchSystemName] ) batchStr = 'batchPlugin.%s()' % ( batchSystemName ) tl.batchPlugin = eval( batchStr ) res = tl.getScaledCPU() self.assertEqual( res, 300.0 ) for batch, retValue in [( 'MJF', MJF_ReturnValue )]: self.tl = importlib.import_module( "DIRAC.Core.Utilities.TimeLeft.TimeLeft" ) rcMock = MagicMock() rcMock.return_value = S_OK( retValue ) self.tl.runCommand = rcMock batchSystemName = '%sTimeLeft' % batch batchPlugin = __import__( 'DIRAC.Core.Utilities.TimeLeft.%s' % #pylint: disable=unused-variable batchSystemName, globals(), locals(), [batchSystemName] ) batchStr = 'batchPlugin.%s()' % ( batchSystemName ) tl.batchPlugin = eval( batchStr ) res = tl.getScaledCPU() self.assertEqual( res, 0.0 )
def getCPUTime(cpuNormalizationFactor): """ Trying to get CPUTime (in seconds) from the CS or from TimeLeft. The default is a (low) 3600s. This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value """ cpuTimeLeft = 0. cpuWorkLeft = gConfig.getValue('/LocalSite/CPUTimeLeft', 0) if not cpuWorkLeft: # Try and get the information from the CPU left utility result = TimeLeft().getTimeLeft() if result['OK']: cpuWorkLeft = result['Value'] if cpuWorkLeft: # This is in HS06sseconds # We need to convert in real seconds if not cpuNormalizationFactor: # if cpuNormalizationFactor passed in is 0, try get it from the local cfg cpuNormalizationFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0) if cpuNormalizationFactor: cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor # this is a float if not cpuTimeLeft: # now we know that we have to find the CPUTimeLeft by looking in the CS # this is not granted to be correct as the CS units may not be real seconds gridCE = gConfig.getValue('/LocalSite/GridCE') ceQueue = gConfig.getValue('/LocalSite/CEQueue') if not ceQueue: # we have to look for a CEQueue in the CS # A bit hacky. We should better profit from something generic gLogger.warn( "No CEQueue in local configuration, looking to find one in CS") siteName = gConfig.getValue('/LocalSite/Site') queueSection = '/Resources/Sites/%s/%s/CEs/%s/Queues' % ( siteName.split('.')[0], siteName, gridCE) res = gConfig.getSections(queueSection) if not res['OK']: raise RuntimeError(res['Message']) queues = res['Value'] cpuTimes = [ gConfig.getValue(queueSection + '/' + queue + '/maxCPUTime', 10000.) for queue in queues ] # These are (real, wall clock) minutes - damn BDII! cpuTimeLeft = min(cpuTimes) * 60 else: queueInfo = getQueueInfo('%s/%s' % (gridCE, ceQueue)) cpuTimeLeft = 3600. if not queueInfo['OK'] or not queueInfo['Value']: gLogger.warn( "Can't find a CE/queue, defaulting CPUTime to %d" % cpuTimeLeft) else: queueCSSection = queueInfo['Value']['QueueCSSection'] # These are (real, wall clock) minutes - damn BDII! cpuTimeInMinutes = gConfig.getValue( '%s/maxCPUTime' % queueCSSection, 0.) if cpuTimeInMinutes: cpuTimeLeft = cpuTimeInMinutes * 60. gLogger.info("CPUTime for %s: %f" % (queueCSSection, cpuTimeLeft)) else: gLogger.warn( "Can't find maxCPUTime for %s, defaulting CPUTime to %f" % (queueCSSection, cpuTimeLeft)) return int(cpuTimeLeft)
def initialize(self, loops=0): """ Watchdog initialization. """ if self.initialized: self.log.info('Watchdog already initialized') return S_OK() else: self.initialized = True setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: return S_ERROR('Can not get the DIRAC Setup value') wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') self.section = '/Systems/WorkloadManagement/%s/JobWrapper' % wms_instance self.maxcount = loops self.log.verbose('Watchdog initialization') self.log.info('Attempting to Initialize Watchdog for: %s' % (self.systemFlag)) # Test control flags self.testWallClock = gConfig.getValue( self.section + '/CheckWallClockFlag', 1) self.testDiskSpace = gConfig.getValue( self.section + '/CheckDiskSpaceFlag', 1) self.testLoadAvg = gConfig.getValue(self.section + '/CheckLoadAvgFlag', 1) self.testCPUConsumed = gConfig.getValue( self.section + '/CheckCPUConsumedFlag', 1) self.testCPULimit = gConfig.getValue( self.section + '/CheckCPULimitFlag', 0) self.testMemoryLimit = gConfig.getValue( self.section + '/CheckMemoryLimitFlag', 0) self.testTimeLeft = gConfig.getValue( self.section + '/CheckTimeLeftFlag', 1) # Other parameters self.pollingTime = gConfig.getValue(self.section + '/PollingTime', 10) # 10 seconds self.checkingTime = gConfig.getValue(self.section + '/CheckingTime', 30 * 60) # 30 minute period self.minCheckingTime = gConfig.getValue( self.section + '/MinCheckingTime', 20 * 60) # 20 mins self.maxWallClockTime = gConfig.getValue( self.section + '/MaxWallClockTime', 3 * 24 * 60 * 60) # e.g. 4 days self.jobPeekFlag = gConfig.getValue(self.section + '/JobPeekFlag', 1) # on / off self.minDiskSpace = gConfig.getValue(self.section + '/MinDiskSpace', 10) # MB self.loadAvgLimit = gConfig.getValue( self.section + '/LoadAverageLimit', 1000) # > 1000 and jobs killed self.sampleCPUTime = gConfig.getValue(self.section + '/CPUSampleTime', 30 * 60) # e.g. up to 20mins sample self.jobCPUMargin = gConfig.getValue( self.section + '/JobCPULimitMargin', 20) # %age buffer before killing job self.minCPUWallClockRatio = gConfig.getValue( self.section + '/MinCPUWallClockRatio', 5) # ratio %age self.nullCPULimit = gConfig.getValue( self.section + '/NullCPUCountLimit', 5) # After 5 sample times return null CPU consumption kill job self.checkCount = 0 self.nullCPUCount = 0 if self.checkingTime < self.minCheckingTime: self.log.info( 'Requested CheckingTime of %s setting to %s seconds (minimum)' % (self.checkingTime, self.minCheckingTime)) self.checkingTime = self.minCheckingTime # The time left is returned in seconds @ 250 SI00 = 1 HS06, # the self.checkingTime and self.pollingTime are in seconds, # thus they need to be multiplied by a large enough factor self.grossTimeLeftLimit = 10 * self.checkingTime self.fineTimeLeftLimit = gConfig.getValue( self.section + '/TimeLeftLimit', 150 * self.pollingTime) self.timeLeftUtil = TimeLeft() self.timeLeft = 0 self.littleTimeLeft = False return S_OK()
def getCPUTime( cpuNormalizationFactor ): """ Trying to get CPUTime left for execution (in seconds). It will first look to get the work left looking for batch system information useing the TimeLeft utility. If it succeeds, it will convert it in real second, and return it. If it fails, it tries to get it from the static info found in CS. If it fails, it returns the default, which is a large 9999999, that we may consider as "Infinite". This is a generic method, independent from the middleware of the resource if TimeLeft doesn't return a value args: cpuNormalizationFactor (float): the CPU power of the current Worker Node. If not passed in, it's get from the local configuration returns: cpuTimeLeft (int): the CPU time left, in seconds """ cpuTimeLeft = 0. cpuWorkLeft = gConfig.getValue( '/LocalSite/CPUTimeLeft', 0 ) if not cpuWorkLeft: # Try and get the information from the CPU left utility result = TimeLeft().getTimeLeft() if result['OK']: cpuWorkLeft = result['Value'] if cpuWorkLeft > 0: # This is in HS06sseconds # We need to convert in real seconds if not cpuNormalizationFactor: # if cpuNormalizationFactor passed in is 0, try get it from the local cfg cpuNormalizationFactor = gConfig.getValue( '/LocalSite/CPUNormalizationFactor', 0.0 ) if cpuNormalizationFactor: cpuTimeLeft = cpuWorkLeft / cpuNormalizationFactor # this is a float if not cpuTimeLeft: # now we know that we have to find the CPUTimeLeft by looking in the CS # this is not granted to be correct as the CS units may not be real seconds gridCE = gConfig.getValue( '/LocalSite/GridCE' ) ceQueue = gConfig.getValue( '/LocalSite/CEQueue' ) if not ceQueue: # we have to look for a ceQueue in the CS # A bit hacky. We should better profit from something generic gLogger.warn( "No CEQueue in local configuration, looking to find one in CS" ) siteName = gConfig.getValue( '/LocalSite/Site' ) queueSection = '/Resources/Sites/%s/%s/CEs/%s/Queues' % ( siteName.split( '.' )[0], siteName, gridCE ) res = gConfig.getSections( queueSection ) if not res['OK']: raise RuntimeError( res['Message'] ) queues = res['Value'] cpuTimes = [gConfig.getValue( queueSection + '/' + queue + '/maxCPUTime', 9999999. ) for queue in queues] # These are (real, wall clock) minutes - damn BDII! cpuTimeLeft = min( cpuTimes ) * 60 else: queueInfo = getQueueInfo( '%s/%s' % ( gridCE, ceQueue ) ) cpuTimeLeft = 9999999. if not queueInfo['OK'] or not queueInfo['Value']: gLogger.warn( "Can't find a CE/queue, defaulting CPUTime to %d" % cpuTimeLeft ) else: queueCSSection = queueInfo['Value']['QueueCSSection'] # These are (real, wall clock) minutes - damn BDII! cpuTimeInMinutes = gConfig.getValue( '%s/maxCPUTime' % queueCSSection, 0. ) if cpuTimeInMinutes: cpuTimeLeft = cpuTimeInMinutes * 60. gLogger.info( "CPUTime for %s: %f" % ( queueCSSection, cpuTimeLeft ) ) else: gLogger.warn( "Can't find maxCPUTime for %s, defaulting CPUTime to %f" % ( queueCSSection, cpuTimeLeft ) ) return int( cpuTimeLeft )