class JobHistoryAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize(self): self.jobDB = JobDB() for status in MONITOR_STATUS: for site in MONITOR_SITES: gLogger.verbose("Registering activity %s-%s" % (status, site)) gLogger.verbose("Jobs in %s state at %s" % (status, site)) gMonitor.registerActivity( "%s-%s" % (status, site), "Jobs in %s state at %s" % (status, site), "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN) self.last_update = 0 self.resultDB = None self.reportPeriod = 60 return S_OK() def execute(self): """ Main execution method """ delta = time.time() - self.last_update if delta > self.reportPeriod: result = self.jobDB.getCounters('Jobs', ['Status', 'Site'], {}, '') if not result['OK']: return S_ERROR('Failed to get data from the Job Database') self.resultDB = result['Value'] self.last_update = time.time() totalDict = {} for status in MONITOR_STATUS: totalDict[status] = 0 for row in self.resultDB: site = row[0]['Site'] status = row[0]['Status'] count = row[1] if site in MONITOR_SITES and status in MONITOR_STATUS: gLogger.verbose("Adding mark %s-%s: " % (status, site) + str(count)) gMonitor.addMark("%s-%s" % (status, site), count) if status in totalDict: totalDict[status] += count for status in MONITOR_STATUS: gLogger.verbose("Adding mark %s-All sites: " % status + str(totalDict[status])) gMonitor.addMark("%s-All sites" % status, totalDict[status]) return S_OK()
class JobHistoryAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize( self ): self.jobDB = JobDB() for status in MONITOR_STATUS: for site in MONITOR_SITES: gLogger.verbose( "Registering activity %s-%s" % ( status, site ) ) gLogger.verbose( "Jobs in %s state at %s" % ( status, site ) ) gMonitor.registerActivity( "%s-%s" % ( status, site ), "Jobs in %s state at %s" % ( status, site ), "JobHistoryAgent", "Jobs/minute", gMonitor.OP_MEAN ) self.last_update = 0 self.resultDB = None self.reportPeriod = 60 return S_OK() def execute( self ): """ Main execution method """ delta = time.time() - self.last_update if delta > self.reportPeriod: result = self.jobDB.getCounters( 'Jobs', ['Status', 'Site'], {}, '' ) if not result['OK']: return S_ERROR( 'Failed to get data from the Job Database' ) self.resultDB = result['Value'] self.last_update = time.time() totalDict = {} for status in MONITOR_STATUS: totalDict[status] = 0 for row in self.resultDB: site = row[0]['Site'] status = row[0]['Status'] count = row[1] if site in MONITOR_SITES and status in MONITOR_STATUS: gLogger.verbose( "Adding mark %s-%s: " % ( status, site ) + str( count ) ) gMonitor.addMark( "%s-%s" % ( status, site ), count ) if status in totalDict: totalDict[status] += count for status in MONITOR_STATUS: gLogger.verbose( "Adding mark %s-All sites: " % status + str( totalDict[status] ) ) gMonitor.addMark( "%s-All sites" % status, totalDict[status] ) return S_OK()
class SPTCorrector(BaseCorrector): _GLOBAL_MAX_CORRECTION = "MaxGlobalCorrection" _SLICE_TIME_SPAN = "TimeSpan" _SLICE_WEIGHT = "Weight" _SLICE_MAX_CORRECTION = "MaxCorrection" def initialize(self): self.__jobDB = JobDB() return S_OK() def applyCorrection(self, entitiesExpectedShare): print "AT >>> entitiesExpectedShare", entitiesExpectedShare ownerDNs = entitiesExpectedShare.keys() group = self.getGroup() result = self.__jobDB.getCounters("Jobs", ["OwnerDN"], {"OwnerGroup": group, "Status": "Waiting"}) if not result["OK"]: print "AT >>> result", result return entitiesExpectedShare ownerDict = {} for row in result["Value"]: ownerDict[row[0]["OwnerDN"]] = row[1] print "AT >>> ownerDict", ownerDict resultShare = {} minNumber = 1000000000000 minOwnerDN = "" for ownerDN in ownerDNs: resultShare[ownerDN] = 0 if minNumber > ownerDict[ownerDN]: minNumber = ownerDict[ownerDN] minOwnerDN = ownerDN resultShare[minOwnerDN] = 1 print "AT >>> resultShare", resultShare return resultShare def updateHistoryKnowledge(self): return S_OK()
class Limiter(object): # static variables shared between all instances of this class csDictCache = DictCache() condCache = DictCache() delayMem = {} def __init__(self, jobDB=None, opsHelper=None): """ Constructor """ self.__runningLimitSection = "JobScheduling/RunningLimit" self.__matchingDelaySection = "JobScheduling/MatchingDelay" if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() self.log = gLogger.getSubLogger("Limiter") if opsHelper: self.__opsHelper = opsHelper else: self.__opsHelper = Operations() def getNegativeCond(self): """ Get negative condition for ALL sites """ orCond = self.condCache.get("GLOBAL") if orCond: return orCond negCond = {} # Run Limit result = self.__opsHelper.getSections(self.__runningLimitSection) sites = [] if result['OK']: sites = result['Value'] for siteName in sites: result = self.__getRunningCondition(siteName) if not result['OK']: continue data = result['Value'] if data: negCond[siteName] = data # Delay limit result = self.__opsHelper.getSections(self.__matchingDelaySection) sites = [] if result['OK']: sites = result['Value'] for siteName in sites: result = self.__getDelayCondition(siteName) if not result['OK']: continue data = result['Value'] if not data: continue if siteName in negCond: negCond[siteName] = self.__mergeCond(negCond[siteName], data) else: negCond[siteName] = data orCond = [] for siteName in negCond: negCond[siteName]['Site'] = siteName orCond.append(negCond[siteName]) self.condCache.add("GLOBAL", 10, orCond) return orCond def getNegativeCondForSite(self, siteName): """ Generate a negative query based on the limits set on the site """ # Check if Limits are imposed onto the site negativeCond = {} if self.__opsHelper.getValue("JobScheduling/CheckJobLimits", True): result = self.__getRunningCondition(siteName) if result['OK']: negativeCond = result['Value'] self.log.verbose('Negative conditions for site', '%s after checking limits are: %s' % (siteName, str(negativeCond))) if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): result = self.__getDelayCondition(siteName) if result['OK']: delayCond = result['Value'] self.log.verbose('Negative conditions for site', '%s after delay checking are: %s' % (siteName, str(delayCond))) negativeCond = self.__mergeCond(negativeCond, delayCond) if negativeCond: self.log.info('Negative conditions for site', '%s are: %s' % (siteName, str(negativeCond))) return negativeCond def __mergeCond(self, negCond, addCond): """ Merge two negative dicts """ # Merge both negative dicts for attr in addCond: if attr not in negCond: negCond[attr] = [] for value in addCond[attr]: if value not in negCond[attr]: negCond[attr].append(value) return negCond def __extractCSData(self, section): """ Extract limiting information from the CS in the form: { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } """ stuffDict = self.csDictCache.get(section) if stuffDict: return S_OK(stuffDict) result = self.__opsHelper.getSections(section) if not result['OK']: return result attribs = result['Value'] stuffDict = {} for attName in attribs: result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName)) if not result['OK']: return result attLimits = result['Value'] try: attLimits = dict([(k, int(attLimits[k])) for k in attLimits]) except Exception as excp: errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp)) self.log.error(errMsg) return S_ERROR(errMsg) stuffDict[attName] = attLimits self.csDictCache.add(section, 300, stuffDict) return S_OK(stuffDict) def __getRunningCondition(self, siteName): """ Get extra conditions allowing site throttling """ siteSection = "%s/%s" % (self.__runningLimitSection, siteName) result = self.__extractCSData(siteSection) if not result['OK']: return result limitsDict = result['Value'] # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } if not limitsDict: return S_OK({}) # Check if the site exceeding the given limits negCond = {} for attName in limitsDict: if attName not in self.jobDB.jobAttributeNames: self.log.error("Attribute does not exist", "(%s). Check the job limits" % attName) continue cK = "Running:%s:%s" % (siteName, attName) data = self.condCache.get(cK) if not data: result = self.jobDB.getCounters( 'Jobs', [attName], { 'Site': siteName, 'Status': [ 'Running', 'Matched', 'Stalled']}) if not result['OK']: return result data = result['Value'] data = dict([(k[0][attName], k[1]) for k in data]) self.condCache.add(cK, 10, data) for attValue in limitsDict[attName]: limit = limitsDict[attName][attValue] running = data.get(attValue, 0) if running >= limit: self.log.verbose('Job Limit imposed', 'at %s on %s/%s=%d, %d jobs already deployed' % (siteName, attName, attValue, limit, running)) if attName not in negCond: negCond[attName] = [] negCond[attName].append(attValue) # negCond is something like : {'JobType': ['Merge']} return S_OK(negCond) def updateDelayCounters(self, siteName, jid): # Get the info from the CS siteSection = "%s/%s" % (self.__matchingDelaySection, siteName) result = self.__extractCSData(siteSection) if not result['OK']: return result delayDict = result['Value'] # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } if not delayDict: return S_OK() attNames = [] for attName in delayDict: if attName not in self.jobDB.jobAttributeNames: self.log.error("Attribute does not exist in the JobDB. Please fix it!", "(%s)" % attName) else: attNames.append(attName) result = self.jobDB.getJobAttributes(jid, attNames) if not result['OK']: self.log.error("Error while retrieving attributes", "coming from %s: %s" % (siteSection, result['Message'])) return result atts = result['Value'] # Create the DictCache if not there if siteName not in self.delayMem: self.delayMem[siteName] = DictCache() # Update the counters delayCounter = self.delayMem[siteName] for attName in atts: attValue = atts[attName] if attValue in delayDict[attName]: delayTime = delayDict[attName][attValue] self.log.notice("Adding delay for %s/%s=%s of %s secs" % (siteName, attName, attValue, delayTime)) delayCounter.add((attName, attValue), delayTime) return S_OK() def __getDelayCondition(self, siteName): """ Get extra conditions allowing matching delay """ if siteName not in self.delayMem: return S_OK({}) lastRun = self.delayMem[siteName].getKeys() negCond = {} for attName, attValue in lastRun: if attName not in negCond: negCond[attName] = [] negCond[attName].append(attValue) return S_OK(negCond)
class JobCommand( Command ): ''' Job "master" Command. ''' def __init__( self, args = None, clients = None ): super( JobCommand, self ).__init__( args, clients ) if 'JobDB' in self.apis: self.jobDB = self.apis[ 'JobDB' ] else: self.jobDB = JobDB() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient() def _storeCommand( self, result ): ''' Stores the results of doNew method on the database. ''' for jobDict in result: lowerCaseJobDict = {} for key, value in jobDict.iteritems(): lowerCaseJobDict[ key[0].lower() + key[1:] ] = value resQuery = self.rmClient.addOrModifyJobCache( **lowerCaseJobDict ) if not resQuery[ 'OK' ]: return resQuery return S_OK() def _prepareCommand( self ): ''' JobCommand requires one arguments: - name : <str> ''' if not 'name' in self.args: return S_ERROR( '"name" not found in self.args' ) name = self.args[ 'name' ] if not 'timespan' in self.args: return S_ERROR( '"timespan" not found in self.args' ) timespan = self.args[ 'timespan' ] return S_OK( ( name, timespan ) ) def doNew( self, masterParams = None ): ''' Gets the parameters to run, either from the master method or from its own arguments. It contacts the WMSAdministrator with a list of site names, or a single site. If there are jobs, are recorded and then returned. ''' if masterParams is True: self.args[ 'name' ] = '' params = self._prepareCommand() if not params[ 'OK' ]: return params name, timespan = params[ 'Value' ] condDict = {} if name: condDict = { 'Site' : name } startTimeWindow = datetime.utcnow() - timedelta( seconds = timespan ) results = self.jobDB.getCounters( 'Jobs', ['Site', 'Status'], condDict, newer = startTimeWindow, timeStamp = 'LastUpdateTime' ) if not results[ 'OK' ]: return results # Results look like this # [ ({'Status': 'Checking', 'Site': 'ANY'}, 6L), ... uniformResult = {} jobStatuses = ( 'Checking', 'Completed', 'Done', 'Failed', 'Killed', 'Matched', 'Received', 'Rescheduled', 'Running', 'Staging', 'Stalled', 'Waiting' ) for resultTuple in results[ 'Value' ]: selectionDict, numberOfJobs = resultTuple siteName = selectionDict[ 'Site' ] if siteName in ( 'ANY', 'Multiple' ): continue if not siteName in uniformResult: uniformResult[ siteName ] = dict.fromkeys( jobStatuses, 0 ) uniformResult[ siteName ][ selectionDict[ 'Status' ] ] = numberOfJobs # Store results storeRes = self._storeCommand( uniformResult ) if not storeRes[ 'OK' ]: return storeRes return S_OK( uniformResult ) def doCache( self ): ''' Method that reads the cache table and tries to read from it. It will return a list of dictionaries if there are results. ''' params = self._prepareCommand() if not params[ 'OK' ]: return params name = params[ 'Value' ] result = self.rmClient.selectJobCache( name ) if result[ 'OK' ]: result = S_OK( [ dict( zip( result[ 'Columns' ], res ) ) for res in result[ 'Value' ] ] ) return result def doMaster( self ): ''' Master method. Gets all sites and calls doNew method. ''' jobsResults = self.doNew( True ) if not jobsResults[ 'OK' ]: self.metrics[ 'failed' ].append( jobsResults[ 'Message' ] ) return S_OK( self.metrics ) ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ #class JobsStatsCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsStatsCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand( self ): # """ # Return getJobStats from Jobs Client # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string: should be the name of the ValidElement # # returns: # { # 'MeanProcessedJobs': X # } # """ # # return self.jClient.getJobsStats( self.args[0], self.args[1], self.args[2] ) ################################################################################ ################################################################################ #class JobsEffCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsEffCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand( self ): # """ # Return getJobsEff from Jobs Client # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string: should be the name of the ValidElement # # returns: # { # 'JobsEff': X # } # """ # # res = self.jClient.getJobsEff( self.args[0], self.args[1], self.args[2] ) # # return S_OK( res ) ################################################################################ ################################################################################ #class SystemChargeCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( SystemChargeCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand(self): # """ Returns last hour system charge, and the system charge of an hour before # # returns: # { # 'LastHour': n_lastHour # 'anHourBefore': n_anHourBefore # } # """ # # # res = self.jClient.getSystemCharge() # # return S_OK( res ) ################################################################################ ################################################################################ #class JobsWMSCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsWMSCommand, self ).__init__( args, clients ) # # if 'WMSAdministrator' in self.apis: # self.wmsAdmin = self.apis[ 'WMSAdministrator' ] # else: # self.wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) # # def doCommand( self ): # """ # Returns simple jobs efficiency # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string should be the name of the ValidElement # # returns: # { # 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' # } # """ # # if not 'siteName' in self.args: # return self.returnERROR( S_ERROR( 'siteName is missing' ) ) # siteName = self.args[ 'siteName' ] # # # If siteName is None, we take all sites # if siteName is None: # siteName = CSHelpers.getSites() # if not siteName[ 'OK' ]: # return self.returnERROR( siteName ) # siteName = siteName[ 'Value' ] # # results = self.wmsAdmin.getSiteSummaryWeb( { 'Site' : siteName }, [], 0, 500 ) # # if not results[ 'OK' ]: # return self.returnERROR( results ) # results = results[ 'Value' ] # # if not 'ParameterNames' in results: # return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) ) # params = results[ 'ParameterNames' ] # # if not 'Records' in results: # return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) ) # records = results[ 'Records' ] # # jobResults = [] # # for record in records: # # jobDict = dict( zip( params , record )) # try: # jobDict[ 'Efficiency' ] = float( jobDict[ 'Efficiency' ] ) # except KeyError, e: # return self.returnERROR( S_ERROR( e ) ) # except ValueError, e: # return self.returnERROR( S_ERROR( e ) ) # # jobResults.append( jobDict ) # # return S_OK( jobResults ) ################################################################################ ################################################################################ #class JobsEffSimpleEveryOneCommand( Command ): # # #FIXME: write propper docstrings # # def __init__( self, args = None, clients = None ): # # super( JobsEffSimpleEveryOneCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand( self ): # """ # Returns simple jobs efficiency for all the sites in input. # # :params: # :attr:`sites`: list of site names (when not given, take every site) # # :returns: # {'SiteName': {'JE_S': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'}, ...} # """ # # sites = None # # if 'sites' in self.args: # sites = self.args[ 'sites' ] # # if sites is None: # #FIXME: we do not get them from RSS DB anymore, from CS now. # #sites = self.rsClient.selectSite( meta = { 'columns' : 'SiteName' } ) # sites = CSHelpers.getSites() # # if not sites['OK']: # return sites # sites = sites[ 'Value' ] # #sites = [ site[ 0 ] for site in sites[ 'Value' ] ] # # results = self.jClient.getJobsSimpleEff( sites ) # # return results # ## if not results[ 'OK' ]: ## return results ## results = results[ 'Value' ] # ## if results is None: ## results = {} # ## resToReturn = {} # # #for site in results: # # resToReturn[ site ] = results[ site ] # ## return S_OK( resToReturn ) ################################################################################ ################################################################################ #class JobsEffSimpleCachedCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsEffSimpleCachedCommand, self ).__init__( args, clients ) # # if 'ResourceStatusClient' in self.apis: # self.rsClient = self.apis[ 'ResourceStatusClient' ] # else: # self.rsClient = ResourceStatusClient() # # if 'ResourceManagementClient' in self.apis: # self.rmClient = self.apis[ 'ResourceManagementClient' ] # else: # self.rmClient = ResourceManagementClient() # # def doCommand( self ): # """ # Returns simple jobs efficiency # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string should be the name of the ValidElement # # returns: # { # 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' # } # """ # # if self.args[0] == 'Service': # name = self.rsClient.getGeneralName( self.args[0], self.args[1], 'Site' ) # name = name[ 'Value' ][ 0 ] # granularity = 'Site' # elif self.args[0] == 'Site': # name = self.args[1] # granularity = self.args[0] # else: # return S_ERROR( '%s is not a valid granularity' % self.args[ 0 ] ) # # clientDict = { # 'name' : name, # 'commandName' : 'JobsEffSimpleEveryOne', # 'value' : 'JE_S', # 'opt_ID' : 'NULL', # 'meta' : { 'columns' : 'Result' } # } # # res = self.rmClient.getClientCache( **clientDict ) # # if res[ 'OK' ]: # res = res[ 'Value' ] # if res == None or res == []: # res = S_OK( 'Idle' ) # else: # res = S_OK( res[ 0 ] ) # # return res ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
class JobCommand(Command): ''' Job "master" Command. ''' def __init__(self, args=None, clients=None): super(JobCommand, self).__init__(args, clients) if 'JobDB' in self.apis: self.jobDB = self.apis['JobDB'] else: self.jobDB = JobDB() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient() def _storeCommand(self, result): ''' Stores the results of doNew method on the database. ''' for jobDict in result: lowerCaseJobDict = {} for key, value in jobDict.iteritems(): lowerCaseJobDict[key[0].lower() + key[1:]] = value resQuery = self.rmClient.addOrModifyJobCache(**lowerCaseJobDict) if not resQuery['OK']: return resQuery return S_OK() def _prepareCommand(self): ''' JobCommand requires one arguments: - name : <str> ''' if not 'name' in self.args: return S_ERROR('"name" not found in self.args') name = self.args['name'] if not 'timespan' in self.args: return S_ERROR('"timespan" not found in self.args') timespan = self.args['timespan'] return S_OK((name, timespan)) def doNew(self, masterParams=None): ''' Gets the parameters to run, either from the master method or from its own arguments. It contacts the WMSAdministrator with a list of site names, or a single site. If there are jobs, are recorded and then returned. ''' if masterParams is True: self.args['name'] = '' params = self._prepareCommand() if not params['OK']: return params name, timespan = params['Value'] condDict = {} if name: condDict = {'Site': name} startTimeWindow = datetime.utcnow() - timedelta(seconds=timespan) results = self.jobDB.getCounters('Jobs', ['Site', 'Status'], condDict, newer=startTimeWindow, timeStamp='LastUpdateTime') if not results['OK']: return results # Results look like this # [ ({'Status': 'Checking', 'Site': 'ANY'}, 6L), ... uniformResult = {} jobStatuses = ('Checking', 'Completed', 'Done', 'Failed', 'Killed', 'Matched', 'Received', 'Rescheduled', 'Running', 'Staging', 'Stalled', 'Waiting') for resultTuple in results['Value']: selectionDict, numberOfJobs = resultTuple siteName = selectionDict['Site'] if siteName in ('ANY', 'Multiple'): continue if not siteName in uniformResult: uniformResult[siteName] = dict.fromkeys(jobStatuses, 0) uniformResult[siteName][selectionDict['Status']] = numberOfJobs # Store results storeRes = self._storeCommand(uniformResult) if not storeRes['OK']: return storeRes return S_OK(uniformResult) def doCache(self): ''' Method that reads the cache table and tries to read from it. It will return a list of dictionaries if there are results. ''' params = self._prepareCommand() if not params['OK']: return params name = params['Value'] result = self.rmClient.selectJobCache(name) if result['OK']: result = S_OK( [dict(zip(result['Columns'], res)) for res in result['Value']]) return result def doMaster(self): ''' Master method. Gets all sites and calls doNew method. ''' jobsResults = self.doNew(True) if not jobsResults['OK']: self.metrics['failed'].append(jobsResults['Message']) return S_OK(self.metrics) ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ ################################################################################ #class JobsStatsCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsStatsCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand( self ): # """ # Return getJobStats from Jobs Client # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string: should be the name of the ValidElement # # returns: # { # 'MeanProcessedJobs': X # } # """ # # return self.jClient.getJobsStats( self.args[0], self.args[1], self.args[2] ) ################################################################################ ################################################################################ #class JobsEffCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsEffCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand( self ): # """ # Return getJobsEff from Jobs Client # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string: should be the name of the ValidElement # # returns: # { # 'JobsEff': X # } # """ # # res = self.jClient.getJobsEff( self.args[0], self.args[1], self.args[2] ) # # return S_OK( res ) ################################################################################ ################################################################################ #class SystemChargeCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( SystemChargeCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand(self): # """ Returns last hour system charge, and the system charge of an hour before # # returns: # { # 'LastHour': n_lastHour # 'anHourBefore': n_anHourBefore # } # """ # # # res = self.jClient.getSystemCharge() # # return S_OK( res ) ################################################################################ ################################################################################ #class JobsWMSCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsWMSCommand, self ).__init__( args, clients ) # # if 'WMSAdministrator' in self.apis: # self.wmsAdmin = self.apis[ 'WMSAdministrator' ] # else: # self.wmsAdmin = RPCClient( 'WorkloadManagement/WMSAdministrator' ) # # def doCommand( self ): # """ # Returns simple jobs efficiency # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string should be the name of the ValidElement # # returns: # { # 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' # } # """ # # if not 'siteName' in self.args: # return self.returnERROR( S_ERROR( 'siteName is missing' ) ) # siteName = self.args[ 'siteName' ] # # # If siteName is None, we take all sites # if siteName is None: # siteName = CSHelpers.getSites() # if not siteName[ 'OK' ]: # return self.returnERROR( siteName ) # siteName = siteName[ 'Value' ] # # results = self.wmsAdmin.getSiteSummaryWeb( { 'Site' : siteName }, [], 0, 500 ) # # if not results[ 'OK' ]: # return self.returnERROR( results ) # results = results[ 'Value' ] # # if not 'ParameterNames' in results: # return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) ) # params = results[ 'ParameterNames' ] # # if not 'Records' in results: # return self.returnERROR( S_ERROR( 'Malformed result dictionary' ) ) # records = results[ 'Records' ] # # jobResults = [] # # for record in records: # # jobDict = dict( zip( params , record )) # try: # jobDict[ 'Efficiency' ] = float( jobDict[ 'Efficiency' ] ) # except KeyError, e: # return self.returnERROR( S_ERROR( e ) ) # except ValueError, e: # return self.returnERROR( S_ERROR( e ) ) # # jobResults.append( jobDict ) # # return S_OK( jobResults ) ################################################################################ ################################################################################ #class JobsEffSimpleEveryOneCommand( Command ): # # #FIXME: write propper docstrings # # def __init__( self, args = None, clients = None ): # # super( JobsEffSimpleEveryOneCommand, self ).__init__( args, clients ) # # if 'JobsClient' in self.apis: # self.jClient = self.apis[ 'JobsClient' ] # else: # self.jClient = JobsClient() # # def doCommand( self ): # """ # Returns simple jobs efficiency for all the sites in input. # # :params: # :attr:`sites`: list of site names (when not given, take every site) # # :returns: # {'SiteName': {'JE_S': 'Good'|'Fair'|'Poor'|'Idle'|'Bad'}, ...} # """ # # sites = None # # if 'sites' in self.args: # sites = self.args[ 'sites' ] # # if sites is None: # #FIXME: we do not get them from RSS DB anymore, from CS now. # #sites = self.rsClient.selectSite( meta = { 'columns' : 'SiteName' } ) # sites = CSHelpers.getSites() # # if not sites['OK']: # return sites # sites = sites[ 'Value' ] # #sites = [ site[ 0 ] for site in sites[ 'Value' ] ] # # results = self.jClient.getJobsSimpleEff( sites ) # # return results # ## if not results[ 'OK' ]: ## return results ## results = results[ 'Value' ] # ## if results is None: ## results = {} # ## resToReturn = {} # # #for site in results: # # resToReturn[ site ] = results[ site ] # ## return S_OK( resToReturn ) ################################################################################ ################################################################################ #class JobsEffSimpleCachedCommand( Command ): # # def __init__( self, args = None, clients = None ): # # super( JobsEffSimpleCachedCommand, self ).__init__( args, clients ) # # if 'ResourceStatusClient' in self.apis: # self.rsClient = self.apis[ 'ResourceStatusClient' ] # else: # self.rsClient = ResourceStatusClient() # # if 'ResourceManagementClient' in self.apis: # self.rmClient = self.apis[ 'ResourceManagementClient' ] # else: # self.rmClient = ResourceManagementClient() # # def doCommand( self ): # """ # Returns simple jobs efficiency # # :attr:`args`: # - args[0]: string: should be a ValidElement # # - args[1]: string should be the name of the ValidElement # # returns: # { # 'Result': 'Good'|'Fair'|'Poor'|'Idle'|'Bad' # } # """ # # if self.args[0] == 'Service': # name = self.rsClient.getGeneralName( self.args[0], self.args[1], 'Site' ) # name = name[ 'Value' ][ 0 ] # granularity = 'Site' # elif self.args[0] == 'Site': # name = self.args[1] # granularity = self.args[0] # else: # return S_ERROR( '%s is not a valid granularity' % self.args[ 0 ] ) # # clientDict = { # 'name' : name, # 'commandName' : 'JobsEffSimpleEveryOne', # 'value' : 'JE_S', # 'opt_ID' : 'NULL', # 'meta' : { 'columns' : 'Result' } # } # # res = self.rmClient.getClientCache( **clientDict ) # # if res[ 'OK' ]: # res = res[ 'Value' ] # if res == None or res == []: # res = S_OK( 'Idle' ) # else: # res = S_OK( res[ 0 ] ) # # return res ################################################################################ #EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF
class Limiter(object): def __init__(self, jobDB=None, opsHelper=None): """ Constructor """ self.__runningLimitSection = "JobScheduling/RunningLimit" self.__matchingDelaySection = "JobScheduling/MatchingDelay" self.csDictCache = DictCache() self.condCache = DictCache() self.delayMem = {} if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() self.log = gLogger.getSubLogger("Limiter") if opsHelper: self.__opsHelper = opsHelper else: self.__opsHelper = Operations() def getNegativeCond(self): """ Get negative condition for ALL sites """ orCond = self.condCache.get("GLOBAL") if orCond: return orCond negCond = {} # Run Limit result = self.__opsHelper.getSections(self.__runningLimitSection) sites = [] if result['OK']: sites = result['Value'] for siteName in sites: result = self.__getRunningCondition(siteName) if not result['OK']: continue data = result['Value'] if data: negCond[siteName] = data # Delay limit result = self.__opsHelper.getSections(self.__matchingDelaySection) sites = [] if result['OK']: sites = result['Value'] for siteName in sites: result = self.__getDelayCondition(siteName) if not result['OK']: continue data = result['Value'] if not data: continue if siteName in negCond: negCond[siteName] = self.__mergeCond(negCond[siteName], data) else: negCond[siteName] = data orCond = [] for siteName in negCond: negCond[siteName]['Site'] = siteName orCond.append(negCond[siteName]) self.condCache.add("GLOBAL", 10, orCond) return orCond def getNegativeCondForSite(self, siteName): """ Generate a negative query based on the limits set on the site """ # Check if Limits are imposed onto the site negativeCond = {} if self.__opsHelper.getValue("JobScheduling/CheckJobLimits", True): result = self.__getRunningCondition(siteName) if result['OK']: negativeCond = result['Value'] self.log.verbose('Negative conditions for site %s after checking limits are: %s' % (siteName, str(negativeCond))) if self.__opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): result = self.__getDelayCondition(siteName) if result['OK']: delayCond = result['Value'] self.log.verbose('Negative conditions for site %s after delay checking are: %s' % (siteName, str(delayCond))) negativeCond = self.__mergeCond(negativeCond, delayCond) if negativeCond: self.log.info('Negative conditions for site %s are: %s' % (siteName, str(negativeCond))) return negativeCond def __mergeCond(self, negCond, addCond): """ Merge two negative dicts """ # Merge both negative dicts for attr in addCond: if attr not in negCond: negCond[attr] = [] for value in addCond[attr]: if value not in negCond[attr]: negCond[attr].append(value) return negCond def __extractCSData(self, section): """ Extract limiting information from the CS in the form: { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } """ stuffDict = self.csDictCache.get(section) if stuffDict: return S_OK(stuffDict) result = self.__opsHelper.getSections(section) if not result['OK']: return result attribs = result['Value'] stuffDict = {} for attName in attribs: result = self.__opsHelper.getOptionsDict("%s/%s" % (section, attName)) if not result['OK']: return result attLimits = result['Value'] try: attLimits = dict([(k, int(attLimits[k])) for k in attLimits]) except Exception as excp: errMsg = "%s/%s has to contain numbers: %s" % (section, attName, str(excp)) self.log.error(errMsg) return S_ERROR(errMsg) stuffDict[attName] = attLimits self.csDictCache.add(section, 300, stuffDict) return S_OK(stuffDict) def __getRunningCondition(self, siteName): """ Get extra conditions allowing site throttling """ siteSection = "%s/%s" % (self.__runningLimitSection, siteName) result = self.__extractCSData(siteSection) if not result['OK']: return result limitsDict = result['Value'] # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } if not limitsDict: return S_OK({}) # Check if the site exceeding the given limits negCond = {} for attName in limitsDict: if attName not in self.jobDB.jobAttributeNames: self.log.error("Attribute %s does not exist. Check the job limits" % attName) continue cK = "Running:%s:%s" % (siteName, attName) data = self.condCache.get(cK) if not data: result = self.jobDB.getCounters( 'Jobs', [attName], { 'Site': siteName, 'Status': [ 'Running', 'Matched', 'Stalled']}) if not result['OK']: return result data = result['Value'] data = dict([(k[0][attName], k[1]) for k in data]) self.condCache.add(cK, 10, data) for attValue in limitsDict[attName]: limit = limitsDict[attName][attValue] running = data.get(attValue, 0) if running >= limit: self.log.verbose('Job Limit imposed at %s on %s/%s=%d,' ' %d jobs already deployed' % (siteName, attName, attValue, limit, running)) if attName not in negCond: negCond[attName] = [] negCond[attName].append(attValue) # negCond is something like : {'JobType': ['Merge']} return S_OK(negCond) def updateDelayCounters(self, siteName, jid): # Get the info from the CS siteSection = "%s/%s" % (self.__matchingDelaySection, siteName) result = self.__extractCSData(siteSection) if not result['OK']: return result delayDict = result['Value'] # limitsDict is something like { 'JobType' : { 'Merge' : 20, 'MCGen' : 1000 } } if not delayDict: return S_OK() attNames = [] for attName in delayDict: if attName not in self.jobDB.jobAttributeNames: self.log.error("Attribute %s does not exist in the JobDB. Please fix it!" % attName) else: attNames.append(attName) result = self.jobDB.getJobAttributes(jid, attNames) if not result['OK']: self.log.error("While retrieving attributes coming from %s: %s" % (siteSection, result['Message'])) return result atts = result['Value'] # Create the DictCache if not there if siteName not in self.delayMem: self.delayMem[siteName] = DictCache() # Update the counters delayCounter = self.delayMem[siteName] for attName in atts: attValue = atts[attName] if attValue in delayDict[attName]: delayTime = delayDict[attName][attValue] self.log.notice("Adding delay for %s/%s=%s of %s secs" % (siteName, attName, attValue, delayTime)) delayCounter.add((attName, attValue), delayTime) return S_OK() def __getDelayCondition(self, siteName): """ Get extra conditions allowing matching delay """ if siteName not in self.delayMem: return S_OK({}) lastRun = self.delayMem[siteName].getKeys() negCond = {} for attName, attValue in lastRun: if attName not in negCond: negCond[attName] = [] negCond[attName].append(attValue) return S_OK(negCond)