def getCEStatus(self): """ Method to return information on running and pending jobs. """ result = S_OK() result['SubmittedJobs'] = 0 result['RunningJobs'] = 0 result['WaitingJobs'] = 0 ##getWaitingPilots condDict = { 'DestinationSite': self.ceName, 'Status': WAITING_PILOT_STATUS } res = PilotAgentsDB().countPilots(condDict) if res['OK']: result['WaitingJobs'] = int(res['Value']) else: self.log.warn("Failure getting pilot count for %s: %s " % (self.ceName, res['Message'])) ##getRunningPilots condDict = {'DestinationSite': self.ceName, 'Status': 'Running'} res = PilotAgentsDB().countPilots(condDict) if res['OK']: result['RunningJobs'] = int(res['Value']) else: self.log.warn("Failure getting pilot count for %s: %s " % (self.ceName, res['Message'])) return result
def export_getCurrentPilotCounters(cls, attrDict={}): """ Get pilot counters per Status with attrDict selection. Final statuses are given for the last day. """ result = PilotAgentsDB().getCounters('PilotAgents', ['Status'], attrDict, timeStamp='LastUpdateTime') if not result['OK']: return result last_update = Time.dateTime() - Time.day resultDay = PilotAgentsDB().getCounters('PilotAgents', ['Status'], attrDict, newer=last_update, timeStamp='LastUpdateTime') if not resultDay['OK']: return resultDay resultDict = {} for statusDict, count in result['Value']: status = statusDict['Status'] resultDict[status] = count if status in FINAL_STATES: resultDict[status] = 0 for statusDayDict, ccount in resultDay['Value']: if status == statusDayDict['Status']: resultDict[status] = ccount break return S_OK(resultDict)
def export_deletePilots(cls, pilotIDs): if isinstance(pilotIDs, basestring): return PilotAgentsDB().deletePilot(pilotIDs) if isinstance(pilotIDs, (int, long)): pilotIDs = [ pilotIDs, ] result = PilotAgentsDB().deletePilots(pilotIDs) if not result['OK']: return result if enablePilotsLogging: pilotIDs = result['Value'] pilots = PilotAgentsDB().getPilotInfo(pilotID=pilotIDs) if not pilots['OK']: return pilots pilotRefs = [] for pilot in pilots: pilotRefs.append(pilot['PilotJobReference']) result = PilotsLoggingDB().deletePilotsLogging(pilotRefs) if not result['OK']: return result return S_OK()
def export_getPilots(cls, jobID): """ Get pilot references and their states for : - those pilots submitted for the TQ where job is sitting - (or) the pilots executing/having executed the Job """ pilots = [] result = PilotAgentsDB().getPilotsForJobID(int(jobID)) if not result['OK']: if result['Message'].find('not found') == -1: return S_ERROR('Failed to get pilot: ' + result['Message']) else: pilots += result['Value'] if not pilots: # Pilots were not found try to look in the Task Queue taskQueueID = 0 result = TaskQueueDB().getTaskQueueForJob(int(jobID)) if result['OK'] and result['Value']: taskQueueID = result['Value'] if taskQueueID: result = PilotAgentsDB().getPilotsForTaskQueue(taskQueueID, limit=10) if not result['OK']: return S_ERROR('Failed to get pilot: ' + result['Message']) pilots += result['Value'] if not pilots: return S_ERROR('Failed to get pilot for Job %d' % int(jobID)) return PilotAgentsDB().getPilotInfo(pilotID=pilots)
def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus()
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.pilotDB = PilotAgentsDB() return S_OK()
def initialize(self): """Sets defaults""" self.am_setOption("GridEnv", "") self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption("ClearPilotsDelay", 30) self.clearAbortedDelay = self.am_getOption("ClearAbortedPilotsDelay", 7) self.pilots = PilotManagerClient() return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.pilots = PilotManagerClient() return S_OK()
def export_setJobForPilot(cls, jobID, pilotRef, destination=None): """ Report the DIRAC job ID which is executed by the given pilot job """ result = PilotAgentsDB().setJobForPilot(int(jobID), pilotRef) if not result['OK']: return result result = PilotAgentsDB().setCurrentJobID(pilotRef, int(jobID)) if not result['OK']: return result if destination: result = PilotAgentsDB().setPilotDestinationSite( pilotRef, destination) return result
def export_getPilotStatistics(attribute, selectDict): """ Get pilot statistics distribution per attribute value with a given selection """ startDate = selectDict.get('FromDate', None) if startDate: del selectDict['FromDate'] if startDate is None: startDate = selectDict.get('LastUpdate', None) if startDate: del selectDict['LastUpdate'] endDate = selectDict.get('ToDate', None) if endDate: del selectDict['ToDate'] result = PilotAgentsDB().getCounters('PilotAgents', [attribute], selectDict, newer=startDate, older=endDate, timeStamp='LastUpdateTime') statistics = {} if result['OK']: for status, count in result['Value']: if "OwnerDN" in status: userName = getUsernameForDN(status['OwnerDN']) if userName['OK']: status['OwnerDN'] = userName['Value'] statistics[status['OwnerDN']] = count else: statistics[status[attribute]] = count return S_OK(statistics)
class PilotMonitorAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( 'PollingTime', 120 ) self.clearPilotsDelay = self.am_getOption( 'ClearPilotsDelay', 30 ) self.clearAbortedDelay = self.am_getOption( 'ClearAbortedPilotsDelay', 7 ) self.pilotDB = PilotAgentsDB() return S_OK() ############################################################################# def execute( self ): """ Remove from PilotDB pilots that: - are older than self.clearPilotsDelay - are Aborted and older than self.clearAbortedDelay """ result = self.pilotDB.clearPilots( self.clearPilotsDelay, self.clearAbortedDelay ) if not result['OK']: self.log.warn( 'Failed to clear old pilots in the PilotAgentsDB' ) return S_OK( 'Monitoring cycle complete.' )
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gTaskQueueDB global jlDB global pilotAgentsDB gJobDB = JobDB() gTaskQueueDB = TaskQueueDB() jlDB = JobLoggingDB() pilotAgentsDB = PilotAgentsDB() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
def export_killPilot(cls, pilotRefList): """ Kill the specified pilots """ # Make a list if it is not yet pilotRefs = list(pilotRefList) if isinstance(pilotRefList, basestring): pilotRefs = [pilotRefList] # Regroup pilots per site and per owner pilotRefDict = {} for pilotReference in pilotRefs: result = PilotAgentsDB().getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to get info for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] queue = '@@@'.join([ owner, group, pilotDict['GridSite'], pilotDict['DestinationSite'], pilotDict['Queue'] ]) gridType = pilotDict['GridType'] pilotRefDict.setdefault(queue, {}) pilotRefDict[queue].setdefault('PilotList', []) pilotRefDict[queue]['PilotList'].append(pilotReference) pilotRefDict[queue]['GridType'] = gridType failed = killPilotsInQueues(pilotRefDict) if failed: return S_ERROR('Failed to kill at least some pilots') return S_OK()
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gJobLoggingDB global gTaskQueueDB global gPilotAgentsDB # Create JobDB object and initialize its tables. gJobDB = JobDB() res = gJobDB._checkTable() if not res['OK']: return res # Create JobLoggingDB object and initialize its tables. gJobLoggingDB = JobLoggingDB() res = gJobLoggingDB._checkTable() if not res['OK']: return res gTaskQueueDB = TaskQueueDB() # Create PilotAgentsDB object and initialize its tables. gPilotAgentsDB = PilotAgentsDB() res = gPilotAgentsDB._checkTable() if not res['OK']: return res gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def export_countPilots(cls, condDict, older=None, newer=None, timeStamp='SubmissionTime'): """ Set the pilot agent status """ return PilotAgentsDB().countPilots(condDict, older, newer, timeStamp)
def initialize( self ): """Sets defaults """ self.am_setOption( 'PollingTime', 120 ) self.clearPilotsDelay = self.am_getOption( 'ClearPilotsDelay', 30 ) self.clearAbortedDelay = self.am_getOption( 'ClearAbortedPilotsDelay', 7 ) self.pilotDB = PilotAgentsDB() return S_OK()
def export_getPilotSummaryWeb(cls, selectDict, sortList, startItem, maxItems): """ Get the summary of the pilot information for a given page in the pilot monitor in a generic format """ result = PilotAgentsDB().getPilotSummaryWeb(selectDict, sortList, startItem, maxItems) return result
def export_clearPilots(cls, interval=30, aborted_interval=7): result = PilotAgentsDB().clearPilots(interval, aborted_interval) if not result['OK']: return result if enablePilotsLogging: pilotIDs = result['Value'] pilots = PilotAgentsDB().getPilotInfo(pilotID=pilotIDs) if not pilots['OK']: return pilots pilotRefs = [] for pilot in pilots: pilotRefs.append(pilot['PilotJobReference']) result = PilotsLoggingDB().deletePilotsLogging(pilotRefs) if not result['OK']: return result return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( 'PollingTime', 120 ) self.am_setOption( 'GridEnv', '' ) self.am_setOption( 'PilotStalledDays', 3 ) self.pilotDB = PilotAgentsDB() return S_OK()
def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 120) self.am_setOption("GridEnv", "") self.am_setOption("PilotStalledDays", 3) self.pilotDB = PilotAgentsDB() return S_OK()
def initializeWMSAdministratorHandler(serviceInfo): """ WMS AdministratorService initialization """ global jobDB global pilotDB global taskQueueDB jobDB = JobDB() pilotDB = PilotAgentsDB() taskQueueDB = TaskQueueDB() return S_OK()
def export_addPilotTQReference(cls, pilotRef, taskQueueID, ownerDN, ownerGroup, broker='Unknown', gridType='DIRAC', pilotStampDict={}): """ Add a new pilot job reference """ return PilotAgentsDB().addPilotTQReference(pilotRef, taskQueueID, ownerDN, ownerGroup, broker, gridType, pilotStampDict)
def initialize( self ): self.am_setOption( 'shifterProxy', 'DataManager' ) self.rmClient = ResourceManagementClient() self.commands[ 'Downtime' ] = [ { 'Downtime' : {} } ] self.commands[ 'SpaceTokenOccupancy' ] = [ { 'SpaceTokenOccupancy' : {} } ] self.commands[ 'Pilot' ] = [ { 'Pilot' : { 'timespan' : 1800 } },] # { 'Pilot' : { 'timespan' : 86400 } }, # { 'Pilot' : { 'timespan' : 604800 } }] #PilotsCommand # self.commands[ 'Pilots' ] = [ # { 'PilotsWMS' : { 'element' : 'Site', 'siteName' : None } }, # { 'PilotsWMS' : { 'element' : 'Resource', 'siteName' : None } } # ] #FIXME: do not forget about hourly vs Always ...etc #AccountingCacheCommand # self.commands[ 'AccountingCache' ] = [ # {'SuccessfullJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'FailedJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, # {'SuccessfullPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsBySiteSplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'SuccessfullPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'FailedPilotsByCESplitted' :{'hours' :24, 'plotType' :'Pilot' }}, # {'RunningJobsBySiteSplitted' :{'hours' :24, 'plotType' :'Job' }}, ## {'RunningJobsBySiteSplitted' :{'hours' :168, 'plotType' :'Job' }}, ## {'RunningJobsBySiteSplitted' :{'hours' :720, 'plotType' :'Job' }}, ## {'RunningJobsBySiteSplitted' :{'hours' :8760, 'plotType' :'Job' }}, # ] #VOBOXAvailability # self.commands[ 'VOBOXAvailability' ] = [ # { 'VOBOXAvailability' : {} } # #Reuse clients for the commands self.clients[ 'GOCDBClient' ] = GOCDBClient() self.clients[ 'ReportGenerator' ] = RPCClient( 'Accounting/ReportGenerator' ) self.clients[ 'ReportsClient' ] = ReportsClient() self.clients[ 'ResourceStatusClient' ] = ResourceStatusClient() self.clients[ 'ResourceManagementClient' ] = ResourceManagementClient() self.clients[ 'PilotsDB' ] = PilotAgentsDB() self.clients[ 'WMSAdministrator' ] = RPCClient( 'WorkloadManagement/WMSAdministrator' ) self.cCaller = CommandCaller return S_OK()
def initializeHandler(cls, serviceInfoDict): """ Initialization of DB objects """ cls.pilotAgentsDB = PilotAgentsDB() cls.gPilotsLoggingDB = None enablePilotsLogging = Operations().getValue( '/Services/JobMonitoring/usePilotsLoggingFlag', False) if enablePilotsLogging: cls.gPilotsLoggingDB = PilotsLoggingDB() return S_OK()
def export_getCounters(cls, table, keys, condDict, newer=None, timeStamp='SubmissionTime'): """ Set the pilot agent status """ return PilotAgentsDB().getCounters(table, keys, condDict, newer=newer, timeStamp=timeStamp)
def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.WMSAdministrator = WMSAdministratorClient() return S_OK()
def __init__(self, args=None, clients=None): """ Constructor. :Parameters: **args** - [, `dict` ] arguments to be passed to be used in the _prepareCommand method ( name and timespan are the expected ones ) **clients - [, `dict` ] clients from where information is fetched. Mainly used to avoid creating new connections on agents looping over clients. ResourceManagementClient and PilotsDB are most welcome. """ super(PilotCommand, self).__init__(args, clients) if 'PilotsDB' in self.apis: self.pilotsDB = self.apis['PilotsDB'] else: self.pilotsDB = PilotAgentsDB() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis['ResourceManagementClient'] else: self.rmClient = ResourceManagementClient()
def export_setPilotStatus(self, pilotRef, status, destination=None, reason=None, gridSite=None, queue=None): """ Set the pilot agent status """ return PilotAgentsDB().setPilotStatus(pilotRef, status, destination=destination, statusReason=reason, gridSite=gridSite, queue=queue)
def initializeJobManagerHandler(serviceInfo): global gJobDB, gJobLoggingDB, gtaskQueueDB, enablePilotsLogging, gPilotAgentsDB, gPilotsLoggingDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gtaskQueueDB = TaskQueueDB() gPilotAgentsDB = PilotAgentsDB() # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict enablePilotsLogging = gConfig.getValue( serviceInfo['serviceSectionPath'].replace('JobManager', 'PilotsLogging') + '/Enable', 'False').lower() in ('yes', 'true') if enablePilotsLogging: gPilotsLoggingDB = PilotsLoggingDB() return S_OK()
def initializePilotManagerHandler(serviceInfo): """ PilotManagerHandler initialization """ global pilotDB global pilotsLoggingDB global enablePilotsLogging # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict enablePilotsLogging = gConfig.getValue( serviceInfo['serviceSectionPath'].replace('Pilots', 'PilotsLogging') + '/Enable', 'False').lower() in ('yes', 'true') pilotDB = PilotAgentsDB() if enablePilotsLogging: pilotsLoggingDB = PilotsLoggingDB() return S_OK()
def initializeHandler(cls, serviceInfoDict): """ Initialization of DB objects and OptimizationMind """ cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.taskQueueDB = TaskQueueDB() cls.pilotAgentsDB = PilotAgentsDB() cls.pilotsLoggingDB = None enablePilotsLogging = Operations().getValue( '/Services/JobMonitoring/usePilotsLoggingFlag', False) if enablePilotsLogging: cls.pilotsLoggingDB = PilotsLoggingDB() cls.msgClient = MessageClient("WorkloadManagement/OptimizationMind") cls.__connectToOptMind() gThreadScheduler.addPeriodicTask(60, cls.__connectToOptMind) return S_OK()
def export_getPilotLoggingInfo(cls, pilotReference): """ Get the pilot logging info for the Grid job reference """ result = PilotAgentsDB().getPilotInfo(pilotReference) if not result['OK'] or not result['Value']: return S_ERROR('Failed to determine owner for pilot ' + pilotReference) pilotDict = result['Value'][pilotReference] owner = pilotDict['OwnerDN'] group = pilotDict['OwnerGroup'] gridType = pilotDict['GridType'] return getPilotLoggingInfo( gridType, pilotReference, # pylint: disable=unexpected-keyword-arg proxyUserDN=owner, proxyUserGroup=group)
def initializeHandler(cls, serviceInfoDict): cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.taskQueueDB = TaskQueueDB() cls.pilotAgentsDB = PilotAgentsDB() cls.limiter = Limiter(jobDB=cls.jobDB) cls.taskQueueDB.recalculateTQSharesForAll() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues) cls.sendNumTaskQueues() return S_OK()
def initialize(self): ''' Standard initialize. Uses the ProductionManager shifterProxy to modify the ResourceStatus DB ''' self.maxNumberOfThreads = self.am_getOption('maxNumberOfThreads', self.maxNumberOfThreads) self.elementType = self.am_getOption('elementType', self.elementType) self.checkingFreqs = self.am_getOption('checkingFreqs', self.checkingFreqs) self.limitQueueFeeder = self.am_getOption('limitQueueFeeder', self.limitQueueFeeder) self.elementsToBeChecked = Queue.Queue() self.threadPool = ThreadPool(self.maxNumberOfThreads, self.maxNumberOfThreads) self.rsClient = ResourceStatusClient() self.clients['ResourceStatusClient'] = self.rsClient self.clients['ResourceManagementClient'] = ResourceManagementClient() self.clients['PilotsDB'] = PilotAgentsDB() return S_OK()
def __init__( self, args = None, clients = None ): """ Constructor. :Parameters: **args** - [, `dict` ] arguments to be passed to be used in the _prepareCommand method ( name and timespan are the expected ones ) **clients - [, `dict` ] clients from where information is fetched. Mainly used to avoid creating new connections on agents looping over clients. ResourceManagementClient and PilotsDB are most welcome. """ super( PilotCommand, self ).__init__( args, clients ) if 'PilotsDB' in self.apis: self.pilotsDB = self.apis[ 'PilotsDB' ] else: self.pilotsDB = PilotAgentsDB() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient()
class PilotCommand( Command ): """ Pilot 'master' Command. """ def __init__( self, args = None, clients = None ): """ Constructor. :Parameters: **args** - [, `dict` ] arguments to be passed to be used in the _prepareCommand method ( name and timespan are the expected ones ) **clients - [, `dict` ] clients from where information is fetched. Mainly used to avoid creating new connections on agents looping over clients. ResourceManagementClient and PilotsDB are most welcome. """ super( PilotCommand, self ).__init__( args, clients ) if 'PilotsDB' in self.apis: self.pilotsDB = self.apis[ 'PilotsDB' ] else: self.pilotsDB = PilotAgentsDB() if 'ResourceManagementClient' in self.apis: self.rmClient = self.apis[ 'ResourceManagementClient' ] else: self.rmClient = ResourceManagementClient() def _storeCommand( self, result ): """ Stores the results of doNew method on the database. :Parameters: **result** - `list( dict )` list of dictionaries to be inserted on the DB. Unfortunately, there is no bulk insertion method on the database. The dictionaries are sanitized in doNew method so that they match the column names in the database. :return: S_OK / S_ERROR """ for pilotDict in result: lowerCasePilotDict = {} for key, value in pilotDict.iteritems(): lowerCasePilotDict[ key[0].lower() + key[1:] ] = value # I do not care about the **magic, it makes it cleaner resQuery = self.rmClient.addOrModifyPilotCache( **lowerCasePilotDict ) if not resQuery[ 'OK' ]: return resQuery return S_OK() def _prepareCommand( self ): """ Method that parses command arguments to extract the ones needed: name : name of the computing element timespan ( seconds ) : time window :return: : S_OK( name, timespan ) / S_ERROR """ if not 'name' in self.args: return S_ERROR( '"name" not found in self.args' ) name = self.args[ 'name' ] if not 'timespan' in self.args: return S_ERROR( '"timespan" not found in self.args' ) timespan = self.args[ 'timespan' ] return S_OK( ( name, timespan ) ) def doNew( self, masterParams = None ): """ doNew method. If is master execution, name is declared as '' so that all ce's are asked. Once values are obtained, they are stored on the Database. The entries with name Unknown, NotAssigned and Total are skipped. :Parameters: **masterParams** - [, bool ] if True, it queries for all elements in the database for the given timespan :return: S_OK( list ( dict ) ) / S_ERROR """ # Ask for all CEs if masterParams is True: self.args[ 'name' ] = '' params = self._prepareCommand() if not params[ 'OK' ]: return params computingElement, timespan = params[ 'Value' ] # Calculate time window from timespan and utcnow endTimeWindow = datetime.utcnow() startTimeWindow = endTimeWindow - timedelta( seconds = timespan ) # Get pilots information from DB pilotsRes = self.pilotsDB.getPilotSummaryShort( startTimeWindow, endTimeWindow, computingElement ) if not pilotsRes[ 'OK' ]: return pilotsRes # This list matches the database schema in ResourceManagemntDB. It is used # to have a perfect match even it there are no pilots on a particular state pilotStatuses = [ 'Scheduled', 'Waiting', 'Submitted', 'Running', 'Done', 'Aborted', 'Cancelled', 'Deleted', 'Failed', 'Held', 'Killed', 'Stalled' ] uniformResult = [] for ceName, pilotDict in pilotsRes[ 'Value' ].items(): if ceName in [ 'Total', 'Unknown', 'NotAssigned' ]: continue uniformPilotDict = dict.fromkeys( pilotStatuses, 0 ) uniformPilotDict.update( pilotDict ) uniformPilotDict[ 'Timespan' ] = timespan uniformPilotDict[ 'CE' ] = ceName uniformResult.append( uniformPilotDict ) # Store results storeRes = self._storeCommand( uniformResult ) if not storeRes[ 'OK' ]: return storeRes return S_OK( uniformResult ) def doCache( self ): """ doCache gets values from the database instead from the PilotsDB tables. If successful, returns a list of dictionaries with the database records. :return: S_OK( list( dict ) ) / S_ERROR """ params = self._prepareCommand() if not params[ 'OK' ]: return params computingElement, timespan = params[ 'Value' ] # Make sure the records we obtain are NOT out of date lastValidRecord = datetime.utcnow() - timedelta( seconds = timespan ) result = self.rmClient.selectPilotCache( cE = computingElement, timespan = timespan, meta = { 'older' : ( 'LastCheckTime', lastValidRecord ) } ) if result[ 'OK' ]: result = S_OK( [ dict( zip( result[ 'Columns' ], res ) ) for res in result[ 'Value' ] ] ) return result def doMaster( self ): """ Master method, asks for all information in the database for the given timespan ( see _prepareCommand ). :return: : S_OK( failedMessages ) """ pilotResults = self.doNew( masterParams = True ) if not pilotResults[ 'OK' ]: self.metrics[ 'failed' ].append( pilotResults[ 'Message' ] ) return S_OK( self.metrics )
class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if isinstance( resourceDescription, basestring ): classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO if credDict[ 'group' ] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get( 'VirtualOrganization', '' ) else: vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class PilotStatusAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ queryStateList = ["Ready", "Submitted", "Running", "Waiting", "Scheduled"] finalStateList = ["Done", "Aborted", "Cleared", "Deleted", "Failed"] identityFieldsList = ["OwnerDN", "OwnerGroup", "GridType", "Broker"] eligibleGridTypes = ["gLite"] ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 120) self.am_setOption("GridEnv", "") self.am_setOption("PilotStalledDays", 3) self.pilotDB = PilotAgentsDB() return S_OK() ############################################################################# def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption("PilotStalledDays", 3) self.gridEnv = self.am_getOption("GridEnv") if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue("/DIRAC/Setup", "") if setup: instance = gConfig.getValue("/DIRAC/Setups/%s/WorkloadManagement" % setup, "") if instance: self.gridEnv = gConfig.getValue("/Systems/WorkloadManagement/%s/GridEnv" % instance, "") result = self.pilotDB._getConnection() if result["OK"]: connection = result["Value"] else: return result result = self.pilotDB.getPilotGroups(self.identityFieldsList, {"Status": self.queryStateList}) if not result["OK"]: self.log.error("Fail to get identities Groups", result["Message"]) return result if not result["Value"]: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result["Value"]: if not gridType in self.eligibleGridTypes: continue self.log.verbose("Getting pilots for %s:%s @ %s %s" % (ownerDN, ownerGroup, gridType, broker)) condDict1 = { "Status": "Done", "StatusReason": "Report from JobAgent", "OwnerDN": ownerDN, "OwnerGroup": ownerGroup, "GridType": gridType, "Broker": broker, } condDict2 = { "Status": self.queryStateList, "OwnerDN": ownerDN, "OwnerGroup": ownerGroup, "GridType": gridType, "Broker": broker, } for condDict in [condDict1, condDict2]: result = self.clearWaitingPilots(condDict) if not result["OK"]: self.log.warn("Failed to clear Waiting Pilot Jobs") result = self.pilotDB.selectPilots(condDict) if not result["OK"]: self.log.warn("Failed to get the Pilot Agents") return result if not result["Value"]: continue refList = result["Value"] ret = gProxyManager.getPilotProxyFromDIRACGroup(ownerDN, ownerGroup) if not ret["OK"]: self.log.error(ret["Message"]) self.log.error("Could not get proxy:", 'User "%s", Group "%s"' % (ownerDN, ownerGroup)) continue proxy = ret["Value"] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % (len(refList), ownerDN, ownerGroup) ) for start_index in range(0, len(refList), MAX_JOBS_QUERY): refsToQuery = refList[start_index : start_index + MAX_JOBS_QUERY] self.log.verbose( "Querying %d pilots of %s starting at %d" % (len(refsToQuery), len(refList), start_index) ) result = self.getPilotStatus(proxy, gridType, refsToQuery) if not result["OK"]: if result["Message"] == "Broker not Available": self.log.error("Broker %s not Available" % broker) break self.log.warn("Failed to get pilot status:") self.log.warn("%s:%s @ %s" % (ownerDN, ownerGroup, gridType)) continue statusDict = result["Value"] for pRef in statusDict: pDict = statusDict[pRef] if pDict: if pDict["isParent"]: self.log.verbose("Clear parametric parent %s" % pRef) result = self.clearParentJob(pRef, pDict, connection) if not result["OK"]: self.log.warn(result["Message"]) else: self.log.info("Parametric parent removed: %s" % pRef) if pDict["FinalStatus"]: self.log.verbose("Marking Status for %s to %s" % (pRef, pDict["Status"])) pilotsToAccount[pRef] = pDict else: self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"])) result = self.pilotDB.setPilotStatus( pRef, pDict["Status"], pDict["DestinationSite"], updateTime=pDict["StatusDate"], conn=connection, ) if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() return S_OK() def clearWaitingPilots(self, condDict): """ Clear pilots in the faulty Waiting state """ last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour clearDict = { "Status": "Waiting", "OwnerDN": condDict["OwnerDN"], "OwnerGroup": condDict["OwnerGroup"], "GridType": condDict["GridType"], "Broker": condDict["Broker"], } result = self.pilotDB.selectPilots(clearDict, older=last_update) if not result["OK"]: self.log.warn("Failed to get the Pilot Agents fpr Waiting state") return result if not result["Value"]: return S_OK() refList = result["Value"] for pilotRef in refList: self.log.info("Setting Waiting pilot to Aborted: %s" % pilotRef) result = self.pilotDB.setPilotStatus(pilotRef, "Stalled", statusReason="Exceeded max waiting time") return S_OK() def clearParentJob(self, pRef, pDict, connection): """ Clear the parameteric parent job from the PilotAgentsDB """ childList = pDict["ChildRefs"] # Check that at least one child is in the database children_ok = False for child in childList: result = self.pilotDB.getPilotInfo(child, conn=connection) if result["OK"]: if result["Value"]: children_ok = True if children_ok: return self.pilotDB.deletePilot(pRef, conn=connection) else: self.log.verbose("Adding children for parent %s" % pRef) result = self.pilotDB.getPilotInfo(pRef) parentInfo = result["Value"][pRef] tqID = parentInfo["TaskQueueID"] ownerDN = parentInfo["OwnerDN"] ownerGroup = parentInfo["OwnerGroup"] broker = parentInfo["Broker"] gridType = parentInfo["GridType"] result = self.pilotDB.addPilotTQReference( childList, tqID, ownerDN, ownerGroup, broker=broker, gridType=gridType ) if not result["OK"]: return result children_added = True for chRef, chDict in pDict["ChildDicts"].items(): result = self.pilotDB.setPilotStatus( chRef, chDict["Status"], destination=chDict["DestinationSite"], conn=connection ) if not result["OK"]: children_added = False if children_added: result = self.pilotDB.deletePilot(pRef, conn=connection) else: return S_ERROR("Failed to add children") return S_OK() def handleOldPilots(self, connection): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays) # A.T. Below looks to be a bug # result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' ) result = self.pilotDB.selectPilots( {"Status": self.queryStateList}, older=timeLimitToConsider, timeStamp="LastUpdateTime" ) if not result["OK"]: self.log.error("Failed to get the Pilot Agents") return result if not result["Value"]: return S_OK() refList = result["Value"] result = self.pilotDB.getPilotInfo(refList) if not result["OK"]: self.log.error("Failed to get Info for Pilot Agents") return result pilotsDict = result["Value"] for pRef in pilotsDict: deletedJobDict = pilotsDict[pRef] deletedJobDict["Status"] = "Deleted" deletedJobDict["StatusDate"] = Time.dateTime() pilotsToAccount[pRef] = deletedJobDict if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) return S_OK() def accountPilots(self, pilotsToAccount, connection): """ account for pilots """ accountingFlag = False pae = self.am_getOption("PilotAccountingEnabled", "yes") if pae.lower() == "yes": accountingFlag = True if not pilotsToAccount: self.log.info("No pilots to Account") return S_OK() accountingSent = False if accountingFlag: retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection) if not retVal["OK"]: self.log.error("Fail to retrieve Info for pilots", retVal["Message"]) return retVal dbData = retVal["Value"] for pref in dbData: if pref in pilotsToAccount: if dbData[pref]["Status"] not in self.finalStateList: dbData[pref]["Status"] = pilotsToAccount[pref]["Status"] dbData[pref]["DestinationSite"] = pilotsToAccount[pref]["DestinationSite"] dbData[pref]["LastUpdateTime"] = pilotsToAccount[pref]["StatusDate"] retVal = self.__addPilotsAccountingReport(dbData) if not retVal["OK"]: self.log.error("Fail to retrieve Info for pilots", retVal["Message"]) return retVal self.log.info("Sending accounting records...") retVal = gDataStoreClient.commit() if not retVal["OK"]: self.log.error("Can't send accounting reports", retVal["Message"]) else: self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount)) accountingSent = True if not accountingFlag or accountingSent: for pRef in pilotsToAccount: pDict = pilotsToAccount[pRef] self.log.verbose("Setting Status for %s to %s" % (pRef, pDict["Status"])) self.pilotDB.setPilotStatus( pRef, pDict["Status"], pDict["DestinationSite"], pDict["StatusDate"], conn=connection ) return S_OK() ############################################################################# def getPilotStatus(self, proxy, gridType, pilotRefList): """ Get GRID job status information using the job's owner proxy and GRID job IDs. Returns for each JobID its status in the GRID WMS and its destination CE as a tuple of 2 elements """ if gridType == "LCG": cmd = ["edg-job-status"] elif gridType == "gLite": cmd = ["glite-wms-job-status"] else: return S_ERROR() cmd.extend(pilotRefList) start = time.time() ret = executeGridCommand(proxy, cmd, self.gridEnv) self.log.info("%s Job Status Execution Time for %d jobs:" % (gridType, len(pilotRefList)), time.time() - start) if not ret["OK"]: self.log.error("Failed to execute %s Job Status" % gridType, ret["Message"]) return S_ERROR() if ret["Value"][0] != 0: stderr = ret["Value"][2] stdout = ret["Value"][1] deleted = 0 resultDict = {} status = "Deleted" destination = "Unknown" deletedJobDict = { "Status": status, "DestinationSite": destination, "StatusDate": Time.dateTime(), "isChild": False, "isParent": False, "ParentRef": False, "FinalStatus": status in self.finalStateList, "ChildRefs": [], } # Glite returns this error for Deleted jobs to std.err for job in List.fromChar(stderr, "\nUnable to retrieve the status for:")[1:]: pRef = List.fromChar(job, "\n")[0].strip() resultDict[pRef] = deletedJobDict self.pilotDB.setPilotStatus(pRef, "Deleted") deleted += 1 # EDG returns a similar error for Deleted jobs to std.out for job in List.fromChar(stdout, "\nUnable to retrieve the status for:")[1:]: pRef = List.fromChar(job, "\n")[0].strip() if re.search("No such file or directory: no matching jobs found", job): resultDict[pRef] = deletedJobDict self.pilotDB.setPilotStatus(pRef, "Deleted") deleted += 1 if re.search("edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job): # the Broker is not accesible return S_ERROR("Broker not Available") if not deleted: self.log.error( "Error executing %s Job Status:" % gridType, str(ret["Value"][0]) + "\n".join(ret["Value"][1:3]) ) return S_ERROR() return S_OK(resultDict) stdout = ret["Value"][1] stderr = ret["Value"][2] resultDict = {} for job in List.fromChar(stdout, "\nStatus info for the Job :")[1:]: pRef = List.fromChar(job, "\n")[0].strip() resultDict[pRef] = self.__parseJobStatus(job, gridType) return S_OK(resultDict) def __parseJobStatus(self, job, gridType): """ Parse output of grid pilot status command """ statusRE = "Current Status:\s*(\w*)" destinationRE = "Destination:\s*([\w\.-]*)" statusDateLCGRE = "reached on:\s*....(.*)" submittedDateRE = "Submitted:\s*....(.*)" statusFailedRE = "Current Status:.*\(Failed\)" status = None destination = "Unknown" statusDate = None submittedDate = None try: status = re.search(statusRE, job).group(1) if status == "Done" and re.search(statusFailedRE, job): status = "Failed" if re.search(destinationRE, job): destination = re.search(destinationRE, job).group(1) if gridType == "LCG" and re.search(statusDateLCGRE, job): statusDate = re.search(statusDateLCGRE, job).group(1) statusDate = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(statusDate, "%b %d %H:%M:%S %Y")) if gridType == "gLite" and re.search(submittedDateRE, job): submittedDate = re.search(submittedDateRE, job).group(1) submittedDate = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(submittedDate, "%b %d %H:%M:%S %Y %Z")) except: self.log.exception("Error parsing %s Job Status output:\n" % gridType, job) isParent = False if re.search("Nodes information", job): isParent = True isChild = False if re.search("Parent Job", job): isChild = True if status == "Running": # Pilots can be in Running state for too long, due to bugs in the WMS if statusDate: statusTime = Time.fromString(statusDate) delta = Time.dateTime() - statusTime if delta > 4 * Time.day: self.log.info("Setting pilot status to Deleted after 4 days in Running") status = "Deleted" statusDate = statusTime + 4 * Time.day elif submittedDate: statusTime = Time.fromString(submittedDate) delta = Time.dateTime() - statusTime if delta > 7 * Time.day: self.log.info("Setting pilot status to Deleted more than 7 days after submission still in Running") status = "Deleted" statusDate = statusTime + 7 * Time.day childRefs = [] childDicts = {} if isParent: for subjob in List.fromChar(job, " Status info for the Job :")[1:]: chRef = List.fromChar(subjob, "\n")[0].strip() childDict = self.__parseJobStatus(subjob, gridType) childRefs.append(chRef) childDicts[chRef] = childDict return { "Status": status, "DestinationSite": destination, "StatusDate": statusDate, "isChild": isChild, "isParent": isParent, "ParentRef": False, "FinalStatus": status in self.finalStateList, "ChildRefs": childRefs, "ChildDicts": childDicts, } def __addPilotsAccountingReport(self, pilotsData): """ fill accounting data """ for pRef in pilotsData: pData = pilotsData[pRef] pA = PilotAccounting() pA.setEndTime(pData["LastUpdateTime"]) pA.setStartTime(pData["SubmissionTime"]) retVal = CS.getUsernameForDN(pData["OwnerDN"]) if not retVal["OK"]: userName = "******" self.log.error("Can't determine username for dn:", pData["OwnerDN"]) else: userName = retVal["Value"] pA.setValueByKey("User", userName) pA.setValueByKey("UserGroup", pData["OwnerGroup"]) result = getSiteForCE(pData["DestinationSite"]) if result["OK"] and result["Value"].strip(): pA.setValueByKey("Site", result["Value"].strip()) else: pA.setValueByKey("Site", "Unknown") pA.setValueByKey("GridCE", pData["DestinationSite"]) pA.setValueByKey("GridMiddleware", pData["GridType"]) pA.setValueByKey("GridResourceBroker", pData["Broker"]) pA.setValueByKey("GridStatus", pData["Status"]) if not "Jobs" in pData: pA.setValueByKey("Jobs", 0) else: pA.setValueByKey("Jobs", len(pData["Jobs"])) self.log.verbose("Added accounting record for pilot %s" % pData["PilotID"]) retVal = gDataStoreClient.addRegister(pA) if not retVal["OK"]: return retVal return S_OK()
class PilotStatusAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled'] finalStateList = [ 'Done', 'Aborted', 'Cleared', 'Deleted', 'Failed' ] identityFieldsList = [ 'OwnerDN', 'OwnerGroup', 'GridType', 'Broker' ] eligibleGridTypes = [ 'gLite' ] ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( 'PollingTime', 120 ) self.am_setOption( 'GridEnv', '' ) self.am_setOption( 'PilotStalledDays', 3 ) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() return S_OK() ############################################################################# def execute( self ): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption( 'PilotStalledDays', 3 ) self.gridEnv = self.am_getOption( 'GridEnv' ) if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue( '/DIRAC/Setup', '' ) if setup: instance = gConfig.getValue( '/DIRAC/Setups/%s/WorkloadManagement' % setup, '' ) if instance: self.gridEnv = gConfig.getValue( '/Systems/WorkloadManagement/%s/GridEnv' % instance, '' ) result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result result = self.pilotDB.getPilotGroups( self.identityFieldsList, {'Status': self.queryStateList } ) if not result['OK']: self.log.error( 'Fail to get identities Groups', result['Message'] ) return result if not result['Value']: return S_OK() pilotsToAccount = {} for ownerDN, ownerGroup, gridType, broker in result['Value']: if not gridType in self.eligibleGridTypes: continue self.log.verbose( 'Getting pilots for %s:%s @ %s %s' % ( ownerDN, ownerGroup, gridType, broker ) ) condDict1 = {'Status':'Done', 'StatusReason':'Report from JobAgent', 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'GridType':gridType, 'Broker':broker} condDict2 = {'Status':self.queryStateList, 'OwnerDN':ownerDN, 'OwnerGroup':ownerGroup, 'GridType':gridType, 'Broker':broker} for condDict in [ condDict1, condDict2]: result = self.clearWaitingPilots( condDict ) if not result['OK']: self.log.warn( 'Failed to clear Waiting Pilot Jobs' ) result = self.pilotDB.selectPilots( condDict ) if not result['OK']: self.log.warn( 'Failed to get the Pilot Agents' ) return result if not result['Value']: continue refList = result['Value'] ret = gProxyManager.getPilotProxyFromDIRACGroup( ownerDN, ownerGroup ) if not ret['OK']: self.log.error( ret['Message'] ) self.log.error( 'Could not get proxy:', 'User "%s", Group "%s"' % ( ownerDN, ownerGroup ) ) continue proxy = ret['Value'] self.log.verbose( "Getting status for %s pilots for owner %s and group %s" % ( len( refList ), ownerDN, ownerGroup ) ) for start_index in range( 0, len( refList ), MAX_JOBS_QUERY ): refsToQuery = refList[ start_index : start_index + MAX_JOBS_QUERY ] self.log.verbose( 'Querying %d pilots of %s starting at %d' % ( len( refsToQuery ), len( refList ), start_index ) ) result = self.getPilotStatus( proxy, gridType, refsToQuery ) if not result['OK']: if result['Message'] == 'Broker not Available': self.log.error( 'Broker %s not Available' % broker ) break self.log.warn( 'Failed to get pilot status:' ) self.log.warn( '%s:%s @ %s' % ( ownerDN, ownerGroup, gridType ) ) continue statusDict = result[ 'Value' ] for pRef in statusDict: pDict = statusDict[ pRef ] if pDict: if pDict['isParent']: self.log.verbose( 'Clear parametric parent %s' % pRef ) result = self.clearParentJob( pRef, pDict, connection ) if not result['OK']: self.log.warn( result['Message'] ) else: self.log.info( 'Parametric parent removed: %s' % pRef ) if pDict[ 'FinalStatus' ]: self.log.verbose( 'Marking Status for %s to %s' % ( pRef, pDict['Status'] ) ) pilotsToAccount[ pRef ] = pDict else: self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) ) result = self.pilotDB.setPilotStatus( pRef, pDict['Status'], pDict['DestinationSite'], updateTime = pDict['StatusDate'], conn = connection ) if len( pilotsToAccount ) > 100: self.accountPilots( pilotsToAccount, connection ) pilotsToAccount = {} self.accountPilots( pilotsToAccount, connection ) # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots( connection ) connection.close() return S_OK() def clearWaitingPilots( self, condDict ): """ Clear pilots in the faulty Waiting state """ last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour clearDict = {'Status':'Waiting', 'OwnerDN':condDict['OwnerDN'], 'OwnerGroup':condDict['OwnerGroup'], 'GridType':condDict['GridType'], 'Broker':condDict['Broker']} result = self.pilotDB.selectPilots( clearDict, older = last_update ) if not result['OK']: self.log.warn( 'Failed to get the Pilot Agents for Waiting state' ) return result if not result['Value']: return S_OK() refList = result['Value'] for pilotRef in refList: self.log.info( 'Setting Waiting pilot to Aborted: %s' % pilotRef ) result = self.pilotDB.setPilotStatus( pilotRef, 'Stalled', statusReason = 'Exceeded max waiting time' ) return S_OK() def clearParentJob( self, pRef, pDict, connection ): """ Clear the parameteric parent job from the PilotAgentsDB """ childList = pDict['ChildRefs'] # Check that at least one child is in the database children_ok = False for child in childList: result = self.pilotDB.getPilotInfo( child, conn = connection ) if result['OK']: if result['Value']: children_ok = True if children_ok: return self.pilotDB.deletePilot( pRef, conn = connection ) else: self.log.verbose( 'Adding children for parent %s' % pRef ) result = self.pilotDB.getPilotInfo( pRef ) parentInfo = result['Value'][pRef] tqID = parentInfo['TaskQueueID'] ownerDN = parentInfo['OwnerDN'] ownerGroup = parentInfo['OwnerGroup'] broker = parentInfo['Broker'] gridType = parentInfo['GridType'] result = self.pilotDB.addPilotTQReference( childList, tqID, ownerDN, ownerGroup, broker = broker, gridType = gridType ) if not result['OK']: return result children_added = True for chRef, chDict in pDict['ChildDicts'].items(): result = self.pilotDB.setPilotStatus( chRef, chDict['Status'], destination = chDict['DestinationSite'], conn = connection ) if not result['OK']: children_added = False if children_added : result = self.pilotDB.deletePilot( pRef, conn = connection ) else: return S_ERROR( 'Failed to add children' ) return S_OK() def handleOldPilots( self, connection ): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = Time.toString( Time.dateTime() - Time.day * self.pilotStalledDays ) # A.T. Below looks to be a bug #result = self.pilotDB.selectPilots( {'Status':self.queryStateList} , older=None, timeStamp='LastUpdateTime' ) result = self.pilotDB.selectPilots( { 'Status':self.queryStateList} , older = timeLimitToConsider, timeStamp = 'LastUpdateTime' ) if not result['OK']: self.log.error( 'Failed to get the Pilot Agents' ) return result if not result['Value']: return S_OK() refList = result['Value'] result = self.pilotDB.getPilotInfo( refList ) if not result['OK']: self.log.error( 'Failed to get Info for Pilot Agents' ) return result pilotsDict = result['Value'] for pRef in pilotsDict: if pilotsDict[pRef].has_key('Jobs') and len(pilotsDict[pRef]['Jobs']) > 0 and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'],self.pilotStalledDays): self.log.debug('%s should not be deleted since one job of %s is running.' % ( str(pRef) , str(pilotsDict[pRef]['Jobs']) ) ) continue deletedJobDict = pilotsDict[pRef] deletedJobDict['Status'] = 'Deleted' deletedJobDict['StatusDate'] = Time.dateTime() pilotsToAccount[ pRef ] = deletedJobDict if len( pilotsToAccount ) > 100: self.accountPilots( pilotsToAccount, connection ) self._killPilots( pilotsToAccount ) pilotsToAccount = {} self.accountPilots( pilotsToAccount, connection ) self._killPilots( pilotsToAccount ) return S_OK() def accountPilots( self, pilotsToAccount, connection ): """ account for pilots """ accountingFlag = False pae = self.am_getOption( 'PilotAccountingEnabled', 'yes' ) if pae.lower() == "yes": accountingFlag = True if not pilotsToAccount: self.log.info( 'No pilots to Account' ) return S_OK() accountingSent = False if accountingFlag: retVal = self.pilotDB.getPilotInfo( pilotsToAccount.keys(), conn = connection ) if not retVal['OK']: self.log.error( 'Fail to retrieve Info for pilots', retVal['Message'] ) return retVal dbData = retVal[ 'Value' ] for pref in dbData: if pref in pilotsToAccount: if dbData[pref][ 'Status' ] not in self.finalStateList: dbData[pref][ 'Status' ] = pilotsToAccount[pref][ 'Status' ] dbData[pref][ 'DestinationSite' ] = pilotsToAccount[pref][ 'DestinationSite' ] dbData[pref][ 'LastUpdateTime' ] = pilotsToAccount[pref][ 'StatusDate' ] retVal = self.__addPilotsAccountingReport( dbData ) if not retVal['OK']: self.log.error( 'Fail to retrieve Info for pilots', retVal['Message'] ) return retVal self.log.info( "Sending accounting records..." ) retVal = gDataStoreClient.commit() if not retVal[ 'OK' ]: self.log.error( "Can't send accounting reports", retVal[ 'Message' ] ) else: self.log.info( "Accounting sent for %s pilots" % len( pilotsToAccount ) ) accountingSent = True if not accountingFlag or accountingSent: for pRef in pilotsToAccount: pDict = pilotsToAccount[pRef] self.log.verbose( 'Setting Status for %s to %s' % ( pRef, pDict['Status'] ) ) self.pilotDB.setPilotStatus( pRef, pDict['Status'], pDict['DestinationSite'], pDict['StatusDate'], conn = connection ) return S_OK() ############################################################################# def getPilotStatus( self, proxy, gridType, pilotRefList ): """ Get GRID job status information using the job's owner proxy and GRID job IDs. Returns for each JobID its status in the GRID WMS and its destination CE as a tuple of 2 elements """ if gridType == 'LCG': cmd = [ 'edg-job-status' ] elif gridType == 'gLite': cmd = [ 'glite-wms-job-status' ] else: return S_ERROR() cmd.extend( pilotRefList ) start = time.time() ret = executeGridCommand( proxy, cmd, self.gridEnv ) self.log.info( '%s Job Status Execution Time for %d jobs:' % ( gridType, len( pilotRefList ) ), time.time() - start ) if not ret['OK']: self.log.error( 'Failed to execute %s Job Status' % gridType, ret['Message'] ) return S_ERROR() if ret['Value'][0] != 0: stderr = ret['Value'][2] stdout = ret['Value'][1] deleted = 0 resultDict = {} status = 'Deleted' destination = 'Unknown' deletedJobDict = { 'Status': status, 'DestinationSite': destination, 'StatusDate': Time.dateTime(), 'isChild': False, 'isParent': False, 'ParentRef': False, 'FinalStatus' : status in self.finalStateList, 'ChildRefs' : [] } # Glite returns this error for Deleted jobs to std.err for job in List.fromChar( stderr, '\nUnable to retrieve the status for:' )[1:]: pRef = List.fromChar( job, '\n' )[0].strip() resultDict[pRef] = deletedJobDict self.pilotDB.setPilotStatus( pRef, "Deleted" ) deleted += 1 # EDG returns a similar error for Deleted jobs to std.out for job in List.fromChar( stdout, '\nUnable to retrieve the status for:' )[1:]: pRef = List.fromChar( job, '\n' )[0].strip() if re.search( "No such file or directory: no matching jobs found", job ): resultDict[pRef] = deletedJobDict self.pilotDB.setPilotStatus( pRef, "Deleted" ) deleted += 1 if re.search( "edg_wll_JobStatus: Connection refused: edg_wll_ssl_connect()", job ): # the Broker is not accesible return S_ERROR( 'Broker not Available' ) if not deleted: self.log.error( 'Error executing %s Job Status:' % gridType, str( ret['Value'][0] ) + '\n'.join( ret['Value'][1:3] ) ) return S_ERROR() return S_OK( resultDict ) stdout = ret['Value'][1] stderr = ret['Value'][2] resultDict = {} for job in List.fromChar( stdout, '\nStatus info for the Job :' )[1:]: pRef = List.fromChar( job, '\n' )[0].strip() resultDict[pRef] = self.__parseJobStatus( job, gridType ) return S_OK( resultDict ) def __parseJobStatus( self, job, gridType ): """ Parse output of grid pilot status command """ statusRE = 'Current Status:\s*(\w*)' destinationRE = 'Destination:\s*([\w\.-]*)' statusDateLCGRE = 'reached on:\s*....(.*)' submittedDateRE = 'Submitted:\s*....(.*)' statusFailedRE = 'Current Status:.*\(Failed\)' status = None destination = 'Unknown' statusDate = None submittedDate = None try: status = re.search( statusRE, job ).group( 1 ) if status == 'Done' and re.search( statusFailedRE, job ): status = 'Failed' if re.search( destinationRE, job ): destination = re.search( destinationRE, job ).group( 1 ) if gridType == 'LCG' and re.search( statusDateLCGRE, job ): statusDate = re.search( statusDateLCGRE, job ).group( 1 ) statusDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( statusDate, '%b %d %H:%M:%S %Y' ) ) if gridType == 'gLite' and re.search( submittedDateRE, job ): submittedDate = re.search( submittedDateRE, job ).group( 1 ) submittedDate = time.strftime( '%Y-%m-%d %H:%M:%S', time.strptime( submittedDate, '%b %d %H:%M:%S %Y %Z' ) ) except: self.log.exception( 'Error parsing %s Job Status output:\n' % gridType, job ) isParent = False if re.search( 'Nodes information', job ): isParent = True isChild = False if re.search( 'Parent Job', job ): isChild = True if status == "Running": # Pilots can be in Running state for too long, due to bugs in the WMS if statusDate: statusTime = Time.fromString( statusDate ) delta = Time.dateTime() - statusTime if delta > 4 * Time.day: self.log.info( 'Setting pilot status to Deleted after 4 days in Running' ) status = "Deleted" statusDate = statusTime + 4 * Time.day elif submittedDate: statusTime = Time.fromString( submittedDate ) delta = Time.dateTime() - statusTime if delta > 7 * Time.day: self.log.info( 'Setting pilot status to Deleted more than 7 days after submission still in Running' ) status = "Deleted" statusDate = statusTime + 7 * Time.day childRefs = [] childDicts = {} if isParent: for subjob in List.fromChar( job, ' Status info for the Job :' )[1:]: chRef = List.fromChar( subjob, '\n' )[0].strip() childDict = self.__parseJobStatus( subjob, gridType ) childRefs.append( chRef ) childDicts[chRef] = childDict return { 'Status': status, 'DestinationSite': destination, 'StatusDate': statusDate, 'isChild': isChild, 'isParent': isParent, 'ParentRef': False, 'FinalStatus' : status in self.finalStateList, 'ChildRefs' : childRefs, 'ChildDicts' : childDicts } def __addPilotsAccountingReport( self, pilotsData ): """ fill accounting data """ for pRef in pilotsData: pData = pilotsData[pRef] pA = PilotAccounting() pA.setEndTime( pData[ 'LastUpdateTime' ] ) pA.setStartTime( pData[ 'SubmissionTime' ] ) retVal = CS.getUsernameForDN( pData[ 'OwnerDN' ] ) if not retVal[ 'OK' ]: userName = '******' self.log.error( "Can't determine username for dn:", pData[ 'OwnerDN' ] ) else: userName = retVal[ 'Value' ] pA.setValueByKey( 'User', userName ) pA.setValueByKey( 'UserGroup', pData[ 'OwnerGroup' ] ) result = getSiteForCE( pData[ 'DestinationSite' ] ) if result['OK'] and result[ 'Value' ].strip(): pA.setValueByKey( 'Site', result['Value'].strip() ) else: pA.setValueByKey( 'Site', 'Unknown' ) pA.setValueByKey( 'GridCE', pData[ 'DestinationSite' ] ) pA.setValueByKey( 'GridMiddleware', pData[ 'GridType' ] ) pA.setValueByKey( 'GridResourceBroker', pData[ 'Broker' ] ) pA.setValueByKey( 'GridStatus', pData[ 'Status' ] ) if not 'Jobs' in pData: pA.setValueByKey( 'Jobs', 0 ) else: pA.setValueByKey( 'Jobs', len( pData['Jobs'] ) ) self.log.verbose( "Added accounting record for pilot %s" % pData[ 'PilotID' ] ) retVal = gDataStoreClient.addRegister( pA ) if not retVal[ 'OK' ]: return retVal return S_OK() def _killPilots( self, acc ): for i in sorted(acc.keys()): result = self.diracadmin.getPilotInfo( i ) if result['OK'] and result['Value'].has_key(i) and result['Value'][i].has_key('Status'): ret = self.diracadmin.killPilot( str(i) ) if ret['OK']: self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status'] ) ) else: self.log.error("Failed to delete %s : %s" % ( i, ret['Message'])) else: self.log.error("Failed to get info. of %s : %s" % ( i, str(result))) def _checkJobLastUpdateTime( self, joblist , StalledDays ): timeLimitToConsider = Time.dateTime() - Time.day * StalledDays ret = False for JobID in joblist: result = self.jobDB.getJobAttributes(int(JobID)) if result['OK']: if result['Value'].has_key('LastUpdateTime'): LastUpdateTime = result['Value']['LastUpdateTime'] if Time.fromString(LastUpdateTime) > timeLimitToConsider: ret = True self.log.debug('Since '+str(JobID)+' updates LastUpdateTime on '+str(LastUpdateTime)+', this does not to need to be deleted.') break else: self.log.error("Error taking job info. from DB:%s" % str( result['Message'] ) ) return ret
labels = ['pilotUUID', 'timestamp', 'source', 'phase', 'status', 'messageContent'] for log in logs: content.append([log[label] for label in labels]) printTable(labels, content, numbering=False, columnSeparator=' | ') if uuid: pilotsLogging = PilotsLoggingClient() result = pilotsLogging.getPilotsLogging(uuid) if not result['OK']: print 'ERROR: %s' % result['Message'] DIRAC.exit(1) printPilotsLogging(result['Value']) DIRAC.exit(0) else: pilotDB = PilotAgentsDB() pilotsLogging = PilotsLoggingClient() pilots = pilotDB.getPilotsForJobID(jobid) if not pilots['OK ']: print pilots['Message'] for pilotID in pilots: info = pilotDB.getPilotInfo(pilotID=pilotID) if not info['OK']: print info['Message'] for pilot in info: logging = pilotsLogging.getPilotsLogging(pilot['PilotJobReference']) if not logging['OK']: print logging['Message'] printPilotsLogging(logging) DIRAC.exit(0)
class PilotStatusAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ queryStateList = ['Ready', 'Submitted', 'Running', 'Waiting', 'Scheduled'] finalStateList = ['Done', 'Aborted', 'Cleared', 'Deleted', 'Failed'] def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.pilotDB = None self.diracadmin = None ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption('PollingTime', 120) self.am_setOption('GridEnv', '') self.am_setOption('PilotStalledDays', 3) self.pilotDB = PilotAgentsDB() self.diracadmin = DiracAdmin() self.jobDB = JobDB() self.clearPilotsDelay = self.am_getOption('ClearPilotsDelay', 30) self.clearAbortedDelay = self.am_getOption('ClearAbortedPilotsDelay', 7) self.WMSAdministrator = WMSAdministratorClient() return S_OK() ############################################################################# def execute(self): """The PilotAgent execution method. """ self.pilotStalledDays = self.am_getOption('PilotStalledDays', 3) self.gridEnv = self.am_getOption('GridEnv') if not self.gridEnv: # No specific option found, try a general one setup = gConfig.getValue('/DIRAC/Setup', '') if setup: instance = gConfig.getValue('/DIRAC/Setups/%s/WorkloadManagement' % setup, '') if instance: self.gridEnv = gConfig.getValue('/Systems/WorkloadManagement/%s/GridEnv' % instance, '') result = self.pilotDB._getConnection() if result['OK']: connection = result['Value'] else: return result # Now handle pilots not updated in the last N days (most likely the Broker is no # longer available) and declare them Deleted. result = self.handleOldPilots(connection) connection.close() result = self.WMSAdministrator.clearPilots(self.clearPilotsDelay, self.clearAbortedDelay) if not result['OK']: self.log.warn('Failed to clear old pilots in the PilotAgentsDB') return S_OK() def clearWaitingPilots(self, condDict): """ Clear pilots in the faulty Waiting state """ last_update = Time.dateTime() - MAX_WAITING_STATE_LENGTH * Time.hour clearDict = {'Status': 'Waiting', 'OwnerDN': condDict['OwnerDN'], 'OwnerGroup': condDict['OwnerGroup'], 'GridType': condDict['GridType'], 'Broker': condDict['Broker']} result = self.pilotDB.selectPilots(clearDict, older=last_update) if not result['OK']: self.log.warn('Failed to get the Pilot Agents for Waiting state') return result if not result['Value']: return S_OK() refList = result['Value'] for pilotRef in refList: self.log.info('Setting Waiting pilot to Stalled: %s' % pilotRef) result = self.pilotDB.setPilotStatus(pilotRef, 'Stalled', statusReason='Exceeded max waiting time') return S_OK() def clearParentJob(self, pRef, pDict, connection): """ Clear the parameteric parent job from the PilotAgentsDB """ childList = pDict['ChildRefs'] # Check that at least one child is in the database children_ok = False for child in childList: result = self.pilotDB.getPilotInfo(child, conn=connection) if result['OK']: if result['Value']: children_ok = True if children_ok: return self.pilotDB.deletePilot(pRef, conn=connection) else: self.log.verbose('Adding children for parent %s' % pRef) result = self.pilotDB.getPilotInfo(pRef) parentInfo = result['Value'][pRef] tqID = parentInfo['TaskQueueID'] ownerDN = parentInfo['OwnerDN'] ownerGroup = parentInfo['OwnerGroup'] broker = parentInfo['Broker'] gridType = parentInfo['GridType'] result = self.pilotDB.addPilotTQReference(childList, tqID, ownerDN, ownerGroup, broker=broker, gridType=gridType) if not result['OK']: return result children_added = True for chRef, chDict in pDict['ChildDicts'].items(): result = self.pilotDB.setPilotStatus(chRef, chDict['Status'], destination=chDict['DestinationSite'], conn=connection) if not result['OK']: children_added = False if children_added: result = self.pilotDB.deletePilot(pRef, conn=connection) else: return S_ERROR('Failed to add children') return S_OK() def handleOldPilots(self, connection): """ select all pilots that have not been updated in the last N days and declared them Deleted, accounting for them. """ pilotsToAccount = {} timeLimitToConsider = Time.toString(Time.dateTime() - Time.day * self.pilotStalledDays) result = self.pilotDB.selectPilots({'Status': self.queryStateList}, older=timeLimitToConsider, timeStamp='LastUpdateTime') if not result['OK']: self.log.error('Failed to get the Pilot Agents') return result if not result['Value']: return S_OK() refList = result['Value'] result = self.pilotDB.getPilotInfo(refList) if not result['OK']: self.log.error('Failed to get Info for Pilot Agents') return result pilotsDict = result['Value'] for pRef in pilotsDict: if pilotsDict[pRef].get('Jobs') and self._checkJobLastUpdateTime(pilotsDict[pRef]['Jobs'], self.pilotStalledDays): self.log.debug('%s should not be deleted since one job of %s is running.' % (str(pRef), str(pilotsDict[pRef]['Jobs']))) continue deletedJobDict = pilotsDict[pRef] deletedJobDict['Status'] = 'Deleted' deletedJobDict['StatusDate'] = Time.dateTime() pilotsToAccount[pRef] = deletedJobDict if len(pilotsToAccount) > 100: self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) pilotsToAccount = {} self.accountPilots(pilotsToAccount, connection) self._killPilots(pilotsToAccount) return S_OK() def accountPilots(self, pilotsToAccount, connection): """ account for pilots """ accountingFlag = False pae = self.am_getOption('PilotAccountingEnabled', 'yes') if pae.lower() == "yes": accountingFlag = True if not pilotsToAccount: self.log.info('No pilots to Account') return S_OK() accountingSent = False if accountingFlag: retVal = self.pilotDB.getPilotInfo(pilotsToAccount.keys(), conn=connection) if not retVal['OK']: self.log.error('Fail to retrieve Info for pilots', retVal['Message']) return retVal dbData = retVal['Value'] for pref in dbData: if pref in pilotsToAccount: if dbData[pref]['Status'] not in self.finalStateList: dbData[pref]['Status'] = pilotsToAccount[pref]['Status'] dbData[pref]['DestinationSite'] = pilotsToAccount[pref]['DestinationSite'] dbData[pref]['LastUpdateTime'] = pilotsToAccount[pref]['StatusDate'] retVal = self.__addPilotsAccountingReport(dbData) if not retVal['OK']: self.log.error('Fail to retrieve Info for pilots', retVal['Message']) return retVal self.log.info("Sending accounting records...") retVal = gDataStoreClient.commit() if not retVal['OK']: self.log.error("Can't send accounting reports", retVal['Message']) else: self.log.info("Accounting sent for %s pilots" % len(pilotsToAccount)) accountingSent = True if not accountingFlag or accountingSent: for pRef in pilotsToAccount: pDict = pilotsToAccount[pRef] self.log.verbose('Setting Status for %s to %s' % (pRef, pDict['Status'])) self.pilotDB.setPilotStatus(pRef, pDict['Status'], pDict['DestinationSite'], pDict['StatusDate'], conn=connection) return S_OK() def __addPilotsAccountingReport(self, pilotsData): """ fill accounting data """ for pRef in pilotsData: pData = pilotsData[pRef] pA = PilotAccounting() pA.setEndTime(pData['LastUpdateTime']) pA.setStartTime(pData['SubmissionTime']) retVal = CS.getUsernameForDN(pData['OwnerDN']) if not retVal['OK']: userName = '******' self.log.error("Can't determine username for dn:", pData['OwnerDN']) else: userName = retVal['Value'] pA.setValueByKey('User', userName) pA.setValueByKey('UserGroup', pData['OwnerGroup']) result = getSiteForCE(pData['DestinationSite']) if result['OK'] and result['Value'].strip(): pA.setValueByKey('Site', result['Value'].strip()) else: pA.setValueByKey('Site', 'Unknown') pA.setValueByKey('GridCE', pData['DestinationSite']) pA.setValueByKey('GridMiddleware', pData['GridType']) pA.setValueByKey('GridResourceBroker', pData['Broker']) pA.setValueByKey('GridStatus', pData['Status']) if 'Jobs' not in pData: pA.setValueByKey('Jobs', 0) else: pA.setValueByKey('Jobs', len(pData['Jobs'])) self.log.verbose("Added accounting record for pilot %s" % pData['PilotID']) retVal = gDataStoreClient.addRegister(pA) if not retVal['OK']: return retVal return S_OK() def _killPilots(self, acc): for i in sorted(acc.keys()): result = self.diracadmin.getPilotInfo(i) if result['OK'] and i in result['Value'] and 'Status' in result['Value'][i]: ret = self.diracadmin.killPilot(str(i)) if ret['OK']: self.log.info("Successfully deleted: %s (Status : %s)" % (i, result['Value'][i]['Status'])) else: self.log.error("Failed to delete pilot: ", "%s : %s" % (i, ret['Message'])) else: self.log.error("Failed to get pilot info", "%s : %s" % (i, str(result))) def _checkJobLastUpdateTime(self, joblist, StalledDays): timeLimitToConsider = Time.dateTime() - Time.day * StalledDays ret = False for jobID in joblist: result = self.jobDB.getJobAttributes(int(jobID)) if result['OK']: if 'LastUpdateTime' in result['Value']: lastUpdateTime = result['Value']['LastUpdateTime'] if Time.fromString(lastUpdateTime) > timeLimitToConsider: ret = True self.log.debug( 'Since %s updates LastUpdateTime on %s this does not to need to be deleted.' % (str(jobID), str(lastUpdateTime))) break else: self.log.error("Error taking job info from DB", result['Message']) return ret
def test_PilotsDB( self ): wmsAdministrator = RPCClient( 'WorkloadManagement/WMSAdministrator' ) pilotAgentDB = PilotAgentsDB() res = wmsAdministrator.addPilotTQReference( ['aPilot'], 1, '/a/ownerDN', 'a/owner/Group' ) self.assert_( res['OK'] ) res = wmsAdministrator.getCurrentPilotCounters( {} ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'Submitted': 1L} ) res = pilotAgentDB.deletePilot( 'aPilot' ) self.assert_( res['OK'] ) res = wmsAdministrator.getCurrentPilotCounters( {} ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {} ) res = wmsAdministrator.addPilotTQReference( ['anotherPilot'], 1, '/a/ownerDN', 'a/owner/Group' ) self.assert_( res['OK'] ) res = wmsAdministrator.storePilotOutput( 'anotherPilot', 'This is an output', 'this is an error' ) self.assert_( res['OK'] ) res = wmsAdministrator.getPilotOutput( 'anotherPilot' ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'OwnerDN': '/a/ownerDN', 'OwnerGroup': 'a/owner/Group', 'StdErr': 'this is an error', 'FileList': [], 'StdOut': 'This is an output'} ) # need a job for the following # res = wmsAdministrator.getJobPilotOutput( 1 ) # self.assertEqual( res['Value'], {'OwnerDN': '/a/ownerDN', 'OwnerGroup': 'a/owner/Group', # 'StdErr': 'this is an error', 'FileList': [], 'StdOut': 'This is an output'} ) # self.assert_( res['OK'] ) res = wmsAdministrator.getPilotInfo( 'anotherPilot' ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['anotherPilot']['AccountingSent'], 'False' ) self.assertEqual( res['Value']['anotherPilot']['PilotJobReference'], 'anotherPilot' ) res = wmsAdministrator.selectPilots( {} ) self.assert_( res['OK'] ) # res = wmsAdministrator.getPilotLoggingInfo( 'anotherPilot' ) # self.assert_( res['OK'] ) res = wmsAdministrator.getPilotSummary( '', '' ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['Total']['Submitted'], 1 ) res = wmsAdministrator.getPilotMonitorWeb( {}, [], 0, 100 ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['TotalRecords'], 1 ) res = wmsAdministrator.getPilotMonitorSelectors() self.assert_( res['OK'] ) self.assertEqual( res['Value'], {'GridType': ['DIRAC'], 'OwnerGroup': ['a/owner/Group'], 'DestinationSite': ['NotAssigned'], 'Broker': ['Unknown'], 'Status': ['Submitted'], 'OwnerDN': ['/a/ownerDN'], 'GridSite': ['Unknown'], 'Owner': []} ) res = wmsAdministrator.getPilotSummaryWeb( {}, [], 0, 100 ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['TotalRecords'], 1 ) res = wmsAdministrator.setAccountingFlag( 'anotherPilot', 'True' ) self.assert_( res['OK'] ) res = wmsAdministrator.setPilotStatus( 'anotherPilot', 'Running' ) self.assert_( res['OK'] ) res = wmsAdministrator.getPilotInfo( 'anotherPilot' ) self.assert_( res['OK'] ) self.assertEqual( res['Value']['anotherPilot']['AccountingSent'], 'True' ) self.assertEqual( res['Value']['anotherPilot']['Status'], 'Running' ) res = wmsAdministrator.setJobForPilot( 123, 'anotherPilot' ) self.assert_( res['OK'] ) res = wmsAdministrator.setPilotBenchmark( 'anotherPilot', 12.3 ) self.assert_( res['OK'] ) res = wmsAdministrator.countPilots( {} ) self.assert_( res['OK'] ) # res = wmsAdministrator.getCounters() # # getPilotStatistics res = pilotAgentDB.deletePilot( 'anotherPilot' ) self.assert_( res['OK'] ) res = wmsAdministrator.getCurrentPilotCounters( {} ) self.assert_( res['OK'] ) self.assertEqual( res['Value'], {} )