def initialize(self): """ Sets defaults """ self.am_setOption("PollingTime", 120) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info( "Will exclude the following Production types from cleaning %s" % (', '.join(self.prod_types))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500) self.jobByJob = self.am_getOption('JobByJob', False) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1) return S_OK()
def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus()
def cleanTaskQueues(): tqDB = TaskQueueDB() jobDB = JobDB() logDB = JobLoggingDB() result = tqDB.enableAllTaskQueues() if not result['OK']: return result result = tqDB.findOrphanJobs() if not result['OK']: return result for jid in result['Value']: result = tqDB.deleteJob(jid) if not result['OK']: gLogger.error("Cannot delete from TQ job %s" % jid, result['Message']) continue result = jobDB.rescheduleJob(jid) if not result['OK']: gLogger.error("Cannot reschedule in JobDB job %s" % jid, result['Message']) continue result = logDB.addLoggingRecord(jid, JobStatus.RECEIVED, "", "", source="JobState") if not result['OK']: gLogger.error("Cannot add logging record in JobLoggingDB %s" % jid, result['Message']) continue return S_OK()
def initialize(self, jobDB=False, logDB=False): """ Initialization of the Optimizer Agent. """ if not jobDB: self.jobDB = JobDB() else: self.jobDB = jobDB if not logDB: self.logDB = JobLoggingDB() else: self.logDB = logDB trailing = "Agent" optimizerName = self.am_getModuleParam('agentName') if optimizerName[-len(trailing):].find(trailing) == 0: optimizerName = optimizerName[:-len(trailing)] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.startingMajorStatus = "Checking" self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed') self.requiredJobInfo = 'jdl' self.am_setOption("PollingTime", 30) return self.initializeOptimizer()
def initialize(self, jobDB=None, logDB=None): """Initialization of the Optimizer Agent.""" self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) useESForJobParametersFlag = Operations().getValue( "/Services/JobMonitoring/useESForJobParametersFlag", False) if useESForJobParametersFlag: try: result = ObjectLoader().loadObject( "WorkloadManagementSystem.DB.ElasticJobParametersDB", "ElasticJobParametersDB") if not result["OK"]: return result self.elasticJobParametersDB = result["Value"]() except RuntimeError as excp: return S_ERROR("Can't connect to DB: %s" % excp) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam("agentName") if optimizerName.endswith("Agent"): optimizerName = optimizerName[:-len("Agent")] self.am_setModuleParam("optimizerName", optimizerName) self.startingMinorStatus = self.am_getModuleParam("optimizerName") self.failedStatus = self.am_getOption("FailedJobStatus", JobStatus.FAILED) self.am_setOption("PollingTime", 30) return self.initializeOptimizer()
def initialize(self): """Sets default parameters""" self.jobDB = JobDB() self.logDB = JobLoggingDB() # getting parameters if not self.am_getOption("Enable", True): self.log.info("Stalled Job Agent running in disabled mode") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR( "Can not get the WorkloadManagement system instance") self.stalledJobsTolerantSites = self.am_getOption( "StalledJobsTolerantSites", []) self.stalledJobsToleranceTime = self.am_getOption( "StalledJobsToleranceTime", 0) self.stalledJobsToRescheduleSites = self.am_getOption( "StalledJobsToRescheduleSites", []) self.submittingTime = self.am_getOption("SubmittingTime", self.submittingTime) self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime) self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime) wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper") failedTime = self.am_getOption("FailedTimeHours", 6) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, "CheckingTime"), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60)) stalledTime = self.am_getOption("StalledTimeHours", 2) self.log.verbose("", "StalledTime = %s cycles" % (stalledTime)) self.stalledTime = int(watchdogCycle * (stalledTime + 0.5)) self.log.verbose("", "FailedTime = %s cycles" % (failedTime)) # Add half cycle to avoid race conditions self.failedTime = int(watchdogCycle * (failedTime + 0.5)) self.minorStalledStatuses = ( JobMinorStatus.STALLED_PILOT_NOT_RUNNING, "Stalling for more than %d sec" % self.failedTime, ) # setting up the threading maxNumberOfThreads = self.am_getOption("MaxNumberOfThreads", 15) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) self.threadPoolExecutor = concurrent.futures.ThreadPoolExecutor( max_workers=maxNumberOfThreads) return S_OK()
def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') return S_OK()
def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper )
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gTaskQueueDB global jlDB global pilotAgentsDB gJobDB = JobDB() gTaskQueueDB = TaskQueueDB() jlDB = JobLoggingDB() pilotAgentsDB = PilotAgentsDB() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) ) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK()
def initializeJobMonitoringHandler(serviceInfo): global jobDB, jobLoggingDB, taskQueueDB jobDB = JobDB() jobLoggingDB = JobLoggingDB() taskQueueDB = TaskQueueDB() return S_OK()
def initializeJobMonitoringHandler(serviceInfo): global gJobDB, gJobLoggingDB, gTaskQueueDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gTaskQueueDB = TaskQueueDB() return S_OK()
def initialize( self, jobDB = False, logDB = False ): """ Initialization of the Optimizer Agent. """ if not jobDB: self.jobDB = JobDB() else: self.jobDB = jobDB if not logDB: self.logDB = JobLoggingDB() else: self.logDB = logDB trailing = "Agent" optimizerName = self.am_getModuleParam( 'agentName' ) if optimizerName[ -len( trailing ):].find( trailing ) == 0: optimizerName = optimizerName[ :-len( trailing ) ] self.am_setModuleParam( 'optimizerName', optimizerName ) self.startingMinorStatus = self.am_getModuleParam( 'optimizerName' ) self.startingMajorStatus = "Checking" self.failedStatus = self.am_getOption( "FailedJobStatus" , 'Failed' ) self.requiredJobInfo = 'jdl' self.am_setOption( "PollingTime", 30 ) return self.initializeOptimizer()
def checkDBAccess(cls): # Init DB if there if not JobState.__db.checked: JobState.__db.jobDB = JobDB() JobState.__db.logDB = JobLoggingDB() JobState.__db.tqDB = TaskQueueDB() JobState.__db.checked = True
def initializeJobStateUpdateHandler(serviceInfo): global jobDB global logDB jobDB = JobDB() logDB = JobLoggingDB() return S_OK()
def initializeJobManagerHandler(serviceInfo): global gJobDB, gJobLoggingDB, gtaskQueueDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gtaskQueueDB = TaskQueueDB() return S_OK()
def initialize(self): """ Standard constructor """ self.jobDB = JobDB() self.jobLoggingDB = JobLoggingDB() self._optimizers = {} self.am_setOption("PollingTime", 30) return S_OK()
def initializeMatcherHandler(serviceInfo): """ Matcher Service initialization """ global gJobDB global gJobLoggingDB global gTaskQueueDB global gPilotAgentsDB # Create JobDB object and initialize its tables. gJobDB = JobDB() res = gJobDB._checkTable() if not res['OK']: return res # Create JobLoggingDB object and initialize its tables. gJobLoggingDB = JobLoggingDB() res = gJobLoggingDB._checkTable() if not res['OK']: return res gTaskQueueDB = TaskQueueDB() # Create PilotAgentsDB object and initialize its tables. gPilotAgentsDB = PilotAgentsDB() res = gPilotAgentsDB._checkTable() if not res['OK']: return res gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gTaskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask(120, gTaskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, sendNumTaskQueues) sendNumTaskQueues() return S_OK()
def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') # setting up the threading maxNumberOfThreads = self.am_getOption('MaxNumberOfThreads', 15) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) for _ in range(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute) return S_OK()
def initialize( self ): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption( 'PollingTime', 60 * 60 ) if not self.am_getOption( 'Enable', True ): self.log.info( 'Stalled Job Agent running in disabled mode' ) return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initialize(self, jobDB=None, logDB=None): """ Initialization of the Optimizer Agent. """ self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam('agentName') if optimizerName.endswith('Agent'): optimizerName = optimizerName[:-len('Agent')] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed') self.am_setOption("PollingTime", 30) return self.initializeOptimizer()
def initializeHandler(cls, svcInfoDict): cls.gJobDB = JobDB() cls.gJobLoggingDB = JobLoggingDB() cls.gTaskQueueDB = TaskQueueDB() cls.gElasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.gElasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def initializeHandler(cls, svcInfoDict): """ Determines the switching of ElasticSearch and MySQL backends """ cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.elasticJobParametersDB = None useESForJobParametersFlag = Operations().getValue( '/Services/JobMonitoring/useESForJobParametersFlag', False) if useESForJobParametersFlag: cls.elasticJobParametersDB = ElasticJobParametersDB() return S_OK()
def __updateJobStatus(self, jobID, status, minorstatus=None): """ This method updates the job status in the JobDB FIXME: Use the JobStateUpdate service instead of the JobDB """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (jobID, status)) from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB jobDB = JobDB() if self.enabled: result = jobDB.setJobAttribute(jobID, 'Status', status, update=True) else: return S_OK('DisabledMode') if not result['OK']: self.log.error("Failed to update job status", result['Message']) raise RuntimeError("Failed to update job status") if minorstatus is None: #Retain last minor status for stalled jobs result = jobDB.getJobAttributes(jobID, ['MinorStatus']) if result['OK']: minorstatus = result['Value']['MinorStatus'] else: self.log.error("Failed to get Minor Status", result['Message']) raise RuntimeError("Failed to get Minorstatus") else: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (jobID, minorstatus)) result = jobDB.setJobAttribute(jobID, 'MinorStatus', minorstatus, update=True) logStatus = status from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB result = JobLoggingDB().addLoggingRecord(jobID, status=logStatus, minor=minorstatus, source='DataRecoveryAgent') if not result['OK']: ## just the logging entry, no big loss so no exception self.log.warn(result) return result
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production']) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initializeJobManagerHandler(serviceInfo): global gJobDB, gJobLoggingDB, gtaskQueueDB, enablePilotsLogging, gPilotAgentsDB, gPilotsLoggingDB gJobDB = JobDB() gJobLoggingDB = JobLoggingDB() gtaskQueueDB = TaskQueueDB() gPilotAgentsDB = PilotAgentsDB() # there is a problem with accessing CS with shorter paths, so full path is extracted from serviceInfo dict enablePilotsLogging = gConfig.getValue( serviceInfo['serviceSectionPath'].replace('JobManager', 'PilotsLogging') + '/Enable', 'False').lower() in ('yes', 'true') if enablePilotsLogging: gPilotsLoggingDB = PilotsLoggingDB() return S_OK()
def initializeHandler(cls, serviceInfoDict): """ Initialization of DB objects and OptimizationMind """ cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.taskQueueDB = TaskQueueDB() cls.pilotAgentsDB = PilotAgentsDB() cls.pilotsLoggingDB = None enablePilotsLogging = Operations().getValue( '/Services/JobMonitoring/usePilotsLoggingFlag', False) if enablePilotsLogging: cls.pilotsLoggingDB = PilotsLoggingDB() cls.msgClient = MessageClient("WorkloadManagement/OptimizationMind") cls.__connectToOptMind() gThreadScheduler.addPeriodicTask(60, cls.__connectToOptMind) return S_OK()
def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK()
def initializeHandler(cls, serviceInfoDict): cls.jobDB = JobDB() cls.jobLoggingDB = JobLoggingDB() cls.taskQueueDB = TaskQueueDB() cls.pilotAgentsDB = PilotAgentsDB() cls.limiter = Limiter(jobDB=cls.jobDB) cls.taskQueueDB.recalculateTQSharesForAll() gMonitor.registerActivity('matchTime', "Job matching time", 'Matching', "secs", gMonitor.OP_MEAN, 300) gMonitor.registerActivity('matchesDone', "Job Match Request", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('matchesOK', "Matched jobs", 'Matching', "matches", gMonitor.OP_RATE, 300) gMonitor.registerActivity('numTQs', "Number of Task Queues", 'Matching', "tqsk queues", gMonitor.OP_MEAN, 300) gThreadScheduler.addPeriodicTask(120, cls.taskQueueDB.recalculateTQSharesForAll) gThreadScheduler.addPeriodicTask(60, cls.sendNumTaskQueues) cls.sendNumTaskQueues() return S_OK()
def initializeMatcherHandler( serviceInfo ): """ Matcher Service initialization """ global jobDB global jobLoggingDB global taskQueueDB jobDB = JobDB() jobLoggingDB = JobLoggingDB() taskQueueDB = TaskQueueDB() gMonitor.registerActivity( 'matchTime', "Job matching time", 'Matching', "secs" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'matchTaskQueues', "Task queues checked per job", 'Matching', "task queues" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'matchesDone', "Job Matches", 'Matching', "matches" , gMonitor.OP_MEAN, 300 ) gMonitor.registerActivity( 'numTQs', "Number of Task Queues", 'Matching', "tqsk queues" , gMonitor.OP_MEAN, 300 ) taskQueueDB.recalculateTQSharesForAll() gThreadScheduler.addPeriodicTask( 120, taskQueueDB.recalculateTQSharesForAll ) gThreadScheduler.addPeriodicTask( 120, sendNumTaskQueues ) sendNumTaskQueues() return S_OK()
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.removeStatusDelay = {'Done':7, 'Killed':1, 'Failed':7 } ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) ) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class OptimizerModule(AgentModule): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.logDB = None self.startingMinorStatus = None self.startingMajorStatus = JobStatus.CHECKING self.failedStatus = None self.requiredJobInfo = 'jdl' self._initResult = None def initialize(self, jobDB=None, logDB=None): """ Initialization of the Optimizer Agent. """ self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam('agentName') if optimizerName.endswith('Agent'): optimizerName = optimizerName[:-len('Agent')] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.failedStatus = self.am_getOption("FailedJobStatus", JobStatus.FAILED) self.am_setOption("PollingTime", 30) return self.initializeOptimizer() def initializeOptimizer(self): """ To be overwritten by inheriting class """ return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ result = self.initializeOptimizer() if not result['OK']: return result self._initResult = result['Value'] condition = {'Status': self.startingMajorStatus} if self.startingMinorStatus: condition['MinorStatus'] = self.startingMinorStatus result = self.jobDB.selectJobs(condition) if not result['OK']: self.log.warn('Failed to get a job list from the JobDB') return S_ERROR('Failed to get a job list from the JobDB') if not result['Value']: self.log.verbose('No pending jobs to process') return S_OK('No work to do') for job in result['Value']: result = self.getJobDefinition(job) if not result['OK']: self.setFailedJob(job, result['Message'], '') continue jobDef = result['Value'] result = self.optimizeJob(job, jobDef['classad']) return S_OK() ############################################################################# def optimizeJob(self, job, classAdJob): """ Call the corresponding Optimizer checkJob method """ self.log.verbose( 'Job will be processed', '%s by %sExecutor' % (job, self.am_getModuleParam('optimizerName'))) result = self.checkJob(job, classAdJob) if not result['OK']: self.setFailedJob(job, result['Message'], classAdJob) return result ############################################################################# def getJobDefinition(self, job, jobDef=False): """ Retrieve JDL of the Job and return jobDef dictionary """ if not jobDef: jobDef = {} # If not jdl in jobinfo load it if 'jdl' not in jobDef: if self.requiredJobInfo == 'jdlOriginal': result = self.jobDB.getJobJDL(job, original=True) if not result['OK']: self.log.error("No JDL for job", "%s" % job) return S_ERROR("No JDL for job") jobDef['jdl'] = result['Value'] if self.requiredJobInfo == 'jdl': result = self.jobDB.getJobJDL(job) if not result['OK']: self.log.error("No JDL for job", "%s" % job) return S_ERROR("No JDL for job") jobDef['jdl'] = result['Value'] # Load the classad if needed if 'jdl' in jobDef and 'classad' not in jobDef: try: classad = ClassAd(jobDef['jdl']) except BaseException: self.log.debug("Cannot load JDL") return S_ERROR('Illegal Job JDL') if not classad.isOK(): self.log.debug( "Warning: illegal JDL for job %s, will be marked problematic" % (job)) return S_ERROR('Illegal Job JDL') jobDef['classad'] = classad return S_OK(jobDef) ############################################################################# def getOptimizerJobInfo(self, job, reportName): """This method gets job optimizer information that will be used for """ self.log.debug("self.jobDB.getJobOptParameter(%s,'%s')" % (job, reportName)) result = self.jobDB.getJobOptParameter(job, reportName) if result['OK']: value = result['Value'] if not value: self.log.warn('JobDB returned null value', 'for %s %s' % (job, reportName)) return S_ERROR('No optimizer info returned') else: try: return S_OK(eval(value)) except BaseException as x: return S_ERROR( 'Could not evaluate optimizer parameters: %s' % repr(x)) return result ############################################################################# def setOptimizerJobInfo(self, job, reportName, value): """This method sets the job optimizer information that will subsequently be used for job scheduling and TURL queries on the WN. """ self.log.debug("self.jobDB.setJobOptParameter(%s,'%s','%s')" % (job, reportName, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setJobOptParameter(job, reportName, str(value)) ############################################################################# def setOptimizerChain(self, job, value): """This method sets the job optimizer chain, in principle only needed by one of the optimizers. """ self.log.debug("self.jobDB.setOptimizerChain(%s,%s)" % (job, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setOptimizerChain(job, value) ############################################################################# def setNextOptimizer(self, job): """This method is executed when the optimizer instance has successfully processed the job. The next optimizer in the chain will subsequently start to work on the job. """ result = self.logDB.addLoggingRecord( job, status=self.startingMajorStatus, minor=self.startingMinorStatus, source=self.am_getModuleParam("optimizerName")) if not result['OK']: self.log.warn("Can't add logging record", result['Message']) self.log.debug("self.jobDB.setNextOptimizer(%s,'%s')" % (job, self.am_getModuleParam("optimizerName"))) return self.jobDB.setNextOptimizer( job, self.am_getModuleParam("optimizerName")) ############################################################################# def updateJobStatus(self, job, status, minorStatus=None, appStatus=None): """This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.debug( "self.jobDB.setJobStatus(%s,'Status/Minor/Application','%s'/'%s'/'%s',update=True)" % (job, status, str(minorStatus), str(appStatus))) if not self.am_Enabled(): return S_OK() result = self.jobDB.setJobStatus(job, status, minorStatus, appStatus) if not result['OK']: return result result = self.logDB.addLoggingRecord( job, status=status, minor=minorStatus, application=appStatus, source=self.am_getModuleParam('optimizerName')) if not result['OK']: self.log.warn("Can't add logging record", result['Message']) return S_OK() ############################################################################# def setJobParam(self, job, reportName, value): """This method updates a job parameter in the JobDB. """ self.log.debug("self.jobDB.setJobParameter(%s,'%s','%s')" % (job, reportName, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setJobParameter(job, reportName, value) ############################################################################# def setFailedJob(self, job, msg, classAdJob=None): """This method moves the job to the failed status """ self.log.debug("self.updateJobStatus(%s,'%s','%s')" % (job, self.failedStatus, msg)) if not self.am_Enabled(): return S_OK() self.updateJobStatus(job, self.failedStatus, msg) if classAdJob: self.sendAccountingRecord(job, msg, classAdJob) ############################################################################# def checkJob(self, job, classad): """This method controls the checking of the job, should be overridden in a subclass """ self.log.warn( 'Optimizer: checkJob method should be implemented in a subclass') return S_ERROR( 'Optimizer: checkJob method should be implemented in a subclass') ############################################################################# def sendAccountingRecord(self, job, msg, classAdJob): """ Send and accounting record for the failed job """ accountingReport = AccountingJob() accountingReport.setStartTime() accountingReport.setEndTime() owner = classAdJob.getAttributeString('Owner') userGroup = classAdJob.getAttributeString('OwnerGroup') jobGroup = classAdJob.getAttributeString('JobGroup') jobType = classAdJob.getAttributeString('JobType') jobClass = 'unknown' if classAdJob.lookupAttribute('JobSplitType'): jobClass = classAdJob.getAttributeString('JobSplitType') inputData = [] processingType = 'unknown' if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') if classAdJob.lookupAttribute('InputData'): inputData = classAdJob.getListFromExpression('InputData') inputDataFiles = len(inputData) outputData = [] if classAdJob.lookupAttribute('OutputData'): outputData = classAdJob.getListFromExpression('OutputData') outputDataFiles = len(outputData) acData = { 'User': owner, 'UserGroup': userGroup, 'JobGroup': jobGroup, 'JobType': jobType, 'JobClass': jobClass, 'ProcessingType': processingType, 'FinalMajorStatus': self.failedStatus, 'FinalMinorStatus': msg, 'CPUTime': 0.0, 'NormCPUTime': 0.0, 'ExecTime': 0.0, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': inputDataFiles, 'OutputDataFiles': outputDataFiles, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0.0 } self.log.debug('Accounting Report is:') self.log.debug(acData) accountingReport.setValuesFromDict(acData) return accountingReport.commit()
class StalledJobAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ jobDB = None logDB = None matchedTime = 7200 rescheduledTime = 600 completedTime = 86400 ############################################################################# def initialize( self ): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption( 'PollingTime', 60 * 60 ) if not self.am_getOption( 'Enable', True ): self.log.info( 'Stalled Job Agent running in disabled mode' ) return S_OK() ############################################################################# def execute( self ): """ The main agent execution method """ self.log.verbose( 'Waking up Stalled Job Agent' ) wms_instance = getSystemInstance( 'WorkloadManagement' ) if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance' ) wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' ) stalledTime = self.am_getOption( 'StalledTimeHours', 2 ) failedTime = self.am_getOption( 'FailedTimeHours', 6 ) self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime ) self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime ) self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime ) self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) ) self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) ) watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 ) watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) ) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * ( stalledTime + 0.5 ) failedTime = watchdogCycle * ( failedTime + 0.5 ) result = self.__markStalledJobs( stalledTime ) if not result['OK']: self.log.error( 'Failed to detect stalled jobs', result['Message'] ) #Note, jobs will be revived automatically during the heartbeat signal phase and #subsequent status changes will result in jobs not being selected by the #stalled job agent. result = self.__failStalledJobs( failedTime ) if not result['OK']: self.log.error( 'Failed to process stalled jobs', result['Message'] ) result = self.__failCompletedJobs() if not result['OK']: self.log.error( 'Failed to process completed jobs', result['Message'] ) result = self.__kickStuckJobs() if not result['OK']: self.log.error( 'Failed to kick stuck jobs', result['Message'] ) return S_OK( 'Stalled Job Agent cycle complete' ) ############################################################################# def __markStalledJobs( self, stalledTime ): """ Identifies stalled jobs running without update longer than stalledTime. """ stalledCounter = 0 runningCounter = 0 result = self.jobDB.selectJobs( {'Status':'Running'} ) if not result['OK']: return result if not result['Value']: return S_OK() jobs = result['Value'] self.log.info( '%s Running jobs will be checked for being stalled' % ( len( jobs ) ) ) jobs.sort() # jobs = jobs[:10] #for debugging for job in jobs: result = self.__getStalledJob( job, stalledTime ) if result['OK']: self.log.verbose( 'Updating status to Stalled for job %s' % ( job ) ) self.__updateJobStatus( job, 'Stalled' ) stalledCounter += 1 else: self.log.verbose( result['Message'] ) runningCounter += 1 self.log.info( 'Total jobs: %s, Stalled job count: %s, Running job count: %s' % ( len( jobs ), stalledCounter, runningCounter ) ) return S_OK() ############################################################################# def __failStalledJobs( self, failedTime ): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs( {'Status':'Stalled'} ) if not result['OK']: return result failedCounter = 0 if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) ) for job in jobs: # Check if the job pilot is lost result = self.__getJobPilotStatus( job ) if result['OK']: pilotStatus = result['Value'] if pilotStatus != "Running": result = self.__updateJobStatus( job, 'Failed', "Job stalled: pilot not running" ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) break continue result = self.__getLatestUpdateTime( job ) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate if elapsedTime > failedTime: self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) break recoverCounter = 0 for minor in ["Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime]: result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } ) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) ) for job in jobs: result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info( '%d jobs set to Failed' % failedCounter ) if recoverCounter: self.log.info( '%d jobs properly Accounted' % recoverCounter ) return S_OK( failedCounter ) ############################################################################# def __getJobPilotStatus( self, jobID ): """ Get the job pilot status """ result = self.jobDB.getJobParameter( jobID, 'Pilot_Reference' ) if not result['OK']: return result if not result['Value']: return S_ERROR( 'Failed to get the pilot reference' ) pilotReference = result['Value'] wmsAdminClient = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdminClient.getPilotInfo( pilotReference ) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn( result['Message'] ) return S_OK( 'NoPilot' ) self.log.error( 'Failed to get pilot information', result['Message'] ) return S_ERROR( 'Failed to get the pilot status' ) pilotStatus = result['Value'][pilotReference]['Status'] return S_OK( pilotStatus ) ############################################################################# def __getStalledJob( self, job, stalledTime ): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime( job ) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate self.log.verbose( '(CurrentTime-LastUpdate) = %s secs' % ( elapsedTime ) ) if elapsedTime > stalledTime: self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % ( job, elapsedTime ) ) return S_OK( 'Stalled' ) return S_ERROR( 'Job %s is running and will be ignored' % job ) ############################################################################# def __getLatestUpdateTime( self, job ): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] ) if not result['OK']: self.log.error( 'Failed to get job attributes', result['Message'] ) if not result['OK'] or not result['Value']: self.log.error( 'Could not get attributes for job', '%s' % job ) return S_ERROR( 'Could not get attributes for job' ) self.log.verbose( result ) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None': self.log.verbose( 'HeartBeatTime is null for job %s' % job ) else: latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) ) if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None': self.log.verbose( 'LastUpdateTime is null for job %s' % job ) else: lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) ) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job ) else: self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) ) return S_OK( latestUpdate ) ############################################################################# def __updateJobStatus( self, job, status, minorstatus = None ): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % ( job, status ) ) if self.am_getOption( 'Enable', True ): result = self.jobDB.setJobAttribute( job, 'Status', status, update = True ) else: result = S_OK( 'DisabledMode' ) if result['OK']: if minorstatus: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % ( job, minorstatus ) ) result = self.jobDB.setJobAttribute( job, 'MinorStatus', minorstatus, update = True ) if not minorstatus: #Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes( job, ['MinorStatus'] ) if result['OK']: minorstatus = result['Value']['MinorStatus'] logStatus = status result = self.logDB.addLoggingRecord( job, status = logStatus, minor = minorstatus, source = 'StalledJobAgent' ) if not result['OK']: self.log.warn( result ) return result def __getProcessingType( self, jobID ): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = 'unknown' result = self.jobDB.getJobJDL( jobID, original = True ) if not result['OK']: return processingType classAdJob = ClassAd( result['Value'] ) if classAdJob.lookupAttribute( 'ProcessingType' ): processingType = classAdJob.getAttributeString( 'ProcessingType' ) return processingType ############################################################################# def __sendAccounting( self, jobID ): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes( jobID ) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo( jobID, jobDict ) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict ) lastHeartBeatTime = fromString( lastHeartBeatTime ) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' ) if not cpuNormalization['OK'] or not cpuNormalization['Value']: cpuNormalization = 0.0 else: cpuNormalization = float( cpuNormalization['Value'] ) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % ( str( jobID ), str( endTime ), str( lastHeartBeatTime ) ), '' , False ) return S_ERROR( "Exception" ) processingType = self.__getProcessingType( jobID ) accountingReport.setStartTime( startTime ) accountingReport.setEndTime( endTime ) # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site' : jobDict['Site'], 'User' : jobDict['Owner'], 'UserGroup' : jobDict['OwnerGroup'], 'JobGroup' : jobDict['JobGroup'], 'JobType' : jobDict['JobType'], 'JobClass' : jobDict['JobSplitType'], 'ProcessingType' : processingType, 'FinalMajorStatus' : 'Failed', 'FinalMinorStatus' : 'Stalled', 'CPUTime' : lastCPUTime, 'NormCPUTime' : lastCPUTime * cpuNormalization, 'ExecTime' : lastWallTime, 'InputDataSize' : 0.0, 'OutputDataSize' : 0.0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0.0, 'InputSandBoxSize' : 0.0, 'OutputSandBoxSize' : 0.0, 'ProcessedEvents' : 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose( 'Accounting Report is:' ) self.log.verbose( acData ) accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' ) else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) ) return result def __checkHeartBeat( self, jobID, jobDict ): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData( jobID ) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result['OK']: for name, value, heartBeatTime in result['Value']: if 'CPUConsumed' == name: try: value = int( float( value ) ) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if 'WallClockTime' == name: try: value = int( float( value ) ) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo( self, jobID, jobDict ): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo( jobID ) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type( startTime ) in types.StringTypes: startTime = fromString( startTime ) if startTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString( items[3] ) if endTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) endTime = dateTime() return startTime, endTime def __kickStuckJobs( self ): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = str( dateTime() - self.matchedTime * second ) result = self.jobDB.selectJobs( {'Status':'Matched'}, older = checkTime ) if not result['OK']: self.log.error( 'Failed to select jobs', result['Message'] ) return result jobIDs = result['Value'] if jobIDs: self.log.info( 'Rescheduling %d jobs stuck in Matched status' % len( jobIDs ) ) result = self.jobDB.rescheduleJobs( jobIDs ) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs'] ) checkTime = str( dateTime() - self.rescheduledTime * second ) result = self.jobDB.selectJobs( {'Status':'Rescheduled'}, older = checkTime ) if not result['OK']: self.log.error( 'Failed to select jobs', result['Message'] ) return result jobIDs = result['Value'] if jobIDs: self.log.info( 'Rescheduling %d jobs stuck in Rescheduled status' % len( jobIDs ) ) result = self.jobDB.rescheduleJobs( jobIDs ) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs'] ) if message: return S_ERROR( message ) else: return S_OK() def __failCompletedJobs( self ): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str( dateTime() - self.completedTime * second ) result = self.jobDB.selectJobs( {'Status':'Completed'}, older = checkTime ) if not result['OK']: self.log.error( 'Failed to select jobs', result['Message'] ) return result jobIDs = result['Value'] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttributes( jobID, ['Status','MinorStatus'] ) if not result['OK']: self.log.error( 'Failed to get job attributes', result['Message'] ) continue if result['Value']['Status'] != "Completed": continue if result['Value']['MinorStatus'] == "Pending Requests": continue result = self.__updateJobStatus( jobID, 'Failed', "Job died during finalization" ) result = self.__sendAccounting( jobID ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) continue return S_OK()
class StalledJobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.logDB = None self.matchedTime = 7200 self.rescheduledTime = 600 self.submittingTime = 300 self.stalledJobsTolerantSites = [] ############################################################################# def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ self.log.verbose('Waking up Stalled Job Agent') wms_instance = getSystemInstance('WorkloadManagement') if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper') stalledTime = self.am_getOption('StalledTimeHours', 2) failedTime = self.am_getOption('FailedTimeHours', 6) self.stalledJobsTolerantSites = self.am_getOption( 'StalledJobsTolerantSites', []) self.stalledJobsToleranceTime = self.am_getOption( 'StalledJobsToleranceTime', 0) self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime) self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime) self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime) self.log.verbose('StalledTime = %s cycles' % (stalledTime)) self.log.verbose('FailedTime = %s cycles' % (failedTime)) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, 'CheckingTime'), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = int(watchdogCycle * (stalledTime + 0.5)) failedTime = int(watchdogCycle * (failedTime + 0.5)) result = self._markStalledJobs(stalledTime) if not result['OK']: self.log.error('Failed to detect stalled jobs', result['Message']) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self._failStalledJobs(failedTime) if not result['OK']: self.log.error('Failed to process stalled jobs', result['Message']) result = self._failSubmittingJobs() if not result['OK']: self.log.error('Failed to process jobs being submitted', result['Message']) result = self._kickStuckJobs() if not result['OK']: self.log.error('Failed to kick stuck jobs', result['Message']) return S_OK('Stalled Job Agent cycle complete') ############################################################################# def _markStalledJobs(self, stalledTime): """ Identifies stalled jobs running or completing without update longer than stalledTime. """ stalledCounter = 0 aliveCounter = 0 # This is the minimum time we wait for declaring a job Stalled, therefore it is safe checkTime = dateTime() - stalledTime * second checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING] # Only get jobs whose HeartBeat is older than the stalledTime result = self.jobDB.selectJobs({'Status': checkedStatuses}, older=checkTime, timeStamp='HeartBeatTime') if not result['OK']: return result if not result['Value']: return S_OK() jobs = sorted(result['Value']) self.log.info( '%d %s jobs will be checked for being stalled, heartbeat before %s' % (len(jobs), ' & '.join(checkedStatuses), str(checkTime))) for job in jobs: delayTime = stalledTime # Add a tolerance time for some sites if required site = self.jobDB.getJobAttribute(job, 'site')['Value'] if site in self.stalledJobsTolerantSites: delayTime += self.stalledJobsToleranceTime # Check if the job is really stalled result = self.__checkJobStalled(job, delayTime) if result['OK']: self.log.verbose('Updating status to Stalled for job %s' % (job)) self.__updateJobStatus(job, 'Stalled') stalledCounter += 1 else: self.log.verbose(result['Message']) aliveCounter += 1 self.log.info('Total jobs: %d, Stalled jobs: %d, %s jobs: %d' % (len(jobs), stalledCounter, '+'.join(checkedStatuses), aliveCounter)) return S_OK() ############################################################################# def _failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ # Only get jobs that have been Stalled for long enough checkTime = dateTime() - failedTime * second result = self.jobDB.selectJobs({'Status': JobStatus.STALLED}, older=checkTime) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info( '%d jobs Stalled before %s will be checked for failure' % (len(jobs), str(checkTime))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: # Verify that there was no sign of life for long enough result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: self.__sendKillCommand(job) self.__updateJobStatus(job, JobStatus.FAILED, setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': JobStatus.FAILED, 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter) ############################################################################# def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference') if not pilotReference: # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn(result['Message']) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: ' % jobID + result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus) ############################################################################# def __checkJobStalled(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result['OK']: return result elapsedTime = toEpoch() - result['Value'] self.log.verbose('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime)) if elapsedTime > stalledTime: self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % (job, elapsedTime)) return S_OK('Stalled') return S_ERROR('Job %s is running and will be ignored' % job) ############################################################################# def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime']) if not result['OK']: self.log.error('Failed to get job attributes', result['Message']) if not result['OK'] or not result['Value']: self.log.error('Could not get attributes for job', '%s' % job) return S_ERROR('Could not get attributes for job') latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value'][ 'HeartBeatTime'] == 'None': self.log.verbose('HeartBeatTime is null for job %s' % job) else: latestUpdate = toEpoch(fromString( result['Value']['HeartBeatTime'])) if not result['Value']['LastUpdateTime'] or result['Value'][ 'LastUpdateTime'] == 'None': self.log.verbose('LastUpdateTime is null for job %s' % job) else: latestUpdate = max( latestUpdate, toEpoch(fromString(result['Value']['LastUpdateTime']))) if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job) else: self.log.verbose('Latest update time from epoch for job %s is %s' % (job, latestUpdate)) return S_OK(latestUpdate) ############################################################################# def __updateJobStatus(self, job, status, minorstatus=None): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status)) if self.am_getOption('Enable', True): result = self.jobDB.setJobAttribute(job, 'Status', status, update=True) else: result = S_OK('DisabledMode') if result['OK']: if minorstatus: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorstatus)) result = self.jobDB.setJobAttribute(job, 'MinorStatus', minorstatus, update=True) if not minorstatus: # Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes(job, ['MinorStatus']) if result['OK']: minorstatus = result['Value']['MinorStatus'] logStatus = status result = self.logDB.addLoggingRecord(job, status=logStatus, minor=minorstatus, source='StalledJobAgent') if not result['OK']: self.log.warn(result) return result def __getProcessingType(self, jobID): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = 'unknown' result = self.jobDB.getJobJDL(jobID, original=True) if not result['OK']: return processingType classAdJob = ClassAd(result['Value']) if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') return processingType def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter( jobID, 'CPUNormalizationFactor') if not result['OK'] or not result['Value']: self.log.error( 'Error getting Job Parameter CPUNormalizationFactor, setting 0', result.get('Message', 'No such value')) cpuNormalization = 0.0 else: cpuNormalization = float( result['Value'].get('CPUNormalizationFactor')) except Exception as e: self.log.exception( "Exception in __sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { 'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result def __checkHeartBeat(self, jobID, jobDict): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData(jobID) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result['OK']: for name, value, heartBeatTime in result['Value']: if name == 'CPUConsumed': try: value = int(float(value)) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if name == 'WallClockTime': try: value = int(float(value)) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if isinstance(startTime, six.string_types): startTime = fromString(startTime) if startTime is None: self.log.error('Wrong timestamp in DB', items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString(items[3]) if endTime is None: self.log.error('Wrong timestamp in DB', items[3]) endTime = dateTime() return startTime, endTime def _kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = dateTime() - self.matchedTime * second result = self.jobDB.selectJobs({'Status': JobStatus.MATCHED}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Matched status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs']) checkTime = dateTime() - self.rescheduledTime * second result = self.jobDB.selectJobs({'Status': 'Rescheduled'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Rescheduled status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs']) if message: return S_ERROR(message) return S_OK() def _failSubmittingJobs(self): """ Failed Jobs stuck in Submitting Status for a long time. They are due to a failed bulk submission transaction. """ # Get old Submitting Jobs checkTime = dateTime() - self.submittingTime * second result = self.jobDB.selectJobs({'Status': JobStatus.SUBMITTING}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result for jobID in result['Value']: result = self.__updateJobStatus(jobID, JobStatus.FAILED) if not result['OK']: self.log.error('Failed to update job status', result['Message']) continue return S_OK() def __sendKillCommand(self, job): """Send a kill signal to the job such that it cannot continue running. :param int job: ID of job to send kill command """ ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN') ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup') if ownerDN['OK'] and ownerGroup['OK']: wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN['Value'], delegatedGroup=ownerGroup['Value']) resKill = wmsClient.killJob(job) if not resKill['OK']: self.log.error("Failed to send kill command to job", "%s: %s" % (job, resKill['Message'])) else: self.log.error( "Failed to get ownerDN or Group for job:", "%s: %s, %s" % (job, ownerDN.get( 'Message', ''), ownerGroup.get('Message', '')))
class OptimizerModule(AgentModule): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.logDB = None self.startingMinorStatus = None self.startingMajorStatus = "Checking" self.failedStatus = None self.requiredJobInfo = 'jdl' self._initResult = None def initialize(self, jobDB=None, logDB=None): """ Initialization of the Optimizer Agent. """ self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam('agentName') if optimizerName.endswith('Agent'): optimizerName = optimizerName[:-len('Agent')] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed') self.am_setOption("PollingTime", 30) return self.initializeOptimizer() def initializeOptimizer(self): """ To be overwritten by inheriting class """ return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ result = self.initializeOptimizer() if not result['OK']: return result self._initResult = result['Value'] condition = {'Status': self.startingMajorStatus} if self.startingMinorStatus: condition['MinorStatus'] = self.startingMinorStatus result = self.jobDB.selectJobs(condition) if not result['OK']: self.log.warn('Failed to get a job list from the JobDB') return S_ERROR('Failed to get a job list from the JobDB') if not result['Value']: self.log.verbose('No pending jobs to process') return S_OK('No work to do') for job in result['Value']: result = self.getJobDefinition(job) if not result['OK']: self.setFailedJob(job, result['Message'], '') continue jobDef = result['Value'] result = self.optimizeJob(job, jobDef['classad']) return S_OK() ############################################################################# def optimizeJob(self, job, classAdJob): """ Call the corresponding Optimizer checkJob method """ self.log.info('Job %s will be processed by %sAgent' % (job, self.am_getModuleParam('optimizerName'))) result = self.checkJob(job, classAdJob) if not result['OK']: self.setFailedJob(job, result['Message'], classAdJob) return result ############################################################################# def getJobDefinition(self, job, jobDef=False): """ Retrieve JDL of the Job and return jobDef dictionary """ if not jobDef: jobDef = {} # If not jdl in jobinfo load it if 'jdl' not in jobDef: if self.requiredJobInfo == 'jdlOriginal': result = self.jobDB.getJobJDL(job, original=True) if not result['OK']: self.log.error("No JDL for job", "%s" % job) return S_ERROR("No JDL for job") jobDef['jdl'] = result['Value'] if self.requiredJobInfo == 'jdl': result = self.jobDB.getJobJDL(job) if not result['OK']: self.log.error("No JDL for job", "%s" % job) return S_ERROR("No JDL for job") jobDef['jdl'] = result['Value'] # Load the classad if needed if 'jdl' in jobDef and 'classad' not in jobDef: try: classad = ClassAd(jobDef['jdl']) except BaseException: self.log.debug("Cannot load JDL") return S_ERROR('Illegal Job JDL') if not classad.isOK(): self.log.debug("Warning: illegal JDL for job %s, will be marked problematic" % (job)) return S_ERROR('Illegal Job JDL') jobDef['classad'] = classad return S_OK(jobDef) ############################################################################# def getOptimizerJobInfo(self, job, reportName): """This method gets job optimizer information that will be used for """ self.log.verbose("self.jobDB.getJobOptParameter(%s,'%s')" % (job, reportName)) result = self.jobDB.getJobOptParameter(job, reportName) if result['OK']: value = result['Value'] if not value: self.log.warn('JobDB returned null value for %s %s' % (job, reportName)) return S_ERROR('No optimizer info returned') else: try: return S_OK(eval(value)) except BaseException as x: return S_ERROR('Could not evaluate optimizer parameters: %s' % repr(x)) return result ############################################################################# def setOptimizerJobInfo(self, job, reportName, value): """This method sets the job optimizer information that will subsequently be used for job scheduling and TURL queries on the WN. """ self.log.verbose("self.jobDB.setJobOptParameter(%s,'%s','%s')" % (job, reportName, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setJobOptParameter(job, reportName, str(value)) ############################################################################# def setOptimizerChain(self, job, value): """This method sets the job optimizer chain, in principle only needed by one of the optimizers. """ self.log.verbose("self.jobDB.setOptimizerChain(%s,%s)" % (job, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setOptimizerChain(job, value) ############################################################################# def setNextOptimizer(self, job): """This method is executed when the optimizer instance has successfully processed the job. The next optimizer in the chain will subsequently start to work on the job. """ result = self.logDB.addLoggingRecord(job, status=self.startingMajorStatus, minor=self.startingMinorStatus, source=self.am_getModuleParam("optimizerName")) if not result['OK']: self.log.warn(result['Message']) self.log.verbose("self.jobDB.setNextOptimizer(%s,'%s')" % (job, self.am_getModuleParam("optimizerName"))) return self.jobDB.setNextOptimizer(job, self.am_getModuleParam("optimizerName")) ############################################################################# def updateJobStatus(self, job, status, minorStatus=None, appStatus=None): """This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose("self.jobDB.setJobStatus(%s,'Status/Minor/Application','%s'/'%s'/'%s',update=True)" % (job, status, str(minorStatus), str(appStatus))) if not self.am_Enabled(): return S_OK() result = self.jobDB.setJobStatus(job, status, minorStatus, appStatus) if not result['OK']: return result result = self.logDB.addLoggingRecord(job, status=status, minor=minorStatus, application=appStatus, source=self.am_getModuleParam('optimizerName')) if not result['OK']: self.log.warn(result['Message']) return S_OK() ############################################################################# def setJobParam(self, job, reportName, value): """This method updates a job parameter in the JobDB. """ self.log.verbose("self.jobDB.setJobParameter(%s,'%s','%s')" % (job, reportName, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setJobParameter(job, reportName, value) ############################################################################# def setFailedJob(self, job, msg, classAdJob=None): """This method moves the job to the failed status """ self.log.verbose("self.updateJobStatus(%s,'%s','%s')" % (job, self.failedStatus, msg)) if not self.am_Enabled(): return S_OK() self.updateJobStatus(job, self.failedStatus, msg) if classAdJob: self.sendAccountingRecord(job, msg, classAdJob) ############################################################################# def checkJob(self, job, classad): """This method controls the checking of the job, should be overridden in a subclass """ self.log.warn('Optimizer: checkJob method should be implemented in a subclass') return S_ERROR('Optimizer: checkJob method should be implemented in a subclass') ############################################################################# def sendAccountingRecord(self, job, msg, classAdJob): """ Send and accounting record for the failed job """ accountingReport = AccountingJob() accountingReport.setStartTime() accountingReport.setEndTime() owner = classAdJob.getAttributeString('Owner') userGroup = classAdJob.getAttributeString('OwnerGroup') jobGroup = classAdJob.getAttributeString('JobGroup') jobType = classAdJob.getAttributeString('JobType') jobClass = 'unknown' if classAdJob.lookupAttribute('JobSplitType'): jobClass = classAdJob.getAttributeString('JobSplitType') inputData = [] processingType = 'unknown' if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') if classAdJob.lookupAttribute('InputData'): inputData = classAdJob.getListFromExpression('InputData') inputDataFiles = len(inputData) outputData = [] if classAdJob.lookupAttribute('OutputData'): outputData = classAdJob.getListFromExpression('OutputData') outputDataFiles = len(outputData) acData = { 'User': owner, 'UserGroup': userGroup, 'JobGroup': jobGroup, 'JobType': jobType, 'JobClass': jobClass, 'ProcessingType': processingType, 'FinalMajorStatus': self.failedStatus, 'FinalMinorStatus': msg, 'CPUTime': 0.0, 'NormCPUTime': 0.0, 'ExecTime': 0.0, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': inputDataFiles, 'OutputDataFiles': outputDataFiles, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0.0 } self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) return accountingReport.commit()
def setUp(self): self.jlogDB = JobLoggingDB()
class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if isinstance( resourceDescription, basestring ): classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO if credDict[ 'group' ] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get( 'VirtualOrganization', '' ) else: vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.prodTypes = [] self.removeStatusDelay = {} ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info("Will exclude the following Production types from cleaning %s" % ( ', '.join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prodTypes: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict( baseCond ) if status != 'Any': condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result[ 'OK' ]: gLogger.error( "Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK(result) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class StalledJobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ jobDB = None logDB = None matchedTime = 7200 rescheduledTime = 600 completedTime = 86400 ############################################################################# def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ self.log.verbose('Waking up Stalled Job Agent') wms_instance = getSystemInstance('WorkloadManagement') if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance') wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper') stalledTime = self.am_getOption('StalledTimeHours', 2) failedTime = self.am_getOption('FailedTimeHours', 6) self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime) self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime) self.completedTime = self.am_getOption('CompletedTime', self.completedTime) self.log.verbose('StalledTime = %s cycles' % (stalledTime)) self.log.verbose('FailedTime = %s cycles' % (failedTime)) watchdogCycle = gConfig.getValue( cfgPath(wrapperSection, 'CheckingTime'), 30 * 60) watchdogCycle = max( watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * (stalledTime + 0.5) failedTime = watchdogCycle * (failedTime + 0.5) result = self.__markStalledJobs(stalledTime) if not result['OK']: self.log.error(result['Message']) #Note, jobs will be revived automatically during the heartbeat signal phase and #subsequent status changes will result in jobs not being selected by the #stalled job agent. result = self.__failStalledJobs(failedTime) if not result['OK']: self.log.error(result['Message']) result = self.__failCompletedJobs() if not result['OK']: self.log.error(result['Message']) result = self.__kickStuckJobs() if not result['OK']: self.log.error(result['Message']) return S_OK('Stalled Job Agent cycle complete') ############################################################################# def __markStalledJobs(self, stalledTime): """ Identifies stalled jobs running without update longer than stalledTime. """ stalledCounter = 0 runningCounter = 0 result = self.jobDB.selectJobs({'Status': 'Running'}) if not result['OK']: return result if not result['Value']: return S_OK() jobs = result['Value'] self.log.info('%s Running jobs will be checked for being stalled' % (len(jobs))) jobs.sort() # jobs = jobs[:10] #for debugging for job in jobs: result = self.__getStalledJob(job, stalledTime) if result['OK']: self.log.verbose('Updating status to Stalled for job %s' % (job)) self.__updateJobStatus(job, 'Stalled') stalledCounter += 1 else: self.log.verbose(result['Message']) runningCounter += 1 self.log.info( 'Total jobs: %s, Stalled job count: %s, Running job count: %s' % (len(jobs), stalledCounter, runningCounter)) return S_OK() ############################################################################# def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({'Status': 'Stalled'}) if not result['OK']: return result failedCounter = 0 if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be checked for failure' % (len(jobs))) for job in jobs: # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if result['OK']: pilotStatus = result['Value'] if pilotStatus != "Running": result = self.__updateJobStatus( job, 'Failed', "Job stalled: pilot not running") failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error(result['Message']) break continue result = self.__getLatestUpdateTime(job) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate if elapsedTime > failedTime: self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error(result['Message']) break recoverCounter = 0 for minor in [ "Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime ]: result = self.jobDB.selectJobs({ 'Status': 'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error(result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter) ############################################################################# def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = self.jobDB.getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result if not result['Value']: return S_ERROR('Failed to get the pilot reference') pilotReference = result['Value'] wmsAdminClient = RPCClient('WorkloadManagement/WMSAdministrator') result = wmsAdminClient.getPilotInfo(pilotReference) if not result['OK']: self.log.error(result['Message']) return S_ERROR('Failed to get the pilot status') pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus) ############################################################################# def __getStalledJob(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate self.log.verbose('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime)) if elapsedTime > stalledTime: self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % (job, elapsedTime)) return S_OK('Stalled') return S_ERROR('Job %s is running and will be ignored' % job) ############################################################################# def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime']) if not result['OK']: self.log.error(result['Message']) if not result['OK'] or not result['Value']: return S_ERROR('Could not get attributes for job %s' % job) self.log.verbose(result) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value'][ 'HeartBeatTime'] == 'None': self.log.verbose('HeartBeatTime is null for job %s' % job) else: latestUpdate = toEpoch(fromString( result['Value']['HeartBeatTime'])) if not result['Value']['LastUpdateTime'] or result['Value'][ 'LastUpdateTime'] == 'None': self.log.verbose('LastUpdateTime is null for job %s' % job) else: lastUpdate = toEpoch(fromString(result['Value']['LastUpdateTime'])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job) else: self.log.verbose('Latest update time from epoch for job %s is %s' % (job, latestUpdate)) return S_OK(latestUpdate) ############################################################################# def __updateJobStatus(self, job, status, minorstatus=None): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status)) if self.am_getOption('Enable', True): result = self.jobDB.setJobAttribute(job, 'Status', status, update=True) else: result = S_OK('DisabledMode') if result['OK']: if minorstatus: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorstatus)) result = self.jobDB.setJobAttribute(job, 'MinorStatus', minorstatus, update=True) if not minorstatus: #Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes(job, ['MinorStatus']) if result['OK']: minorstatus = result['Value']['MinorStatus'] logStatus = status result = self.logDB.addLoggingRecord(job, status=logStatus, minor=minorstatus, source='StalledJobAgent') if not result['OK']: self.log.warn(result) return result def __getProcessingType(self, jobID): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = 'unknown' result = self.jobDB.getJobJDL(jobID, original=True) if not result['OK']: return processingType classAdJob = ClassAd(result['Value']) if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') return processingType ############################################################################# def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor') if not cpuNormalization['OK'] or not cpuNormalization['Value']: cpuNormalization = 0.0 else: cpuNormalization = float(cpuNormalization['Value']) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), '', False) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result def __checkHeartBeat(self, jobID, jobDict): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData(jobID) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result['OK']: for name, value, heartBeatTime in result['Value']: if 'CPUConsumed' == name: try: value = int(float(value)) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if 'WallClockTime' == name: try: value = int(float(value)) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type(startTime) in types.StringTypes: startTime = fromString(startTime) if startTime == None: self.log.error('Wrong timestamp in DB', items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString(items[3]) if endTime == None: self.log.error('Wrong timestamp in DB', items[3]) endTime = dateTime() return startTime, endTime def __kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = str(dateTime() - self.matchedTime * second) result = self.jobDB.selectJobs({'Status': 'Matched'}, older=checkTime) if not result['OK']: self.log.error(result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Matched status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs']) checkTime = str(dateTime() - self.rescheduledTime * second) result = self.jobDB.selectJobs({'Status': 'Rescheduled'}, older=checkTime) if not result['OK']: self.log.error(result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Rescheduled status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs']) if message: return S_ERROR(message) else: return S_OK() def __failCompletedJobs(self): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str(dateTime() - self.completedTime * second) result = self.jobDB.selectJobs({'Status': 'Completed'}, older=checkTime) if not result['OK']: self.log.error(result['Message']) return result jobIDs = result['Value'] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttribute(jobID, 'MinorStatus') if not result['OK']: self.log.error(result['Message']) continue if result['Value'] == "Pending Requests": continue result = self.__updateJobStatus(jobID, 'Failed', "Job died during finalization") result = self.__sendAccounting(jobID) if not result['OK']: self.log.error(result['Message']) continue return S_OK()
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption('ProductionTypes',['DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production']) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK()
class StalledJobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ jobDB = None logDB = None matchedTime = 7200 rescheduledTime = 600 completedTime = 86400 ############################################################################# def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption("PollingTime", 60 * 60) if not self.am_getOption("Enable", True): self.log.info("Stalled Job Agent running in disabled mode") return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ self.log.verbose("Waking up Stalled Job Agent") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper") stalledTime = self.am_getOption("StalledTimeHours", 2) failedTime = self.am_getOption("FailedTimeHours", 6) self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime) self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime) self.completedTime = self.am_getOption("CompletedTime", self.completedTime) self.log.verbose("StalledTime = %s cycles" % (stalledTime)) self.log.verbose("FailedTime = %s cycles" % (failedTime)) watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60) watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * (stalledTime + 0.5) failedTime = watchdogCycle * (failedTime + 0.5) result = self.__markStalledJobs(stalledTime) if not result["OK"]: self.log.error("Failed to detect stalled jobs", result["Message"]) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self.__failStalledJobs(failedTime) if not result["OK"]: self.log.error("Failed to process stalled jobs", result["Message"]) result = self.__failCompletedJobs() if not result["OK"]: self.log.error("Failed to process completed jobs", result["Message"]) result = self.__kickStuckJobs() if not result["OK"]: self.log.error("Failed to kick stuck jobs", result["Message"]) return S_OK("Stalled Job Agent cycle complete") ############################################################################# def __markStalledJobs(self, stalledTime): """ Identifies stalled jobs running without update longer than stalledTime. """ stalledCounter = 0 runningCounter = 0 result = self.jobDB.selectJobs({"Status": "Running"}) if not result["OK"]: return result if not result["Value"]: return S_OK() jobs = result["Value"] self.log.info("%s Running jobs will be checked for being stalled" % (len(jobs))) jobs.sort() # jobs = jobs[:10] #for debugging for job in jobs: result = self.__getStalledJob(job, stalledTime) if result["OK"]: self.log.verbose("Updating status to Stalled for job %s" % (job)) self.__updateJobStatus(job, "Stalled") stalledCounter += 1 else: self.log.verbose(result["Message"]) runningCounter += 1 self.log.info( "Total jobs: %s, Stalled job count: %s, Running job count: %s" % (len(jobs), stalledCounter, runningCounter) ) return S_OK() ############################################################################# def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({"Status": "Stalled"}) if not result["OK"]: return result jobs = result["Value"] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", "Stalling for more than %d sec" % failedTime) if jobs: self.log.info("%s Stalled jobs will be checked for failure" % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result["OK"]: self.log.error("Failed to get pilot status", result["Message"]) continue pilotStatus = result["Value"] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result["OK"]: self.log.error("Failed to get job update time", result["Message"]) continue elapsedTime = toEpoch() - result["Value"] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: # Send a kill signal to the job such that it cannot continue running WMSClient().killJob(job) self.__updateJobStatus(job, "Failed", setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({"Status": "Failed", "MinorStatus": minor, "AccountedFlag": "False"}) if not result["OK"]: return result if result["Value"]: jobs = result["Value"] self.log.info("%s Stalled jobs will be Accounted" % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) continue recoverCounter += 1 if not result["OK"]: break if failedCounter: self.log.info("%d jobs set to Failed" % failedCounter) if recoverCounter: self.log.info("%d jobs properly Accounted" % recoverCounter) return S_OK(failedCounter) ############################################################################# def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = self.jobDB.getJobParameter(jobID, "Pilot_Reference") if not result["OK"]: return result if not result["Value"]: return S_ERROR("Failed to get the pilot reference") pilotReference = result["Value"] wmsAdminClient = RPCClient("WorkloadManagement/WMSAdministrator") result = wmsAdminClient.getPilotInfo(pilotReference) if not result["OK"]: if "No pilots found" in result["Message"]: self.log.warn(result["Message"]) return S_OK("NoPilot") self.log.error("Failed to get pilot information", result["Message"]) return S_ERROR("Failed to get the pilot status") pilotStatus = result["Value"][pilotReference]["Status"] return S_OK(pilotStatus) ############################################################################# def __getStalledJob(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result["OK"]: return result currentTime = toEpoch() lastUpdate = result["Value"] elapsedTime = currentTime - lastUpdate self.log.verbose("(CurrentTime-LastUpdate) = %s secs" % (elapsedTime)) if elapsedTime > stalledTime: self.log.info("Job %s is identified as stalled with last update > %s secs ago" % (job, elapsedTime)) return S_OK("Stalled") return S_ERROR("Job %s is running and will be ignored" % job) ############################################################################# def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) if not result["OK"] or not result["Value"]: self.log.error("Could not get attributes for job", "%s" % job) return S_ERROR("Could not get attributes for job") self.log.verbose(result) latestUpdate = 0 if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None": self.log.verbose("HeartBeatTime is null for job %s" % job) else: latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"])) if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None": self.log.verbose("LastUpdateTime is null for job %s" % job) else: lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job) else: self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate)) return S_OK(latestUpdate) ############################################################################# def __updateJobStatus(self, job, status, minorstatus=None): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose("self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status)) if self.am_getOption("Enable", True): result = self.jobDB.setJobAttribute(job, "Status", status, update=True) else: result = S_OK("DisabledMode") if result["OK"]: if minorstatus: self.log.verbose("self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorstatus)) result = self.jobDB.setJobAttribute(job, "MinorStatus", minorstatus, update=True) if not minorstatus: # Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes(job, ["MinorStatus"]) if result["OK"]: minorstatus = result["Value"]["MinorStatus"] logStatus = status result = self.logDB.addLoggingRecord(job, status=logStatus, minor=minorstatus, source="StalledJobAgent") if not result["OK"]: self.log.warn(result) return result def __getProcessingType(self, jobID): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = "unknown" result = self.jobDB.getJobJDL(jobID, original=True) if not result["OK"]: return processingType classAdJob = ClassAd(result["Value"]) if classAdJob.lookupAttribute("ProcessingType"): processingType = classAdJob.getAttributeString("ProcessingType") return processingType ############################################################################# def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = "Unknown" lastHeartBeatTime = "Unknown" result = self.jobDB.getJobAttributes(jobID) if not result["OK"]: return result jobDict = result["Value"] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter(jobID, "CPUNormalizationFactor") if not cpuNormalization["OK"] or not cpuNormalization["Value"]: cpuNormalization = 0.0 else: cpuNormalization = float(cpuNormalization["Value"]) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), "", False, ) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { "Site": jobDict["Site"], "User": jobDict["Owner"], "UserGroup": jobDict["OwnerGroup"], "JobGroup": jobDict["JobGroup"], "JobType": jobDict["JobType"], "JobClass": jobDict["JobSplitType"], "ProcessingType": processingType, "FinalMajorStatus": "Failed", "FinalMinorStatus": "Stalled", "CPUTime": lastCPUTime, "NormCPUTime": lastCPUTime * cpuNormalization, "ExecTime": lastWallTime, "InputDataSize": 0.0, "OutputDataSize": 0.0, "InputDataFiles": 0, "OutputDataFiles": 0, "DiskSpace": 0.0, "InputSandBoxSize": 0.0, "OutputSandBoxSize": 0.0, "ProcessedEvents": 0, } # For accidentally stopped jobs ExecTime can be not set if not acData["ExecTime"]: acData["ExecTime"] = acData["CPUTime"] elif acData["ExecTime"] < acData["CPUTime"]: acData["ExecTime"] = acData["CPUTime"] self.log.verbose("Accounting Report is:") self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result["OK"]: self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True") else: self.log.error("Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"])) return result def __checkHeartBeat(self, jobID, jobDict): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData(jobID) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict["StartExecTime"] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result["OK"]: for name, value, heartBeatTime in result["Value"]: if "CPUConsumed" == name: try: value = int(float(value)) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if "WallClockTime" == name: try: value = int(float(value)) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result["OK"]: logList = result["Value"] startTime = jobDict["StartExecTime"] if not startTime or startTime == "None": # status, minor, app, stime, source for items in logList: if items[0] == "Running": startTime = items[3] break if not startTime or startTime == "None": startTime = jobDict["SubmissionTime"] if type(startTime) in types.StringTypes: startTime = fromString(startTime) if startTime == None: self.log.error("Wrong timestamp in DB", items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == "Stalled": endTime = fromString(items[3]) if endTime == None: self.log.error("Wrong timestamp in DB", items[3]) endTime = dateTime() return startTime, endTime def __kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = "" checkTime = str(dateTime() - self.matchedTime * second) result = self.jobDB.selectJobs({"Status": "Matched"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Matched status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: message = "Failed to reschedule %d jobs stuck in Matched status" % len(result["FailedJobs"]) checkTime = str(dateTime() - self.rescheduledTime * second) result = self.jobDB.selectJobs({"Status": "Rescheduled"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Rescheduled status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: if message: message += "\n" message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len(result["FailedJobs"]) if message: return S_ERROR(message) else: return S_OK() def __failCompletedJobs(self): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str(dateTime() - self.completedTime * second) result = self.jobDB.selectJobs({"Status": "Completed"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttributes(jobID, ["Status", "MinorStatus"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) continue if result["Value"]["Status"] != "Completed": continue if result["Value"]["MinorStatus"] == "Pending Requests": continue result = self.__updateJobStatus(jobID, "Failed", "Job died during finalization") result = self.__sendAccounting(jobID) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) continue return S_OK()