class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if isinstance( resourceDescription, basestring ): classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO if credDict[ 'group' ] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get( 'VirtualOrganization', '' ) else: vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class StalledJobAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ jobDB = None logDB = None matchedTime = 7200 rescheduledTime = 600 completedTime = 86400 ############################################################################# def initialize( self ): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption( 'PollingTime', 60 * 60 ) if not self.am_getOption( 'Enable', True ): self.log.info( 'Stalled Job Agent running in disabled mode' ) return S_OK() ############################################################################# def execute( self ): """ The main agent execution method """ self.log.verbose( 'Waking up Stalled Job Agent' ) wms_instance = getSystemInstance( 'WorkloadManagement' ) if not wms_instance: return S_ERROR( 'Can not get the WorkloadManagement system instance' ) wrapperSection = cfgPath( 'Systems', 'WorkloadManagement', wms_instance, 'JobWrapper' ) stalledTime = self.am_getOption( 'StalledTimeHours', 2 ) failedTime = self.am_getOption( 'FailedTimeHours', 6 ) self.matchedTime = self.am_getOption( 'MatchedTime', self.matchedTime ) self.rescheduledTime = self.am_getOption( 'RescheduledTime', self.rescheduledTime ) self.completedTime = self.am_getOption( 'CompletedTime', self.completedTime ) self.log.verbose( 'StalledTime = %s cycles' % ( stalledTime ) ) self.log.verbose( 'FailedTime = %s cycles' % ( failedTime ) ) watchdogCycle = gConfig.getValue( cfgPath( wrapperSection , 'CheckingTime' ), 30 * 60 ) watchdogCycle = max( watchdogCycle, gConfig.getValue( cfgPath( wrapperSection , 'MinCheckingTime' ), 20 * 60 ) ) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * ( stalledTime + 0.5 ) failedTime = watchdogCycle * ( failedTime + 0.5 ) result = self.__markStalledJobs( stalledTime ) if not result['OK']: self.log.error( 'Failed to detect stalled jobs', result['Message'] ) #Note, jobs will be revived automatically during the heartbeat signal phase and #subsequent status changes will result in jobs not being selected by the #stalled job agent. result = self.__failStalledJobs( failedTime ) if not result['OK']: self.log.error( 'Failed to process stalled jobs', result['Message'] ) result = self.__failCompletedJobs() if not result['OK']: self.log.error( 'Failed to process completed jobs', result['Message'] ) result = self.__kickStuckJobs() if not result['OK']: self.log.error( 'Failed to kick stuck jobs', result['Message'] ) return S_OK( 'Stalled Job Agent cycle complete' ) ############################################################################# def __markStalledJobs( self, stalledTime ): """ Identifies stalled jobs running without update longer than stalledTime. """ stalledCounter = 0 runningCounter = 0 result = self.jobDB.selectJobs( {'Status':'Running'} ) if not result['OK']: return result if not result['Value']: return S_OK() jobs = result['Value'] self.log.info( '%s Running jobs will be checked for being stalled' % ( len( jobs ) ) ) jobs.sort() # jobs = jobs[:10] #for debugging for job in jobs: result = self.__getStalledJob( job, stalledTime ) if result['OK']: self.log.verbose( 'Updating status to Stalled for job %s' % ( job ) ) self.__updateJobStatus( job, 'Stalled' ) stalledCounter += 1 else: self.log.verbose( result['Message'] ) runningCounter += 1 self.log.info( 'Total jobs: %s, Stalled job count: %s, Running job count: %s' % ( len( jobs ), stalledCounter, runningCounter ) ) return S_OK() ############################################################################# def __failStalledJobs( self, failedTime ): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs( {'Status':'Stalled'} ) if not result['OK']: return result failedCounter = 0 if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be checked for failure' % ( len( jobs ) ) ) for job in jobs: # Check if the job pilot is lost result = self.__getJobPilotStatus( job ) if result['OK']: pilotStatus = result['Value'] if pilotStatus != "Running": result = self.__updateJobStatus( job, 'Failed', "Job stalled: pilot not running" ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) break continue result = self.__getLatestUpdateTime( job ) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate if elapsedTime > failedTime: self.__updateJobStatus( job, 'Failed', 'Stalling for more than %d sec' % failedTime ) failedCounter += 1 result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) break recoverCounter = 0 for minor in ["Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime]: result = self.jobDB.selectJobs( {'Status':'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' } ) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info( '%s Stalled jobs will be Accounted' % ( len( jobs ) ) ) for job in jobs: result = self.__sendAccounting( job ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info( '%d jobs set to Failed' % failedCounter ) if recoverCounter: self.log.info( '%d jobs properly Accounted' % recoverCounter ) return S_OK( failedCounter ) ############################################################################# def __getJobPilotStatus( self, jobID ): """ Get the job pilot status """ result = self.jobDB.getJobParameter( jobID, 'Pilot_Reference' ) if not result['OK']: return result if not result['Value']: return S_ERROR( 'Failed to get the pilot reference' ) pilotReference = result['Value'] wmsAdminClient = RPCClient( 'WorkloadManagement/WMSAdministrator' ) result = wmsAdminClient.getPilotInfo( pilotReference ) if not result['OK']: if "No pilots found" in result['Message']: self.log.warn( result['Message'] ) return S_OK( 'NoPilot' ) self.log.error( 'Failed to get pilot information', result['Message'] ) return S_ERROR( 'Failed to get the pilot status' ) pilotStatus = result['Value'][pilotReference]['Status'] return S_OK( pilotStatus ) ############################################################################# def __getStalledJob( self, job, stalledTime ): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime( job ) if not result['OK']: return result currentTime = toEpoch() lastUpdate = result['Value'] elapsedTime = currentTime - lastUpdate self.log.verbose( '(CurrentTime-LastUpdate) = %s secs' % ( elapsedTime ) ) if elapsedTime > stalledTime: self.log.info( 'Job %s is identified as stalled with last update > %s secs ago' % ( job, elapsedTime ) ) return S_OK( 'Stalled' ) return S_ERROR( 'Job %s is running and will be ignored' % job ) ############################################################################# def __getLatestUpdateTime( self, job ): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes( job, ['HeartBeatTime', 'LastUpdateTime'] ) if not result['OK']: self.log.error( 'Failed to get job attributes', result['Message'] ) if not result['OK'] or not result['Value']: self.log.error( 'Could not get attributes for job', '%s' % job ) return S_ERROR( 'Could not get attributes for job' ) self.log.verbose( result ) latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None': self.log.verbose( 'HeartBeatTime is null for job %s' % job ) else: latestUpdate = toEpoch( fromString( result['Value']['HeartBeatTime'] ) ) if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None': self.log.verbose( 'LastUpdateTime is null for job %s' % job ) else: lastUpdate = toEpoch( fromString( result['Value']['LastUpdateTime'] ) ) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR( 'LastUpdate and HeartBeat times are null for job %s' % job ) else: self.log.verbose( 'Latest update time from epoch for job %s is %s' % ( job, latestUpdate ) ) return S_OK( latestUpdate ) ############################################################################# def __updateJobStatus( self, job, status, minorstatus = None ): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose( "self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % ( job, status ) ) if self.am_getOption( 'Enable', True ): result = self.jobDB.setJobAttribute( job, 'Status', status, update = True ) else: result = S_OK( 'DisabledMode' ) if result['OK']: if minorstatus: self.log.verbose( "self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % ( job, minorstatus ) ) result = self.jobDB.setJobAttribute( job, 'MinorStatus', minorstatus, update = True ) if not minorstatus: #Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes( job, ['MinorStatus'] ) if result['OK']: minorstatus = result['Value']['MinorStatus'] logStatus = status result = self.logDB.addLoggingRecord( job, status = logStatus, minor = minorstatus, source = 'StalledJobAgent' ) if not result['OK']: self.log.warn( result ) return result def __getProcessingType( self, jobID ): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = 'unknown' result = self.jobDB.getJobJDL( jobID, original = True ) if not result['OK']: return processingType classAdJob = ClassAd( result['Value'] ) if classAdJob.lookupAttribute( 'ProcessingType' ): processingType = classAdJob.getAttributeString( 'ProcessingType' ) return processingType ############################################################################# def __sendAccounting( self, jobID ): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes( jobID ) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo( jobID, jobDict ) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat( jobID, jobDict ) lastHeartBeatTime = fromString( lastHeartBeatTime ) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter( jobID, 'CPUNormalizationFactor' ) if not cpuNormalization['OK'] or not cpuNormalization['Value']: cpuNormalization = 0.0 else: cpuNormalization = float( cpuNormalization['Value'] ) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % ( str( jobID ), str( endTime ), str( lastHeartBeatTime ) ), '' , False ) return S_ERROR( "Exception" ) processingType = self.__getProcessingType( jobID ) accountingReport.setStartTime( startTime ) accountingReport.setEndTime( endTime ) # execTime = toEpoch( endTime ) - toEpoch( startTime ) #Fill the accounting data acData = { 'Site' : jobDict['Site'], 'User' : jobDict['Owner'], 'UserGroup' : jobDict['OwnerGroup'], 'JobGroup' : jobDict['JobGroup'], 'JobType' : jobDict['JobType'], 'JobClass' : jobDict['JobSplitType'], 'ProcessingType' : processingType, 'FinalMajorStatus' : 'Failed', 'FinalMinorStatus' : 'Stalled', 'CPUTime' : lastCPUTime, 'NormCPUTime' : lastCPUTime * cpuNormalization, 'ExecTime' : lastWallTime, 'InputDataSize' : 0.0, 'OutputDataSize' : 0.0, 'InputDataFiles' : 0, 'OutputDataFiles' : 0, 'DiskSpace' : 0.0, 'InputSandBoxSize' : 0.0, 'OutputSandBoxSize' : 0.0, 'ProcessedEvents' : 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose( 'Accounting Report is:' ) self.log.verbose( acData ) accountingReport.setValuesFromDict( acData ) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute( jobID, 'AccountedFlag', 'True' ) else: self.log.error( 'Failed to send accounting report', 'Job: %d, Error: %s' % ( int( jobID ), result['Message'] ) ) return result def __checkHeartBeat( self, jobID, jobDict ): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData( jobID ) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result['OK']: for name, value, heartBeatTime in result['Value']: if 'CPUConsumed' == name: try: value = int( float( value ) ) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if 'WallClockTime' == name: try: value = int( float( value ) ) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo( self, jobID, jobDict ): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo( jobID ) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if type( startTime ) in types.StringTypes: startTime = fromString( startTime ) if startTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString( items[3] ) if endTime == None: self.log.error( 'Wrong timestamp in DB', items[3] ) endTime = dateTime() return startTime, endTime def __kickStuckJobs( self ): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = str( dateTime() - self.matchedTime * second ) result = self.jobDB.selectJobs( {'Status':'Matched'}, older = checkTime ) if not result['OK']: self.log.error( 'Failed to select jobs', result['Message'] ) return result jobIDs = result['Value'] if jobIDs: self.log.info( 'Rescheduling %d jobs stuck in Matched status' % len( jobIDs ) ) result = self.jobDB.rescheduleJobs( jobIDs ) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len( result['FailedJobs'] ) checkTime = str( dateTime() - self.rescheduledTime * second ) result = self.jobDB.selectJobs( {'Status':'Rescheduled'}, older = checkTime ) if not result['OK']: self.log.error( 'Failed to select jobs', result['Message'] ) return result jobIDs = result['Value'] if jobIDs: self.log.info( 'Rescheduling %d jobs stuck in Rescheduled status' % len( jobIDs ) ) result = self.jobDB.rescheduleJobs( jobIDs ) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len( result['FailedJobs'] ) if message: return S_ERROR( message ) else: return S_OK() def __failCompletedJobs( self ): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str( dateTime() - self.completedTime * second ) result = self.jobDB.selectJobs( {'Status':'Completed'}, older = checkTime ) if not result['OK']: self.log.error( 'Failed to select jobs', result['Message'] ) return result jobIDs = result['Value'] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttributes( jobID, ['Status','MinorStatus'] ) if not result['OK']: self.log.error( 'Failed to get job attributes', result['Message'] ) continue if result['Value']['Status'] != "Completed": continue if result['Value']['MinorStatus'] == "Pending Requests": continue result = self.__updateJobStatus( jobID, 'Failed', "Job died during finalization" ) result = self.__sendAccounting( jobID ) if not result['OK']: self.log.error( 'Failed to send accounting', result['Message'] ) continue return S_OK()
class OptimizerModule(AgentModule): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) self.jobDB = None self.logDB = None self.startingMinorStatus = None self.startingMajorStatus = "Checking" self.failedStatus = None self.requiredJobInfo = 'jdl' self._initResult = None def initialize(self, jobDB=None, logDB=None): """ Initialization of the Optimizer Agent. """ self.jobDB = JobDB() if jobDB is None else jobDB if not self.jobDB.isValid(): dExit(1) self.logDB = JobLoggingDB() if logDB is None else logDB optimizerName = self.am_getModuleParam('agentName') if optimizerName.endswith('Agent'): optimizerName = optimizerName[:-len('Agent')] self.am_setModuleParam('optimizerName', optimizerName) self.startingMinorStatus = self.am_getModuleParam('optimizerName') self.failedStatus = self.am_getOption("FailedJobStatus", 'Failed') self.am_setOption("PollingTime", 30) return self.initializeOptimizer() def initializeOptimizer(self): """ To be overwritten by inheriting class """ return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ result = self.initializeOptimizer() if not result['OK']: return result self._initResult = result['Value'] condition = {'Status': self.startingMajorStatus} if self.startingMinorStatus: condition['MinorStatus'] = self.startingMinorStatus result = self.jobDB.selectJobs(condition) if not result['OK']: self.log.warn('Failed to get a job list from the JobDB') return S_ERROR('Failed to get a job list from the JobDB') if not result['Value']: self.log.verbose('No pending jobs to process') return S_OK('No work to do') for job in result['Value']: result = self.getJobDefinition(job) if not result['OK']: self.setFailedJob(job, result['Message'], '') continue jobDef = result['Value'] result = self.optimizeJob(job, jobDef['classad']) return S_OK() ############################################################################# def optimizeJob(self, job, classAdJob): """ Call the corresponding Optimizer checkJob method """ self.log.info('Job %s will be processed by %sAgent' % (job, self.am_getModuleParam('optimizerName'))) result = self.checkJob(job, classAdJob) if not result['OK']: self.setFailedJob(job, result['Message'], classAdJob) return result ############################################################################# def getJobDefinition(self, job, jobDef=False): """ Retrieve JDL of the Job and return jobDef dictionary """ if not jobDef: jobDef = {} # If not jdl in jobinfo load it if 'jdl' not in jobDef: if self.requiredJobInfo == 'jdlOriginal': result = self.jobDB.getJobJDL(job, original=True) if not result['OK']: self.log.error("No JDL for job", "%s" % job) return S_ERROR("No JDL for job") jobDef['jdl'] = result['Value'] if self.requiredJobInfo == 'jdl': result = self.jobDB.getJobJDL(job) if not result['OK']: self.log.error("No JDL for job", "%s" % job) return S_ERROR("No JDL for job") jobDef['jdl'] = result['Value'] # Load the classad if needed if 'jdl' in jobDef and 'classad' not in jobDef: try: classad = ClassAd(jobDef['jdl']) except BaseException: self.log.debug("Cannot load JDL") return S_ERROR('Illegal Job JDL') if not classad.isOK(): self.log.debug("Warning: illegal JDL for job %s, will be marked problematic" % (job)) return S_ERROR('Illegal Job JDL') jobDef['classad'] = classad return S_OK(jobDef) ############################################################################# def getOptimizerJobInfo(self, job, reportName): """This method gets job optimizer information that will be used for """ self.log.verbose("self.jobDB.getJobOptParameter(%s,'%s')" % (job, reportName)) result = self.jobDB.getJobOptParameter(job, reportName) if result['OK']: value = result['Value'] if not value: self.log.warn('JobDB returned null value for %s %s' % (job, reportName)) return S_ERROR('No optimizer info returned') else: try: return S_OK(eval(value)) except BaseException as x: return S_ERROR('Could not evaluate optimizer parameters: %s' % repr(x)) return result ############################################################################# def setOptimizerJobInfo(self, job, reportName, value): """This method sets the job optimizer information that will subsequently be used for job scheduling and TURL queries on the WN. """ self.log.verbose("self.jobDB.setJobOptParameter(%s,'%s','%s')" % (job, reportName, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setJobOptParameter(job, reportName, str(value)) ############################################################################# def setOptimizerChain(self, job, value): """This method sets the job optimizer chain, in principle only needed by one of the optimizers. """ self.log.verbose("self.jobDB.setOptimizerChain(%s,%s)" % (job, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setOptimizerChain(job, value) ############################################################################# def setNextOptimizer(self, job): """This method is executed when the optimizer instance has successfully processed the job. The next optimizer in the chain will subsequently start to work on the job. """ result = self.logDB.addLoggingRecord(job, status=self.startingMajorStatus, minor=self.startingMinorStatus, source=self.am_getModuleParam("optimizerName")) if not result['OK']: self.log.warn(result['Message']) self.log.verbose("self.jobDB.setNextOptimizer(%s,'%s')" % (job, self.am_getModuleParam("optimizerName"))) return self.jobDB.setNextOptimizer(job, self.am_getModuleParam("optimizerName")) ############################################################################# def updateJobStatus(self, job, status, minorStatus=None, appStatus=None): """This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose("self.jobDB.setJobStatus(%s,'Status/Minor/Application','%s'/'%s'/'%s',update=True)" % (job, status, str(minorStatus), str(appStatus))) if not self.am_Enabled(): return S_OK() result = self.jobDB.setJobStatus(job, status, minorStatus, appStatus) if not result['OK']: return result result = self.logDB.addLoggingRecord(job, status=status, minor=minorStatus, application=appStatus, source=self.am_getModuleParam('optimizerName')) if not result['OK']: self.log.warn(result['Message']) return S_OK() ############################################################################# def setJobParam(self, job, reportName, value): """This method updates a job parameter in the JobDB. """ self.log.verbose("self.jobDB.setJobParameter(%s,'%s','%s')" % (job, reportName, value)) if not self.am_Enabled(): return S_OK() return self.jobDB.setJobParameter(job, reportName, value) ############################################################################# def setFailedJob(self, job, msg, classAdJob=None): """This method moves the job to the failed status """ self.log.verbose("self.updateJobStatus(%s,'%s','%s')" % (job, self.failedStatus, msg)) if not self.am_Enabled(): return S_OK() self.updateJobStatus(job, self.failedStatus, msg) if classAdJob: self.sendAccountingRecord(job, msg, classAdJob) ############################################################################# def checkJob(self, job, classad): """This method controls the checking of the job, should be overridden in a subclass """ self.log.warn('Optimizer: checkJob method should be implemented in a subclass') return S_ERROR('Optimizer: checkJob method should be implemented in a subclass') ############################################################################# def sendAccountingRecord(self, job, msg, classAdJob): """ Send and accounting record for the failed job """ accountingReport = AccountingJob() accountingReport.setStartTime() accountingReport.setEndTime() owner = classAdJob.getAttributeString('Owner') userGroup = classAdJob.getAttributeString('OwnerGroup') jobGroup = classAdJob.getAttributeString('JobGroup') jobType = classAdJob.getAttributeString('JobType') jobClass = 'unknown' if classAdJob.lookupAttribute('JobSplitType'): jobClass = classAdJob.getAttributeString('JobSplitType') inputData = [] processingType = 'unknown' if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') if classAdJob.lookupAttribute('InputData'): inputData = classAdJob.getListFromExpression('InputData') inputDataFiles = len(inputData) outputData = [] if classAdJob.lookupAttribute('OutputData'): outputData = classAdJob.getListFromExpression('OutputData') outputDataFiles = len(outputData) acData = { 'User': owner, 'UserGroup': userGroup, 'JobGroup': jobGroup, 'JobType': jobType, 'JobClass': jobClass, 'ProcessingType': processingType, 'FinalMajorStatus': self.failedStatus, 'FinalMinorStatus': msg, 'CPUTime': 0.0, 'NormCPUTime': 0.0, 'ExecTime': 0.0, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': inputDataFiles, 'OutputDataFiles': outputDataFiles, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0.0 } self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) return accountingReport.commit()
class JobState(object): def __init__(self, jid, source="Unknown"): self.__jid = jid self.__source = str(source) self.jobDB = JobDB() self.logDB = JobLoggingDB() self.tqDB = TaskQueueDB() @property def jid(self): return self.__jid def setSource(self, source): self.__source = source def __getDB(self): return self.jobDB def getManifest(self, rawData=False): result = self.__getDB().getJobJDL(self.__jid) if not result['OK'] or rawData: return result if not result['Value']: return S_ERROR("No manifest for job %s" % self.__jid) manifest = JobManifest() result = manifest.loadJDL(result['Value']) if not result['OK']: return result return S_OK(manifest) def setManifest(self, manifest): if not isinstance(manifest, JobManifest): manifestStr = manifest manifest = JobManifest() result = manifest.load(manifestStr) if not result['OK']: return result manifestJDL = manifest.dumpAsJDL() return self.__retryFunction(5, self.__getDB().setJobJDL, (self.__jid, manifestJDL)) # Execute traces def __retryFunction(self, retries, functor, args=False, kwargs=False): retries = max(1, retries) if not args: args = tuple() if not kwargs: kwargs = {} while retries: retries -= 1 result = functor(*args, **kwargs) if result['OK']: return result if retries == 0: return result return S_ERROR("No more retries") right_commitCache = RIGHT_GET_INFO def commitCache(self, initialState, cache, jobLog): try: self.__checkType(initialState, dict) self.__checkType(cache, dict) self.__checkType(jobLog, (list, tuple)) except TypeError as excp: return S_ERROR(str(excp)) result = self.getAttributes(initialState.keys()) if not result['OK']: return result if not result['Value'] == initialState: return S_OK(False) gLogger.verbose("Job %s: About to execute trace. Current state %s" % (self.__jid, initialState)) data = {'att': [], 'jobp': [], 'optp': []} for key in cache: for dk in data: if key.find("%s." % dk) == 0: data[dk].append((key[len(dk) + 1:], cache[key])) if data['att']: attN = [t[0] for t in data['att']] attV = [t[1] for t in data['att']] result = self.__retryFunction(5, self.jobDB.setJobAttributes, (self.__jid, attN, attV), {'update': True}) if not result['OK']: return result if data['jobp']: result = self.__retryFunction(5, self.jobDB.setJobParameters, (self.__jid, data['jobp'])) if not result['OK']: return result for k, v in data['optp']: result = self.__retryFunction(5, self.jobDB.setJobOptParameter, (self.__jid, k, v)) if not result['OK']: return result if 'inputData' in cache: result = self.__retryFunction(5, self.jobDB.setInputData, (self.__jid, cache['inputData'])) if not result['OK']: return result gLogger.verbose("Adding logging records for %s" % self.__jid) for record, updateTime, source in jobLog: gLogger.verbose("Logging records for %s: %s %s %s" % (self.__jid, record, updateTime, source)) record['date'] = updateTime record['source'] = source result = self.__retryFunction(5, self.logDB.addLoggingRecord, (self.__jid, ), record) if not result['OK']: return result gLogger.info("Job %s: Ended trace execution" % self.__jid) # We return a new initial state return self.getAttributes(initialState.keys()) # # Status # def __checkType(self, value, tList, canBeNone=False): """ Raise TypeError if the value does not have one of the expected types :param value: the value to test :param tList: type or tuple of types :param canBeNone: boolean, since there is no type for None to be used with isinstance """ if canBeNone: if value is None: return if not isinstance(value, tList): raise TypeError("%s has wrong type. Has to be one of %s" % (value, tList)) right_setStatus = RIGHT_GET_INFO def setStatus(self, majorStatus, minorStatus=None, appStatus=None, source=None, updateTime=None): try: self.__checkType(majorStatus, basestring) self.__checkType(minorStatus, basestring, canBeNone=True) self.__checkType(appStatus, basestring, canBeNone=True) self.__checkType(source, basestring, canBeNone=True) self.__checkType(updateTime, datetime.datetime, canBeNone=True) except TypeError as excp: return S_ERROR(str(excp)) result = self.jobDB.setJobStatus(self.__jid, majorStatus, minorStatus, appStatus) if not result['OK']: return result # HACK: Cause joblogging is crappy if not minorStatus: minorStatus = 'idem' if not source: source = self.__source return self.logDB.addLoggingRecord(self.__jid, majorStatus, minorStatus, appStatus, date=updateTime, source=source) right_getMinorStatus = RIGHT_GET_INFO def setMinorStatus(self, minorStatus, source=None, updateTime=None): try: self.__checkType(minorStatus, basestring) self.__checkType(source, basestring, canBeNone=True) except TypeError as excp: return S_ERROR(str(excp)) result = self.jobDB.setJobStatus(self.__jid, minor=minorStatus) if not result['OK']: return result if not source: source = self.__source return self.logDB.addLoggingRecord(self.__jid, minor=minorStatus, date=updateTime, source=source) def getStatus(self): result = self.jobDB.getJobAttributes(self.__jid, ['Status', 'MinorStatus']) if not result['OK']: return result data = result['Value'] if data: return S_OK((data['Status'], data['MinorStatus'])) return S_ERROR('Job %d not found in the JobDB' % int(self.__jid)) right_setAppStatus = RIGHT_GET_INFO def setAppStatus(self, appStatus, source=None, updateTime=None): try: self.__checkType(appStatus, basestring) self.__checkType(source, basestring, canBeNone=True) except TypeError as excp: return S_ERROR(str(excp)) result = self.jobDB.setJobStatus(self.__jid, application=appStatus) if not result['OK']: return result if not source: source = self.__source return self.logDB.addLoggingRecord(self.__jid, application=appStatus, date=updateTime, source=source) right_getAppStatus = RIGHT_GET_INFO def getAppStatus(self): result = self.jobDB.getJobAttributes(self.__jid, ['ApplicationStatus']) if result['OK']: result['Value'] = result['Value']['ApplicationStatus'] return result # Attributes right_setAttribute = RIGHT_GET_INFO def setAttribute(self, name, value): try: self.__checkType(name, basestring) self.__checkType(value, basestring) except TypeError as excp: return S_ERROR(str(excp)) return self.jobDB.setJobAttribute(self.__jid, name, value) right_setAttributes = RIGHT_GET_INFO def setAttributes(self, attDict): try: self.__checkType(attDict, dict) except TypeError as excp: return S_ERROR(str(excp)) keys = [key for key in attDict] values = [attDict[key] for key in keys] return self.jobDB.setJobAttributes(self.__jid, keys, values) right_getAttribute = RIGHT_GET_INFO def getAttribute(self, name): try: self.__checkType(name, basestring) except TypeError as excp: return S_ERROR(str(excp)) return self.jobDB.getJobAttribute(self.__jid, name) right_getAttributes = RIGHT_GET_INFO def getAttributes(self, nameList=None): try: self.__checkType(nameList, (list, tuple), canBeNone=True) except TypeError as excp: return S_ERROR(str(excp)) return self.jobDB.getJobAttributes(self.__jid, nameList) # OptimizerParameters right_setOptParameter = RIGHT_GET_INFO def setOptParameter(self, name, value): try: self.__checkType(name, basestring) self.__checkType(value, basestring) except TypeError as excp: return S_ERROR(str(excp)) return self.jobDB.setJobOptParameter(self.__jid, name, value) right_setOptParameters = RIGHT_GET_INFO def setOptParameters(self, pDict): try: self.__checkType(pDict, dict) except TypeError as excp: return S_ERROR(str(excp)) for name in pDict: result = self.jobDB.setJobOptParameter(self.__jid, name, pDict[name]) if not result['OK']: return result return S_OK() right_removeOptParameters = RIGHT_GET_INFO def removeOptParameters(self, nameList): if isinstance(nameList, basestring): nameList = [nameList] try: self.__checkType(nameList, (list, tuple)) except TypeError as excp: return S_ERROR(str(excp)) for name in nameList: result = self.jobDB.removeJobOptParameter(self.__jid, name) if not result['OK']: return result return S_OK() right_getOptParameter = RIGHT_GET_INFO def getOptParameter(self, name): try: self.__checkType(name, basestring) except TypeError as excp: return S_ERROR(str(excp)) return self.jobDB.getJobOptParameter(self.__jid, name) right_getOptParameters = RIGHT_GET_INFO def getOptParameters(self, nameList=None): try: self.__checkType(nameList, (list, tuple), canBeNone=True) except TypeError as excp: return S_ERROR(str(excp)) return self.jobDB.getJobOptParameters(self.__jid, nameList) # Other right_resetJob = RIGHT_RESCHEDULE def rescheduleJob(self, source=""): result = self.tqDB.deleteJob(self.__jid) if not result['OK']: return S_ERROR("Cannot delete from TQ job %s: %s" % (self.__jid, result['Message'])) result = self.jobDB.rescheduleJob(self.__jid) if not result['OK']: return S_ERROR("Cannot reschedule in JobDB job %s: %s" % (self.__jid, result['Message'])) self.logDB.addLoggingRecord(self.__jid, "Received", "", "", source=source) return S_OK() right_resetJob = RIGHT_RESET def resetJob(self, source=""): result = self.jobDB.setJobAttribute(self.__jid, "RescheduleCounter", -1) if not result['OK']: return S_ERROR("Cannot set the RescheduleCounter for job %s: %s" % (self.__jid, result['Message'])) result = self.tqDB.deleteJob(self.__jid) if not result['OK']: return S_ERROR("Cannot delete from TQ job %s: %s" % (self.__jid, result['Message'])) result = self.jobDB.rescheduleJob(self.__jid) if not result['OK']: return S_ERROR("Cannot reschedule in JobDB job %s: %s" % (self.__jid, result['Message'])) self.logDB.addLoggingRecord(self.__jid, "Received", "", "", source=source) return S_OK() right_getInputData = RIGHT_GET_INFO def getInputData(self): return self.jobDB.getInputData(self.__jid) @classmethod def checkInputDataStructure(cls, pDict): if not isinstance(pDict, dict): return S_ERROR("Input data has to be a dictionary") for lfn in pDict: if 'Replicas' not in pDict[lfn]: return S_ERROR("Missing replicas for lfn %s" % lfn) replicas = pDict[lfn]['Replicas'] for seName in replicas: if 'SURL' not in replicas or 'Disk' not in replicas: return S_ERROR( "Missing SURL or Disk for %s:%s replica" % (seName, lfn)) return S_OK() right_setInputData = RIGHT_GET_INFO def set_InputData(self, lfnData): result = self.checkInputDataStructure(lfnData) if not result['OK']: return result return self.jobDB.setInputData(self.__jid, lfnData) right_insertIntoTQ = RIGHT_CHANGE_STATUS def insertIntoTQ(self, manifest=None): if not manifest: result = self.getManifest() if not result['OK']: return result manifest = result['Value'] reqSection = "JobRequirements" result = manifest.getSection(reqSection) if not result['OK']: return S_ERROR("No %s section in the job manifest" % reqSection) reqCfg = result['Value'] jobReqDict = {} for name in singleValueDefFields: if name in reqCfg: if name == 'CPUTime': jobReqDict[name] = int(reqCfg[name]) else: jobReqDict[name] = reqCfg[name] for name in multiValueDefFields: if name in reqCfg: jobReqDict[name] = reqCfg.getOption(name, []) jobPriority = reqCfg.getOption('UserPriority', 1) result = self.__retryFunction(2, self.tqDB.insertJob, (self.__jid, jobReqDict, jobPriority)) if not result['OK']: errMsg = result['Message'] # Force removing the job from the TQ if it was actually inserted result = self.tqDB.deleteJob(self.__jid) if result['OK']: if result['Value']: gLogger.info("Job %s removed from the TQ" % self.__jid) return S_ERROR("Cannot insert in task queue: %s" % errMsg) return S_OK()
class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if type( resourceDescription ) in StringTypes: classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ # Check credentials if not generic pilot if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class StalledJobAgent(AgentModule): """ Agent for setting Running jobs Stalled, and Stalled jobs Failed. And a few more. """ def __init__(self, *args, **kwargs): """ c'tor """ super(StalledJobAgent, self).__init__(*args, **kwargs) self.jobDB = None self.logDB = None self.matchedTime = 7200 self.rescheduledTime = 600 self.submittingTime = 300 self.stalledJobsTolerantSites = [] self.jobsQueue = Queue() ############################################################################# def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption('PollingTime', 60 * 60) if not self.am_getOption('Enable', True): self.log.info('Stalled Job Agent running in disabled mode') # setting up the threading maxNumberOfThreads = self.am_getOption('MaxNumberOfThreads', 15) threadPool = ThreadPool(maxNumberOfThreads, maxNumberOfThreads) self.log.verbose("Multithreaded with %d threads" % maxNumberOfThreads) for _ in range(maxNumberOfThreads): threadPool.generateJobAndQueueIt(self._execute) return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ self.log.debug('Waking up Stalled Job Agent') # getting parameters wms_instance = getSystemInstance('WorkloadManagement') if not wms_instance: return S_ERROR('Can not get the WorkloadManagement system instance') wrapperSection = cfgPath('Systems', 'WorkloadManagement', wms_instance, 'JobWrapper') stalledTime = self.am_getOption('StalledTimeHours', 2) failedTime = self.am_getOption('FailedTimeHours', 6) self.stalledJobsTolerantSites = self.am_getOption('StalledJobsTolerantSites', []) self.stalledJobsToleranceTime = self.am_getOption('StalledJobsToleranceTime', 0) self.submittingTime = self.am_getOption('SubmittingTime', self.submittingTime) self.matchedTime = self.am_getOption('MatchedTime', self.matchedTime) self.rescheduledTime = self.am_getOption('RescheduledTime', self.rescheduledTime) self.log.verbose('', 'StalledTime = %s cycles' % (stalledTime)) self.log.verbose('', 'FailedTime = %s cycles' % (failedTime)) watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, 'CheckingTime'), 30 * 60) watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, 'MinCheckingTime'), 20 * 60)) # Add half cycle to avoid race conditions self.stalledTime = int(watchdogCycle * (stalledTime + 0.5)) self.failedTime = int(watchdogCycle * (failedTime + 0.5)) self.minorStalledStatuses = ( JobMinorStatus.STALLED_PILOT_NOT_RUNNING, 'Stalling for more than %d sec' % self.failedTime) # Now we are getting what's going to be checked # 1) Queueing the jobs that might be marked Stalled # This is the minimum time we wait for declaring a job Stalled, therefore it is safe checkTime = dateTime() - self.stalledTime * second checkedStatuses = [JobStatus.RUNNING, JobStatus.COMPLETING] # Only get jobs whose HeartBeat is older than the stalledTime result = self.jobDB.selectJobs({'Status': checkedStatuses}, older=checkTime, timeStamp='HeartBeatTime') if not result['OK']: self.log.error("Issue selecting %s jobs" % ' & '.join(checkedStatuses), result['Message']) if result['Value']: jobs = sorted(result['Value']) self.log.info('%s jobs will be checked for being stalled' % ' & '.join(checkedStatuses), '(n=%d, heartbeat before %s)' % (len(jobs), str(checkTime))) for job in jobs: self.jobsQueue.put('%s:_markStalledJobs' % job) # 2) Queueing the Stalled jobs that might be marked Failed result = self.jobDB.selectJobs({'Status': JobStatus.STALLED}) if not result['OK']: self.log.error("Issue selecting Stalled jobs", result['Message']) if result['Value']: jobs = sorted(result['Value']) self.log.info('Jobs Stalled will be checked for failure', '(n=%d)' % len(jobs)) for job in jobs: self.jobsQueue.put('%s:_failStalledJobs' % job) # 3) Send accounting for minor in self.minorStalledStatuses: result = self.jobDB.selectJobs({'Status': JobStatus.FAILED, 'MinorStatus': minor, 'AccountedFlag': 'False'}) if not result['OK']: self.log.error("Issue selecting jobs for accounting", result['Message']) if result['Value']: jobs = result['Value'] self.log.info('Stalled jobs will be Accounted', '(n=%d)' % (len(jobs))) for job in jobs: self.jobsQueue.put('%s:__sendAccounting' % job) # From here on we don't use the threads # 4) Fail submitting jobs result = self._failSubmittingJobs() if not result['OK']: self.log.error('Failed to process jobs being submitted', result['Message']) # 5) Kick stuck jobs result = self._kickStuckJobs() if not result['OK']: self.log.error('Failed to kick stuck jobs', result['Message']) return S_OK() def _execute(self): """ Doing the actual job. This is run inside the threads """ while True: job_Op = self.jobsQueue.get() jobID, jobOp = job_Op.split(':') jobID = int(jobID) res = getattr(self, '%s' % jobOp)(jobID) if not res['OK']: self.log.error("Failure executing %s" % jobOp, "on %d: %s" % (jobID, res['Message'])) ############################################################################# def _markStalledJobs(self, jobID): """ Identifies if JobID is stalled: running or completing without update longer than stalledTime. Run inside thread. """ delayTime = self.stalledTime # Add a tolerance time for some sites if required result = self.jobDB.getJobAttribute(jobID, 'site') if not result['OK']: return result site = result['Value'] if site in self.stalledJobsTolerantSites: delayTime += self.stalledJobsToleranceTime # Check if the job is really stalled result = self.__checkJobStalled(jobID, delayTime) if not result['OK']: return result self.log.verbose('Updating status to Stalled', 'for job %s' % (jobID)) return self.__updateJobStatus(jobID, JobStatus.STALLED) ############################################################################# def _failStalledJobs(self, jobID): """ Changes the Stalled status to Failed for jobs long in the Stalled status. Run inside thread. """ setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(jobID) if not result['OK']: self.log.error('Failed to get pilot status', "for job %d: %s" % (jobID, result['Message'])) return result pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = self.minorStalledStatuses[0] else: # Verify that there was no sign of life for long enough result = self.__getLatestUpdateTime(jobID) if not result['OK']: self.log.error('Failed to get job update time', "for job %d: %s" % (jobID, result['Message'])) return result elapsedTime = toEpoch() - result['Value'] if elapsedTime > self.failedTime: setFailed = self.minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead # and send accounting info if setFailed: self.__sendKillCommand(jobID) # always returns None return self.__updateJobStatus(jobID, JobStatus.FAILED, minorStatus=setFailed) return S_OK() def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = JobMonitoringClient().getJobParameter(jobID, 'Pilot_Reference') if not result['OK']: return result pilotReference = result['Value'].get('Pilot_Reference', 'Unknown') if pilotReference == 'Unknown': # There is no pilot reference, hence its status is unknown return S_OK('NoPilot') result = PilotManagerClient().getPilotInfo(pilotReference) if not result['OK']: if DErrno.cmpError(result, DErrno.EWMSNOPILOT): self.log.warn("No pilot found", "for job %d: %s" % (jobID, result['Message'])) return S_OK('NoPilot') self.log.error('Failed to get pilot information', 'for job %d: %s' % (jobID, result['Message'])) return result pilotStatus = result['Value'][pilotReference]['Status'] return S_OK(pilotStatus) ############################################################################# def __checkJobStalled(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result['OK']: return result elapsedTime = toEpoch() - result['Value'] self.log.debug('(CurrentTime-LastUpdate) = %s secs' % (elapsedTime)) if elapsedTime > stalledTime: self.log.info('Job is identified as stalled', ": jobID %d with last update > %s secs ago" % (job, elapsedTime)) return S_OK('Stalled') return S_ERROR('Job %s is running and will be ignored' % job) ############################################################################# def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes(job, ['HeartBeatTime', 'LastUpdateTime']) if not result['OK'] or not result['Value']: self.log.error('Failed to get job attributes', 'for job %d: %s' % (job, result['Message'] if 'Message' in result else 'empty')) return S_ERROR('Could not get attributes for job') latestUpdate = 0 if not result['Value']['HeartBeatTime'] or result['Value']['HeartBeatTime'] == 'None': self.log.verbose('HeartBeatTime is null', 'for job %s' % job) else: latestUpdate = toEpoch(fromString(result['Value']['HeartBeatTime'])) if not result['Value']['LastUpdateTime'] or result['Value']['LastUpdateTime'] == 'None': self.log.verbose('LastUpdateTime is null', 'for job %s' % job) else: latestUpdate = max(latestUpdate, toEpoch(fromString(result['Value']['LastUpdateTime']))) if not latestUpdate: return S_ERROR('LastUpdate and HeartBeat times are null for job %s' % job) else: self.log.verbose('', 'Latest update time from epoch for job %s is %s' % (job, latestUpdate)) return S_OK(latestUpdate) ############################################################################# def __updateJobStatus(self, job, status, minorStatus=None): """ This method updates the job status in the JobDB """ if not self.am_getOption('Enable', True): return S_OK('Disabled') toRet = S_OK() self.log.debug("self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status)) result = self.jobDB.setJobAttribute(job, 'Status', status, update=True) if not result['OK']: self.log.error("Failed setting Status", "%s for job %d: %s" % (status, job, result['Message'])) toRet = result if minorStatus: self.log.debug("self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorStatus)) result = self.jobDB.setJobAttribute(job, 'MinorStatus', minorStatus, update=True) if not result['OK']: self.log.error("Failed setting MinorStatus", "%s for job %d: %s" % (minorStatus, job, result['Message'])) toRet = result if not minorStatus: # Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes(job, ['MinorStatus']) if result['OK']: minorStatus = result['Value']['MinorStatus'] else: self.log.error("Failed getting MinorStatus", "for job %d: %s" % (job, result['Message'])) minorStatus = 'idem' toRet = result result = self.logDB.addLoggingRecord(job, status=status, minorStatus=minorStatus, source='StalledJobAgent') if not result['OK']: self.log.warn("Failed adding logging record", result['Message']) toRet = result return toRet def __getProcessingType(self, jobID): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = 'unknown' result = self.jobDB.getJobJDL(jobID, original=True) if not result['OK']: return processingType classAdJob = ClassAd(result['Value']) if classAdJob.lookupAttribute('ProcessingType'): processingType = classAdJob.getAttributeString('ProcessingType') return processingType def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job. Run inside thread. """ try: accountingReport = Job() endTime = 'Unknown' lastHeartBeatTime = 'Unknown' result = self.jobDB.getJobAttributes(jobID) if not result['OK']: return result jobDict = result['Value'] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime result = JobMonitoringClient().getJobParameter(jobID, 'CPUNormalizationFactor') if not result['OK'] or not result['Value']: self.log.error( 'Error getting Job Parameter CPUNormalizationFactor, setting 0', result.get( 'Message', 'No such value')) cpuNormalization = 0.0 else: cpuNormalization = float(result['Value'].get('CPUNormalizationFactor')) except Exception as e: self.log.exception("Exception in __sendAccounting", "for job=%s: endTime=%s, lastHBTime=%s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), lException=e) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = {'Site': jobDict['Site'], 'User': jobDict['Owner'], 'UserGroup': jobDict['OwnerGroup'], 'JobGroup': jobDict['JobGroup'], 'JobType': jobDict['JobType'], 'JobClass': jobDict['JobSplitType'], 'ProcessingType': processingType, 'FinalMajorStatus': 'Failed', 'FinalMinorStatus': 'Stalled', 'CPUTime': lastCPUTime, 'NormCPUTime': lastCPUTime * cpuNormalization, 'ExecTime': lastWallTime, 'InputDataSize': 0.0, 'OutputDataSize': 0.0, 'InputDataFiles': 0, 'OutputDataFiles': 0, 'DiskSpace': 0.0, 'InputSandBoxSize': 0.0, 'OutputSandBoxSize': 0.0, 'ProcessedEvents': 0 } # For accidentally stopped jobs ExecTime can be not set if not acData['ExecTime']: acData['ExecTime'] = acData['CPUTime'] elif acData['ExecTime'] < acData['CPUTime']: acData['ExecTime'] = acData['CPUTime'] self.log.verbose('Accounting Report is:') self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result['OK']: self.jobDB.setJobAttribute(jobID, 'AccountedFlag', 'True') else: self.log.error('Failed to send accounting report', 'Job: %d, Error: %s' % (int(jobID), result['Message'])) return result def __checkHeartBeat(self, jobID, jobDict): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData(jobID) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict['StartExecTime'] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result['OK']: for name, value, heartBeatTime in result['Value']: if name == 'CPUConsumed': try: value = int(float(value)) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if name == 'WallClockTime': try: value = int(float(value)) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result['OK']: logList = result['Value'] startTime = jobDict['StartExecTime'] if not startTime or startTime == 'None': # status, minor, app, stime, source for items in logList: if items[0] == 'Running': startTime = items[3] break if not startTime or startTime == 'None': startTime = jobDict['SubmissionTime'] if isinstance(startTime, six.string_types): startTime = fromString(startTime) if startTime is None: self.log.error('Wrong timestamp in DB', items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == 'Stalled': endTime = fromString(items[3]) if endTime is None: self.log.error('Wrong timestamp in DB', items[3]) endTime = dateTime() return startTime, endTime def _kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = '' checkTime = dateTime() - self.matchedTime * second result = self.jobDB.selectJobs({'Status': JobStatus.MATCHED}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Matched status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: message = 'Failed to reschedule %d jobs stuck in Matched status' % len(result['FailedJobs']) checkTime = dateTime() - self.rescheduledTime * second result = self.jobDB.selectJobs({'Status': 'Rescheduled'}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result jobIDs = result['Value'] if jobIDs: self.log.info('Rescheduling %d jobs stuck in Rescheduled status' % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if 'FailedJobs' in result: if message: message += '\n' message += 'Failed to reschedule %d jobs stuck in Rescheduled status' % len(result['FailedJobs']) if message: return S_ERROR(message) return S_OK() def _failSubmittingJobs(self): """ Failed Jobs stuck in Submitting Status for a long time. They are due to a failed bulk submission transaction. """ # Get old Submitting Jobs checkTime = dateTime() - self.submittingTime * second result = self.jobDB.selectJobs({'Status': JobStatus.SUBMITTING}, older=checkTime) if not result['OK']: self.log.error('Failed to select jobs', result['Message']) return result for jobID in result['Value']: result = self.__updateJobStatus(jobID, JobStatus.FAILED) if not result['OK']: self.log.error('Failed to update job status', result['Message']) continue return S_OK() def __sendKillCommand(self, job): """Send a kill signal to the job such that it cannot continue running. :param int job: ID of job to send kill command """ ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN') ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup') if ownerDN['OK'] and ownerGroup['OK']: wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN['Value'], delegatedGroup=ownerGroup['Value']) resKill = wmsClient.killJob(job) if not resKill['OK']: self.log.error("Failed to send kill command to job", "%s: %s" % (job, resKill['Message'])) else: self.log.error("Failed to get ownerDN or Group for job:", "%s: %s, %s" % (job, ownerDN.get('Message', ''), ownerGroup.get('Message', '')))
class Matcher(object): """Logic for matching""" def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None, pilotRef=None): """c'tor""" if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() if pilotRef: self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) else: self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper, pilotRef=pilotRef) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """Main job selection function to find the highest priority job matching the resource capacity""" startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"] if "NumberOfProcessors" in resourceDescription: toPrintDict["NumberOfProcessors"] = resourceDescription[ "NumberOfProcessors"] toPrintDict["Tag"] = [] if "Tag" in resourceDict: for tag in resourceDict["Tag"]: if not tag.endswith("GB") and not tag.endswith("Processors"): toPrintDict["Tag"].append(tag) if not toPrintDict["Tag"]: toPrintDict.pop("Tag") self.log.info("Resource description for matching", printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict["Site"], resourceDict.get("GridCE")) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result["OK"]: raise RuntimeError(result["Message"]) result = result["Value"] if not result["matchFound"]: self.log.info("No match found") return {} jobID = result["jobId"] resAtt = self.jobDB.getJobAttributes( jobID, ["OwnerDN", "OwnerGroup", "Status"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if not resAtt["Value"]["Status"] == "Waiting": self.log.error("Job matched by the TQ is not in Waiting state", str(jobID)) result = self.tqDB.deleteJob(jobID) if not result["OK"]: raise RuntimeError(result["Message"]) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result["OK"]: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict["JDL"] = result["Value"] resultDict["JobID"] = jobID matchTime = time.time() - startTime self.log.verbose("Match time", "[%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt["OK"]: for key, value in resOpt["Value"].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict["Site"], jobID) pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag", False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict["DN"] = resAtt["Value"]["OwnerDN"] resultDict["Group"] = resAtt["Value"]["OwnerGroup"] resultDict["PilotInfoReportedFlag"] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """from resourceDescription to resourceDict (just various mods)""" resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict["JobType"] = "Test" self.log.verbose("Resource description") for key in resourceDict: self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get("Tag"): tags = resourceDescription["Tag"] resourceDict["Tag"] = (tags if isinstance(tags, list) else list( {tag.strip("\"' ") for tag in tags.strip("[]").split(",")})) if "RequiredTag" in resourceDescription: requiredTagsList = (list({ tag.strip("\"' ") for tag in resourceDescription["RequiredTag"].strip( "[]").split(",") }) if isinstance(resourceDescription["RequiredTag"], str) else resourceDescription["RequiredTag"]) resourceDict["RequiredTag"] = requiredTagsList if "JobID" in resourceDescription: resourceDict["JobID"] = resourceDescription["JobID"] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get("MaxRAM") if maxRAM: try: maxRAM = int(maxRAM / 1000) except ValueError: maxRAM = None nProcessors = resourceDescription.get("NumberOfProcessors") if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]: if param and param <= 1024: paramList = list(range(2, param + 1)) paramTags = ["%d%s" % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors and nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if "Tag" in resourceDict: resourceDict["Tag"] = list(set(resourceDict["Tag"])) if "RequiredTag" in resourceDict: resourceDict["RequiredTag"] = list(set( resourceDict["RequiredTag"])) for k in ( "DIRACVersion", "ReleaseVersion", "ReleaseProject", "VirtualOrganization", "PilotReference", "PilotBenchmark", "PilotInfoReportedFlag", ): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"] attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result["OK"]: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Set job attributes for jobID", jobID) result = self.jlDB.addLoggingRecord(jobID, status=JobStatus.MATCHED, minorStatus="Assigned", source="Matcher") if not result["OK"]: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Added logging record for jobID", jobID) def _checkMask(self, resourceDict): """Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if "Site" not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict["Site"]) if not result["OK"]: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result["Message"]) raise RuntimeError("Internal error") if resourceDict["Site"] not in result["Value"]: return False return True def _updatePilotInfo(self, resourceDict): """Update pilot information - do not fail if we don't manage to do it""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": gridCE = resourceDict.get("GridCE", "Unknown") site = resourceDict.get("Site", "Unknown") benchmark = resourceDict.get("PilotBenchmark", 0.0) self.log.verbose( "Reporting pilot info", "for %s: gridCE=%s, site=%s, benchmark=%f" % (pilotReference, gridCE, site, benchmark), ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status=PilotStatus.RUNNING, gridSite=site, destination=gridCE, benchmark=benchmark) if not result["OK"]: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _updatePilotJobMapping(self, resourceDict, jobID): """Update pilot to job mapping information""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result["OK"]: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result["OK"]: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _checkCredentials(self, resourceDict, credDict): """Check if we can get a job given the passed credentials""" if Properties.GENERIC_PILOT in credDict["properties"]: # You can only match groups in the same VO if credDict["group"] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get("VirtualOrganization", "") else: vo = Registry.getVOForGroup(credDict["group"]) if "OwnerGroup" not in resourceDict: result = Registry.getGroupsForVO(vo) if result["OK"]: resourceDict["OwnerGroup"] = result["Value"] else: raise RuntimeError(result["Message"]) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict["properties"]: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict["OwnerDN"] = credDict["DN"] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict["properties"]: resourceDict["OwnerGroup"] = credDict["group"] self.log.notice( "Setting the resource group to the credentials group") if "OwnerDN" in resourceDict and resourceDict[ "OwnerDN"] != credDict["DN"]: ownerDN = resourceDict["OwnerDN"] result = Registry.getGroupsForDN(resourceDict["OwnerDN"]) if not result["OK"]: raise RuntimeError(result["Message"]) if credDict["group"] not in result["Value"]: # DN is not in the same group! bad boy. self.log.warn( "You cannot request jobs from this DN, as it does not belong to your group!", "(%s)" % ownerDN, ) resourceDict["OwnerDN"] = credDict["DN"] # Nothing special, group and DN have to be the same else: resourceDict["OwnerDN"] = credDict["DN"] resourceDict["OwnerGroup"] = credDict["group"] return resourceDict def _checkPilotVersion(self, resourceDict): """Check the pilot DIRAC version""" if self.opsHelper.getValue("Pilot/CheckVersion", True): if "ReleaseVersion" not in resourceDict: if "DIRACVersion" not in resourceDict: raise PilotVersionError( "Version check requested and not provided by Pilot") else: pilotVersion = resourceDict["DIRACVersion"] else: pilotVersion = resourceDict["ReleaseVersion"] validVersions = [ convertToPy3VersionNumber(newStyleVersion) for newStyleVersion in self.opsHelper.getValue( "Pilot/Version", []) ] if validVersions and convertToPy3VersionNumber( pilotVersion) not in validVersions: raise PilotVersionError( "Pilot version does not match the production version: %s not in ( %s )" % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if "ReleaseProject" not in resourceDict: raise PilotVersionError( "Version check requested but expected project %s not received" % validProject) if resourceDict["ReleaseProject"] != validProject: raise PilotVersionError( "Version check requested but expected project %s != received %s" % (validProject, resourceDict["ReleaseProject"]))
class StalledJobAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ jobDB = None logDB = None matchedTime = 7200 rescheduledTime = 600 completedTime = 86400 ############################################################################# def initialize(self): """Sets default parameters """ self.jobDB = JobDB() self.logDB = JobLoggingDB() self.am_setOption("PollingTime", 60 * 60) if not self.am_getOption("Enable", True): self.log.info("Stalled Job Agent running in disabled mode") return S_OK() ############################################################################# def execute(self): """ The main agent execution method """ self.log.verbose("Waking up Stalled Job Agent") wms_instance = getSystemInstance("WorkloadManagement") if not wms_instance: return S_ERROR("Can not get the WorkloadManagement system instance") wrapperSection = cfgPath("Systems", "WorkloadManagement", wms_instance, "JobWrapper") stalledTime = self.am_getOption("StalledTimeHours", 2) failedTime = self.am_getOption("FailedTimeHours", 6) self.matchedTime = self.am_getOption("MatchedTime", self.matchedTime) self.rescheduledTime = self.am_getOption("RescheduledTime", self.rescheduledTime) self.completedTime = self.am_getOption("CompletedTime", self.completedTime) self.log.verbose("StalledTime = %s cycles" % (stalledTime)) self.log.verbose("FailedTime = %s cycles" % (failedTime)) watchdogCycle = gConfig.getValue(cfgPath(wrapperSection, "CheckingTime"), 30 * 60) watchdogCycle = max(watchdogCycle, gConfig.getValue(cfgPath(wrapperSection, "MinCheckingTime"), 20 * 60)) # Add half cycle to avoid race conditions stalledTime = watchdogCycle * (stalledTime + 0.5) failedTime = watchdogCycle * (failedTime + 0.5) result = self.__markStalledJobs(stalledTime) if not result["OK"]: self.log.error("Failed to detect stalled jobs", result["Message"]) # Note, jobs will be revived automatically during the heartbeat signal phase and # subsequent status changes will result in jobs not being selected by the # stalled job agent. result = self.__failStalledJobs(failedTime) if not result["OK"]: self.log.error("Failed to process stalled jobs", result["Message"]) result = self.__failCompletedJobs() if not result["OK"]: self.log.error("Failed to process completed jobs", result["Message"]) result = self.__kickStuckJobs() if not result["OK"]: self.log.error("Failed to kick stuck jobs", result["Message"]) return S_OK("Stalled Job Agent cycle complete") ############################################################################# def __markStalledJobs(self, stalledTime): """ Identifies stalled jobs running without update longer than stalledTime. """ stalledCounter = 0 runningCounter = 0 result = self.jobDB.selectJobs({"Status": "Running"}) if not result["OK"]: return result if not result["Value"]: return S_OK() jobs = result["Value"] self.log.info("%s Running jobs will be checked for being stalled" % (len(jobs))) jobs.sort() # jobs = jobs[:10] #for debugging for job in jobs: result = self.__getStalledJob(job, stalledTime) if result["OK"]: self.log.verbose("Updating status to Stalled for job %s" % (job)) self.__updateJobStatus(job, "Stalled") stalledCounter += 1 else: self.log.verbose(result["Message"]) runningCounter += 1 self.log.info( "Total jobs: %s, Stalled job count: %s, Running job count: %s" % (len(jobs), stalledCounter, runningCounter) ) return S_OK() ############################################################################# def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({"Status": "Stalled"}) if not result["OK"]: return result jobs = result["Value"] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", "Stalling for more than %d sec" % failedTime) if jobs: self.log.info("%s Stalled jobs will be checked for failure" % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result["OK"]: self.log.error("Failed to get pilot status", result["Message"]) continue pilotStatus = result["Value"] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result["OK"]: self.log.error("Failed to get job update time", result["Message"]) continue elapsedTime = toEpoch() - result["Value"] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: # Send a kill signal to the job such that it cannot continue running WMSClient().killJob(job) self.__updateJobStatus(job, "Failed", setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({"Status": "Failed", "MinorStatus": minor, "AccountedFlag": "False"}) if not result["OK"]: return result if result["Value"]: jobs = result["Value"] self.log.info("%s Stalled jobs will be Accounted" % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) continue recoverCounter += 1 if not result["OK"]: break if failedCounter: self.log.info("%d jobs set to Failed" % failedCounter) if recoverCounter: self.log.info("%d jobs properly Accounted" % recoverCounter) return S_OK(failedCounter) ############################################################################# def __getJobPilotStatus(self, jobID): """ Get the job pilot status """ result = self.jobDB.getJobParameter(jobID, "Pilot_Reference") if not result["OK"]: return result if not result["Value"]: return S_ERROR("Failed to get the pilot reference") pilotReference = result["Value"] wmsAdminClient = RPCClient("WorkloadManagement/WMSAdministrator") result = wmsAdminClient.getPilotInfo(pilotReference) if not result["OK"]: if "No pilots found" in result["Message"]: self.log.warn(result["Message"]) return S_OK("NoPilot") self.log.error("Failed to get pilot information", result["Message"]) return S_ERROR("Failed to get the pilot status") pilotStatus = result["Value"][pilotReference]["Status"] return S_OK(pilotStatus) ############################################################################# def __getStalledJob(self, job, stalledTime): """ Compares the most recent of LastUpdateTime and HeartBeatTime against the stalledTime limit. """ result = self.__getLatestUpdateTime(job) if not result["OK"]: return result currentTime = toEpoch() lastUpdate = result["Value"] elapsedTime = currentTime - lastUpdate self.log.verbose("(CurrentTime-LastUpdate) = %s secs" % (elapsedTime)) if elapsedTime > stalledTime: self.log.info("Job %s is identified as stalled with last update > %s secs ago" % (job, elapsedTime)) return S_OK("Stalled") return S_ERROR("Job %s is running and will be ignored" % job) ############################################################################# def __getLatestUpdateTime(self, job): """ Returns the most recent of HeartBeatTime and LastUpdateTime """ result = self.jobDB.getJobAttributes(job, ["HeartBeatTime", "LastUpdateTime"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) if not result["OK"] or not result["Value"]: self.log.error("Could not get attributes for job", "%s" % job) return S_ERROR("Could not get attributes for job") self.log.verbose(result) latestUpdate = 0 if not result["Value"]["HeartBeatTime"] or result["Value"]["HeartBeatTime"] == "None": self.log.verbose("HeartBeatTime is null for job %s" % job) else: latestUpdate = toEpoch(fromString(result["Value"]["HeartBeatTime"])) if not result["Value"]["LastUpdateTime"] or result["Value"]["LastUpdateTime"] == "None": self.log.verbose("LastUpdateTime is null for job %s" % job) else: lastUpdate = toEpoch(fromString(result["Value"]["LastUpdateTime"])) if latestUpdate < lastUpdate: latestUpdate = lastUpdate if not latestUpdate: return S_ERROR("LastUpdate and HeartBeat times are null for job %s" % job) else: self.log.verbose("Latest update time from epoch for job %s is %s" % (job, latestUpdate)) return S_OK(latestUpdate) ############################################################################# def __updateJobStatus(self, job, status, minorstatus=None): """ This method updates the job status in the JobDB, this should only be used to fail jobs due to the optimizer chain. """ self.log.verbose("self.jobDB.setJobAttribute(%s,'Status','%s',update=True)" % (job, status)) if self.am_getOption("Enable", True): result = self.jobDB.setJobAttribute(job, "Status", status, update=True) else: result = S_OK("DisabledMode") if result["OK"]: if minorstatus: self.log.verbose("self.jobDB.setJobAttribute(%s,'MinorStatus','%s',update=True)" % (job, minorstatus)) result = self.jobDB.setJobAttribute(job, "MinorStatus", minorstatus, update=True) if not minorstatus: # Retain last minor status for stalled jobs result = self.jobDB.getJobAttributes(job, ["MinorStatus"]) if result["OK"]: minorstatus = result["Value"]["MinorStatus"] logStatus = status result = self.logDB.addLoggingRecord(job, status=logStatus, minor=minorstatus, source="StalledJobAgent") if not result["OK"]: self.log.warn(result) return result def __getProcessingType(self, jobID): """ Get the Processing Type from the JDL, until it is promoted to a real Attribute """ processingType = "unknown" result = self.jobDB.getJobJDL(jobID, original=True) if not result["OK"]: return processingType classAdJob = ClassAd(result["Value"]) if classAdJob.lookupAttribute("ProcessingType"): processingType = classAdJob.getAttributeString("ProcessingType") return processingType ############################################################################# def __sendAccounting(self, jobID): """ Send WMS accounting data for the given job """ try: accountingReport = Job() endTime = "Unknown" lastHeartBeatTime = "Unknown" result = self.jobDB.getJobAttributes(jobID) if not result["OK"]: return result jobDict = result["Value"] startTime, endTime = self.__checkLoggingInfo(jobID, jobDict) lastCPUTime, lastWallTime, lastHeartBeatTime = self.__checkHeartBeat(jobID, jobDict) lastHeartBeatTime = fromString(lastHeartBeatTime) if lastHeartBeatTime is not None and lastHeartBeatTime > endTime: endTime = lastHeartBeatTime cpuNormalization = self.jobDB.getJobParameter(jobID, "CPUNormalizationFactor") if not cpuNormalization["OK"] or not cpuNormalization["Value"]: cpuNormalization = 0.0 else: cpuNormalization = float(cpuNormalization["Value"]) except Exception: self.log.exception( "Exception in __sendAccounting for job %s: endTime=%s, lastHBTime %s" % (str(jobID), str(endTime), str(lastHeartBeatTime)), "", False, ) return S_ERROR("Exception") processingType = self.__getProcessingType(jobID) accountingReport.setStartTime(startTime) accountingReport.setEndTime(endTime) # execTime = toEpoch( endTime ) - toEpoch( startTime ) # Fill the accounting data acData = { "Site": jobDict["Site"], "User": jobDict["Owner"], "UserGroup": jobDict["OwnerGroup"], "JobGroup": jobDict["JobGroup"], "JobType": jobDict["JobType"], "JobClass": jobDict["JobSplitType"], "ProcessingType": processingType, "FinalMajorStatus": "Failed", "FinalMinorStatus": "Stalled", "CPUTime": lastCPUTime, "NormCPUTime": lastCPUTime * cpuNormalization, "ExecTime": lastWallTime, "InputDataSize": 0.0, "OutputDataSize": 0.0, "InputDataFiles": 0, "OutputDataFiles": 0, "DiskSpace": 0.0, "InputSandBoxSize": 0.0, "OutputSandBoxSize": 0.0, "ProcessedEvents": 0, } # For accidentally stopped jobs ExecTime can be not set if not acData["ExecTime"]: acData["ExecTime"] = acData["CPUTime"] elif acData["ExecTime"] < acData["CPUTime"]: acData["ExecTime"] = acData["CPUTime"] self.log.verbose("Accounting Report is:") self.log.verbose(acData) accountingReport.setValuesFromDict(acData) result = accountingReport.commit() if result["OK"]: self.jobDB.setJobAttribute(jobID, "AccountedFlag", "True") else: self.log.error("Failed to send accounting report", "Job: %d, Error: %s" % (int(jobID), result["Message"])) return result def __checkHeartBeat(self, jobID, jobDict): """ Get info from HeartBeat """ result = self.jobDB.getHeartBeatData(jobID) lastCPUTime = 0 lastWallTime = 0 lastHeartBeatTime = jobDict["StartExecTime"] if lastHeartBeatTime == "None": lastHeartBeatTime = 0 if result["OK"]: for name, value, heartBeatTime in result["Value"]: if "CPUConsumed" == name: try: value = int(float(value)) if value > lastCPUTime: lastCPUTime = value except ValueError: pass if "WallClockTime" == name: try: value = int(float(value)) if value > lastWallTime: lastWallTime = value except ValueError: pass if heartBeatTime > lastHeartBeatTime: lastHeartBeatTime = heartBeatTime return lastCPUTime, lastWallTime, lastHeartBeatTime def __checkLoggingInfo(self, jobID, jobDict): """ Get info from JobLogging """ logList = [] result = self.logDB.getJobLoggingInfo(jobID) if result["OK"]: logList = result["Value"] startTime = jobDict["StartExecTime"] if not startTime or startTime == "None": # status, minor, app, stime, source for items in logList: if items[0] == "Running": startTime = items[3] break if not startTime or startTime == "None": startTime = jobDict["SubmissionTime"] if type(startTime) in types.StringTypes: startTime = fromString(startTime) if startTime == None: self.log.error("Wrong timestamp in DB", items[3]) startTime = dateTime() endTime = dateTime() # status, minor, app, stime, source for items in logList: if items[0] == "Stalled": endTime = fromString(items[3]) if endTime == None: self.log.error("Wrong timestamp in DB", items[3]) endTime = dateTime() return startTime, endTime def __kickStuckJobs(self): """ Reschedule jobs stuck in initialization status Rescheduled, Matched """ message = "" checkTime = str(dateTime() - self.matchedTime * second) result = self.jobDB.selectJobs({"Status": "Matched"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Matched status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: message = "Failed to reschedule %d jobs stuck in Matched status" % len(result["FailedJobs"]) checkTime = str(dateTime() - self.rescheduledTime * second) result = self.jobDB.selectJobs({"Status": "Rescheduled"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if jobIDs: self.log.info("Rescheduling %d jobs stuck in Rescheduled status" % len(jobIDs)) result = self.jobDB.rescheduleJobs(jobIDs) if "FailedJobs" in result: if message: message += "\n" message += "Failed to reschedule %d jobs stuck in Rescheduled status" % len(result["FailedJobs"]) if message: return S_ERROR(message) else: return S_OK() def __failCompletedJobs(self): """ Failed Jobs stuck in Completed Status for a long time. They are due to pilots being killed during the finalization of the job execution. """ # Get old Completed Jobs checkTime = str(dateTime() - self.completedTime * second) result = self.jobDB.selectJobs({"Status": "Completed"}, older=checkTime) if not result["OK"]: self.log.error("Failed to select jobs", result["Message"]) return result jobIDs = result["Value"] if not jobIDs: return S_OK() # Remove those with Minor Status "Pending Requests" for jobID in jobIDs: result = self.jobDB.getJobAttributes(jobID, ["Status", "MinorStatus"]) if not result["OK"]: self.log.error("Failed to get job attributes", result["Message"]) continue if result["Value"]["Status"] != "Completed": continue if result["Value"]["MinorStatus"] == "Pending Requests": continue result = self.__updateJobStatus(jobID, "Failed", "Job died during finalization") result = self.__sendAccounting(jobID) if not result["OK"]: self.log.error("Failed to send accounting", result["Message"]) continue return S_OK()