class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if isinstance( resourceDescription, basestring ): classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO if credDict[ 'group' ] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get( 'VirtualOrganization', '' ) else: vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption( "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", ")) ) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200) self.jobByJob = self.am_getOption("JobByJob", True) self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0) return S_OK() def __getAllowedJobTypes(self): # Get valid jobTypes result = self.jobDB.getDistinctJobAttributes("JobType") if not result["OK"]: return result cleanJobTypes = [] for jobType in result["Value"]: if jobType not in self.prod_types: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """The PilotAgent execution method. """ # Delete jobs in "Deleted" state result = self.removeJobsByStatus({"Status": "Deleted"}) if not result["OK"]: return result # Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result["OK"]: return result baseCond = {"JobType": result["Value"]} # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[status] condDict = dict(baseCond) condDict["Status"] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result["OK"]: gLogger.warn("Failed to remove jobs in status %s" % status) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict) if not result["OK"]: return result jobList = result["Value"] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[: self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: gLogger.warn("Cannot unassign jobs to sandboxes", result["Message"]) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultJobDB["OK"]: gLogger.warn("Failed to remove job %d from JobDB" % jobID, result["Message"]) error_count += 1 elif not resultTQ["OK"]: gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, result["Message"]) error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result["OK"]: gLogger.error("Failed to delete %d jobs from JobDB" % len(jobList)) else: gLogger.info("Deleted %d jobs from JobDB" % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ["OK"]: gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, resultTQ["Message"]) error_count += 1 else: count += 1 if count > 0 or error_count > 0: gLogger.info("Deleted %d jobs from JobDB, %d errors" % (count, error_count)) return S_OK()
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class Matcher( object ): """ Logic for matching """ def __init__( self, pilotAgentsDB = None, jobDB = None, tqDB = None, jlDB = None, opsHelper = None ): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger( "Matcher" ) self.limiter = Limiter( jobDB = self.jobDB, opsHelper = self.opsHelper ) def selectJob( self, resourceDescription, credDict ): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict( resourceDescription, credDict ) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site'] ) result = self.tqDB.matchAndGetJob( resourceDict, negativeCond = negativeCond ) if not result['OK']: return result result = result['Value'] if not result['matchFound']: self.log.info( "No match found" ) raise RuntimeError( "No match found" ) jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( "No attributes returned for job" ) if not resAtt['Value']['Status'] == 'Waiting': self.log.error( 'Job matched by the TQ is not in Waiting state', str( jobID ) ) result = self.tqDB.deleteJob( jobID ) if not result[ 'OK' ]: return result raise RuntimeError( "Job %s is not in Waiting state" % str( jobID ) ) self._reportStatus( resourceDict, jobID ) result = self.jobDB.getJobJDL( jobID ) if not result['OK']: raise RuntimeError( "Failed to get the job JDL" ) resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info( "Match time: [%s]" % str( matchTime ) ) gMonitor.addMark( "matchTime", matchTime ) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters( jobID ) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not resAtt['OK']: raise RuntimeError( 'Could not retrieve job attributes' ) if not resAtt['Value']: raise RuntimeError( 'No attributes returned for job' ) if self.opsHelper.getValue( "JobScheduling/CheckMatchingDelay", True ): self.limiter.updateDelayCounters( resourceDict['Site'], jobID ) pilotInfoReportedFlag = resourceDict.get( 'PilotInfoReportedFlag', False ) if not pilotInfoReportedFlag: self._updatePilotInfo( resourceDict ) self._updatePilotJobMapping( resourceDict, jobID ) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict( self, resourceDescription, credDict ): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription( resourceDescription ) resourceDict = self._checkCredentials( resourceDict, credDict ) self._checkPilotVersion( resourceDict ) if not self._checkMask( resourceDict ): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose( "Resource description:" ) for key in resourceDict: self.log.verbose( "%s : %s" % ( key.rjust( 20 ), resourceDict[ key ] ) ) return resourceDict def _processResourceDescription( self, resourceDescription ): """ Check and form the resource description dictionary resourceDescription is a ceDict coming from a JobAgent, for example. """ resourceDict = {} if type( resourceDescription ) in StringTypes: classAdAgent = ClassAd( resourceDescription ) if not classAdAgent.isOK(): raise ValueError( 'Illegal Resource JDL' ) self.log.verbose( classAdAgent.asJDL() ) for name in singleValueDefFields: if classAdAgent.lookupAttribute( name ): if name == 'CPUTime': resourceDict[name] = classAdAgent.getAttributeInt( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) for name in multiValueMatchFields: if classAdAgent.lookupAttribute( name ): if name == 'SubmitPool': resourceDict[name] = classAdAgent.getListFromExpression( name ) else: resourceDict[name] = classAdAgent.getAttributeString( name ) # Check if a JobID is requested if classAdAgent.lookupAttribute( 'JobID' ): resourceDict['JobID'] = classAdAgent.getAttributeInt( 'JobID' ) for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization' ): if classAdAgent.lookupAttribute( k ): resourceDict[ k ] = classAdAgent.getAttributeString( k ) else: for name in singleValueDefFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if resourceDescription.has_key( name ): resourceDict[name] = resourceDescription[name] if resourceDescription.has_key( 'JobID' ): resourceDict['JobID'] = resourceDescription['JobID'] for k in ( 'DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag' ): if k in resourceDescription: resourceDict[ k ] = resourceDescription[ k ] return resourceDict def _reportStatus( self, resourceDict, jobID ): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes( jobID, attNames, attValues ) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Set job attributes for jobID %s" % jobID ) result = self.jlDB.addLoggingRecord( jobID, status = 'Matched', minor = 'Assigned', source = 'Matcher' ) if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % ( jobID, result['Message'] ) ) else: self.log.verbose( "Added logging record for jobID %s" % jobID ) def _checkMask( self, resourceDict ): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if not 'Site' in resourceDict: self.log.error( "Missing Site Name in Resource JDL" ) raise RuntimeError( "Missing Site Name in Resource JDL" ) # Get common site mask and check the agent site result = self.jobDB.getSiteMask( siteState = 'Active' ) if not result['OK']: self.log.error( "Internal error", "getSiteMask: %s" % result['Message'] ) raise RuntimeError( "Internal error" ) maskList = result['Value'] if resourceDict['Site'] not in maskList: return False return True def _updatePilotInfo( self, resourceDict ): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: gridCE = resourceDict.get( 'GridCE', 'Unknown' ) site = resourceDict.get( 'Site', 'Unknown' ) benchmark = resourceDict.get( 'PilotBenchmark', 0.0 ) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % ( pilotReference, gridCE, site, benchmark ) ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status = 'Running', gridSite = site, destination = gridCE, benchmark = benchmark ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _updatePilotJobMapping( self, resourceDict, jobID ): """ Update pilot to job mapping information """ pilotReference = resourceDict.get( 'PilotReference', '' ) if pilotReference: result = self.pilotAgentsDB.setCurrentJobID( pilotReference, jobID ) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) result = self.pilotAgentsDB.setJobForPilot( jobID, pilotReference, updateStatus = False ) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % ( pilotReference, result['Message'] ) ) def _checkCredentials( self, resourceDict, credDict ): """ Check if we can get a job given the passed credentials """ # Check credentials if not generic pilot if Properties.GENERIC_PILOT in credDict[ 'properties' ]: # You can only match groups in the same VO vo = Registry.getVOForGroup( credDict[ 'group' ] ) result = Registry.getGroupsForVO( vo ) if result[ 'OK' ]: resourceDict[ 'OwnerGroup' ] = result[ 'Value' ] else: raise RuntimeError( result['Message'] ) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict[ 'properties' ]: self.log.notice( "Setting the resource DN to the credentials DN" ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict[ 'properties' ]: resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] self.log.notice( "Setting the resource group to the credentials group" ) if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN' ] != credDict[ 'DN' ]: ownerDN = resourceDict[ 'OwnerDN' ] result = Registry.getGroupsForDN( resourceDict[ 'OwnerDN' ] ) if not result[ 'OK' ]: raise RuntimeError( result['Message'] ) if credDict[ 'group' ] not in result[ 'Value' ]: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN ) resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] # Nothing special, group and DN have to be the same else: resourceDict[ 'OwnerDN' ] = credDict[ 'DN' ] resourceDict[ 'OwnerGroup' ] = credDict[ 'group' ] return resourceDict def _checkPilotVersion( self, resourceDict ): """ Check the pilot DIRAC version """ if self.opsHelper.getValue( "Pilot/CheckVersion", True ): if 'ReleaseVersion' not in resourceDict: if not 'DIRACVersion' in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot' ) else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue( "Pilot/Version", [] ) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % \ ( pilotVersion, ",".join( validVersions ) ) ) # Check project if requested validProject = self.opsHelper.getValue( "Pilot/Project", "" ) if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject ) if resourceDict[ 'ReleaseProject' ] != validProject: raise RuntimeError( "Version check requested but expected project %s != received %s" % ( validProject, resourceDict[ 'ReleaseProject' ] ) )
class Matcher(object): """ Logic for matching """ def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None): """ c'tor """ if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """ Main job selection function to find the highest priority job matching the resource capacity """ startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict['MaxRAM'] = resourceDescription['MaxRAM'] if "NumberOfProcessors" in resourceDescription: toPrintDict['NumberOfProcessors'] = resourceDescription[ 'NumberOfProcessors'] toPrintDict['Tag'] = [] if "Tag" in resourceDict: for tag in resourceDict['Tag']: if not tag.endswith('GB') and not tag.endswith('Processors'): toPrintDict['Tag'].append(tag) if not toPrintDict['Tag']: toPrintDict.pop('Tag') gLogger.info('Resource description for matching', printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict['Site']) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result['OK']: raise RuntimeError(result['Message']) result = result['Value'] if not result['matchFound']: self.log.info("No match found") return {} jobID = result['jobId'] resAtt = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup', 'Status']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError("No attributes returned for job") if not resAtt['Value']['Status'] == 'Waiting': self.log.error('Job matched by the TQ is not in Waiting state', str(jobID)) result = self.tqDB.deleteJob(jobID) if not result['OK']: raise RuntimeError(result['Message']) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result['OK']: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict['JDL'] = result['Value'] resultDict['JobID'] = jobID matchTime = time.time() - startTime self.log.info("Match time: [%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt['OK']: for key, value in resOpt['Value'].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not resAtt['OK']: raise RuntimeError('Could not retrieve job attributes') if not resAtt['Value']: raise RuntimeError('No attributes returned for job') if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict['Site'], jobID) pilotInfoReportedFlag = resourceDict.get('PilotInfoReportedFlag', False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict['DN'] = resAtt['Value']['OwnerDN'] resultDict['Group'] = resAtt['Value']['OwnerGroup'] resultDict['PilotInfoReportedFlag'] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """ from resourceDescription to resourceDict (just various mods) """ resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict['JobType'] = 'Test' self.log.verbose("Resource description:") for key in resourceDict: self.log.verbose("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """ Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get('Tag'): resourceDict['Tag'] = resourceDescription['Tag'] if 'RequiredTag' in resourceDescription: resourceDict['RequiredTag'] = resourceDescription[ 'RequiredTag'] if 'JobID' in resourceDescription: resourceDict['JobID'] = resourceDescription['JobID'] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get('MaxRAM') if maxRAM: try: maxRAM = int(maxRAM) / 1000 except ValueError: maxRAM = None nProcessors = resourceDescription.get('NumberOfProcessors') if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, 'GB'), (nProcessors, 'Processors')]: if param and param <= 128: paramList = range(2, param + 1) paramTags = ['%d%s' % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if 'Tag' in resourceDict: resourceDict['Tag'] = list(set(resourceDict['Tag'])) for k in ('DIRACVersion', 'ReleaseVersion', 'ReleaseProject', 'VirtualOrganization', 'PilotReference', 'PilotBenchmark', 'PilotInfoReportedFlag'): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """ Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ['Status', 'MinorStatus', 'ApplicationStatus', 'Site'] attValues = ['Matched', 'Assigned', 'Unknown', resourceDict['Site']] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result['OK']: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Set job attributes for jobID %s" % jobID) result = self.jlDB.addLoggingRecord(jobID, status='Matched', minor='Assigned', source='Matcher') if not result['OK']: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result['Message'])) else: self.log.verbose("Added logging record for jobID %s" % jobID) def _checkMask(self, resourceDict): """ Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if 'Site' not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict['Site']) if not result['OK']: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result['Message']) raise RuntimeError("Internal error") if resourceDict['Site'] not in result['Value']: return False return True def _updatePilotInfo(self, resourceDict): """ Update pilot information - do not fail if we don't manage to do it """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: gridCE = resourceDict.get('GridCE', 'Unknown') site = resourceDict.get('Site', 'Unknown') benchmark = resourceDict.get('PilotBenchmark', 0.0) self.log.verbose( 'Reporting pilot info for %s: gridCE=%s, site=%s, benchmark=%f' % (pilotReference, gridCE, site, benchmark)) result = self.pilotAgentsDB.setPilotStatus(pilotReference, status='Running', gridSite=site, destination=gridCE, benchmark=benchmark) if not result['OK']: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _updatePilotJobMapping(self, resourceDict, jobID): """ Update pilot to job mapping information """ pilotReference = resourceDict.get('PilotReference', '') if pilotReference: result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result['OK']: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result['Message'])) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result['OK']: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result['Message'])) def _checkCredentials(self, resourceDict, credDict): """ Check if we can get a job given the passed credentials """ if Properties.GENERIC_PILOT in credDict['properties']: # You can only match groups in the same VO if credDict['group'] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get('VirtualOrganization', '') else: vo = Registry.getVOForGroup(credDict['group']) if 'OwnerGroup' not in resourceDict: result = Registry.getGroupsForVO(vo) if result['OK']: resourceDict['OwnerGroup'] = result['Value'] else: raise RuntimeError(result['Message']) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict['properties']: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict['OwnerDN'] = credDict['DN'] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict['properties']: resourceDict['OwnerGroup'] = credDict['group'] self.log.notice( "Setting the resource group to the credentials group") if 'OwnerDN' in resourceDict and resourceDict[ 'OwnerDN'] != credDict['DN']: ownerDN = resourceDict['OwnerDN'] result = Registry.getGroupsForDN(resourceDict['OwnerDN']) if not result['OK']: raise RuntimeError(result['Message']) if credDict['group'] not in result['Value']: # DN is not in the same group! bad boy. self.log.notice( "You cannot request jobs from DN %s. It does not belong to your group!" % ownerDN) resourceDict['OwnerDN'] = credDict['DN'] # Nothing special, group and DN have to be the same else: resourceDict['OwnerDN'] = credDict['DN'] resourceDict['OwnerGroup'] = credDict['group'] return resourceDict def _checkPilotVersion(self, resourceDict): """ Check the pilot DIRAC version """ if self.opsHelper.getValue("Pilot/CheckVersion", True): if 'ReleaseVersion' not in resourceDict: if 'DIRACVersion' not in resourceDict: raise RuntimeError( 'Version check requested and not provided by Pilot') else: pilotVersion = resourceDict['DIRACVersion'] else: pilotVersion = resourceDict['ReleaseVersion'] validVersions = self.opsHelper.getValue("Pilot/Version", []) if validVersions and pilotVersion not in validVersions: raise RuntimeError( 'Pilot version does not match the production version %s not in ( %s )' % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if 'ReleaseProject' not in resourceDict: raise RuntimeError( "Version check requested but expected project %s not received" % validProject) if resourceDict['ReleaseProject'] != validProject: raise RuntimeError( "Version check requested \ but expected project %s != received %s" % (validProject, resourceDict['ReleaseProject']))
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__(self, *args, **kwargs): """ c'tor """ AgentModule.__init__(self, *args, **kwargs) # clients self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.prodTypes = [] self.removeStatusDelay = {} self.removeStatusDelayHB = {} ############################################################################# def initialize(self): """ Sets defaults """ self.am_setOption("PollingTime", 120) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) self.log.info( "Will exclude the following Production types from cleaning %s" % (', '.join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 500) self.jobByJob = self.am_getOption('JobByJob', False) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1) self.removeStatusDelayHB['Done'] = self.am_getOption( 'RemoveStatusDelayHB/Done', -1) self.removeStatusDelayHB['Killed'] = self.am_getOption( 'RemoveStatusDelayHB/Killed', -1) self.removeStatusDelayHB['Failed'] = self.am_getOption( 'RemoveStatusDelayHB/Failed', -1) self.maxHBJobsAtOnce = self.am_getOption('MaxHBJobsAtOnce', 0) return S_OK() def _getAllowedJobTypes(self): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes('JobType') if not result['OK']: return result cleanJobTypes = [] for jobType in result['Value']: if jobType not in self.prodTypes: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """ Remove jobs in various status """ # Delete jobs in "Deleted" state result = self.removeJobsByStatus({'Status': 'Deleted'}) if not result['OK']: return result # Get all the Job types that can be cleaned result = self._getAllowedJobTypes() if not result['OK']: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = {'JobType': result['Value']} # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[status] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict(baseCond) if status != 'Any': condDict['Status'] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result['OK']: self.log.warn('Failed to remove jobs in status %s' % status) if self.maxHBJobsAtOnce > 0: for status, delay in self.removeStatusDelayHB.items(): if delay > 0: self.removeHeartBeatLoggingInfo(status, delay) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: self.log.verbose("Removing jobs with %s and older than %s day(s)" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay, limit=self.maxJobsAtOnce) else: self.log.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict, limit=self.maxJobsAtOnce) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result['OK']: self.log.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result['OK']: self.log.error("Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) resultLogDB = self.jobLoggingDB.deleteJob(jobID) errorFlag = False if not resultJobDB['OK']: self.log.warn('Failed to remove job %d from JobDB' % jobID, result['Message']) errorFlag = True if not resultTQ['OK']: self.log.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message']) errorFlag = True if not resultLogDB['OK']: self.log.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message']) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result['OK']: self.log.error('Failed to delete %d jobs from JobDB' % len(jobList)) else: self.log.info('Deleted %d jobs from JobDB' % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ['OK']: self.log.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message']) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob(jobList) if not result['OK']: self.log.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList)) else: self.log.info('Deleted %d jobs from JobLoggingDB' % len(jobList)) if count > 0 or error_count > 0: self.log.info('Deleted %d jobs from JobDB, %d errors' % (count, error_count)) return S_OK() def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} result = JobMonitoringClient().getJobParameters( jobIDList, 'OutputSandboxLFN') if not result['OK']: return result osLFNDict = result['Value'] if not osLFNDict: return S_OK({'Successful': successful, 'Failed': failed}) osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1]) self.log.verbose("Deleting oversized sandboxes", osLFNDict) # Schedule removal of the LFNs now for jobID, outputSandboxLFNdict in osLFNDict.items( ): # can be an iterator lfn = outputSandboxLFNdict['OutputSandboxLFN'] result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result) def __setRemovalRequest(self, lfn, ownerDN, ownerGroup): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile(removedFile) oRequest.addOperation(removeFile) return ReqClient().putRequest(oRequest) def removeHeartBeatLoggingInfo(self, status, delayDays): """Remove HeartBeatLoggingInfo for jobs with given status after given number of days. :param str status: Job Status :param int delayDays: number of days after which information is removed :returns: None """ self.log.info( "Removing HeartBeatLoggingInfo for Jobs with %s and older than %s day(s)" % (status, delayDays)) delTime = str(Time.dateTime() - delayDays * Time.day) result = self.jobDB.removeInfoFromHeartBeatLogging( status, delTime, self.maxHBJobsAtOnce) if not result['OK']: self.log.error('Failed to delete from HeartBeatLoggingInfo', result['Message']) else: self.log.info('Deleted HeartBeatLogging info') return
class Matcher(object): """Logic for matching""" def __init__(self, pilotAgentsDB=None, jobDB=None, tqDB=None, jlDB=None, opsHelper=None, pilotRef=None): """c'tor""" if pilotAgentsDB: self.pilotAgentsDB = pilotAgentsDB else: self.pilotAgentsDB = PilotAgentsDB() if jobDB: self.jobDB = jobDB else: self.jobDB = JobDB() if tqDB: self.tqDB = tqDB else: self.tqDB = TaskQueueDB() if jlDB: self.jlDB = jlDB else: self.jlDB = JobLoggingDB() if opsHelper: self.opsHelper = opsHelper else: self.opsHelper = Operations() if pilotRef: self.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.pilotAgentsDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jobDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.tqDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) self.jlDB.log = gLogger.getSubLogger("[%s]Matcher" % pilotRef) else: self.log = gLogger.getSubLogger("Matcher") self.limiter = Limiter(jobDB=self.jobDB, opsHelper=self.opsHelper, pilotRef=pilotRef) self.siteClient = SiteStatus() def selectJob(self, resourceDescription, credDict): """Main job selection function to find the highest priority job matching the resource capacity""" startTime = time.time() resourceDict = self._getResourceDict(resourceDescription, credDict) # Make a nice print of the resource matching parameters toPrintDict = dict(resourceDict) if "MaxRAM" in resourceDescription: toPrintDict["MaxRAM"] = resourceDescription["MaxRAM"] if "NumberOfProcessors" in resourceDescription: toPrintDict["NumberOfProcessors"] = resourceDescription[ "NumberOfProcessors"] toPrintDict["Tag"] = [] if "Tag" in resourceDict: for tag in resourceDict["Tag"]: if not tag.endswith("GB") and not tag.endswith("Processors"): toPrintDict["Tag"].append(tag) if not toPrintDict["Tag"]: toPrintDict.pop("Tag") self.log.info("Resource description for matching", printDict(toPrintDict)) negativeCond = self.limiter.getNegativeCondForSite( resourceDict["Site"], resourceDict.get("GridCE")) result = self.tqDB.matchAndGetJob(resourceDict, negativeCond=negativeCond) if not result["OK"]: raise RuntimeError(result["Message"]) result = result["Value"] if not result["matchFound"]: self.log.info("No match found") return {} jobID = result["jobId"] resAtt = self.jobDB.getJobAttributes( jobID, ["OwnerDN", "OwnerGroup", "Status"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if not resAtt["Value"]["Status"] == "Waiting": self.log.error("Job matched by the TQ is not in Waiting state", str(jobID)) result = self.tqDB.deleteJob(jobID) if not result["OK"]: raise RuntimeError(result["Message"]) raise RuntimeError("Job %s is not in Waiting state" % str(jobID)) self._reportStatus(resourceDict, jobID) result = self.jobDB.getJobJDL(jobID) if not result["OK"]: raise RuntimeError("Failed to get the job JDL") resultDict = {} resultDict["JDL"] = result["Value"] resultDict["JobID"] = jobID matchTime = time.time() - startTime self.log.verbose("Match time", "[%s]" % str(matchTime)) gMonitor.addMark("matchTime", matchTime) # Get some extra stuff into the response returned resOpt = self.jobDB.getJobOptParameters(jobID) if resOpt["OK"]: for key, value in resOpt["Value"].items(): resultDict[key] = value resAtt = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not resAtt["OK"]: raise RuntimeError("Could not retrieve job attributes") if not resAtt["Value"]: raise RuntimeError("No attributes returned for job") if self.opsHelper.getValue("JobScheduling/CheckMatchingDelay", True): self.limiter.updateDelayCounters(resourceDict["Site"], jobID) pilotInfoReportedFlag = resourceDict.get("PilotInfoReportedFlag", False) if not pilotInfoReportedFlag: self._updatePilotInfo(resourceDict) self._updatePilotJobMapping(resourceDict, jobID) resultDict["DN"] = resAtt["Value"]["OwnerDN"] resultDict["Group"] = resAtt["Value"]["OwnerGroup"] resultDict["PilotInfoReportedFlag"] = True return resultDict def _getResourceDict(self, resourceDescription, credDict): """from resourceDescription to resourceDict (just various mods)""" resourceDict = self._processResourceDescription(resourceDescription) resourceDict = self._checkCredentials(resourceDict, credDict) self._checkPilotVersion(resourceDict) if not self._checkMask(resourceDict): # Banned destinations can only take Test jobs resourceDict["JobType"] = "Test" self.log.verbose("Resource description") for key in resourceDict: self.log.debug("%s : %s" % (key.rjust(20), resourceDict[key])) return resourceDict def _processResourceDescription(self, resourceDescription): """Check and form the resource description dictionary :param resourceDescription: a ceDict coming from a JobAgent, for example. :return: updated dictionary of resource description parameters """ resourceDict = {} for name in singleValueDefFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] for name in multiValueMatchFields: if name in resourceDescription: resourceDict[name] = resourceDescription[name] if resourceDescription.get("Tag"): tags = resourceDescription["Tag"] resourceDict["Tag"] = (tags if isinstance(tags, list) else list( {tag.strip("\"' ") for tag in tags.strip("[]").split(",")})) if "RequiredTag" in resourceDescription: requiredTagsList = (list({ tag.strip("\"' ") for tag in resourceDescription["RequiredTag"].strip( "[]").split(",") }) if isinstance(resourceDescription["RequiredTag"], str) else resourceDescription["RequiredTag"]) resourceDict["RequiredTag"] = requiredTagsList if "JobID" in resourceDescription: resourceDict["JobID"] = resourceDescription["JobID"] # Convert MaxRAM and NumberOfProcessors parameters into a list of tags maxRAM = resourceDescription.get("MaxRAM") if maxRAM: try: maxRAM = int(maxRAM / 1000) except ValueError: maxRAM = None nProcessors = resourceDescription.get("NumberOfProcessors") if nProcessors: try: nProcessors = int(nProcessors) except ValueError: nProcessors = None for param, key in [(maxRAM, "GB"), (nProcessors, "Processors")]: if param and param <= 1024: paramList = list(range(2, param + 1)) paramTags = ["%d%s" % (par, key) for par in paramList] if paramTags: resourceDict.setdefault("Tag", []).extend(paramTags) # Add 'MultiProcessor' to the list of tags if nProcessors and nProcessors > 1: resourceDict.setdefault("Tag", []).append("MultiProcessor") # Add 'WholeNode' to the list of tags if "WholeNode" in resourceDescription: resourceDict.setdefault("Tag", []).append("WholeNode") if "Tag" in resourceDict: resourceDict["Tag"] = list(set(resourceDict["Tag"])) if "RequiredTag" in resourceDict: resourceDict["RequiredTag"] = list(set( resourceDict["RequiredTag"])) for k in ( "DIRACVersion", "ReleaseVersion", "ReleaseProject", "VirtualOrganization", "PilotReference", "PilotBenchmark", "PilotInfoReportedFlag", ): if k in resourceDescription: resourceDict[k] = resourceDescription[k] return resourceDict def _reportStatus(self, resourceDict, jobID): """Reports the status of the matched job in jobDB and jobLoggingDB Do not fail if errors happen here """ attNames = ["Status", "MinorStatus", "ApplicationStatus", "Site"] attValues = ["Matched", "Assigned", "Unknown", resourceDict["Site"]] result = self.jobDB.setJobAttributes(jobID, attNames, attValues) if not result["OK"]: self.log.error( "Problem reporting job status", "setJobAttributes, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Set job attributes for jobID", jobID) result = self.jlDB.addLoggingRecord(jobID, status=JobStatus.MATCHED, minorStatus="Assigned", source="Matcher") if not result["OK"]: self.log.error( "Problem reporting job status", "addLoggingRecord, jobID = %s: %s" % (jobID, result["Message"])) else: self.log.verbose("Added logging record for jobID", jobID) def _checkMask(self, resourceDict): """Check the mask: are we allowed to run normal jobs? FIXME: should we move to site OR SE? """ if "Site" not in resourceDict: self.log.error("Missing Site Name in Resource JDL") raise RuntimeError("Missing Site Name in Resource JDL") # Check if site is allowed result = self.siteClient.getUsableSites(resourceDict["Site"]) if not result["OK"]: self.log.error("Internal error", "siteClient.getUsableSites: %s" % result["Message"]) raise RuntimeError("Internal error") if resourceDict["Site"] not in result["Value"]: return False return True def _updatePilotInfo(self, resourceDict): """Update pilot information - do not fail if we don't manage to do it""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": gridCE = resourceDict.get("GridCE", "Unknown") site = resourceDict.get("Site", "Unknown") benchmark = resourceDict.get("PilotBenchmark", 0.0) self.log.verbose( "Reporting pilot info", "for %s: gridCE=%s, site=%s, benchmark=%f" % (pilotReference, gridCE, site, benchmark), ) result = self.pilotAgentsDB.setPilotStatus( pilotReference, status=PilotStatus.RUNNING, gridSite=site, destination=gridCE, benchmark=benchmark) if not result["OK"]: self.log.warn( "Problem updating pilot information", "; setPilotStatus. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _updatePilotJobMapping(self, resourceDict, jobID): """Update pilot to job mapping information""" pilotReference = resourceDict.get("PilotReference", "") if pilotReference and pilotReference != "Unknown": result = self.pilotAgentsDB.setCurrentJobID(pilotReference, jobID) if not result["OK"]: self.log.error( "Problem updating pilot information", ";setCurrentJobID. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) result = self.pilotAgentsDB.setJobForPilot(jobID, pilotReference, updateStatus=False) if not result["OK"]: self.log.error( "Problem updating pilot information", "; setJobForPilot. pilotReference: %s; %s" % (pilotReference, result["Message"]), ) def _checkCredentials(self, resourceDict, credDict): """Check if we can get a job given the passed credentials""" if Properties.GENERIC_PILOT in credDict["properties"]: # You can only match groups in the same VO if credDict["group"] == "hosts": # for the host case the VirtualOrganization parameter # is mandatory in resourceDict vo = resourceDict.get("VirtualOrganization", "") else: vo = Registry.getVOForGroup(credDict["group"]) if "OwnerGroup" not in resourceDict: result = Registry.getGroupsForVO(vo) if result["OK"]: resourceDict["OwnerGroup"] = result["Value"] else: raise RuntimeError(result["Message"]) else: # If it's a private pilot, the DN has to be the same if Properties.PILOT in credDict["properties"]: self.log.notice( "Setting the resource DN to the credentials DN") resourceDict["OwnerDN"] = credDict["DN"] # If it's a job sharing. The group has to be the same and just check that the DN (if any) # belongs to the same group elif Properties.JOB_SHARING in credDict["properties"]: resourceDict["OwnerGroup"] = credDict["group"] self.log.notice( "Setting the resource group to the credentials group") if "OwnerDN" in resourceDict and resourceDict[ "OwnerDN"] != credDict["DN"]: ownerDN = resourceDict["OwnerDN"] result = Registry.getGroupsForDN(resourceDict["OwnerDN"]) if not result["OK"]: raise RuntimeError(result["Message"]) if credDict["group"] not in result["Value"]: # DN is not in the same group! bad boy. self.log.warn( "You cannot request jobs from this DN, as it does not belong to your group!", "(%s)" % ownerDN, ) resourceDict["OwnerDN"] = credDict["DN"] # Nothing special, group and DN have to be the same else: resourceDict["OwnerDN"] = credDict["DN"] resourceDict["OwnerGroup"] = credDict["group"] return resourceDict def _checkPilotVersion(self, resourceDict): """Check the pilot DIRAC version""" if self.opsHelper.getValue("Pilot/CheckVersion", True): if "ReleaseVersion" not in resourceDict: if "DIRACVersion" not in resourceDict: raise PilotVersionError( "Version check requested and not provided by Pilot") else: pilotVersion = resourceDict["DIRACVersion"] else: pilotVersion = resourceDict["ReleaseVersion"] validVersions = [ convertToPy3VersionNumber(newStyleVersion) for newStyleVersion in self.opsHelper.getValue( "Pilot/Version", []) ] if validVersions and convertToPy3VersionNumber( pilotVersion) not in validVersions: raise PilotVersionError( "Pilot version does not match the production version: %s not in ( %s )" % (pilotVersion, ",".join(validVersions))) # Check project if requested validProject = self.opsHelper.getValue("Pilot/Project", "") if validProject: if "ReleaseProject" not in resourceDict: raise PilotVersionError( "Version check requested but expected project %s not received" % validProject) if resourceDict["ReleaseProject"] != validProject: raise PilotVersionError( "Version check requested but expected project %s != received %s" % (validProject, resourceDict["ReleaseProject"]))
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.prodTypes = [] self.removeStatusDelay = {} ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info("Will exclude the following Production types from cleaning %s" % ( ', '.join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prodTypes: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict( baseCond ) if status != 'Any': condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result[ 'OK' ]: gLogger.error( "Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK(result) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK()
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption('ProductionTypes', [ 'DataReconstruction', 'DataStripping', 'MCSimulation', 'Merge', 'production' ]) gLogger.info( 'Will exclude the following Production types from cleaning %s' % (string.join(self.prod_types, ', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 200) self.jobByJob = self.am_getOption('JobByJob', True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) return S_OK() def __getAllowedJobTypes(self): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes('JobType') if not result['OK']: return result cleanJobTypes = [] for jobType in result['Value']: if jobType not in self.prod_types: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus({'Status': 'Deleted'}) if not result['OK']: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result['OK']: return result baseCond = {'JobType': result['Value']} # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[status] condDict = dict(baseCond) condDict['Status'] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result['OK']: gLogger.warn('Failed to remove jobs in status %s' % status) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result['OK']: gLogger.warn("Cannot unassign jobs to sandboxes", result['Message']) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultJobDB['OK']: gLogger.warn('Failed to remove job %d from JobDB' % jobID, result['Message']) error_count += 1 elif not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message']) error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message']) error_count += 1 else: count += 1 if count > 0 or error_count > 0: gLogger.info('Deleted %d jobs from JobDB, %d errors' % (count, error_count)) return S_OK()