class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',100) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedle removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogting JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize( self ): """Sets defaults """ self.am_setOption( "PollingTime", 60 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info('Will exclude the following Production types from cleaning %s'%(string.join(self.prod_types,', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce',200) self.jobByJob = self.am_getOption('JobByJob',True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod',0.) return S_OK() def __getAllowedJobTypes( self ): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK()
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.removeStatusDelay = {'Done':7, 'Killed':1, 'Failed':7 } ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % ( ', '.join( self.prod_types ) ) ) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prod_types: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] condDict = dict( baseCond ) condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot unassign jobs to sandboxes", result[ 'Message' ] ) result = self.deleteJobOversizedSandbox( jobList ) if not result[ 'OK' ]: gLogger.warn( "Cannot schedule removal of oversized sandboxes", result[ 'Message' ] ) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop( jobList.index( job ) ) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK( {'Successful':successful, 'Failed':failed} ) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK( result ) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) self.prod_types = self.am_getOption( "ProductionTypes", ["DataReconstruction", "DataStripping", "MCSimulation", "Merge", "production"] ) gLogger.info( "Will exclude the following Production types from cleaning %s" % (string.join(self.prod_types, ", ")) ) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", 200) self.jobByJob = self.am_getOption("JobByJob", True) self.throttlingPeriod = self.am_getOption("ThrottlingPeriod", 0.0) return S_OK() def __getAllowedJobTypes(self): # Get valid jobTypes result = self.jobDB.getDistinctJobAttributes("JobType") if not result["OK"]: return result cleanJobTypes = [] for jobType in result["Value"]: if jobType not in self.prod_types: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """The PilotAgent execution method. """ # Delete jobs in "Deleted" state result = self.removeJobsByStatus({"Status": "Deleted"}) if not result["OK"]: return result # Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result["OK"]: return result baseCond = {"JobType": result["Value"]} # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[status] condDict = dict(baseCond) condDict["Status"] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result["OK"]: gLogger.warn("Failed to remove jobs in status %s" % status) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict) if not result["OK"]: return result jobList = result["Value"] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[: self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: gLogger.warn("Cannot unassign jobs to sandboxes", result["Message"]) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultJobDB["OK"]: gLogger.warn("Failed to remove job %d from JobDB" % jobID, result["Message"]) error_count += 1 elif not resultTQ["OK"]: gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, result["Message"]) error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result["OK"]: gLogger.error("Failed to delete %d jobs from JobDB" % len(jobList)) else: gLogger.info("Deleted %d jobs from JobDB" % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ["OK"]: gLogger.warn("Failed to remove job %d from TaskQueueDB" % jobID, resultTQ["Message"]) error_count += 1 else: count += 1 if count > 0 or error_count > 0: gLogger.info("Deleted %d jobs from JobDB, %d errors" % (count, error_count)) return S_OK()
class JobCleaningAgent(AgentModule): """ Agent for removing jobs in status "Deleted", and not only """ def __init__(self, *args, **kwargs): """c'tor""" AgentModule.__init__(self, *args, **kwargs) # clients self.jobDB = None self.maxJobsAtOnce = 500 self.prodTypes = [] self.removeStatusDelay = {} self.removeStatusDelayHB = {} ############################################################################# def initialize(self): """Sets defaults""" self.jobDB = JobDB() agentTSTypes = self.am_getOption("ProductionTypes", []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( "Transformations/DataProcessing", ["MCSimulation", "Merge"]) self.log.info( "Will exclude the following Production types from cleaning %s" % (", ".join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption("MaxJobsAtOnce", self.maxJobsAtOnce) self.removeStatusDelay[JobStatus.DONE] = self.am_getOption( "RemoveStatusDelay/Done", 7) self.removeStatusDelay[JobStatus.KILLED] = self.am_getOption( "RemoveStatusDelay/Killed", 7) self.removeStatusDelay[JobStatus.FAILED] = self.am_getOption( "RemoveStatusDelay/Failed", 7) self.removeStatusDelay["Any"] = self.am_getOption( "RemoveStatusDelay/Any", -1) self.removeStatusDelayHB[JobStatus.DONE] = self.am_getOption( "RemoveStatusDelayHB/Done", -1) self.removeStatusDelayHB[JobStatus.KILLED] = self.am_getOption( "RemoveStatusDelayHB/Killed", -1) self.removeStatusDelayHB[JobStatus.FAILED] = self.am_getOption( "RemoveStatusDelayHB/Failed", -1) self.maxHBJobsAtOnce = self.am_getOption("MaxHBJobsAtOnce", 0) return S_OK() def _getAllowedJobTypes(self): """Get valid jobTypes""" result = self.jobDB.getDistinctJobAttributes("JobType") if not result["OK"]: return result cleanJobTypes = [] for jobType in result["Value"]: if jobType not in self.prodTypes: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) def execute(self): """Remove or delete jobs in various status""" # First, fully remove jobs in JobStatus.DELETED state result = self.removeDeletedJobs() if not result["OK"]: self.log.error("Failed to remove jobs with status %s" % JobStatus.DELETED) # Second: set the status to JobStatus.DELETED for certain jobs # Get all the Job types for which we can set the status to JobStatus.DELETED result = self._getAllowedJobTypes() if not result["OK"]: return result # No jobs in the system subject to deletion if not result["Value"]: return S_OK() baseCond = {"JobType": result["Value"]} # Delete jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[status] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict(baseCond) if status != "Any": condDict["Status"] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.deleteJobsByStatus(condDict, delTime) if not result["OK"]: self.log.error("Failed to delete jobs", "with condDict %s" % condDict) if self.maxHBJobsAtOnce > 0: for status, delay in self.removeStatusDelayHB.items(): if delay > 0: self.removeHeartBeatLoggingInfo(status, delay) return S_OK() def removeDeletedJobs(self): """Fully remove jobs that are already in status "DELETED", unless there are still requests. :returns: S_OK/S_ERROR """ res = self._getJobsList({"Status": JobStatus.DELETED}) if not res["OK"]: return res jobList = res["Value"] if not jobList: self.log.info("No jobs to remove") return S_OK() self.log.info("Unassigning sandboxes from soon to be deleted jobs", "(%d)" % len(jobList)) result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: self.log.error("Cannot unassign jobs to sandboxes", result["Message"]) return result self.log.info("Attempting to remove deleted jobs", "(%d)" % len(jobList)) # remove from jobList those that have still Operations to do in RMS reqClient = ReqClient() res = reqClient.getRequestIDsForJobs(jobList) if not res["OK"]: return res if res["Value"]["Successful"]: notFinal = set() # Check whether these requests are in a final status for job, reqID in res["Value"]["Successful"].items(): # If not, remove job from list to remove if reqClient.getRequestStatus(reqID).get( "Value") not in Request.FINAL_STATES: # Keep that job notFinal.add(job) else: # Remove the request, if failed, keep the job res1 = reqClient.deleteRequest(reqID) if not res1["OK"]: notFinal.add(job) if notFinal: self.log.info( "Some jobs won't be removed, as still having Requests not in final status", "(n=%d)" % len(notFinal)) jobList = list(set(jobList) - notFinal) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to remove jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.removeJob(jobsList) if not result["OK"]: self.log.error( "Could not remove jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK() def deleteJobsByStatus(self, condDict, delay=False): """Sets the job status to "DELETED" for jobs in condDict. :param dict condDict: a dict like {'JobType': 'User', 'Status': 'Killed'} :param int delay: days of delay :returns: S_OK/S_ERROR """ res = self._getJobsList(condDict, delay) if not res["OK"]: return res jobList = res["Value"] if not jobList: return S_OK() self.log.notice("Attempting to delete jobs", "(%d for %s)" % (len(jobList), condDict)) result = self.deleteJobOversizedSandbox( jobList) # This might set a request if not result["OK"]: self.log.error("Cannot schedule removal of oversized sandboxes", result["Message"]) return result failedJobs = result["Value"][JobStatus.FAILED] for job in failedJobs: jobList.pop(jobList.index(job)) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to delete jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.deleteJob(jobsList) if not result["OK"]: self.log.error( "Could not delete jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK() def _getJobsList(self, condDict, delay=None): """Get jobs list according to conditions :param dict condDict: a dict like {'JobType': 'User', 'Status': 'Killed'} :param int delay: days of delay :returns: S_OK with jobsList """ jobIDsS = set() delayStr = "and older than %s" % delay if delay else "" self.log.info("Get jobs with %s %s" % (str(condDict), delayStr)) for order in ["JobID:ASC", "JobID:DESC"]: result = self.jobDB.selectJobs(condDict, older=delay, orderAttribute=order, limit=self.maxJobsAtOnce) if not result["OK"]: return result jobIDsS = jobIDsS.union({int(jID) for jID in result["Value"]}) return S_OK(list(jobIDsS)) def _getOwnerJobsDict(self, jobList): """ :param list jobList: list of int(JobID) :returns: a dict with a grouping of them by owner, e.g.{'dn;group': [1, 3, 4], 'dn;group_1': [5], 'dn_1;group': [2]} """ res = self.jobDB.getJobsAttributes(jobList, ["OwnerDN", "OwnerGroup"]) if not res["OK"]: self.log.error("Could not get the jobs attributes", res["Message"]) return res jobsDictAttribs = res["Value"] ownerJobsDict = {} for jobID, jobDict in jobsDictAttribs.items(): ownerJobsDict.setdefault(";".join(jobDict.values()), []).append(jobID) return ownerJobsDict def deleteJobOversizedSandbox(self, jobIDList): """ Deletes the job oversized sandbox files from storage elements. Creates a request in RMS if not immediately possible. :param list jobIDList: list of job IDs :returns: S_OK/S_ERROR """ failed = {} successful = {} result = JobMonitoringClient().getJobParameters( jobIDList, ["OutputSandboxLFN"]) if not result["OK"]: return result osLFNDict = result["Value"] if not osLFNDict: return S_OK({"Successful": successful, "Failed": failed}) osLFNDict = dict(osLFN for osLFN in osLFNDict.items() if osLFN[1]) self.log.verbose("Deleting oversized sandboxes", osLFNDict) # Schedule removal of the LFNs now for jobID, outputSandboxLFNdict in osLFNDict.items( ): # can be an iterator lfn = outputSandboxLFNdict["OutputSandboxLFN"] result = self.jobDB.getJobAttributes(jobID, ["OwnerDN", "OwnerGroup"]) if not result["OK"]: failed[jobID] = lfn continue if not result["Value"]: failed[jobID] = lfn continue ownerDN = result["Value"]["OwnerDN"] ownerGroup = result["Value"]["OwnerGroup"] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result["OK"]: failed[jobID] = lfn else: successful[jobID] = lfn result = {"Successful": successful, "Failed": failed} return S_OK(result) def __setRemovalRequest(self, lfn, ownerDN, ownerGroup): """Set removal request with the given credentials""" oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn).strip() + "_removal_request.xml" oRequest.SourceComponent = "JobCleaningAgent" removeFile = Operation() removeFile.Type = "RemoveFile" removedFile = File() removedFile.LFN = lfn removeFile.addFile(removedFile) oRequest.addOperation(removeFile) # put the request with the owner certificate to make sure it's still a valid DN return ReqClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup).putRequest(oRequest) def removeHeartBeatLoggingInfo(self, status, delayDays): """Remove HeartBeatLoggingInfo for jobs with given status after given number of days. :param str status: Job Status :param int delayDays: number of days after which information is removed :returns: None """ self.log.info( "Removing HeartBeatLoggingInfo for Jobs with %s and older than %s day(s)" % (status, delayDays)) delTime = str(Time.dateTime() - delayDays * Time.day) result = self.jobDB.removeInfoFromHeartBeatLogging( status, delTime, self.maxHBJobsAtOnce) if not result["OK"]: self.log.error("Failed to delete from HeartBeatLoggingInfo", result["Message"]) else: self.log.info("Deleted HeartBeatLogging info") return
class TaskAgent( AgentModule ): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize( self ): self.__taskDB = TaskDB() self.__jobDB = JobDB() return S_OK() def execute( self ): """ Main execution method """ condDict = { 'Status': ['Ready', 'Processing', 'Finished'] } result = self.__taskDB.getTasks( [ 'TaskID', 'Status' ], condDict ) if not result['OK']: return result tasks = result['Value'] self.log.info( '%d tasks will be refreshed' % len(tasks) ) for task in tasks: taskID = task[0] status = task[1] if status in ['Ready', 'Processing', 'Finished']: self.__refreshTask( taskID ) return S_OK() def __refreshTask( self, taskID ): result = self.__refreshTaskStringAttribute( taskID, 'Site' ) if result['OK']: self.log.debug( 'Task %d site is refreshed' % taskID ) else: self.log.error( 'Task %d site refresh failed: %s' % ( taskID, result['Message'] ) ) result = self.__refreshTaskStringAttribute( taskID, 'JobGroup' ) if result['OK']: self.log.debug( 'Task %d job group is refreshed' % taskID ) else: self.log.error( 'Task %d job group refresh failed: %s' % ( taskID, result['Message'] ) ) result = self.__refreshTaskStatus( taskID ) if result['OK']: self.log.debug( 'Task %d status is refreshed' % taskID ) else: self.log.error( 'Task %d status refresh failed: %s' % ( taskID, result['Message'] ) ) ################################################################################ def __getTaskProgress( self, taskID ): result = self.__taskDB.getTaskJobs( taskID ) if not result['OK']: return result jobIDs = result['Value'] result = self.__jobDB.getAttributesForJobList( jobIDs, ['Status'] ) if not result['OK']: return result statuses = result['Value'] progress = { 'Total': 0, 'Done': 0, 'Failed': 0, 'Running': 0, 'Waiting': 0, 'Deleted': 0 } progress['Total'] = len(jobIDs) for jobID in jobIDs: if jobID in statuses: status = statuses[jobID]['Status'] if status in ['Done']: progress['Done'] += 1 elif status in ['Failed', 'Stalled', 'Killed']: progress['Failed'] += 1 elif status in ['Running', 'Completed']: progress['Running'] += 1 else: progress['Waiting'] += 1 else: progress['Deleted'] += 1 return S_OK( progress ) def __analyseTaskStatus( self, progress ): totalJob = progress.get( 'Total', 0 ) runningJob = progress.get( 'Running', 0 ) waitingJob = progress.get( 'Waiting', 0 ) deletedJob = progress.get( 'Deleted', 0 ) status = 'Unknown' if deletedJob == totalJob: status = 'Expired' elif runningJob == 0 and waitingJob == 0: status = 'Finished' else: status = 'Processing' return status def __refreshTaskStatus( self, taskID ): """ Refresh the task status """ # get task progress from the job list result = self.__getTaskProgress( taskID ) if not result['OK']: return result progress = result['Value'] self.log.debug( 'Task %d Progress: %s' % ( taskID, progress ) ) result = self.__taskDB.updateTaskProgress( taskID, progress ) if not result['OK']: return result # get previous task status result = self.__taskDB.getTaskStatus( taskID ) if not result['OK']: return result status = result['Value'] # get current task status from the progress newStatus = self.__analyseTaskStatus( progress ) self.log.debug( 'Task %d new status: %s' % ( taskID, newStatus ) ) if newStatus != status: self.__taskDB.updateTaskStatus( taskID, newStatus, 'Status refreshed' ) if not result['OK']: return result return S_OK( newStatus ) ################################################################################ def __getTaskAttribute( self, taskID, attributeType ): """ Get all attributes of the jobs in the task """ result = self.__taskDB.getTaskJobs( taskID ) if not result['OK']: return result jobIDs = result['Value'] condDict = { 'JobID': jobIDs } result = self.__jobDB.getDistinctJobAttributes( attributeType, condDict ) if not result['OK']: return result attributes = result['Value'] return S_OK( attributes ) def __refreshTaskStringAttribute( self, taskID, attributeType ): """ Refresh the task attribute. The attribute type must be string and seperated by comma """ # get task attibutes from the job list result = self.__getTaskAttribute( taskID, attributeType ) if not result['OK']: return result newAttributes = result['Value'] # get previous task attributes result = self.__taskDB.getTask( taskID, [attributeType] ) if not result['OK']: return result oldAttributes = result['Value'][0].split( ',' ) # check whether there are differences if set( newAttributes ) == set( oldAttributes ): self.log.debug( 'Task %s attribute is the same: %s' % (attributeType, oldAttributes) ) return S_OK( oldAttributes ) # make a combination of old and new attributes attributes = list( set( oldAttributes ) | set( newAttributes ) ) for emptyAttr in [ '', 'ANY', 'Multiple' ]: if emptyAttr in attributes: attributes.remove( emptyAttr ) # generate a new attribute allAttributes = ','.join( attributes ) result = self.__taskDB.updateTask( taskID, [attributeType], [allAttributes] ) if not result['OK']: return result return S_OK( allAttributes )
class TaskAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def initialize(self): self.__taskDB = TaskDB() self.__jobDB = JobDB() return S_OK() def execute(self): """ Main execution method """ condDict = {'Status': ['Ready', 'Processing', 'Finished']} result = self.__taskDB.getTasks(['TaskID', 'Status'], condDict) if not result['OK']: return result tasks = result['Value'] self.log.info('%d tasks will be refreshed' % len(tasks)) for task in tasks: taskID = task[0] status = task[1] if status in ['Ready', 'Processing', 'Finished']: self.__refreshTask(taskID) return S_OK() def __refreshTask(self, taskID): result = self.__refreshTaskStringAttribute(taskID, 'Site') if result['OK']: self.log.debug('Task %d site is refreshed' % taskID) else: self.log.error('Task %d site refresh failed: %s' % (taskID, result['Message'])) result = self.__refreshTaskStringAttribute(taskID, 'JobGroup') if result['OK']: self.log.debug('Task %d job group is refreshed' % taskID) else: self.log.error('Task %d job group refresh failed: %s' % (taskID, result['Message'])) result = self.__refreshTaskStatus(taskID) if result['OK']: self.log.debug('Task %d status is refreshed' % taskID) else: self.log.error('Task %d status refresh failed: %s' % (taskID, result['Message'])) ################################################################################ def __getTaskProgress(self, taskID): result = self.__taskDB.getTaskJobs(taskID) if not result['OK']: return result jobIDs = result['Value'] result = self.__jobDB.getAttributesForJobList(jobIDs, ['Status']) if not result['OK']: return result statuses = result['Value'] progress = { 'Total': 0, 'Done': 0, 'Failed': 0, 'Running': 0, 'Waiting': 0, 'Deleted': 0 } progress['Total'] = len(jobIDs) for jobID in jobIDs: if jobID in statuses: status = statuses[jobID]['Status'] if status in ['Done']: progress['Done'] += 1 elif status in ['Failed', 'Stalled', 'Killed']: progress['Failed'] += 1 elif status in ['Running', 'Completed']: progress['Running'] += 1 else: progress['Waiting'] += 1 else: progress['Deleted'] += 1 return S_OK(progress) def __analyseTaskStatus(self, progress): totalJob = progress.get('Total', 0) runningJob = progress.get('Running', 0) waitingJob = progress.get('Waiting', 0) deletedJob = progress.get('Deleted', 0) status = 'Unknown' if deletedJob == totalJob: status = 'Expired' elif runningJob == 0 and waitingJob == 0: status = 'Finished' else: status = 'Processing' return status def __refreshTaskStatus(self, taskID): """ Refresh the task status """ # get task progress from the job list result = self.__getTaskProgress(taskID) if not result['OK']: return result progress = result['Value'] self.log.debug('Task %d Progress: %s' % (taskID, progress)) result = self.__taskDB.updateTaskProgress(taskID, progress) if not result['OK']: return result # get previous task status result = self.__taskDB.getTaskStatus(taskID) if not result['OK']: return result status = result['Value'] # get current task status from the progress newStatus = self.__analyseTaskStatus(progress) self.log.debug('Task %d new status: %s' % (taskID, newStatus)) if newStatus != status: self.__taskDB.updateTaskStatus(taskID, newStatus, 'Status refreshed') if not result['OK']: return result return S_OK(newStatus) ################################################################################ def __getTaskAttribute(self, taskID, attributeType): """ Get all attributes of the jobs in the task """ result = self.__taskDB.getTaskJobs(taskID) if not result['OK']: return result jobIDs = result['Value'] condDict = {'JobID': jobIDs} result = self.__jobDB.getDistinctJobAttributes(attributeType, condDict) if not result['OK']: return result attributes = result['Value'] return S_OK(attributes) def __refreshTaskStringAttribute(self, taskID, attributeType): """ Refresh the task attribute. The attribute type must be string and seperated by comma """ # get task attibutes from the job list result = self.__getTaskAttribute(taskID, attributeType) if not result['OK']: return result newAttributes = result['Value'] # get previous task attributes result = self.__taskDB.getTask(taskID, [attributeType]) if not result['OK']: return result oldAttributes = result['Value'][0].split(',') # check whether there are differences if set(newAttributes) == set(oldAttributes): self.log.debug('Task %s attribute is the same: %s' % (attributeType, oldAttributes)) return S_OK(oldAttributes) # make a combination of old and new attributes attributes = list(set(oldAttributes) | set(newAttributes)) for emptyAttr in ['', 'ANY', 'Multiple']: if emptyAttr in attributes: attributes.remove(emptyAttr) # generate a new attribute allAttributes = ','.join(attributes) result = self.__taskDB.updateTask(taskID, [attributeType], [allAttributes]) if not result['OK']: return result return S_OK(allAttributes)
class JobCleaningAgent(AgentModule): """ The specific agents must provide the following methods: - initialize() for initial settings - beginExecution() - execute() - the main method called in the agent cycle - endExecution() - finalize() - the graceful exit of the method, this one is usually used for the agent restart """ ############################################################################# def initialize(self): """Sets defaults """ self.am_setOption("PollingTime", 60) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prod_types = agentTSTypes else: self.prod_types = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info( 'Will exclude the following Production types from cleaning %s' % (string.join(self.prod_types, ', '))) self.maxJobsAtOnce = self.am_getOption('MaxJobsAtOnce', 100) self.jobByJob = self.am_getOption('JobByJob', True) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) return S_OK() def __getAllowedJobTypes(self): #Get valid jobTypes result = self.jobDB.getDistinctJobAttributes('JobType') if not result['OK']: return result cleanJobTypes = [] for jobType in result['Value']: if jobType not in self.prod_types: cleanJobTypes.append(jobType) self.log.notice("JobTypes to clean %s" % cleanJobTypes) return S_OK(cleanJobTypes) ############################################################################# def execute(self): """The PilotAgent execution method. """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus({'Status': 'Deleted'}) if not result['OK']: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result['OK']: return result baseCond = {'JobType': result['Value']} # Remove jobs with final status for status in REMOVE_STATUS_DELAY: delay = REMOVE_STATUS_DELAY[status] condDict = dict(baseCond) condDict['Status'] = status delTime = str(Time.dateTime() - delay * Time.day) result = self.removeJobsByStatus(condDict, delTime) if not result['OK']: gLogger.warn('Failed to remove jobs in status %s' % status) return S_OK() def removeJobsByStatus(self, condDict, delay=False): """ Remove deleted jobs """ if delay: gLogger.verbose("Removing jobs with %s and older than %s" % (condDict, delay)) result = self.jobDB.selectJobs(condDict, older=delay, limit=self.maxJobsAtOnce) else: gLogger.verbose("Removing jobs with %s " % condDict) result = self.jobDB.selectJobs(condDict, limit=self.maxJobsAtOnce) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice("Deleting %s jobs for %s" % (len(jobList), condDict)) count = 0 error_count = 0 result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result['OK']: gLogger.warn("Cannot unassign jobs to sandboxes", result['Message']) result = self.deleteJobOversizedSandbox(jobList) if not result['OK']: gLogger.warn("Cannot schedle removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB(jobID) resultTQ = self.taskQueueDB.deleteJob(jobID) resultLogDB = self.jobLoggingDB.deleteJob(jobID) errorFlag = False if not resultJobDB['OK']: gLogger.warn('Failed to remove job %d from JobDB' % jobID, result['Message']) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message']) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message']) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList)) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob(jobID) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message']) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob(jobList) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList)) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList)) if count > 0 or error_count > 0: gLogger.info('Deleted %d jobs from JobDB, %d errors' % (count, error_count)) return S_OK() def deleteJobOversizedSandbox(self, jobIDList): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter(jobID, 'OutputSandboxLFN') if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn('Error interrogting JobDB: %s' % result['Message']) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn, jobID in lfnDict.items(): result = self.jobDB.getJobAttributes(jobID, ['OwnerDN', 'OwnerGroup']) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest(lfn, ownerDN, ownerGroup) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful': successful, 'Failed': failed} return S_OK(result) def __setRemovalRequest(self, lfn, ownerDN, ownerGroup): """ Set removal request with the given credentials """ request = RequestContainer() request.setRequestAttributes({ 'OwnerDN': ownerDN, 'OwnerGroup': ownerGroup }) requestName = os.path.basename(lfn).strip() + '_removal_request.xml' request.setRequestName(requestName) request.setSourceComponent('JobCleaningAgent') removalDict = { 'Attributes': { 'Operation': 'removeFile', 'TargetSE': '', 'ExecutionOrder': 0 } } result = request.addSubRequest(removalDict, 'removal') if not result['OK']: return result index = result['Value'] fileDict = {'LFN': lfn, 'PFN': '', 'Status': 'Waiting'} request.setSubRequestFiles(index, 'removal', [fileDict]) client = RequestClient() result = request.toXML() if not result['OK']: return result xmlRequest = result['Value'] result = client.setRequest(requestName, xmlRequest) return result
class JobCleaningAgent( AgentModule ): """ The specific agents must provide the following methods: * initialize() for initial settings * beginExecution() * execute() - the main method called in the agent cycle * endExecution() * finalize() - the graceful exit of the method, this one is usually used for the agent restart """ def __init__( self, *args, **kwargs ): """ c'tor """ AgentModule.__init__( self, *args, **kwargs ) #clients # FIXME: shouldn't we avoid using the DBs directly, and instead go through the service? self.jobDB = None self.taskQueueDB = None self.jobLoggingDB = None self.maxJobsAtOnce = 100 self.jobByJob = False self.throttlingPeriod = 0. self.prodTypes = [] self.removeStatusDelay = {} ############################################################################# def initialize( self ): """ Sets defaults """ self.am_setOption( "PollingTime", 120 ) self.jobDB = JobDB() self.taskQueueDB = TaskQueueDB() self.jobLoggingDB = JobLoggingDB() # self.sandboxDB = SandboxDB( 'SandboxDB' ) agentTSTypes = self.am_getOption('ProductionTypes', []) if agentTSTypes: self.prodTypes = agentTSTypes else: self.prodTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge']) gLogger.info("Will exclude the following Production types from cleaning %s" % ( ', '.join(self.prodTypes))) self.maxJobsAtOnce = self.am_getOption( 'MaxJobsAtOnce', 500 ) self.jobByJob = self.am_getOption( 'JobByJob', False ) self.throttlingPeriod = self.am_getOption('ThrottlingPeriod', 0.) self.removeStatusDelay['Done'] = self.am_getOption( 'RemoveStatusDelay/Done', 7 ) self.removeStatusDelay['Killed'] = self.am_getOption( 'RemoveStatusDelay/Killed', 7 ) self.removeStatusDelay['Failed'] = self.am_getOption( 'RemoveStatusDelay/Failed', 7 ) self.removeStatusDelay['Any'] = self.am_getOption( 'RemoveStatusDelay/Any', -1 ) return S_OK() def __getAllowedJobTypes( self ): """ Get valid jobTypes """ result = self.jobDB.getDistinctJobAttributes( 'JobType' ) if not result[ 'OK' ]: return result cleanJobTypes = [] for jobType in result[ 'Value' ]: if jobType not in self.prodTypes: cleanJobTypes.append( jobType ) self.log.notice( "JobTypes to clean %s" % cleanJobTypes ) return S_OK( cleanJobTypes ) ############################################################################# def execute( self ): """ Remove jobs in various status """ #Delete jobs in "Deleted" state result = self.removeJobsByStatus( { 'Status' : 'Deleted' } ) if not result[ 'OK' ]: return result #Get all the Job types that can be cleaned result = self.__getAllowedJobTypes() if not result[ 'OK' ]: return result # No jobs in the system subject to removal if not result['Value']: return S_OK() baseCond = { 'JobType' : result[ 'Value' ] } # Remove jobs with final status for status in self.removeStatusDelay: delay = self.removeStatusDelay[ status ] if delay < 0: # Negative delay means don't delete anything... continue condDict = dict( baseCond ) if status != 'Any': condDict[ 'Status' ] = status delTime = str( Time.dateTime() - delay * Time.day ) result = self.removeJobsByStatus( condDict, delTime ) if not result['OK']: gLogger.warn( 'Failed to remove jobs in status %s' % status ) return S_OK() def removeJobsByStatus( self, condDict, delay = False ): """ Remove deleted jobs """ if delay: gLogger.verbose( "Removing jobs with %s and older than %s day(s)" % ( condDict, delay ) ) result = self.jobDB.selectJobs( condDict, older = delay, limit = self.maxJobsAtOnce ) else: gLogger.verbose( "Removing jobs with %s " % condDict ) result = self.jobDB.selectJobs( condDict, limit = self.maxJobsAtOnce ) if not result['OK']: return result jobList = result['Value'] if len(jobList) > self.maxJobsAtOnce: jobList = jobList[:self.maxJobsAtOnce] if not jobList: return S_OK() self.log.notice( "Deleting %s jobs for %s" % ( len( jobList ), condDict ) ) count = 0 error_count = 0 result = SandboxStoreClient( useCertificates = True ).unassignJobs( jobList ) if not result[ 'OK' ]: gLogger.error("Cannot unassign jobs to sandboxes", result['Message']) return result result = self.deleteJobOversizedSandbox(jobList) if not result[ 'OK' ]: gLogger.error( "Cannot schedule removal of oversized sandboxes", result['Message']) return result failedJobs = result['Value']['Failed'] for job in failedJobs: jobList.pop(jobList.index(job)) # TODO: we should not remove a job if it still has requests in the RequestManager. # But this logic should go in the client or in the service, and right now no service expose jobDB.removeJobFromDB if self.jobByJob: for jobID in jobList: resultJobDB = self.jobDB.removeJobFromDB( jobID ) resultTQ = self.taskQueueDB.deleteJob( jobID ) resultLogDB = self.jobLoggingDB.deleteJob( jobID ) errorFlag = False if not resultJobDB['OK']: gLogger.warn( 'Failed to remove job %d from JobDB' % jobID, result['Message'] ) errorFlag = True if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, result['Message'] ) errorFlag = True if not resultLogDB['OK']: gLogger.warn( 'Failed to remove job %d from JobLoggingDB' % jobID, result['Message'] ) errorFlag = True if errorFlag: error_count += 1 else: count += 1 if self.throttlingPeriod: time.sleep(self.throttlingPeriod) else: result = self.jobDB.removeJobFromDB( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobDB' % len(jobList) ) for jobID in jobList: resultTQ = self.taskQueueDB.deleteJob( jobID ) if not resultTQ['OK']: gLogger.warn( 'Failed to remove job %d from TaskQueueDB' % jobID, resultTQ['Message'] ) error_count += 1 else: count += 1 result = self.jobLoggingDB.deleteJob( jobList ) if not result['OK']: gLogger.error('Failed to delete %d jobs from JobLoggingDB' % len(jobList) ) else: gLogger.info('Deleted %d jobs from JobLoggingDB' % len(jobList) ) if count > 0 or error_count > 0 : gLogger.info( 'Deleted %d jobs from JobDB, %d errors' % ( count, error_count ) ) return S_OK() def deleteJobOversizedSandbox( self, jobIDList ): """ Delete the job oversized sandbox files from storage elements """ failed = {} successful = {} lfnDict = {} for jobID in jobIDList: result = self.jobDB.getJobParameter( jobID, 'OutputSandboxLFN' ) if result['OK']: lfn = result['Value'] if lfn: lfnDict[lfn] = jobID else: successful[jobID] = 'No oversized sandbox found' else: gLogger.warn( 'Error interrogating JobDB: %s' % result['Message'] ) if not lfnDict: return S_OK({'Successful': successful, 'Failed': failed}) # Schedule removal of the LFNs now for lfn,jobID in lfnDict.items(): result = self.jobDB.getJobAttributes( jobID, ['OwnerDN', 'OwnerGroup'] ) if not result['OK']: failed[jobID] = lfn continue if not result['Value']: failed[jobID] = lfn continue ownerDN = result['Value']['OwnerDN'] ownerGroup = result['Value']['OwnerGroup'] result = self.__setRemovalRequest( lfn, ownerDN, ownerGroup ) if not result['OK']: failed[jobID] = lfn else: successful[jobID] = lfn result = {'Successful':successful, 'Failed':failed} return S_OK(result) def __setRemovalRequest( self, lfn, ownerDN, ownerGroup ): """ Set removal request with the given credentials """ oRequest = Request() oRequest.OwnerDN = ownerDN oRequest.OwnerGroup = ownerGroup oRequest.RequestName = os.path.basename( lfn ).strip() + '_removal_request.xml' oRequest.SourceComponent = 'JobCleaningAgent' removeFile = Operation() removeFile.Type = 'RemoveFile' removedFile = File() removedFile.LFN = lfn removeFile.addFile( removedFile ) oRequest.addOperation( removeFile ) return ReqClient().putRequest( oRequest )