Beispiel #1
0
 def __init__(self, config):
     # queue url used in WorkQueueManager
     self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984"
     self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl)
     self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl)
     self.dbsUtil = DBSBufferUtil()
     self.condorAPI = PyCondorAPI()
Beispiel #2
0
 def __init__(self, queue, config):
     """
     Initialise class members
     """
     BaseWorkerThread.__init__(self)
     self.queue = queue
     self.config = config
     self.condorAPI = PyCondorAPI()
Beispiel #3
0
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI()
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = [
            'CondorPriority', 'NoJobRetries', 'EnableAllSites'
        ]

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
Beispiel #4
0
 def __init__(self, config):
     # queue url used in WorkQueueManager
     self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984"
     self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl)
     self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl)
     self.dbsUtil = DBSBufferUtil()
     self.condorAPI = PyCondorAPI()
Beispiel #5
0
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        self.queue = queue
        self.config = config
        self.condorAPI = PyCondorAPI()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)
        self.listSubsWithoutJobs = self.daoFactory(
            classname="Subscriptions.GetSubsWithoutJobGroup")
Beispiel #6
0
 def __init__(self, config):
     """
     initialize properties specified from config
     """
     BaseWorkerThread.__init__(self)
     self.config = config
     self.drainAPI = DrainStatusAPI(config)
     self.condorAPI = PyCondorAPI()
     self.agentConfig = {}
     self.previousConfig = {}
     self.validSpeedDrainConfigKeys = [
         'CondorPriority', 'NoJobRetries', 'EnableAllSites'
     ]
     self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
     self.emailAlert = EmailAlert(config.EmailAlert.dictionary_())
     self.condorStates = ("Running", "Idle")
Beispiel #7
0
class DrainStatusAPI(object):
    """
    Provides methods for querying dbs and condor for drain statistics
    """
    def __init__(self):

        self.dbsUtil = DBSBufferUtil()
        self.condorAPI = PyCondorAPI()

    def collectDrainInfo(self):
        """
        Call methods to check the drain status
        """
        results = {}
        results['workflows_completed'] = self.checkWorkflows()

        # if workflows are completed, collect additional drain statistics
        if results['workflows_completed']:
            results['upload_status'] = self.checkFileUploadStatus()
            results['condor_status'] = self.checkCondorStates()

        return results

    def checkWorkflows(self):
        """
        Check to see if all workflows have a 'completed' status
        """
        results = self.dbsUtil.isAllWorkflowCompleted()
        return results

    def checkCondorStates(self):
        """
        Check idle and running jobs in Condor
        """
        results = {}
        queries = [["1", "idle"], ["2", "running"]]

        for query in queries:
            jobs = self.condorAPI.getCondorJobs("JobStatus=="+query[0], [])
            # if there is an error, report it instead of the length of an empty list
            if jobs is None:
                results[query[1]] = "unknown (schedd query error)"
            else:
                results[query[1]] = len(jobs)

        return results

    def checkFileUploadStatus(self):
        """
        Check file upload status:
            Blocks open in DBS
            Files not uploaded in DBS
            Files not uploaded to Phedex
        """
        results = {}
        results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks()
        results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus(status="NOTUPLOADED")
        results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded()
        return results
 def __init__(self, queue, config):
     """
     Initialise class members
     """
     BaseWorkerThread.__init__(self)
     self.queue = queue
     self.config = config
     self.condorAPI = PyCondorAPI()
Beispiel #9
0
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI(config)
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites']

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        self.queue = queue
        self.config = config
        self.condorAPI = PyCondorAPI()

        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)
        self.listSubsWithoutJobs = self.daoFactory(classname="Subscriptions.GetSubsWithoutJobGroup")
Beispiel #11
0
    def __init__(self):

        self.dbsUtil = DBSBufferUtil()
        self.condorAPI = PyCondorAPI()
Beispiel #12
0
class WorkQueueManagerWorkPoller(BaseWorkerThread):
    """
    Polls for Work
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        self.queue = queue
        self.config = config
        self.condorAPI = PyCondorAPI()

        self.daoFactory = DAOFactory(package="WMCore.WMBS",
                                     logger=logging,
                                     dbinterface=myThread.dbi)
        self.listSubsWithoutJobs = self.daoFactory(
            classname="Subscriptions.GetSubsWithoutJobGroup")

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop', t)
        time.sleep(t)

    @timeFunction
    def algorithm(self, parameters):
        """
        Pull in work
        """
        self.logger.info("Starting WorkQueueManagerWorkPoller thread ...")
        try:
            self.pullWork()
        except Exception as ex:
            self.queue.logger.error("Error in work pull loop: %s", str(ex))
        try:
            # process if we get work or not - we may have to split old work
            # i.e. if transient errors were seen during splitting
            self.processWork()
        except Exception as ex:
            self.queue.logger.error("Error in new work split loop: %s",
                                    str(ex))
        return

    def passRetrieveCondition(self):
        """
        _passRetrieveCondition_
        Return true if the component can proceed with fetching work.
        False if the component should skip pulling work this cycle.

        For now, it only checks whether the agent is in drain mode or
        MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded.
        """
        passCond = "OK"
        myThread = threading.currentThread()
        if isDrainMode(self.config):
            passCond = "agent is in drain mode"
        elif availableScheddSlots(myThread.dbi) <= 0:
            passCond = "schedd slot is maxed: MAX_JOBS_PER_OWNER"
        elif self.condorAPI.isScheddOverloaded():
            passCond = "schedd is overloaded"
        else:
            subscriptions = self.listSubsWithoutJobs.execute()
            if subscriptions:
                passCond = "JobCreator hasn't created jobs for subscriptions %s" % subscriptions

        return passCond

    def pullWork(self):
        """Get work from parent"""
        self.queue.logger.info("Pulling work from %s",
                               self.queue.parent_queue.queueUrl)

        myThread = threading.currentThread()

        try:
            cond = self.passRetrieveCondition()
            if cond == "OK":
                work = self.queue.pullWork()
                self.queue.logger.info("Obtained %s unit(s) of work", work)
                myThread.logdbClient.delete("LocalWorkQueue_pullWork",
                                            "warning",
                                            this_thread=True)
            else:
                self.queue.logger.warning("No work will be pulled, reason: %s",
                                          cond)
                myThread.logdbClient.post("LocalWorkQueue_pullWork", cond,
                                          "warning")
        except IOError as ex:
            self.queue.logger.exception(
                "Error opening connection to work queue: %s", str(ex))
        except Exception as ex:
            self.queue.logger.exception(
                "Unable to pull work from parent Error: %s", str(ex))

    def processWork(self):
        """Process new work"""
        self.queue.logger.info("Splitting new work")
        try:
            self.queue.processInboundWork()
        except Exception as ex:
            self.queue.logger.exception('Error during split: %s', str(ex))
        self.logger.info('Splitting finished')
        return
Beispiel #13
0
class DrainStatusAPI(object):
    """
    Provides methods for querying dbs and condor for drain statistics
    """
    def __init__(self, config):
        # queue url used in WorkQueueManager
        self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984"
        self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl)
        self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl)
        self.dbsUtil = DBSBufferUtil()
        self.condorAPI = PyCondorAPI()

    def collectDrainInfo(self):
        """
        Call methods to check the drain status
        """
        results = {}
        results['workflows_completed'] = self.checkWorkflows()

        # if workflows are completed, collect additional drain statistics
        if results['workflows_completed']:
            results['upload_status'] = self.checkFileUploadStatus()
            results['condor_status'] = self.checkCondorStates()
            results['local_wq_status'] = self.checkLocalWQStatus(dbname="workqueue")
            results['local_wqinbox_status'] = self.checkLocalWQStatus(dbname="workqueue_inbox")
            results['global_wq_status'] = self.checkGlobalWQStatus()

        return results

    def checkWorkflows(self):
        """
        Check to see if all workflows have a 'completed' status
        """
        results = self.dbsUtil.isAllWorkflowCompleted()
        return results

    def checkCondorStates(self):
        """
        Check idle and running jobs in Condor
        """
        results = {}
        queries = [["1", "idle"], ["2", "running"]]

        for query in queries:
            jobs = self.condorAPI.getCondorJobs("JobStatus=="+query[0], [])
            # if there is an error, report it instead of the length of an empty list
            if jobs is None:
                results[query[1]] = "unknown (schedd query error)"
            else:
                results[query[1]] = len(jobs)

        return results

    def checkFileUploadStatus(self):
        """
        Check file upload status:
            Blocks open in DBS
            Files not uploaded in DBS
            Files not uploaded to Phedex
        """
        results = {}
        results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks()
        results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus(status="NOTUPLOADED")
        results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded()
        return results

    def checkLocalWQStatus(self, dbname):
        """
        Query local WorkQueue workqueue/workqueue_inbox database to see whether
        there are any active elements in this agent.
        """
        results = {}

        for st in ('Available', 'Negotiating', 'Acquired', 'Running'):
            if dbname == "workqueue":
                elements = self.localBackend.getElements(status=st, returnIdOnly=True)
            else:
                elements = self.localBackend.getInboxElements(status=st, returnIdOnly=True)
            results[st] = len(elements)
        return results

    def checkGlobalWQStatus(self):
        """
        Query Global WorkQueue workqueue database to see whether there are
        any active elements set to this agent.
        """
        results = {}

        for st in ("Acquired", "Running"):
            elements = self.globalBackend.getElements(status=st, returnIdOnly=True,
                                                      ChildQueueUrl=self.thisAgentUrl)
            results[st] = len(elements)
        return results
Beispiel #14
0
class WorkQueueManagerWorkPoller(BaseWorkerThread):
    """
    Polls for Work
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        self.queue = queue
        self.config = config
        self.condorAPI = PyCondorAPI()

    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    def algorithm(self, parameters):
        """
        Pull in work
            """
        try:
            self.pullWork()
        except Exception as ex:
            self.queue.logger.error("Error in work pull loop: %s" % str(ex))
        try:
            # process if we get work or not - we may have to split old work
            # i.e. if transient errors were seen during splitting
            self.processWork()
        except Exception as ex:
            self.queue.logger.error("Error in new work split loop: %s" %
                                    str(ex))
        return

    def passRetrieveCondition(self):
        """
        _passRetrieveCondition_
        
        Return true if the component can proceed with fetching work.
        False if the component should skip pulling work this cycle.
        
        For now, it only checks whether the agent is in drain mode or
        if the condor schedd is overloaded.
        """
        passCond = True
        if isDrainMode(self.config):
            passCond = False
        elif self.condorAPI.isScheddOverloaded():
            passCond = False

        return passCond

    def pullWork(self):
        """Get work from parent"""
        self.queue.logger.info("Pulling work from %s" %
                               self.queue.parent_queue.queueUrl)
        work = 0

        myThread = threading.currentThread()

        try:
            if self.passRetrieveCondition():
                work = self.queue.pullWork()
                myThread.logdbClient.delete("LocalWorkQueue_pullWork",
                                            "warning",
                                            this_thread=True)
            else:
                msg = "Workqueue didn't pass the retrieve condition: NOT pulling work"
                self.queue.logger.warning(msg)
                myThread.logdbClient.post("LocalWorkQueue_pullWork", msg,
                                          "warning")
        except IOError as ex:
            self.queue.logger.error(
                "Error opening connection to work queue: %s \n%s" %
                (str(ex), traceback.format_exc()))
        except Exception as ex:
            self.queue.logger.error(
                "Unable to pull work from parent Error: %s\n%s" %
                (str(ex), traceback.format_exc()))
        self.queue.logger.info("Obtained %s unit(s) of work" % work)
        return work

    def processWork(self):
        """Process new work"""
        self.queue.logger.info("Splitting new work")
        try:
            self.queue.processInboundWork()
        except Exception as ex:
            self.queue.logger.exception('Error during split: %s' % str(ex))
        self.logger.info('Splitting finished')
        return
class WorkQueueManagerWorkPoller(BaseWorkerThread):
    """
    Polls for Work
    """
    def __init__(self, queue, config):
        """
        Initialise class members
        """
        BaseWorkerThread.__init__(self)
        myThread = threading.currentThread()

        self.queue = queue
        self.config = config
        self.condorAPI = PyCondorAPI()

        self.daoFactory = DAOFactory(package="WMCore.WMBS", logger=logging, dbinterface=myThread.dbi)
        self.listSubsWithoutJobs = self.daoFactory(classname="Subscriptions.GetSubsWithoutJobGroup")


    def setup(self, parameters):
        """
        Called at startup - introduce random delay
             to avoid workers all starting at once
        """
        t = random.randrange(self.idleTime)
        self.logger.info('Sleeping for %d seconds before 1st loop' % t)
        time.sleep(t)

    @timeFunction
    def algorithm(self, parameters):
        """
        Pull in work
            """
        try:
            self.pullWork()
        except Exception as ex:
            self.queue.logger.error("Error in work pull loop: %s" % str(ex))
        try:
            # process if we get work or not - we may have to split old work
            # i.e. if transient errors were seen during splitting
            self.processWork()
        except Exception as ex:
            self.queue.logger.error("Error in new work split loop: %s" % str(ex))
        return

    def passRetrieveCondition(self):
        """
        _passRetrieveCondition_
        Return true if the component can proceed with fetching work.
        False if the component should skip pulling work this cycle.

        For now, it only checks whether the agent is in drain mode or
        MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded.
        """

        passCond = "OK"
        myThread = threading.currentThread()
        if isDrainMode(self.config):
            passCond = "No work will be pulled: Agent is in drain"
        elif availableScheddSlots(myThread.dbi) <= 0:
            passCond = "No work will be pulled: schedd slot is maxed: MAX_JOBS_PER_OWNER"
        elif self.condorAPI.isScheddOverloaded():
            passCond = "No work will be pulled: schedd is overloaded"
        else:
            subscriptions = self.listSubsWithoutJobs.execute()
            if subscriptions:
                passCond = "No work will be pulled: "
                passCond += "JobCreator hasn't created jobs for subscriptions %s" % subscriptions

        return passCond

    def pullWork(self):
        """Get work from parent"""
        self.queue.logger.info("Pulling work from %s" % self.queue.parent_queue.queueUrl)
        work = 0

        myThread = threading.currentThread()

        try:
            cond = self.passRetrieveCondition()
            if cond == "OK":
                work = self.queue.pullWork()
                myThread.logdbClient.delete("LocalWorkQueue_pullWork", "warning", this_thread=True)
            else:
                self.queue.logger.warning(cond)
                myThread.logdbClient.post("LocalWorkQueue_pullWork", cond, "warning")
        except IOError as ex:
            self.queue.logger.error("Error opening connection to work queue: %s \n%s" %
                                    (str(ex), traceback.format_exc()))
        except Exception as ex:
            self.queue.logger.error("Unable to pull work from parent Error: %s\n%s"
                                    % (str(ex), traceback.format_exc()))
        self.queue.logger.info("Obtained %s unit(s) of work" % work)
        return work

    def processWork(self):
        """Process new work"""
        self.queue.logger.info("Splitting new work")
        try:
            self.queue.processInboundWork()
        except Exception as ex:
            self.queue.logger.exception('Error during split: %s' % str(ex))
        self.logger.info('Splitting finished')
        return
Beispiel #16
0
class DrainStatusPoller(BaseWorkerThread):
    """
    Collects information related to the agent drain status
    """
    # class variable that contains drain statistics
    drainStats = {}

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI()
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = [
            'CondorPriority', 'NoJobRetries', 'EnableAllSites'
        ]

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(
            self.config.Agent.hostName)

        if isDrainMode(self.config):
            # check to see if the agent hit any speed drain thresholds
            thresholdsHit = self.checkSpeedDrainThresholds()
            if thresholdsHit:
                logging.info("Updating agent configuration for speed drain...")
                self.updateAgentSpeedDrainConfig(thresholdsHit)
            try:
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: " +
                             str(DrainStatusPoller.drainStats))

            except Exception as ex:
                msg = "Error occurred, will retry later:\n"
                msg += str(ex)
                logging.exception(msg)
        else:
            logging.info(
                "Agent not in drain mode. Resetting flags and skipping drain check..."
            )
            self.resetAgentSpeedDrainConfig()

    @classmethod
    def getDrainInfo(cls):
        """
        Return drainStats class variable
        """
        return cls.drainStats

    def updateAgentSpeedDrainConfig(self, thresholdsHit):
        """
        Takes a list of speed drain configuration keys and updates the agent configuration
        """
        updateConfig = False
        condorPriorityFlag = False
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        if 'CondorPriority' in thresholdsHit:
            logging.info(
                "Bumping condor job priority to 999999 for Production/Processing pending jobs."
            )
            self.condorAPI.editCondorJobs(
                "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")",
                "JobPrio", "999999")
            condorPriorityFlag = True

        if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']:
            # CondorPriority setting is irreversible so the flag only indicates weather
            # priority is increased or not. It is not checked by other components
            logging.info("Enabling CondorPriority flag.")
            speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag
            updateConfig = True

        if 'NoJobRetries' in thresholdsHit:
            logging.info(
                "Enabling NoJobRetries flag: Error Handler won't retry the jobs"
            )
            # ErrorHandler will pick this up and set max retries to 0
            speedDrainConfig['NoJobRetries']['Enabled'] = True
            updateConfig = True

        if 'EnableAllSites' in thresholdsHit:
            logging.info(
                "Enabling EnableAllSites flag: Updating agent to submit to all sites."
            )
            # setting this value to True makes JobSubmitterPoller ignore site status
            speedDrainConfig['EnableAllSites']['Enabled'] = True
            updateConfig = True

        # update the aux db speed drain config with any changes
        if updateConfig:
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainMode", True)
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainConfig",
                                            speedDrainConfig)

        return

    def resetAgentSpeedDrainConfig(self):
        """
        resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False
        """

        if self.agentConfig.get("SpeedDrainMode"):
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainMode", False)
            speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")
            for key, v in speedDrainConfig.items():
                if key in self.validSpeedDrainConfigKeys and v['Enabled']:
                    speedDrainConfig[key]['Enabled'] = False

            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName,
                                            "SpeedDrainConfig",
                                            speedDrainConfig)
        return

    def checkSpeedDrainThresholds(self):
        """
        Check the current number of jobs in Condor and create a list of agent configuration parameters
        that need updated for speed draining
        """
        enableKeys = []

        # get the current speed drain status
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        # get condor jobs
        jobs = self.condorAPI.getCondorJobs("", [])
        if jobs is None:
            logging.warning(
                "There was an error querying the schedd.  Not checking speed drain thresholds."
            )
            return []

        # loop through the speed drain configuration and make a list of what thresholds have been hit
        for k, v in speedDrainConfig.items():
            # make sure keys in the speed drain config are valid
            if k in self.validSpeedDrainConfigKeys and isinstance(
                    v['Threshold'], int) and isinstance(v['Enabled'], bool):
                # we always want to apply the condor priority change if the threshold is hit
                if not v['Enabled'] or k == 'CondorPriority':
                    logging.info("Checking speed drain threshold for %s. ", k)
                    if len(jobs) < v['Threshold']:
                        logging.info(
                            "Agent will update speed drain configuration for %s. ",
                            k)
                        enableKeys.append(k)
            else:
                logging.warning(
                    "Speed drain configuration error for %s.  Please check aux db contents.",
                    k)

        return enableKeys
Beispiel #17
0
class DrainStatusAPI(object):
    """
    Provides methods for querying dbs and condor for drain statistics
    """
    def __init__(self, config):
        # queue url used in WorkQueueManager
        self.thisAgentUrl = "http://" + config.Agent.hostName + ":5984"
        self.globalBackend = WorkQueueBackend(config.WorkloadSummary.couchurl)
        self.localBackend = WorkQueueBackend(config.WorkQueueManager.couchurl)
        self.dbsUtil = DBSBufferUtil()
        self.condorAPI = PyCondorAPI()
        self.condorStates = ("Running", "Idle")

    def collectDrainInfo(self):
        """
        Call methods to check the drain status
        """
        results = {}
        results['workflows_completed'] = self.checkWorkflows()

        # if workflows are completed, collect additional drain statistics
        if results['workflows_completed']:
            results['upload_status'] = self.checkFileUploadStatus()
            results['condor_status'] = self.checkCondorStates()
            results['local_wq_status'] = self.checkLocalWQStatus(
                dbname="workqueue")
            results['local_wqinbox_status'] = self.checkLocalWQStatus(
                dbname="workqueue_inbox")
            results['global_wq_status'] = self.checkGlobalWQStatus()

        return results

    def checkWorkflows(self):
        """
        Check to see if all workflows have a 'completed' status
        """
        results = self.dbsUtil.isAllWorkflowCompleted()
        return results

    def checkCondorStates(self):
        """
        Check idle and running jobs in Condor
        """
        results = {}
        jobs = self.condorAPI.getCondorJobsSummary()
        for state in self.condorStates:
            # if there is an error, report it instead of the length of an empty list
            if not jobs:
                results[state.lower()] = None
            else:
                results[state.lower()] = int(jobs[0].get(state))

        return results

    def checkFileUploadStatus(self):
        """
        Check file upload status:
            Blocks open in DBS
            Files not uploaded in DBS
            Files not uploaded to Phedex
        """
        results = {}
        results['dbs_open_blocks'] = self.dbsUtil.countOpenBlocks()
        results['dbs_notuploaded'] = self.dbsUtil.countFilesByStatus(
            status="NOTUPLOADED")
        results['phedex_notuploaded'] = self.dbsUtil.countPhedexNotUploaded()
        return results

    def checkLocalWQStatus(self, dbname):
        """
        Query local WorkQueue workqueue/workqueue_inbox database to see whether
        there are any active elements in this agent.
        """
        results = {}

        for st in ('Available', 'Negotiating', 'Acquired', 'Running'):
            if dbname == "workqueue":
                elements = self.localBackend.getElements(status=st,
                                                         returnIdOnly=True)
            else:
                elements = self.localBackend.getInboxElements(
                    status=st, returnIdOnly=True)
            results[st] = len(elements)
        return results

    def checkGlobalWQStatus(self):
        """
        Query Global WorkQueue workqueue database to see whether there are
        any active elements set to this agent.
        """
        results = {}

        for st in ("Acquired", "Running"):
            elements = self.globalBackend.getElements(
                status=st, returnIdOnly=True, ChildQueueUrl=self.thisAgentUrl)
            results[st] = len(elements)
        return results
Beispiel #18
0
class DrainStatusPoller(BaseWorkerThread):
    """
    Collects information related to the agent drain status
    """
    # class variable that contains drain statistics
    drainStats = {}

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI(config)
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.validSpeedDrainConfigKeys = ['CondorPriority', 'NoJobRetries', 'EnableAllSites']

        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)

    @timeFunction
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName)
        if not self.agentConfig:
            logging.error("Failed to fetch agent configuration from the auxiliary DB")
            return

        if isDrainMode(self.config):
            # check to see if the agent hit any speed drain thresholds
            thresholdsHit = self.checkSpeedDrainThresholds()
            if thresholdsHit:
                logging.info("Updating agent configuration for speed drain...")
                self.updateAgentSpeedDrainConfig(thresholdsHit)
            # now collect drain statistics
            try:
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: " + str(DrainStatusPoller.drainStats))

            except Exception as ex:
                msg = "Error occurred, will retry later:\n"
                msg += str(ex)
                logging.exception(msg)
        else:
            logging.info("Agent not in drain mode. Resetting flags and skipping drain check...")
            self.resetAgentSpeedDrainConfig()

    @classmethod
    def getDrainInfo(cls):
        """
        Return drainStats class variable
        """
        return cls.drainStats

    def updateAgentSpeedDrainConfig(self, thresholdsHit):
        """
        Takes a list of speed drain configuration keys and updates the agent configuration
        """
        updateConfig = False
        condorPriorityFlag = False
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        if 'CondorPriority' in thresholdsHit:
            logging.info("Bumping condor job priority to 999999 for Production/Processing pending jobs.")
            self.condorAPI.editCondorJobs(
                "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")",
                "JobPrio", "999999")
            condorPriorityFlag = True

        if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']:
            # CondorPriority setting is irreversible so the flag only indicates weather
            # priority is increased or not. It is not checked by other components
            logging.info("Enabling CondorPriority flag.")
            speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag
            updateConfig = True

        if 'NoJobRetries' in thresholdsHit:
            logging.info("Enabling NoJobRetries flag: Error Handler won't retry the jobs")
            # ErrorHandler will pick this up and set max retries to 0
            speedDrainConfig['NoJobRetries']['Enabled'] = True
            updateConfig = True

        if 'EnableAllSites' in thresholdsHit:
            logging.info("Enabling EnableAllSites flag: Updating agent to submit to all sites.")
            # setting this value to True makes JobSubmitterPoller ignore site status
            speedDrainConfig['EnableAllSites']['Enabled'] = True
            updateConfig = True

        # update the aux db speed drain config with any changes
        if updateConfig:
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", True)
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig)

        return

    def resetAgentSpeedDrainConfig(self):
        """
        resetting SpeedDrainMode to False and SpeedDrainiConfig Enabled to False
        """

        if self.agentConfig.get("SpeedDrainMode"):
            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainMode", False)
            speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")
            for key, v in speedDrainConfig.items():
                if key in self.validSpeedDrainConfigKeys and v['Enabled']:
                    speedDrainConfig[key]['Enabled'] = False

            self.reqAuxDB.updateAgentConfig(self.config.Agent.hostName, "SpeedDrainConfig", speedDrainConfig)
        return

    def checkSpeedDrainThresholds(self):
        """
        Check the current number of jobs in Condor and create a list of agent configuration parameters
        that need updated for speed draining
        """
        enableKeys = []

        # get the current speed drain status
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        # get condor jobs
        jobs = self.condorAPI.getCondorJobs("", [])
        if jobs is None:
            logging.warning("There was an error querying the schedd.  Not checking speed drain thresholds.")
            return []

        # loop through the speed drain configuration and make a list of what thresholds have been hit
        for k, v in speedDrainConfig.items():
            # make sure keys in the speed drain config are valid
            if k in self.validSpeedDrainConfigKeys and isinstance(v['Threshold'], int) and isinstance(v['Enabled'], bool):
                # we always want to apply the condor priority change if the threshold is hit
                if not v['Enabled'] or k == 'CondorPriority':
                    logging.info("Checking speed drain threshold for %s. ", k)
                    if len(jobs) < v['Threshold']:
                        logging.info("Agent will update speed drain configuration for %s. ", k)
                        enableKeys.append(k)
            else:
                logging.warning("Speed drain configuration error for %s.  Please check aux db contents.", k)

        return enableKeys
Beispiel #19
0
class DrainStatusPoller(BaseWorkerThread):
    """
    Collects information related to the agent drain status
    """
    # class variable that contains drain statistics
    drainStats = {}

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        self.config = config
        self.drainAPI = DrainStatusAPI(config)
        self.condorAPI = PyCondorAPI()
        self.agentConfig = {}
        self.previousConfig = {}
        self.validSpeedDrainConfigKeys = [
            'CondorPriority', 'NoJobRetries', 'EnableAllSites'
        ]
        self.reqAuxDB = ReqMgrAux(self.config.General.ReqMgr2ServiceURL)
        self.emailAlert = EmailAlert(config.EmailAlert.dictionary_())
        self.condorStates = ("Running", "Idle")

    @timeFunction
    def algorithm(self, parameters):
        """
        Update drainStats if agent is in drain mode
        """
        logging.info("Running agent drain algorithm...")
        if self.agentConfig:
            # make a copy of the previous agent aux db configuration to compare against later
            self.previousConfig = copy.deepcopy(self.agentConfig)
        # grab a new copy of the agent aux db configuration
        self.agentConfig = self.reqAuxDB.getWMAgentConfig(
            self.config.Agent.hostName)
        if not self.agentConfig:
            logging.error(
                "Failed to fetch agent configuration from the auxiliary DB")
            return

        try:
            # see if the agent is in drain mode
            if self.agentConfig["UserDrainMode"] or self.agentConfig[
                    "AgentDrainMode"]:
                # check to see if the agent hit any speed drain thresholds
                thresholdsHit = self.checkSpeedDrainThresholds()
                if thresholdsHit:
                    logging.info(
                        "Updating agent configuration for speed drain...")
                    self.updateAgentSpeedDrainConfig(thresholdsHit)
                # now collect drain statistics
                DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo()
                logging.info("Finished collecting agent drain status.")
                logging.info("Drain stats: %s",
                             str(DrainStatusPoller.drainStats))
            else:
                logging.info(
                    "Agent not in drain mode. Resetting flags and skipping drain check..."
                )
                self.resetAgentSpeedDrainConfig()

            # finally, check for any changes in drain status
            self.checkDrainStatusChanges()

        except Exception as ex:
            msg = "Error occurred, will retry later:\n"
            msg += str(ex)
            logging.exception(msg)

    @classmethod
    def getDrainInfo(cls):
        """
        Return drainStats class variable
        """
        return cls.drainStats

    def checkDrainStatusChanges(self):
        """
        Check to see if any drain statuses have changed in the auxiliary db
        If yes, send email notification and update local drain thread variables

        """
        message = ""
        drainStatusKeys = ['UserDrainMode', 'AgentDrainMode', 'SpeedDrainMode']

        if not self.previousConfig:
            return

        for key in drainStatusKeys:
            if self.previousConfig[key] != self.agentConfig[key]:
                message += "Agent had a drain status transition to %s = %s\n" % (
                    str(key), str(self.agentConfig[key]))

        if message:
            self.emailAlert.send(
                "DrainMode status change on " +
                getattr(self.config.Agent, "hostName"), message)
            logging.info("Drain mode status change: %s", message)

        return

    def updateAgentSpeedDrainConfig(self, thresholdsHit):
        """
        Takes a list of speed drain configuration keys and updates the agent configuration
        """
        updateConfig = False
        condorPriorityFlag = False
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        if 'CondorPriority' in thresholdsHit:
            logging.info(
                "Bumping condor job priority to 999999 for Production/Processing pending jobs."
            )
            self.condorAPI.editCondorJobs(
                "JobStatus=?=1 && (CMS_JobType =?= \"Production\" || CMS_JobType =?= \"Processing\")",
                "JobPrio", "999999")
            condorPriorityFlag = True

        if condorPriorityFlag != speedDrainConfig['CondorPriority']['Enabled']:
            # CondorPriority setting is irreversible so the flag only indicates weather
            # priority is increased or not. It is not checked by other components
            logging.info("Enabling CondorPriority flag.")
            speedDrainConfig['CondorPriority']['Enabled'] = condorPriorityFlag
            updateConfig = True

        if 'NoJobRetries' in thresholdsHit:
            logging.info(
                "Enabling NoJobRetries flag: Error Handler won't retry the jobs"
            )
            # ErrorHandler will pick this up and set max retries to 0
            speedDrainConfig['NoJobRetries']['Enabled'] = True
            updateConfig = True

        if 'EnableAllSites' in thresholdsHit:
            logging.info(
                "Enabling EnableAllSites flag: Updating agent to submit to all sites."
            )
            # setting this value to True makes JobSubmitterPoller ignore site status
            speedDrainConfig['EnableAllSites']['Enabled'] = True
            updateConfig = True

        # update the aux db speed drain config with any changes
        if updateConfig:
            self.agentConfig['SpeedDrainMode'] = True
            self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName,
                                              self.agentConfig)

        return

    def resetAgentSpeedDrainConfig(self):
        """
        resetting SpeedDrainMode to False and SpeedDrainConfig Enabled to False
        """

        if self.agentConfig.get("SpeedDrainMode"):
            self.agentConfig['SpeedDrainMode'] = False
            speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")
            for key, v in viewitems(speedDrainConfig):
                if key in self.validSpeedDrainConfigKeys and v['Enabled']:
                    speedDrainConfig[key]['Enabled'] = False

            self.reqAuxDB.updateWMAgentConfig(self.config.Agent.hostName,
                                              self.agentConfig)
        return

    def checkSpeedDrainThresholds(self):
        """
        Check the current number of jobs in Condor and create a list of agent configuration parameters
        that need updated for speed draining
        """
        enableKeys = []
        # first, update our summary of condor jobs
        totalJobs = self.getTotalCondorJobs()
        if totalJobs is None:
            msg = "Cannot check speed drain because there was an error fetching job summary from HTCondor."
            msg += " Will retry again in the next cycle."
            logging.warning(msg)
            return []

        # get the current speed drain status
        speedDrainConfig = self.agentConfig.get("SpeedDrainConfig")

        # loop through the speed drain configuration and make a list of what thresholds have been hit
        for k, v in viewitems(speedDrainConfig):
            # make sure keys in the speed drain config are valid
            if k in self.validSpeedDrainConfigKeys and isinstance(
                    v['Threshold'], int) and isinstance(v['Enabled'], bool):
                # we always want to apply the condor priority change if the threshold is hit
                if not v['Enabled'] or k == 'CondorPriority':
                    logging.info("Checking speed drain threshold for %s. ", k)
                    if totalJobs < v['Threshold']:
                        logging.info(
                            "Agent will update speed drain configuration for %s. ",
                            k)
                        enableKeys.append(k)
            else:
                logging.warning(
                    "Speed drain configuration error for %s.  Please check aux db contents.",
                    k)

        return enableKeys

    def getTotalCondorJobs(self):
        """
        Retrieve a summary of the jobs in condor and return an absolute number
        of the jobs in Idle and Running states.
        :return: returns an integer with the total number of jobs, or None if it failed.
        """
        jobs = self.condorAPI.getCondorJobsSummary()
        if not jobs:
            return None

        results = 0
        if jobs:
            for state in self.condorStates:
                results += int(jobs[0].get(state))
        return results