class HeartbeatMonitorBase(CherryPyPeriodicTask):

    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration}]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional mointoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}
class HeartbeatMonitorBase(CherryPyPeriodicTask):
    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{
            'func': self.reportToWMStats,
            'duration': config.heartbeatCheckDuration
        }]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional mointoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(
            self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo,
                                                config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile

        proxyArgs = {'logger': logging.getLogger()}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY

        localWQUrl = config.AnalyticsDataCollector.localQueueURL
        self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget,
                                    'filter': "WMStatsAgent/repfilter"})
        # TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({'source': t0Source, 'target': t0Target,
                                        'filter': "T0Request/repfilter"})
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']}
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL,
                                        'filter': wqfilter, 'query_params': query_params})
            self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL,
                                        'filter': wqfilter, 'query_params': query_params})

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                rp['source'], rp['target'], filter=rp['filter'],
                query_params=rp.get('query_params', False),
                continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkProxyLifetime(agentInfo)

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not hasattr(self.config, "Tier0Feeder"):
                # Tier0 Agent doesn't have LQ.
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs", timeSpent)

            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            # save locally json file as well
            with open(self.jsonFile, 'w') as outFile:
                json.dump(agentInfo, outFile, indent=2)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority()

        elements = self.workqueueDS.getElementsByStatus(['Available', 'Acquired'])
        uniSites, posSites = getGlobalSiteStatusSummary(elements, dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""}

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'],
                                                                  rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s", agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s", results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState'])
        logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount'])
        logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount'])

        logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus'])
        logging.debug("Total number of complete jobs in BossAir sorted by status: %s",
                      results['completeRunJobByStatus'])

        logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ'])
        logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio'])

        return results

    def checkProxyLifetime(self, agInfo):
        """
        Check the proxy lifetime (usually X509_USER_CERT) and raise either
        a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :return: same dictionary object plus additional keys/values if needed.
        """
        secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile)
        logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile, secsLeft)


        if secsLeft <= 86400 * 3:  # 3 days
            proxyWarning = True
            agInfo['status'] = "error"
        elif secsLeft <= 86400 * 5:  # 5 days
            proxyWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            proxyWarning = False

        if proxyWarning:
            warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile
            warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.)
            agInfo['proxy_warning'] = warnMsg

        return
Beispiel #4
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel

        proxyArgs = {'logger': logging.getLogger(), 'cleanEnvironment': True}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY
        self.userCertFile = self.proxy.getUserCertFilename()  # X509_USER_CERT
        # credential lifetime warning/error thresholds, in days
        self.credThresholds = {
            'proxy': {
                'error': 3,
                'warning': 5
            },
            'certificate': {
                'error': 10,
                'warning': 20
            }
        }

        # Monitoring setup
        self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None)
        self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None)
        self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False)
        self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None)
        self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ",
                                   [('cms-mb.cern.ch', 61313)])

        # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here
        if hasattr(self.config, "Tier0Feeder"):
            self.isT0agent = True
            self.producer = "tier0wmagent"
        else:
            self.isT0agent = False
            self.producer = "wmagent"
            localWQUrl = config.AnalyticsDataCollector.localQueueURL
            self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.General.centralWMStatsURL
        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        if self.isT0agent:
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:
            # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(
            self.config.General.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    @timeFunction
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkCredLifetime(agentInfo, "proxy")
            self.checkCredLifetime(agentInfo, "certificate")

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not self.isT0agent:
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs",
                             timeSpent)

            self.uploadAgentInfoToCentralWMStats(agentInfo)

            self.buildMonITDocs(agentInfo)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s",
                              str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}
        wqStates = ['Available', 'Acquired']

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results[
            'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority(
            )

        elements = self.workqueueDS.getElementsByStatus(wqStates)
        uniSites, posSites = getGlobalSiteStatusSummary(elements,
                                                        status=wqStates,
                                                        dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {
            'name': 'CouchServer',
            'status': 'ok',
            'error_message': ""
        }

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config,
                                                               updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s",
                     agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo):
        """
        Add some required fields to the document before it can get uploaded
        to WMStats.
        :param agentInfo: dict with agent stats to be posted to couchdb
        """
        agentInfo['_id'] = agentInfo["agent_url"]
        agentInfo['timestamp'] = int(time.time())
        agentInfo['type'] = "agent_info"
        # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        try:
            self.centralWMStatsCouchDB.updateAgentInfo(
                agentInfo, propertiesToKeep=["data_last_update", "data_error"])
        except Exception as e:
            logging.error(
                "Failed to upload agent statistics to WMStats. Error: %s",
                str(e))

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s",
                      results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s",
                      results['wmbsCountByState'])
        logging.debug(
            "Total number of 'created' jobs in WMBS sorted by type: %s",
            results['wmbsCreatedTypeCount'])
        logging.debug(
            "Total number of 'executing' jobs in WMBS sorted by type: %s",
            results['wmbsExecutingTypeCount'])

        logging.debug(
            "Total number of active jobs in BossAir sorted by status: %s",
            results['activeRunJobByStatus'])
        logging.debug(
            "Total number of complete jobs in BossAir sorted by status: %s",
            results['completeRunJobByStatus'])

        logging.debug(
            "Available slots thresholds to pull work from GQ to LQ: %s",
            results['thresholdsGQ2LQ'])
        logging.debug(
            "List of jobs pending for each site, sorted by priority: %s",
            results['sitePendCountByPrio'])

        return results

    def checkCredLifetime(self, agInfo, credType):
        """
        Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT
        and raise either a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :param credType: credential type, can be: "proxy" or "certificate"
        :return: same dictionary object plus additional keys/values if needed.
        """
        if credType == "proxy":
            credFile = self.proxyFile
            secsLeft = self.proxy.getTimeLeft(proxy=credFile)
        elif credType == "certificate":
            credFile = self.userCertFile
            secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True)
        else:
            logging.error(
                "Unknown credential type. Available options are: [proxy, certificate]"
            )
            return

        logging.debug("%s '%s' lifetime is %d seconds", credType, credFile,
                      secsLeft)

        daysLeft = secsLeft / (60 * 60 * 24)

        if daysLeft <= self.credThresholds[credType]['error']:
            credWarning = True
            agInfo['status'] = "error"
        elif daysLeft <= self.credThresholds[credType]['warning']:
            credWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            credWarning = False

        if credWarning:
            warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType,
                                                                credFile)
            warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.)
            agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg
            logging.warning(warnMsg)

        return

    def buildMonITDocs(self, dataStats):
        """
        Convert agent statistics into MonIT-friendly documents to be posted
        to AMQ/ES. It creates 5 different type of documents:
         * priority information
         * site information
         * work information
         * agent information
         * agent health information
        Note that the internal methods are popping some metrics out of dataStats
        """
        if not self.postToAMQ:
            return

        logging.info("Preparing documents to be posted to AMQ/MonIT..")
        allDocs = self._buildMonITPrioDocs(dataStats)
        allDocs.extend(self._buildMonITSitesDocs(dataStats))
        allDocs.extend(self._buildMonITWorkDocs(dataStats))
        allDocs.extend(self._buildMonITWMBSDocs(dataStats))
        allDocs.extend(self._buildMonITAgentDocs(dataStats))
        allDocs.extend(self._buildMonITHealthDocs(dataStats))
        allDocs.extend(self._buildMonITSummaryDocs(dataStats))

        # and finally post them all to AMQ
        logging.info("Found %d documents to post to AMQ", len(allDocs))
        self.uploadToAMQ(allDocs, dataStats['agent_url'],
                         dataStats['timestamp'])

    def _buildMonITPrioDocs(self, dataStats):
        """
        Uses the `sitePendCountByPrio` metric in order to build documents
        reporting the site name, job priority and amount of jobs within that
        priority.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_prio_info MonIT docs
        """
        docType = "wma_prio_info"
        prioDocs = []
        sitePendCountByPrio = dataStats['WMBS_INFO'].pop(
            'sitePendCountByPrio', [])

        for site, item in viewitems(sitePendCountByPrio):
            # it seems sites with no jobs are also always here as "Sitename": {0: 0}
            if list(item) == [0]:
                continue
            for prio, jobs in viewitems(item):
                prioDoc = {}
                prioDoc['site_name'] = site
                prioDoc['type'] = docType
                prioDoc['priority'] = prio
                prioDoc['job_count'] = jobs
                prioDocs.append(prioDoc)
        return prioDocs

    def _buildMonITSitesDocs(self, dataStats):
        """
        Uses the site thresholds and job information for each site in order
        to build a `site_info` document type for MonIT.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_site_info MonIT docs
        """
        docType = "wma_site_info"
        siteDocs = []
        thresholds = dataStats['WMBS_INFO'].pop('thresholds', {})
        thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {})
        if self.isT0agent:
            possibleJobsPerSite = {}
            uniqueJobsPerSite = {}
        else:
            possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop(
                'possibleJobsPerSite', {})
            uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop(
                'uniqueJobsPerSite', {})

        for site in sorted(thresholds):
            siteDoc = {}
            siteDoc['site_name'] = site
            siteDoc['type'] = docType
            siteDoc['thresholds'] = thresholds[site]
            siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown')
            siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0)

            for status in possibleJobsPerSite:
                # make sure these keys are always present in the documents
                jobKey = "possible_%s_jobs" % status.lower()
                elemKey = "num_%s_elem" % status.lower()
                uniJobKey = "unique_%s_jobs" % status.lower()
                siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0
                if site in possibleJobsPerSite[status]:
                    siteDoc[jobKey] = possibleJobsPerSite[status][site][
                        'sum_jobs']
                    siteDoc[elemKey] = possibleJobsPerSite[status][site][
                        'num_elem']
                if site in uniqueJobsPerSite[status]:
                    siteDoc[uniJobKey] = uniqueJobsPerSite[status][site][
                        'sum_jobs']

            siteDocs.append(siteDoc)

        return siteDocs

    def _buildMonITWorkDocs(self, dataStats):
        """
        Uses the local workqueue information order by WQE status and build
        statistics for the workload in terms of workqueue elements and top
        level jobs.
        Using the WMBS data, also builds documents to show the amount of
        work in 'created' and 'executing' WMBS status.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_work_info MonIT docs
        """
        workDocs = []
        if self.isT0agent:
            return workDocs

        docType = "wma_work_info"
        workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {})
        for status, info in viewitems(workByStatus):
            workDoc = {}
            workDoc['type'] = docType
            workDoc['status'] = status
            workDoc['num_elem'] = info.get('num_elem', 0)
            workDoc['sum_jobs'] = info.get('sum_jobs', 0)
            workDocs.append(workDoc)

        return workDocs

    def _buildMonITWMBSDocs(self, dataStats):
        """
        Using the WMBS data, builds documents to show the amount of work in
        'created' and 'executing' WMBS status.
        It also builds a document for every single wmbs_status in the database.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs
        """
        docType = "wma_wmbs_info"
        wmbsDocs = []
        wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop(
            'wmbsCreatedTypeCount', {})
        wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop(
            'wmbsExecutingTypeCount', {})
        for jobType in wmbsCreatedTypeCount:
            wmbsDoc = {}
            wmbsDoc['type'] = docType
            wmbsDoc['job_type'] = jobType
            wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType]
            wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType]
            wmbsDocs.append(wmbsDoc)

        docType = "wma_wmbs_state_info"
        wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {})
        for wmbsStatus in wmbsCountByState:
            wmbsDoc = {}
            wmbsDoc['type'] = docType
            wmbsDoc['wmbs_status'] = wmbsStatus
            wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus]
            wmbsDocs.append(wmbsDoc)

        return wmbsDocs

    def _buildMonITAgentDocs(self, dataStats):
        """
        Uses the BossAir and WMBS table information in order to build a
        view of amount of jobs in different statuses.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_agent_info MonIT docs
        """
        docType = "wma_agent_info"
        agentDocs = []
        activeRunJobByStatus = dataStats['WMBS_INFO'].pop(
            'activeRunJobByStatus', {})
        completeRunJobByStatus = dataStats['WMBS_INFO'].pop(
            'completeRunJobByStatus', {})
        for schedStatus in activeRunJobByStatus:
            agentDoc = {}
            agentDoc['type'] = docType
            agentDoc['schedd_status'] = schedStatus
            agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus]
            agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus]
            agentDocs.append(agentDoc)

        return agentDocs

    def _buildMonITHealthDocs(self, dataStats):
        """
        Creates documents with specific agent information, status of
        each component and worker thread (similar to what is shown in
        wmstats) and also some very basic performance numbers.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_health_info MonIT docs
        """
        docType = "wma_health_info"
        healthDocs = []
        workersStatus = dataStats.pop('workers', {})
        for worker in workersStatus:
            healthDoc = {}
            healthDoc['type'] = docType
            healthDoc['worker_name'] = worker['name']
            healthDoc['worker_state'] = worker['state']
            healthDoc['worker_poll'] = worker['poll_interval']
            healthDoc['worker_last_hb'] = worker['last_updated']
            healthDoc['worker_cycle_time'] = worker['cycle_time']
            healthDocs.append(healthDoc)

        return healthDocs

    def _buildMonITSummaryDocs(self, dataStats):
        """
        Creates a document with the very basic agent info used
        in the wmstats monitoring tab.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_health_info MonIT docs
        """
        docType = "wma_summary_info"
        summaryDocs = []
        summaryDoc = {}
        summaryDoc['type'] = docType
        summaryDoc['agent_team'] = dataStats['agent_team']
        summaryDoc['agent_version'] = dataStats['agent_version']
        summaryDoc['agent_status'] = dataStats['status']
        if not self.isT0agent:
            summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO'][
                'total_query_time']
        summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO'][
            'total_query_time']
        summaryDoc['drain_mode'] = dataStats['drain_mode']
        summaryDoc['down_components'] = dataStats['down_components']
        summaryDocs.append(summaryDoc)
        return summaryDocs

    def uploadToAMQ(self, docs, agentUrl, timeS):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in the MonIT infrastructure.
        :param docs: list of documents/dicts to be posted
        """
        if not docs:
            logging.info("There are no documents to send to AMQ")
            return
        # add mandatory information for every single document
        for doc in docs:
            doc['agent_url'] = agentUrl

        docType = "cms_%s_info" % self.producer
        notifications = []

        logging.debug("Sending the following data to AMQ %s", pformat(docs))
        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=self.producer,
                                topic=self.topicAMQ,
                                validation_schema=None,
                                host_and_ports=self.hostPortAMQ,
                                logger=logging)

            for doc in docs:
                singleNotif, _, _ = stompSvc.make_notification(
                    payload=doc,
                    docType=docType,
                    ts=timeS,
                    dataSubfield="payload")
                notifications.append(singleNotif)

            failures = stompSvc.send(notifications)
            msg = "%i out of %i documents successfully sent to AMQ" % (
                len(notifications) - len(failures), len(notifications))
            logging.info(msg)
        except Exception as ex:
            logging.exception("Failed to send data to StompAMQ. Error %s",
                              str(ex))

        return
Beispiel #5
0
class HeartbeatMonitorBase(CherryPyPeriodicTask):

    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list
        self.userAMQ = getattr(config, "user_amq", None)
        self.passAMQ = getattr(config, "pass_amq", None)
        self.postToAMQ = getattr(config, "post_to_amq", False)
        self.topicAMQ = getattr(config, "topic_amq", None)
        self.hostPortAMQ = getattr(config, "host_port_amq", None)

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{'func': self.reportToWMStats, 'duration': config.heartbeatCheckDuration}]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional monitoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo, config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}

    def uploadToAMQ(self, docs, producer=None):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in elastic search.
        :param docs: list of documents/dicts to be posted
        :param producer: service name that's providing this info
        """
        if not docs:
            self.logger.info("There are no documents to send to AMQ")
            return

        producer = producer or self.producer
        self.logger.debug("Sending the following data to AMQ %s", pformat(docs))
        ts = int(time.time())

        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=producer,
                                topic=self.topicAMQ,
                                host_and_ports=self.hostPortAMQ,
                                logger=self.logger)

            notifications = [stompSvc.make_notification(payload=doc, docType=self.docTypeAMQ, ts=ts,
                                                        dataSubfield="payload") for doc in docs]

            failures = stompSvc.send(notifications)
            self.logger.info("%i docs successfully sent to Stomp AMQ", len(notifications) - len(failures))
        except Exception as ex:
            self.logger.exception("Failed to send data to StompAMQ. Error %s", str(ex))

        return
Beispiel #6
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile
        # counter for deep agent monitoring. Every 15min (3 cycles of the component)
        self.monitorCounter = 0
        self.monitorInterval = getattr(config.AgentStatusWatcher,
                                       'monitorPollInterval', 3)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        #TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source':
                sanitizeURL(parentQURL)['url'],
                'target':
                localQInboxURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                rp['source'],
                rp['target'],
                filter=rp['filter'],
                query_params=rp.get('query_params', False),
                continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(
            self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            if self.monitorCounter % self.monitorInterval == 0:
                monitoring = self.collectWMBSInfo()
                monitoring['components'] = agentInfo['down_components']
                monitoring['timestamp'] = int(time.time())
                with open(self.jsonFile, 'w') as outFile:
                    json.dump(monitoring, outFile, indent=2)
            self.monitorCounter += 1
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())

    def collectCouchDBInfo(self):

        couchInfo = {
            'name': 'CouchServer',
            'status': 'ok',
            'error_message': ""
        }

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Disk space warning
        diskUseList = diskUse()
        diskUseThreshold = float(
            self.config.AnalyticsDataCollector.diskUseThreshold)
        agentInfo['disk_warning'] = []
        for disk in diskUseList:
            if float(disk['percent'].strip('%')) >= diskUseThreshold and \
                            disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk:
                agentInfo['disk_warning'].append(disk)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        if agentInfo['down_components']:
            logging.info("List of agent components down: %s" %
                         agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC,
                                           uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        results = {}
        logging.info("Getting wmbs job info ...")
        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.info("Running and pending site thresholds: %s",
                     results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())
        logging.info("Total number of jobs in WMBS sorted by status: %s",
                     results['wmbsCountByState'])
        logging.info(
            "Total number of 'created' jobs in WMBS sorted by type: %s",
            results['wmbsCreatedTypeCount'])
        logging.info(
            "Total number of 'executing' jobs in WMBS sorted by type: %s",
            results['wmbsExecutingTypeCount'])

        logging.info(
            "Total number of active jobs in BossAir sorted by status: %s",
            results['activeRunJobByStatus'])
        logging.info(
            "Total number of complete jobs in BossAir sorted by status: %s",
            results['completeRunJobByStatus'])

        logging.info(
            "Available slots thresholds to pull work from GQ to LQ: %s",
            results['thresholdsGQ2LQ'])
        logging.info(
            "List of jobs pending for each site, sorted by priority: %s",
            results['sitePendCountByPrio'])

        return results
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower()
    
    def setUpCouchDBReplication(self):
        
        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL
        
        self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 
                                    'filter':  "WMStatsAgent/repfilter"})
        #TODO: tier0 specific code - need to make it generic 
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 
                                        'filter': "T0Request/repfilter"})
        else: # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {'childUrl' : childURL, 'parentUrl' : sanitizeURL(parentQURL)['url']}
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 
                                        'filter': wqfilter, 'query_params': query_params})       
            self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 
                                        'filter': wqfilter, 'query_params': query_params})
        
        
    # delete or replicator docs befor setting up
        self.localCouchMonitor.deleteReplicatorDocs()
        
        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                                           rp['source'], rp['target'], filter = rp['filter'], 
                                           query_params = rp.get('query_params', False),
                                           continuous = True, useReplicator = True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True
                     
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        
        if hasattr(self.config, "Tier0Feeder"):
            self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL, 
                                                       appName= "WMStatsAgent")
        else:
            self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Getting Agent info ...")
            agentInfo = self.collectAgentInfo()
            
            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())
            
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)
            
            logging.info("Agent components down:\n %s" % agentInfo['down_components'])
            logging.info("Agent in drain mode:\n %s \nsleep for next WMStats alarm updating cycle"
                          % agentInfo['drain_mode'])
            
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
    
     
    def collectCouchDBInfo(self):
        
        couchInfo = {'status': 'ok', 'error_message': ""}
        
        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo
        
        msg = ""
        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], 
                                                        rp['target'], checkUpdateSeq = False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
        
        couchInfo['error_message'] = msg
        return couchInfo
        
    def collectAgentInfo(self):
        
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)
        
        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['status'] = "warning"
        
        else:
            agentInfo['drain_mode'] = False
        
        couchInfo = self.collectCouchDBInfo()
        
        if (couchInfo['status'] != 'ok'):
            agentInfo['down_components'].append("CouchServer")
            agentInfo['status'] = couchInfo['status']
            couchInfo['name'] = "CouchServer"
            agentInfo['down_component_detail'].append(couchInfo)
        
        
        # Disk space warning   
        diskUseList = diskUse()
        diskUseThreshold = float(self.config.AnalyticsDataCollector.diskUseThreshold)
        agentInfo['disk_warning'] = []
        for disk in diskUseList:
            if float(disk['percent'].strip('%')) >= diskUseThreshold and disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk:
                agentInfo['disk_warning'].append(disk)
        
        # Couch process warning
        couchProc = numberCouchProcess()
        couchProcessThreshold = float(self.config.AnalyticsDataCollector.couchProcessThreshold)
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0
        
        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo(self)
        if lastDataUpload['data_last_update']!=0:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']!="":
            agentInfo['data_error'] = lastDataUpload['data_error']
        
        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok':
            if agentInfo['disk_warning'] != []:
                agentInfo['status'] = "warning"
                
        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if ('data_error' in agentInfo and agentInfo['data_error'] != 'ok') or \
               ('couch_process_warning' in agentInfo and agentInfo['couch_process_warning'] != 0):
                agentInfo['status'] = "error"

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)
Beispiel #8
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = (
            config.AnalyticsDataCollector.summaryLevel).lower()

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        #TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

    # delete or replicator docs befor setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        if hasattr(self.config, "Tier0Feeder"):
            self.centralWMStatsCouchDB = WMStatsWriter(
                self.config.AnalyticsDataCollector.localWMStatsURL,
                appName="WMStatsAgent")
        else:
            self.centralWMStatsCouchDB = WMStatsWriter(
                self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            logging.info("Getting Agent info ...")
            agentInfo = self.collectAgentInfo()

            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())

            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            logging.info("Agent components down:\n %s" %
                         agentInfo['down_components'])
            logging.info(
                "Agent in drain mode:\n %s \nsleep for next WMStats alarm updating cycle"
                % agentInfo['drain_mode'])

        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())

    def collectCouchDBInfo(self):

        couchInfo = {'status': 'ok', 'error_message': ""}

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        msg = ""
        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'

        couchInfo['error_message'] = msg
        return couchInfo

    def collectAgentInfo(self):

        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['status'] = "warning"

        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()

        if (couchInfo['status'] != 'ok'):
            agentInfo['down_components'].append("CouchServer")
            agentInfo['status'] = couchInfo['status']
            couchInfo['name'] = "CouchServer"
            agentInfo['down_component_detail'].append(couchInfo)

        # Disk space warning
        diskUseList = diskUse()
        diskUseThreshold = float(
            self.config.AnalyticsDataCollector.diskUseThreshold)
        agentInfo['disk_warning'] = []
        for disk in diskUseList:
            if float(disk['percent'].strip('%')) >= diskUseThreshold and disk[
                    'mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk:
                agentInfo['disk_warning'].append(disk)

        # Couch process warning
        couchProc = numberCouchProcess()
        couchProcessThreshold = float(
            self.config.AnalyticsDataCollector.couchProcessThreshold)
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo(self)
        if lastDataUpload['data_last_update'] != 0:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error'] != "":
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok':
            if agentInfo['disk_warning'] != []:
                agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if ('data_error' in agentInfo and agentInfo['data_error'] != 'ok') or \
               ('couch_process_warning' in agentInfo and agentInfo['couch_process_warning'] != 0):
                agentInfo['status'] = "error"

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC,
                                           uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)
Beispiel #9
0
class HeartbeatMonitorBase(CherryPyPeriodicTask):
    def __init__(self, rest, config):
        super(HeartbeatMonitorBase, self).__init__(config)
        self.centralWMStats = WMStatsWriter(config.wmstats_url)
        self.threadList = config.thread_list
        self.userAMQ = getattr(config, "user_amq", None)
        self.passAMQ = getattr(config, "pass_amq", None)
        self.postToAMQ = getattr(config, "post_to_amq", False)
        self.topicAMQ = getattr(config, "topic_amq", None)
        self.hostPortAMQ = getattr(config, "host_port_amq", None)

    def setConcurrentTasks(self, config):
        """
        sets the list of function reference for concurrent tasks
        """
        self.concurrentTasks = [{
            'func': self.reportToWMStats,
            'duration': config.heartbeatCheckDuration
        }]

    def reportToWMStats(self, config):
        """
        report thread status and heartbeat.
        Also can report additional monitoring information by rewriting addAdditionalMonitorReport method
        """
        self.logger.info("Checking Thread status...")
        downThreadInfo = self.logDB.wmstats_down_components_report(
            self.threadList)
        monitorInfo = self.addAdditionalMonitorReport(config)
        downThreadInfo.update(monitorInfo)
        wqSummaryDoc = convertToServiceCouchDoc(downThreadInfo,
                                                config.log_reporter)
        self.centralWMStats.updateAgentInfo(wqSummaryDoc)

        self.logger.info("Uploaded to WMStats...")

        return

    def addAdditionalMonitorReport(self, config):
        """
        add Additonal report with heartbeat report
        overwite the method with each applications monitoring info. (Need to follow the format displayed in wmstats)
        """
        return {}

    def uploadToAMQ(self, docs, producer=None):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in elastic search.
        :param docs: list of documents/dicts to be posted
        :param producer: service name that's providing this info
        """
        if not docs:
            self.logger.info("There are no documents to send to AMQ")
            return

        producer = producer or self.producer
        self.logger.debug("Sending the following data to AMQ %s",
                          pformat(docs))
        ts = int(time.time())

        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=producer,
                                topic=self.topicAMQ,
                                host_and_ports=self.hostPortAMQ,
                                logger=self.logger)

            notifications = stompSvc.make_notification(payload=docs,
                                                       docType=self.docTypeAMQ,
                                                       docId=producer,
                                                       ts=ts)

            failures = stompSvc.send(notifications)
            self.logger.info("%i docs successfully sent to Stomp AMQ",
                             len(notifications) - len(failures))
        except Exception as ex:
            self.logger.exception("Failed to send data to StompAMQ. Error %s",
                                  str(ex))

        return
Beispiel #10
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile

        proxyArgs = {'logger': logging.getLogger()}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY

        localWQUrl = config.AnalyticsDataCollector.localQueueURL
        self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL

        self.replicatorDocs.append({
            'source': wmstatsSource,
            'target': wmstatsTarget,
            'filter': "WMStatsAgent/repfilter"
        })
        # TODO: tier0 specific code - need to make it generic
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({
                'source': t0Source,
                'target': t0Target,
                'filter': "T0Request/repfilter"
            })
        else:  # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams[
                "ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {
                'childUrl': childURL,
                'parentUrl': sanitizeURL(parentQURL)['url']
            }
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({
                'source': sanitizeURL(parentQURL)['url'],
                'target': localQInboxURL,
                'filter': wqfilter,
                'query_params': query_params
            })
            self.replicatorDocs.append({
                'source':
                sanitizeURL(localQInboxURL)['url'],
                'target':
                parentQURL,
                'filter':
                wqfilter,
                'query_params':
                query_params
            })

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(rp['source'],
                                                         rp['target'],
                                                         filter=rp['filter'],
                                                         query_params=rp.get(
                                                             'query_params',
                                                             False),
                                                         continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi,
                                       myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(
            self.config.AnalyticsDataCollector.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(
            self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkProxyLifetime(agentInfo)

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not hasattr(self.config, "Tier0Feeder"):
                # Tier0 Agent doesn't have LQ.
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs",
                             timeSpent)

            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)

            # save locally json file as well
            with open(self.jsonFile, 'w') as outFile:
                json.dump(agentInfo, outFile, indent=2)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s",
                              str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results[
            'workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority(
            )

        elements = self.workqueueDS.getElementsByStatus(
            ['Available', 'Acquired'])
        uniSites, posSites = getGlobalSiteStatusSummary(elements,
                                                        dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {
            'name': 'CouchServer',
            'status': 'ok',
            'error_message': ""
        }

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(
                rp['source'], rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config,
                                                               updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode']
                                            or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get(
                    'couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s",
                     agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        # direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC,
                                           uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s",
                      results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s",
                      results['wmbsCountByState'])
        logging.debug(
            "Total number of 'created' jobs in WMBS sorted by type: %s",
            results['wmbsCreatedTypeCount'])
        logging.debug(
            "Total number of 'executing' jobs in WMBS sorted by type: %s",
            results['wmbsExecutingTypeCount'])

        logging.debug(
            "Total number of active jobs in BossAir sorted by status: %s",
            results['activeRunJobByStatus'])
        logging.debug(
            "Total number of complete jobs in BossAir sorted by status: %s",
            results['completeRunJobByStatus'])

        logging.debug(
            "Available slots thresholds to pull work from GQ to LQ: %s",
            results['thresholdsGQ2LQ'])
        logging.debug(
            "List of jobs pending for each site, sorted by priority: %s",
            results['sitePendCountByPrio'])

        return results

    def checkProxyLifetime(self, agInfo):
        """
        Check the proxy lifetime (usually X509_USER_CERT) and raise either
        a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :return: same dictionary object plus additional keys/values if needed.
        """
        secsLeft = self.proxy.getTimeLeft(proxy=self.proxyFile)
        logging.debug("Proxy '%s' lifetime is %d secs", self.proxyFile,
                      secsLeft)

        if secsLeft <= 86400 * 3:  # 3 days
            proxyWarning = True
            agInfo['status'] = "error"
        elif secsLeft <= 86400 * 5:  # 5 days
            proxyWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            proxyWarning = False

        if proxyWarning:
            warnMsg = "Agent proxy '%s' must be renewed ASAP. " % self.proxyFile
            warnMsg += "Its time left is: %.2f hours." % (secsLeft / 3600.)
            agInfo['proxy_warning'] = warnMsg

        return
Beispiel #11
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """

    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel

        proxyArgs = {'logger': logging.getLogger()}
        self.proxy = Proxy(proxyArgs)
        self.proxyFile = self.proxy.getProxyFilename()  # X509_USER_PROXY
        self.userCertFile = self.proxy.getUserCertFilename()  # X509_USER_CERT
        # credential lifetime warning/error thresholds, in days
        self.credThresholds = {'proxy': {'error': 3, 'warning': 5},
                               'certificate': {'error': 10, 'warning': 20}}

        # Monitoring setup
        self.userAMQ = getattr(config.AgentStatusWatcher, "userAMQ", None)
        self.passAMQ = getattr(config.AgentStatusWatcher, "passAMQ", None)
        self.postToAMQ = getattr(config.AgentStatusWatcher, "enableAMQ", False)
        self.topicAMQ = getattr(config.AgentStatusWatcher, "topicAMQ", None)
        self.hostPortAMQ = getattr(config.AgentStatusWatcher, "hostPortAMQ", [('cms-mb.cern.ch', 61313)])

        # T0 doesn't have WorkQueue, so some monitoring/replication code has to be skipped here
        if hasattr(self.config, "Tier0Feeder"):
            self.isT0agent = True
            self.producer = "tier0wmagent"
        else:
            self.isT0agent = False
            self.producer = "wmagent"
            localWQUrl = config.AnalyticsDataCollector.localQueueURL
            self.workqueueDS = WorkQueueDS(localWQUrl)

    def setUpCouchDBReplication(self):

        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.General.centralWMStatsURL

        self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget,
                                    'filter': "WMStatsAgent/repfilter"})
        if self.isT0agent:
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({'source': t0Source, 'target': t0Target,
                                        'filter': "T0Request/repfilter"})
        else:
            # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {'childUrl': childURL, 'parentUrl': sanitizeURL(parentQURL)['url']}
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL,
                                        'filter': wqfilter, 'query_params': query_params})
            self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL,
                                        'filter': wqfilter, 'query_params': query_params})

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()

        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                rp['source'], rp['target'], filter=rp['filter'],
                query_params=rp.get('query_params', False),
                continuous=True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True

    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)

        self.centralWMStatsCouchDB = WMStatsWriter(self.config.General.centralWMStatsURL)

        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    @timeFunction
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            self.checkCredLifetime(agentInfo, "proxy")
            self.checkCredLifetime(agentInfo, "certificate")

            timeSpent, wmbsInfo, _ = self.collectWMBSInfo()
            wmbsInfo['total_query_time'] = int(timeSpent)
            agentInfo["WMBS_INFO"] = wmbsInfo
            logging.info("WMBS data collected in: %d secs", timeSpent)

            if not self.isT0agent:
                timeSpent, localWQInfo, _ = self.collectWorkQueueInfo()
                localWQInfo['total_query_time'] = int(timeSpent)
                agentInfo["LocalWQ_INFO"] = localWQInfo
                logging.info("Local WorkQueue data collected in: %d secs", timeSpent)

            self.uploadAgentInfoToCentralWMStats(agentInfo)

            self.buildMonITDocs(agentInfo)

        except Exception as ex:
            logging.exception("Error occurred, will retry later.\nDetails: %s", str(ex))

    @timeFunction
    def collectWorkQueueInfo(self):
        """
        Collect information from local workqueue database
        :return:
        """
        results = {}
        wqStates = ['Available', 'Acquired']

        results['workByStatus'] = self.workqueueDS.getJobsByStatus()
        results['workByStatusAndPriority'] = self.workqueueDS.getJobsByStatusAndPriority()

        elements = self.workqueueDS.getElementsByStatus(wqStates)
        uniSites, posSites = getGlobalSiteStatusSummary(elements, status=wqStates, dataLocality=True)
        results['uniqueJobsPerSite'] = uniSites
        results['possibleJobsPerSite'] = posSites

        return results

    def collectCouchDBInfo(self):

        couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""}

        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'],
                                                                  rp['target'], checkUpdateSeq=False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']

        return couchInfo

    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)

        agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True)

        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
            agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo()
        else:
            agentInfo['drain_mode'] = False

        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)

        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0

        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"

        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0):
                agentInfo['status'] = "error"

        logging.info("List of agent components down: %s", agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo):
        """
        Add some required fields to the document before it can get uploaded
        to WMStats.
        :param agentInfo: dict with agent stats to be posted to couchdb
        """
        agentInfo['_id'] = agentInfo["agent_url"]
        agentInfo['timestamp'] = int(time.time())
        agentInfo['type'] = "agent_info"
        # directly upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        try:
            self.centralWMStatsCouchDB.updateAgentInfo(agentInfo,
                                                       propertiesToKeep=["data_last_update", "data_error"])
        except Exception as e:
            logging.error("Failed to upload agent statistics to WMStats. Error: %s", str(e))

    @timeFunction
    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}

        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s", results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())

        logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState'])
        logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount'])
        logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount'])

        logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus'])
        logging.debug("Total number of complete jobs in BossAir sorted by status: %s",
                      results['completeRunJobByStatus'])

        logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ'])
        logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio'])

        return results

    def checkCredLifetime(self, agInfo, credType):
        """
        Check the credential lifetime. Usually X509_USER_PROXY or X509_USER_CERT
        and raise either a warning or an error if the proxy validity is about to expire.
        :param agInfo: dictionary with plenty of agent monitoring information in place.
        :param credType: credential type, can be: "proxy" or "certificate"
        :return: same dictionary object plus additional keys/values if needed.
        """
        if credType == "proxy":
            credFile = self.proxyFile
            secsLeft = self.proxy.getTimeLeft(proxy=credFile)
        elif credType == "certificate":
            credFile = self.userCertFile
            secsLeft = self.proxy.getUserCertTimeLeft(openSSL=True)
        else:
            logging.error("Unknown credential type. Available options are: [proxy, certificate]")
            return

        logging.debug("%s '%s' lifetime is %d seconds", credType, credFile, secsLeft)

        daysLeft = secsLeft / (60. * 60 * 24)

        if daysLeft <= self.credThresholds[credType]['error']:
            credWarning = True
            agInfo['status'] = "error"
        elif daysLeft <= self.credThresholds[credType]['warning']:
            credWarning = True
            if agInfo['status'] == "ok":
                agInfo['status'] = "warning"
        else:
            credWarning = False

        if credWarning:
            warnMsg = "Agent %s '%s' must be renewed ASAP. " % (credType, credFile)
            warnMsg += "Its time left is: %.2f hours;" % (secsLeft / 3600.)
            agInfo['proxy_warning'] = agInfo.get('proxy_warning', "") + warnMsg
            logging.warning(warnMsg)

        return

    def buildMonITDocs(self, dataStats):
        """
        Convert agent statistics into MonIT-friendly documents to be posted
        to AMQ/ES. It creates 5 different type of documents:
         * priority information
         * site information
         * work information
         * agent information
         * agent health information
        Note that the internal methods are popping some metrics out of dataStats
        """
        if not self.postToAMQ:
            return

        logging.info("Preparing documents to be posted to AMQ/MonIT..")
        allDocs = self._buildMonITPrioDocs(dataStats)
        allDocs.extend(self._buildMonITSitesDocs(dataStats))
        allDocs.extend(self._buildMonITWorkDocs(dataStats))
        allDocs.extend(self._buildMonITWMBSDocs(dataStats))
        allDocs.extend(self._buildMonITAgentDocs(dataStats))
        allDocs.extend(self._buildMonITHealthDocs(dataStats))
        allDocs.extend(self._buildMonITSummaryDocs(dataStats))

        # and finally post them all to AMQ
        logging.info("Found %d documents to post to AMQ", len(allDocs))
        self.uploadToAMQ(allDocs, dataStats['agent_url'], dataStats['timestamp'])


    def _buildMonITPrioDocs(self, dataStats):
        """
        Uses the `sitePendCountByPrio` metric in order to build documents
        reporting the site name, job priority and amount of jobs within that
        priority.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_prio_info MonIT docs
        """
        docType = "wma_prio_info"
        prioDocs = []
        sitePendCountByPrio = dataStats['WMBS_INFO'].pop('sitePendCountByPrio', [])

        for site, item in sitePendCountByPrio.iteritems():
            # it seems sites with no jobs are also always here as "Sitename": {0: 0}
            if item.keys() == [0]:
                continue
            for prio, jobs in item.iteritems():
                prioDoc = {}
                prioDoc['site_name'] = site
                prioDoc['type'] = docType
                prioDoc['priority'] = prio
                prioDoc['job_count'] = jobs
                prioDocs.append(prioDoc)
        return prioDocs

    def _buildMonITSitesDocs(self, dataStats):
        """
        Uses the site thresholds and job information for each site in order
        to build a `site_info` document type for MonIT.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_site_info MonIT docs
        """
        docType = "wma_site_info"
        siteDocs = []
        thresholds = dataStats['WMBS_INFO'].pop('thresholds', {})
        thresholdsGQ2LQ = dataStats['WMBS_INFO'].pop('thresholdsGQ2LQ', {})
        if self.isT0agent:
            possibleJobsPerSite = {}
            uniqueJobsPerSite = {}
        else:
            possibleJobsPerSite = dataStats['LocalWQ_INFO'].pop('possibleJobsPerSite', {})
            uniqueJobsPerSite = dataStats['LocalWQ_INFO'].pop('uniqueJobsPerSite', {})

        for site in sorted(thresholds):
            siteDoc = {}
            siteDoc['site_name'] = site
            siteDoc['type'] = docType
            siteDoc['thresholds'] = thresholds[site]
            siteDoc['state'] = siteDoc['thresholds'].pop('state', 'Unknown')
            siteDoc['thresholdsGQ2LQ'] = thresholdsGQ2LQ.get(site, 0)

            for status in possibleJobsPerSite.keys():
                # make sure these keys are always present in the documents
                jobKey = "possible_%s_jobs" % status.lower()
                elemKey = "num_%s_elem" % status.lower()
                uniJobKey = "unique_%s_jobs" % status.lower()
                siteDoc[jobKey], siteDoc[elemKey], siteDoc[uniJobKey] = 0, 0, 0
                if site in possibleJobsPerSite[status]:
                    siteDoc[jobKey] = possibleJobsPerSite[status][site]['sum_jobs']
                    siteDoc[elemKey] = possibleJobsPerSite[status][site]['num_elem']
                if site in uniqueJobsPerSite[status]:
                    siteDoc[uniJobKey] = uniqueJobsPerSite[status][site]['sum_jobs']

            siteDocs.append(siteDoc)

        return siteDocs

    def _buildMonITWorkDocs(self, dataStats):
        """
        Uses the local workqueue information order by WQE status and build
        statistics for the workload in terms of workqueue elements and top
        level jobs.
        Using the WMBS data, also builds documents to show the amount of
        work in 'created' and 'executing' WMBS status.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_work_info MonIT docs
        """
        workDocs = []
        if self.isT0agent:
            return workDocs

        docType = "wma_work_info"
        workByStatus = dataStats['LocalWQ_INFO'].pop('workByStatus', {})
        for status, info in workByStatus.items():
            workDoc = {}
            workDoc['type'] = docType
            workDoc['status'] = status
            workDoc['num_elem'] = info.get('num_elem', 0)
            workDoc['sum_jobs'] = info.get('sum_jobs', 0)
            workDocs.append(workDoc)

        return workDocs

    def _buildMonITWMBSDocs(self, dataStats):
        """
        Using the WMBS data, builds documents to show the amount of work in
        'created' and 'executing' WMBS status.
        It also builds a document for every single wmbs_status in the database.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_wmbs_info and wma_wmbs_state_info docs
        """
        docType = "wma_wmbs_info"
        wmbsDocs = []
        wmbsCreatedTypeCount = dataStats['WMBS_INFO'].pop('wmbsCreatedTypeCount', {})
        wmbsExecutingTypeCount = dataStats['WMBS_INFO'].pop('wmbsExecutingTypeCount', {})
        for jobType in wmbsCreatedTypeCount:
            wmbsDoc = {}
            wmbsDoc['type'] = docType
            wmbsDoc['job_type'] = jobType
            wmbsDoc['created_jobs'] = wmbsCreatedTypeCount[jobType]
            wmbsDoc['executing_jobs'] = wmbsExecutingTypeCount[jobType]
            wmbsDocs.append(wmbsDoc)

        docType = "wma_wmbs_state_info"
        wmbsCountByState = dataStats['WMBS_INFO'].pop('wmbsCountByState', {})
        for wmbsStatus in wmbsCountByState:
            wmbsDoc = {}
            wmbsDoc['type'] = docType
            wmbsDoc['wmbs_status'] = wmbsStatus
            wmbsDoc['num_jobs'] = wmbsCountByState[wmbsStatus]
            wmbsDocs.append(wmbsDoc)

        return wmbsDocs

    def _buildMonITAgentDocs(self, dataStats):
        """
        Uses the BossAir and WMBS table information in order to build a
        view of amount of jobs in different statuses.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_agent_info MonIT docs
        """
        docType = "wma_agent_info"
        agentDocs = []
        activeRunJobByStatus = dataStats['WMBS_INFO'].pop('activeRunJobByStatus', {})
        completeRunJobByStatus = dataStats['WMBS_INFO'].pop('completeRunJobByStatus', {})
        for schedStatus in activeRunJobByStatus:
            agentDoc = {}
            agentDoc['type'] = docType
            agentDoc['schedd_status'] = schedStatus
            agentDoc['active_jobs'] = activeRunJobByStatus[schedStatus]
            agentDoc['completed_jobs'] = completeRunJobByStatus[schedStatus]
            agentDocs.append(agentDoc)

        return agentDocs

    def _buildMonITHealthDocs(self, dataStats):
        """
        Creates documents with specific agent information, status of
        each component and worker thread (similar to what is shown in
        wmstats) and also some very basic performance numbers.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_health_info MonIT docs
        """
        docType = "wma_health_info"
        healthDocs = []
        workersStatus = dataStats.pop('workers', {})
        for worker in workersStatus:
            healthDoc = {}
            healthDoc['type'] = docType
            healthDoc['worker_name'] = worker['name']
            healthDoc['worker_state'] = worker['state']
            healthDoc['worker_poll'] = worker['poll_interval']
            healthDoc['worker_last_hb'] = worker['last_updated']
            healthDoc['worker_cycle_time'] = worker['cycle_time']
            healthDocs.append(healthDoc)

        return healthDocs

    def _buildMonITSummaryDocs(self, dataStats):
        """
        Creates a document with the very basic agent info used
        in the wmstats monitoring tab.
        :param dataStats: dictionary with metrics previously posted to WMStats
        :return: list of dictionaries with the wma_health_info MonIT docs
        """
        docType = "wma_summary_info"
        summaryDocs = []
        summaryDoc = {}
        summaryDoc['type'] = docType
        summaryDoc['agent_team'] = dataStats['agent_team']
        summaryDoc['agent_version'] = dataStats['agent_version']
        summaryDoc['agent_status'] = dataStats['status']
        if not self.isT0agent:
            summaryDoc['wq_query_time'] = dataStats['LocalWQ_INFO']['total_query_time']
        summaryDoc['wmbs_query_time'] = dataStats['WMBS_INFO']['total_query_time']
        summaryDoc['drain_mode'] = dataStats['drain_mode']
        summaryDoc['down_components'] = dataStats['down_components']
        summaryDocs.append(summaryDoc)
        return summaryDocs

    def uploadToAMQ(self, docs, agentUrl, timeS):
        """
        _uploadToAMQ_

        Sends data to AMQ, which ends up in the MonIT infrastructure.
        :param docs: list of documents/dicts to be posted
        """
        if not docs:
            logging.info("There are no documents to send to AMQ")
            return
        # add mandatory information for every single document
        for doc in docs:
            doc['agent_url'] = agentUrl

        docType = "cms_%s_info" % self.producer
        logging.debug("Sending the following data to AMQ %s", pformat(docs))
        try:
            stompSvc = StompAMQ(username=self.userAMQ,
                                password=self.passAMQ,
                                producer=self.producer,
                                topic=self.topicAMQ,
                                host_and_ports=self.hostPortAMQ,
                                logger=logging)

            notifications = [stompSvc.make_notification(payload=doc, docType=docType, ts=timeS,
                                                        dataSubfield="payload") for doc in docs]

            failures = stompSvc.send(notifications)
            logging.info("%i docs successfully sent to AMQ", len(notifications) - len(failures))
        except Exception as ex:
            logging.exception("Failed to send data to StompAMQ. Error %s", str(ex))

        return
Beispiel #12
0
class AnalyticsPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue, 
    local job couchdb, wmbs/boss air and populate summary db for monitoring 
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        self.agentInfo = {}
        self.agentInfo['agent_team'] = config.Agent.teamName
        self.agentInfo['agent'] = config.Agent.agentName
        # temporarly add port for the split test
        self.agentInfo['agent_url'] = ("%s:%s" % (config.Agent.hostName, config.WMBSService.Webtools.port)) 
        # need to get campaign, user, owner info
        self.agentDocID = "agent+hostname"
        self.summaryLevel = (config.AnalyticsDataCollector.summaryLevel).lower()
    
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gether information
        """
        
        #
        self.localQueue = WorkQueueService(self.config.AnalyticsDataCollector.localQueueURL)
        
        # set the connection for local couchDB call
        self.localCouchDB = LocalCouchDBData(self.config.AnalyticsDataCollector.localCouchURL, self.summaryLevel)
        
        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        # set the connection for local couchDB call
        self.localSummaryCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.localWMStatsURL)
        logging.info("Setting the replication to central monitor ...")
        self.localSummaryCouchDB.replicate(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            #jobs per request info
            logging.info("Getting Job Couch Data ...")
            jobInfoFromCouch = self.localCouchDB.getJobSummaryByWorkflowAndSite()
            
            #fwjr per request info
            logging.info("Getting FWJRJob Couch Data ...")
            fwjrInfoFromCouch = self.localCouchDB.getEventSummaryByWorkflow()
            
            logging.info("Getting Batch Job Data ...")
            batchJobInfo = self.wmagentDB.getBatchJobInfo()
            
            # get the data from local workqueue:
            # request name, input dataset, inWMBS, inQueue
            logging.info("Getting Local Queue Data ...")
            localQInfo = self.localQueue.getAnalyticsData()
            
            # combine all the data from 3 sources
            logging.info("""Combining data from 
                                   Job Couch(%s),
                                   FWJR(%s), 
                                   Batch Job(%s), 
                                   Local Queue(%s)  ...""" 
                    % (len(jobInfoFromCouch), len(fwjrInfoFromCouch), len(batchJobInfo), len(localQInfo)))
            
            tempCombinedData = combineAnalyticsData(jobInfoFromCouch, batchJobInfo)
            combinedRequests = combineAnalyticsData(tempCombinedData, localQInfo)
            
            #set the uploadTime - should be the same for all docs
            uploadTime = int(time.time())
            logging.info("%s requests Data combined,\n uploading request data..." % len(combinedRequests))
            requestDocs = convertToRequestCouchDoc(combinedRequests, fwjrInfoFromCouch,
                                                   self.agentInfo, uploadTime, self.summaryLevel)
            
            self.localSummaryCouchDB.uploadData(requestDocs)
            logging.info("Request data upload success\n %s request \n uploading agent data" % len(requestDocs))
            
            #TODO: agent info (need to include job Slots for the sites)
            agentInfo = self.wmagentDB.getHeartBeatWarning()
            agentInfo.update(self.agentInfo)
            
            agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime)
            self.localSummaryCouchDB.updateAgentInfo(agentDocs)
            logging.info("Agent data upload success\n %s request" % len(agentDocs))
        
        except Exception, ex:
            logging.error(str(ex))
            raise
Beispiel #13
0
class AgentStatusPoller(BaseWorkerThread):
    """
    Gether the summary data for request (workflow) from local queue,
    local job couchdb, wmbs/boss air and populate summary db for monitoring
    """
    def __init__(self, config):
        """
        initialize properties specified from config
        """
        BaseWorkerThread.__init__(self)
        # set the workqueue service for REST call
        self.config = config
        # need to get campaign, user, owner info
        self.agentInfo = initAgentInfo(self.config)
        self.summaryLevel = config.AnalyticsDataCollector.summaryLevel
        self.jsonFile = config.AgentStatusWatcher.jsonFile
    
    def setUpCouchDBReplication(self):
        
        self.replicatorDocs = []
        # set up common replication code
        wmstatsSource = self.config.JobStateMachine.jobSummaryDBName
        wmstatsTarget = self.config.AnalyticsDataCollector.centralWMStatsURL
        
        self.replicatorDocs.append({'source': wmstatsSource, 'target': wmstatsTarget, 
                                    'filter':  "WMStatsAgent/repfilter"})
        #TODO: tier0 specific code - need to make it generic 
        if hasattr(self.config, "Tier0Feeder"):
            t0Source = self.config.Tier0Feeder.requestDBName
            t0Target = self.config.AnalyticsDataCollector.centralRequestDBURL
            self.replicatorDocs.append({'source': t0Source, 'target': t0Target, 
                                        'filter': "T0Request/repfilter"})
        else: # set up workqueue replication
            wqfilter = 'WorkQueue/queueFilter'
            parentQURL = self.config.WorkQueueManager.queueParams["ParentQueueCouchUrl"]
            childURL = self.config.WorkQueueManager.queueParams["QueueURL"]
            query_params = {'childUrl' : childURL, 'parentUrl' : sanitizeURL(parentQURL)['url']}
            localQInboxURL = "%s_inbox" % self.config.AnalyticsDataCollector.localQueueURL
            self.replicatorDocs.append({'source': sanitizeURL(parentQURL)['url'], 'target': localQInboxURL, 
                                        'filter': wqfilter, 'query_params': query_params})       
            self.replicatorDocs.append({'source': sanitizeURL(localQInboxURL)['url'], 'target': parentQURL, 
                                        'filter': wqfilter, 'query_params': query_params})

        # delete old replicator docs before setting up
        self.localCouchMonitor.deleteReplicatorDocs()
        
        for rp in self.replicatorDocs:
            self.localCouchMonitor.couchServer.replicate(
                                           rp['source'], rp['target'], filter = rp['filter'], 
                                           query_params = rp.get('query_params', False),
                                           continuous = True)
        # First cicle need to be skipped since document is not updated that fast
        self.skipReplicationCheck = True
                     
    def setup(self, parameters):
        """
        set db connection(couchdb, wmbs) to prepare to gather information
        """

        # interface to WMBS/BossAir db
        myThread = threading.currentThread()
        # set wmagent db data
        self.wmagentDB = WMAgentDBData(self.summaryLevel, myThread.dbi, myThread.logger)
        
        self.centralWMStatsCouchDB = WMStatsWriter(self.config.AnalyticsDataCollector.centralWMStatsURL)
        
        self.localCouchMonitor = CouchMonitor(self.config.JobStateMachine.couchurl)
        self.setUpCouchDBReplication()

    def algorithm(self, parameters):
        """
        get information from wmbs, workqueue and local couch
        """
        try:
            agentInfo = self.collectAgentInfo()
            #set the uploadTime - should be the same for all docs
            wmbsInfo = self.collectWMBSInfo()
            logging.info("finished collecting agent/wmbs info")
            agentInfo["WMBS_INFO"] = wmbsInfo
            uploadTime = int(time.time())
            self.uploadAgentInfoToCentralWMStats(agentInfo, uploadTime)
            
            #save locally json file as well
            with open(self.jsonFile, 'w') as outFile:
                json.dump(agentInfo, outFile, indent=2)
            
        except Exception as ex:
            logging.error("Error occurred, will retry later:")
            logging.error(str(ex))
            logging.error("Trace back: \n%s" % traceback.format_exc())
    
     
    def collectCouchDBInfo(self):
        
        couchInfo = {'name': 'CouchServer', 'status': 'ok', 'error_message': ""}
        
        if self.skipReplicationCheck:
            # skipping the check this round set if False so it can be checked next round.
            self.skipReplicationCheck = False
            return couchInfo

        for rp in self.replicatorDocs:
            cInfo = self.localCouchMonitor.checkCouchServerStatus(rp['source'], 
                                                        rp['target'], checkUpdateSeq = False)
            if cInfo['status'] != 'ok':
                couchInfo['status'] = 'error'
                couchInfo['error_message'] = cInfo['error_message']
        
        return couchInfo
        
    def collectAgentInfo(self):
        """
        Monitors the general health of the agent, as:
          1. status of the agent processes
          2. status of the agent threads based on the database info
          3. couchdb active tasks and its replications
          4. check the disk usage
          5. check the number of couch processes

        :return: a dict with all the info collected
        """
        logging.info("Getting agent info ...")
        agentInfo = self.wmagentDB.getComponentStatus(self.config)
        agentInfo.update(self.agentInfo)
        
        if isDrainMode(self.config):
            logging.info("Agent is in DrainMode")
            agentInfo['drain_mode'] = True
        else:
            agentInfo['drain_mode'] = False
        
        couchInfo = self.collectCouchDBInfo()
        if couchInfo['status'] != 'ok':
            agentInfo['down_components'].append(couchInfo['name'])
            agentInfo['status'] = couchInfo['status']
            agentInfo['down_component_detail'].append(couchInfo)
        
        
        # Disk space warning   
        diskUseList = diskUse()
        diskUseThreshold = float(self.config.AnalyticsDataCollector.diskUseThreshold)
        agentInfo['disk_warning'] = []
        for disk in diskUseList:
            if float(disk['percent'].strip('%')) >= diskUseThreshold and \
                            disk['mounted'] not in self.config.AnalyticsDataCollector.ignoreDisk:
                agentInfo['disk_warning'].append(disk)
        
        # Couch process warning
        couchProc = numberCouchProcess()
        logging.info("CouchDB is running with %d processes", couchProc)
        couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold
        if couchProc >= couchProcessThreshold:
            agentInfo['couch_process_warning'] = couchProc
        else:
            agentInfo['couch_process_warning'] = 0
        
        # This adds the last time and message when data was updated to agentInfo
        lastDataUpload = DataUploadTime.getInfo()
        if lastDataUpload['data_last_update']:
            agentInfo['data_last_update'] = lastDataUpload['data_last_update']
        if lastDataUpload['data_error']:
            agentInfo['data_error'] = lastDataUpload['data_error']
        
        # Change status if there is data_error, couch process maxed out or disk full problems.
        if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']):
            agentInfo['status'] = "warning"
                
        if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning':
            if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0):
                agentInfo['status'] = "error"

        if agentInfo['down_components']:
            logging.info("List of agent components down: %s" % agentInfo['down_components'])

        return agentInfo

    def uploadAgentInfoToCentralWMStats(self, agentInfo, uploadTime):
        #direct data upload to the remote to prevent data conflict when agent is cleaned up and redeployed
        agentDocs = convertToAgentCouchDoc(agentInfo, self.config.ACDC, uploadTime)
        self.centralWMStatsCouchDB.updateAgentInfo(agentDocs)

    def collectWMBSInfo(self):
        """
        Fetches WMBS job information.
        In addition to WMBS, also collects RunJob info from BossAir
        :return: dict with the number of jobs in each status
        """
        logging.info("Getting wmbs job info ...")
        results = {}
        
        start = int(time.time())
        # first retrieve the site thresholds
        results['thresholds'] = self.wmagentDB.getJobSlotInfo()
        logging.debug("Running and pending site thresholds: %s", results['thresholds'])

        # now fetch the amount of jobs in each state and the amount of created
        # jobs grouped by task
        results.update(self.wmagentDB.getAgentMonitoring())
        end = int(time.time())
        #adding total query time
        results["total_query_time"] = end - start
        
        logging.debug("Total number of jobs in WMBS sorted by status: %s", results['wmbsCountByState'])
        logging.debug("Total number of 'created' jobs in WMBS sorted by type: %s", results['wmbsCreatedTypeCount'])
        logging.debug("Total number of 'executing' jobs in WMBS sorted by type: %s", results['wmbsExecutingTypeCount'])

        logging.debug("Total number of active jobs in BossAir sorted by status: %s", results['activeRunJobByStatus'])
        logging.debug("Total number of complete jobs in BossAir sorted by status: %s", results['completeRunJobByStatus'])

        logging.debug("Available slots thresholds to pull work from GQ to LQ: %s", results['thresholdsGQ2LQ'])
        logging.debug("List of jobs pending for each site, sorted by priority: %s", results['sitePendCountByPrio'])

        return results