def passRetrieveCondition(self): """ _passRetrieveCondition_ Return true if the component can proceed with fetching work. False if the component should skip pulling work this cycle. For now, it only checks whether the agent is in drain mode or MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded. """ passCond = "OK" myThread = threading.currentThread() if isDrainMode(self.config): passCond = "No work will be pulled: Agent is in drain" elif availableScheddSlots(myThread.dbi) <= 0: passCond = "No work will be pulled: schedd slot is maxed: MAX_JOBS_PER_OWNER" elif self.condorAPI.isScheddOverloaded(): passCond = "No work will be pulled: schedd is overloaded" else: subscriptions = self.listSubsWithoutJobs.execute() if subscriptions: passCond = "No work will be pulled: " passCond += "JobCreator hasn't created jobs for subscriptions %s" % subscriptions return passCond
def markInjected(self): """ _markInjected_ Mark any workflows that have been fully injected as injected """ if self.tier0Mode: logging.debug("Component will not check workflows for injection status") return myThread = threading.currentThread() getAction = self.daoFactory(classname="Workflow.GetInjectedWorkflows") markAction = self.daoFactory(classname="Workflow.MarkInjectedWorkflows") result = getAction.execute() # Check each result to see if it is injected: injected = [] for name in result: try: if self.workQueue.getWMBSInjectionStatus(name, isDrainMode(self.config)): injected.append(name) except WorkQueueNoMatchingElements: # workflow not known - free to cleanup injected.append(name) except Exception as ex: logging.exception("Injection status checking failed, investigate: %s", str(ex)) logging.info("Found %d workflows to mark as injected", len(injected)) # Now, mark as injected those that returned True if len(injected) > 0: myThread.transaction.begin() markAction.execute(names=injected, injected=True) myThread.transaction.commit() return
def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ logging.info("Running agent drain algorithm...") self.agentConfig = self.reqAuxDB.getWMAgentConfig(self.config.Agent.hostName) if not self.agentConfig: logging.error("Failed to fetch agent configuration from the auxiliary DB") return if isDrainMode(self.config): # check to see if the agent hit any speed drain thresholds thresholdsHit = self.checkSpeedDrainThresholds() if thresholdsHit: logging.info("Updating agent configuration for speed drain...") self.updateAgentSpeedDrainConfig(thresholdsHit) # now collect drain statistics try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Resetting flags and skipping drain check...") self.resetAgentSpeedDrainConfig()
def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get('couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo
def algorithm(self, parameters): """ Update drainStats if agent is in drain mode """ if isDrainMode(self.config): logging.info("Checking agent drain status...") try: DrainStatusPoller.drainStats = self.drainAPI.collectDrainInfo() logging.info("Finished collecting agent drain status.") logging.info("Drain stats: " + str(DrainStatusPoller.drainStats)) except Exception as ex: msg = "Error occurred, will retry later:\n" msg += str(ex) logging.exception(msg) else: logging.info("Agent not in drain mode. Skipping drain check...")
def passRetrieveCondition(self): """ _passRetrieveCondition_ Return true if the component can proceed with fetching work. False if the component should skip pulling work this cycle. For now, it only checks whether the agent is in drain mode or MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded. """ passCond = "OK" myThread = threading.currentThread() if isDrainMode(self.config): passCond = "No work will be pulled: Agent is in drain" elif availableScheddSlots(myThread.dbi) <= 0: passCond = False passCond = "No work will be pulled: schedd slot is maxed: MAX_JOBS_PER_OWNER" elif self.condorAPI.isScheddOverloaded(): passCond = "No work will be pulled: shedd is overloaded" return passCond
def markInjected(self): """ _markInjected_ Mark any workflows that have been fully injected as injected """ if not self.handleWorkflowInjection: logging.debug( "Component will not check workflows for injection status") return myThread = threading.currentThread() getAction = self.daoFactory(classname="Workflow.GetInjectedWorkflows") markAction = self.daoFactory( classname="Workflow.MarkInjectedWorkflows") result = getAction.execute() # Check each result to see if it is injected: injected = [] for name in result: try: if self.workQueue.getWMBSInjectionStatus( name, isDrainMode(self.config)): injected.append(name) except WorkQueueNoMatchingElements: # workflow not known - free to cleanup injected.append(name) except Exception as ex: logging.exception( "Injection status checking failed, investigate: %s", str(ex)) logging.info("Found %d workflows to mark as injected", len(injected)) # Now, mark as injected those that returned True if len(injected) > 0: myThread.transaction.begin() markAction.execute(names=injected, injected=True) myThread.transaction.commit() return
def passRetrieveCondition(self): """ _passRetrieveCondition_ Return true if the component can proceed with fetching work. False if the component should skip pulling work this cycle. For now, it only checks whether the agent is in drain mode or MAX_JOBS_PER_OWNER is reached or if the condor schedd is overloaded. """ passCond = "OK" myThread = threading.currentThread() if isDrainMode(self.config): passCond = "agent is in drain mode" elif availableScheddSlots(myThread.dbi) <= 0: passCond = "schedd slot is maxed: MAX_JOBS_PER_OWNER" elif self.condorAPI.isScheddOverloaded(): passCond = "schedd is overloaded" else: subscriptions = self.listSubsWithoutJobs.execute() if subscriptions: passCond = "JobCreator hasn't created jobs for subscriptions %s" % subscriptions return passCond
def getAgentsByTeam(self): """ _getAgentsByTeam_ Get the WMStats view about agents and teams """ if isDrainMode(self.config): # maximize pending thresholds to get this agent drained ASAP self.agentsNumByTeam = 1 return agentsByTeam = {} try: agentsByTeam = self.centralCouchDBReader.agentsByTeam(filterDrain=True) except Exception: logging.error("WMStats is not available or is unresponsive.") if not agentsByTeam: logging.warning("agentInfo couch view is not available, use default value %s", self.agentsNumByTeam) else: self.agentsNumByTeam = agentsByTeam.get(self.teamName, self.agentsNumByTeam) logging.debug("Agents connected to the same team (not in DrainMode): %d", self.agentsNumByTeam) return
def collectAgentInfo(self): """ Monitors the general health of the agent, as: 1. status of the agent processes 2. status of the agent threads based on the database info 3. couchdb active tasks and its replications 4. check the disk usage 5. check the number of couch processes :return: a dict with all the info collected """ logging.info("Getting agent info ...") agentInfo = self.wmagentDB.getComponentStatus(self.config) agentInfo.update(self.agentInfo) agentInfo['disk_warning'] = listDiskUsageOverThreshold(self.config, updateDB=True) if isDrainMode(self.config): logging.info("Agent is in DrainMode") agentInfo['drain_mode'] = True agentInfo['drain_stats'] = DrainStatusPoller.getDrainInfo() else: agentInfo['drain_mode'] = False couchInfo = self.collectCouchDBInfo() if couchInfo['status'] != 'ok': agentInfo['down_components'].append(couchInfo['name']) agentInfo['status'] = couchInfo['status'] agentInfo['down_component_detail'].append(couchInfo) # Couch process warning couchProc = numberCouchProcess() logging.info("CouchDB is running with %d processes", couchProc) couchProcessThreshold = self.config.AnalyticsDataCollector.couchProcessThreshold if couchProc >= couchProcessThreshold: agentInfo['couch_process_warning'] = couchProc else: agentInfo['couch_process_warning'] = 0 # This adds the last time and message when data was updated to agentInfo lastDataUpload = DataUploadTime.getInfo() if lastDataUpload['data_last_update']: agentInfo['data_last_update'] = lastDataUpload['data_last_update'] if lastDataUpload['data_error']: agentInfo['data_error'] = lastDataUpload['data_error'] # Change status if there is data_error, couch process maxed out or disk full problems. if agentInfo['status'] == 'ok' and (agentInfo['drain_mode'] or agentInfo['disk_warning']): agentInfo['status'] = "warning" if agentInfo['status'] == 'ok' or agentInfo['status'] == 'warning': if agentInfo.get('data_error', 'ok') != 'ok' or agentInfo.get( 'couch_process_warning', 0): agentInfo['status'] = "error" logging.info("List of agent components down: %s", agentInfo['down_components']) return agentInfo