def submit( self ): """ Returns flatten list of components (services, agents) installed on hosts returned by getHosts function """ checkUserCredentials() DN = getUserDN() group = getSelectedGroup() callback = list() request = self.request() if not 'Hostname' in request: return { "success" : "false" , "error" : "Name of the host is absent" } host = request[ 'Hostname' ] client = SystemAdministratorClient( host , None , delegatedDN=DN , delegatedGroup=group ) result = client.getOverallStatus() gLogger.debug( "Result of getOverallStatus(): %s" % result ) if not result[ "OK" ]: return { "success" : "false" , "error" : result[ "Message" ] } overall = result[ "Value" ] for record in self.flatten( overall ): record[ "Host" ] = host callback.append( record ) return { "success" : "true" , "result" : callback }
def do_show(self, args): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show project - show project to install or upgrade show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show host - show host related parameters show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: print self.do_show.__doc__ return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient(self.host, self.port) result = client.getSoftwareComponents() if not result['OK']: self.__errMsg(result['Message']) else: print pprint.pprint(result['Value']) elif option == 'installed': client = SystemAdministratorClient(self.host, self.port) result = client.getInstalledComponents() if not result['OK']: self.__errMsg(result['Message']) else: print pprint.pprint(result['Value']) elif option == 'setup': client = SystemAdministratorClient(self.host, self.port) result = client.getSetupComponents() if not result['OK']: self.__errMsg(result['Message']) else: print pprint.pprint(result['Value']) elif option == 'project': result = SystemAdministratorClient(self.host, self.port).getProject() if not result['OK']: self.__errMsg(result['Message']) else: print "Current project is %s" % result['Value'] elif option == 'status': client = SystemAdministratorClient(self.host, self.port) result = client.getOverallStatus() if not result['OK']: self.__errMsg(result['Message']) else: fields = [ "System", 'Name', 'Module', 'Type', 'Setup', 'Installed', 'Runit', 'Uptime', 'PID' ] records = [] rDict = result['Value'] for compType in rDict: for system in rDict[compType]: components = rDict[compType][system].keys() components.sort() for component in components: record = [] if rDict[compType][system][component]['Installed']: module = str(rDict[compType][system][component] ['Module']) record += [ system, component, module, compType.lower()[:-1] ] if rDict[compType][system][component]['Setup']: record += ['Setup'] else: record += ['NotSetup'] if rDict[compType][system][component][ 'Installed']: record += ['Installed'] else: record += ['NotInstalled'] record += [ str(rDict[compType][system][component] ['RunitStatus']) ] record += [ str(rDict[compType][system][component] ['Timeup']) ] record += [ str(rDict[compType][system][component] ['PID']) ] records.append(record) printTable(fields, records) elif option == 'database' or option == 'databases': client = SystemAdministratorClient(self.host, self.port) if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = "******" InstallTools.getMySQLPasswords() result = client.getDatabases(InstallTools.mysqlRootPwd) if not result['OK']: self.__errMsg(result['Message']) return resultSW = client.getAvailableDatabases() if not resultSW['OK']: self.__errMsg(resultSW['Message']) return sw = resultSW['Value'] installed = result['Value'] print for db in sw: if db in installed: print db.rjust(25), ': Installed' else: print db.rjust(25), ': Not installed' if not sw: print "No database found" elif option == 'mysql': client = SystemAdministratorClient(self.host, self.port) result = client.getMySQLStatus() if not result['OK']: self.__errMsg(result['Message']) elif result['Value']: print for par, value in result['Value'].items(): print par.rjust(28), ':', value else: print "No MySQL database found" elif option == "log": self.getLog(argss) elif option == "info": client = SystemAdministratorClient(self.host, self.port) result = client.getInfo() if not result['OK']: self.__errMsg(result['Message']) else: print print "Setup:", result['Value']['Setup'] print "DIRAC version:", result['Value']['DIRAC'] if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): print "%s version" % e, v print elif option == "host": client = SystemAdministratorClient(self.host, self.port) result = client.getHostInfo() if not result['OK']: self.__errMsg(result['Message']) else: print print "Host info:" print fields = ['Parameter', 'Value'] records = [] for key, value in result['Value'].items(): records.append([key, str(value)]) printTable(fields, records) elif option == "errors": self.getErrors(argss) else: print "Unknown option:", option
def do_show( self, args ): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show project - show project to install or upgrade show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show host - show host related parameters show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: print self.do_show.__doc__ return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient( self.host, self.port ) result = client.getSoftwareComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: print pprint.pprint( result['Value'] ) elif option == 'installed': client = SystemAdministratorClient( self.host, self.port ) result = client.getInstalledComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: print pprint.pprint( result['Value'] ) elif option == 'setup': client = SystemAdministratorClient( self.host, self.port ) result = client.getSetupComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: print pprint.pprint( result['Value'] ) elif option == 'project': result = SystemAdministratorClient( self.host, self.port ).getProject() if not result['OK']: self.__errMsg( result['Message'] ) else: print "Current project is %s" % result[ 'Value' ] elif option == 'status': client = SystemAdministratorClient( self.host, self.port ) result = client.getOverallStatus() if not result['OK']: self.__errMsg( result['Message'] ) else: fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID'] records = [] rDict = result['Value'] for compType in rDict: for system in rDict[compType]: components = rDict[compType][system].keys() components.sort() for component in components: record = [] if rDict[compType][system][component]['Installed']: module = str( rDict[compType][system][component]['Module'] ) record += [ system,component,module,compType.lower()[:-1]] if rDict[compType][system][component]['Setup']: record += ['Setup'] else: record += ['NotSetup'] if rDict[compType][system][component]['Installed']: record += ['Installed'] else: record += ['NotInstalled'] record += [str( rDict[compType][system][component]['RunitStatus'] )] record += [str( rDict[compType][system][component]['Timeup'] )] record += [str( rDict[compType][system][component]['PID'] )] records.append(record) printTable(fields,records) elif option == 'database' or option == 'databases': client = SystemAdministratorClient( self.host, self.port ) if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = "******" InstallTools.getMySQLPasswords() result = client.getDatabases( InstallTools.mysqlRootPwd ) if not result['OK']: self.__errMsg( result['Message'] ) return resultSW = client.getAvailableDatabases() if not resultSW['OK']: self.__errMsg( resultSW['Message'] ) return sw = resultSW['Value'] installed = result['Value'] print for db in sw: if db in installed: print db.rjust( 25 ), ': Installed' else: print db.rjust( 25 ), ': Not installed' if not sw: print "No database found" elif option == 'mysql': client = SystemAdministratorClient( self.host, self.port ) result = client.getMySQLStatus() if not result['OK']: self.__errMsg( result['Message'] ) elif result['Value']: print for par, value in result['Value'].items(): print par.rjust( 28 ), ':', value else: print "No MySQL database found" elif option == "log": self.getLog( argss ) elif option == "info": client = SystemAdministratorClient( self.host, self.port ) result = client.getInfo() if not result['OK']: self.__errMsg( result['Message'] ) else: print print "Setup:", result['Value']['Setup'] print "DIRAC version:", result['Value']['DIRAC'] if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): print "%s version" % e, v print elif option == "host": client = SystemAdministratorClient( self.host, self.port ) result = client.getHostInfo() if not result['OK']: self.__errMsg( result['Message'] ) else: print print "Host info:" print fields = ['Parameter','Value'] records = [] for key,value in result['Value'].items(): records.append( [key, str(value) ] ) printTable( fields, records ) elif option == "errors": self.getErrors( argss ) else: print "Unknown option:", option
class LemonAgent( AgentModule ): def initialize( self ): self.NON_CRITICAL = "NonCritical" self.CRITICAL = "Critical" self.FAILURE = "FAILURE" self.OK = "OK" self.setup = gConfig.getValue('/DIRAC/Setup','LHCb-Development') self.outputNonCritical = True #all components not present here will be treated as non critical self.admClient = SystemAdministratorClient('localhost') return S_OK() def execute( self ): """ Main execution method """ monitoredSetups = gConfig.getValue('/Operations/lhcb/Lemon/MonitoredSetups', ['LHCb-Production']) self.monitoringEnabled = self.setup in monitoredSetups if not self.monitoringEnabled: self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "Monitoring not enabled for this setup: " + self.setup +". Exiting."); return S_OK() hostsInMaintenance = gConfig.getValue('/Operations/lhcb/Lemon/HostsInMaintenance',[]); if gethostname() in hostsInMaintenance: self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "I am in maintenance mode, exiting."); return S_OK() result = self.admClient.getOverallStatus() if not result or not result['OK']: self._log("Framework/LemonAgent", self.CRITICAL, self.FAILURE, "Can not obtain result!!"); return S_OK() services = result[ 'Value' ][ 'Services' ] agents = result[ 'Value' ][ 'Agents' ] self._processResults(services); self._processResults(agents); return S_OK() def _processResults(self, results): for system in results: for part in results[system]: component = results[system][part] componentName = system + "/" + part if component['Setup'] == True: #we want to monitor only set up services and agents critLevel = self._getCriticality(componentName) if critLevel == self.NON_CRITICAL and self.outputNonCritical == False: continue if component['RunitStatus'] == 'Run': self._log(componentName, self._getCriticality(componentName), self.OK, "Service/Agent running fine"); else: self._log(componentName, self._getCriticality(componentName), self.FAILURE, "Service/Agent failure!"); # else: # if component['Installed'] == True: # print componentName + " is installed but not set up" def _getCriticality(self, component): #lets try to retrieve common criticality first criticality = gConfig.getValue('/Operations/lhcb/Lemon/Criticalities/' + component, self.NON_CRITICAL) #maybe it got redefined in <setup> subtree: criticality = gConfig.getValue('/Operations/lhcb/' + self.setup + '/Lemon/Criticalities/' + component, criticality) return criticality def _log( self, component, criticality, status, string ): gLogger.info( "LEMON " + criticality + " " + status + " " + component + ": " +string + "\n")
def do_show( self, args ): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show project - show project to install or upgrade show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show doc <type> <system> <name> - show documentation for a given service or agent show host - show host related parameters show hosts - show all available hosts show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before> | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]* - show all the installations of components that match the given parameters show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: gLogger.notice( self.do_show.__doc__ ) return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient( self.host, self.port ) result = client.getSoftwareComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'installed': client = SystemAdministratorClient( self.host, self.port ) result = client.getInstalledComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'setup': client = SystemAdministratorClient( self.host, self.port ) result = client.getSetupComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'project': result = SystemAdministratorClient( self.host, self.port ).getProject() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( "Current project is %s" % result[ 'Value' ] ) elif option == 'status': client = SystemAdministratorClient( self.host, self.port ) result = client.getOverallStatus() if not result['OK']: self.__errMsg( result['Message'] ) else: fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID'] records = [] rDict = result['Value'] for compType in rDict: for system in rDict[compType]: components = rDict[compType][system].keys() components.sort() for component in components: record = [] if rDict[compType][system][component]['Installed']: module = str( rDict[compType][system][component]['Module'] ) record += [ system,component,module,compType.lower()[:-1]] if rDict[compType][system][component]['Setup']: record += ['Setup'] else: record += ['NotSetup'] if rDict[compType][system][component]['Installed']: record += ['Installed'] else: record += ['NotInstalled'] record += [str( rDict[compType][system][component]['RunitStatus'] )] record += [str( rDict[compType][system][component]['Timeup'] )] record += [str( rDict[compType][system][component]['PID'] )] records.append(record) printTable(fields,records) elif option == 'database' or option == 'databases': client = SystemAdministratorClient( self.host, self.port ) if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = "******" InstallTools.getMySQLPasswords() result = client.getDatabases( InstallTools.mysqlRootPwd ) if not result['OK']: self.__errMsg( result['Message'] ) return resultSW = client.getAvailableDatabases() if not resultSW['OK']: self.__errMsg( resultSW['Message'] ) return sw = resultSW['Value'] installed = result['Value'] gLogger.notice( '' ) for db in sw: if db in installed: gLogger.notice( db.rjust( 25 ), ': Installed' ) else: gLogger.notice( db.rjust( 25 ), ': Not installed' ) if not sw: gLogger.notice( "No database found" ) elif option == 'mysql': client = SystemAdministratorClient( self.host, self.port ) result = client.getMySQLStatus() if not result['OK']: self.__errMsg( result['Message'] ) elif result['Value']: gLogger.notice( '' ) for par, value in result['Value'].items(): gLogger.notice( ( par.rjust( 28 ), ':', value ) ) else: gLogger.notice( "No MySQL database found" ) elif option == "log": self.getLog( argss ) elif option == "info": client = SystemAdministratorClient( self.host, self.port ) result = client.getInfo() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) gLogger.notice( "Setup:", result['Value']['Setup'] ) gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] ) if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): gLogger.notice( "%s version" % e, v ) gLogger.notice( '' ) elif option == "host": client = SystemAdministratorClient( self.host, self.port ) result = client.getHostInfo() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) gLogger.notice( "Host info:" ) gLogger.notice( '' ) fields = ['Parameter','Value'] records = [] for key, value in result['Value'].items(): records.append( [key, str( value ) ] ) printTable( fields, records ) elif option == "hosts": client = ComponentMonitoringClient() result = client.getHosts( {}, False, False ) if not result[ 'OK' ]: self.__errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) ) else: hostList = result[ 'Value' ] gLogger.notice( '' ) gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' ) gLogger.notice( ( '-' * 69 ) ) for element in hostList: gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' ) gLogger.notice( ( '-' * 69 ) ) gLogger.notice( '' ) elif option == "errors": self.getErrors( argss ) elif option == "installations": self.getInstallations( argss ) elif option == "doc": if len( argss ) > 2: if argss[0] in [ 'service', 'agent' ]: compType = argss[0] compSystem = argss[1] compModule = argss[2] client = SystemAdministratorClient( self.host, self.port ) result = client.getComponentDocumentation( compType, compSystem, compModule ) if result[ 'OK' ]: gLogger.notice( result[ 'Value' ] ) else: self.__errMsg( result[ 'Message' ] ) else: gLogger.notice( self.do_show.__doc__ ) else: gLogger.notice( self.do_show.__doc__ ) else: gLogger.notice( "Unknown option:", option )
def do_show( self, args ): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: print self.do_show.__doc__ return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient( self.host, self.port ) result = client.getSoftwareComponents() if not result['OK']: print " ERROR:", result['Message'] else: print pprint.pprint( result['Value'] ) elif option == 'installed': client = SystemAdministratorClient( self.host, self.port ) result = client.getInstalledComponents() if not result['OK']: print " ERROR:", result['Message'] else: print pprint.pprint( result['Value'] ) elif option == 'setup': client = SystemAdministratorClient( self.host, self.port ) result = client.getSetupComponents() if not result['OK']: print " ERROR:", result['Message'] else: print pprint.pprint( result['Value'] ) elif option == 'status': client = SystemAdministratorClient( self.host, self.port ) result = client.getOverallStatus() if not result['OK']: print "ERROR:", result['Message'] else: rDict = result['Value'] print print " System", ' '*20, 'Name', ' '*15, 'Type', ' '*13, 'Setup Installed Runit Uptime PID' print '-' * 116 for compType in rDict: for system in rDict[compType]: for component in rDict[compType][system]: if rDict[compType][system][component]['Installed']: print system.ljust( 28 ), component.ljust( 28 ), compType.lower()[:-1].ljust( 7 ), if rDict[compType][system][component]['Setup']: print 'SetUp'.rjust( 12 ), else: print 'NotSetup'.rjust( 12 ), if rDict[compType][system][component]['Installed']: print 'Installed'.rjust( 12 ), else: print 'NotInstalled'.rjust( 12 ), print str( rDict[compType][system][component]['RunitStatus'] ).ljust( 7 ), print str( rDict[compType][system][component]['Timeup'] ).rjust( 7 ), print str( rDict[compType][system][component]['PID'] ).rjust( 8 ), print elif option == 'database' or option == 'databases': client = SystemAdministratorClient( self.host, self.port ) if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = "******" InstallTools.getMySQLPasswords() result = client.getDatabases( InstallTools.mysqlRootPwd ) if not result['OK']: print "ERROR:", result['Message'] return resultSW = client.getAvailableDatabases() if not resultSW['OK']: print "ERROR:", resultSW['Message'] return sw = resultSW['Value'] installed = result['Value'] print for db in sw: if db in installed: print db.rjust( 25 ), ': Installed' else: print db.rjust( 25 ), ': Not installed' if not sw: print "No database found" elif option == 'mysql': client = SystemAdministratorClient( self.host, self.port ) result = client.getMySQLStatus() if not result['OK']: print "ERROR:", result['Message'] elif result['Value']: print for par, value in result['Value'].items(): print par.rjust( 28 ), ':', value else: print "No MySQL database found" elif option == "log": self.getLog( argss ) elif option == "info": client = SystemAdministratorClient( self.host, self.port ) result = client.getInfo() if not result['OK']: print "ERROR:", result['Message'] else: print print "Setup:", result['Value']['Setup'] print "DIRAC version:", result['Value']['DIRAC'] if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): print "%s version" % e, v print elif option == "errors": self.getErrors( argss ) else: print "Unknown option:", option
class MonitorAgents(AgentModule): """MonitorAgents class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = 'MonitorAgents' self.setup = "Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.diracLocation = "/opt/dirac/pro" self.sysAdminClient = SystemAdministratorClient(socket.gethostname()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self.errors = list() self.accounting = defaultdict(dict) self.addressTo = ["*****@*****.**"] self.addressFrom = "*****@*****.**" self.emailSubject = "MonitorAgents on %s" % socket.gethostname() def logError(self, errStr, varMsg=''): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.diracLocation = os.environ.get("DIRAC", self.diracLocation) self.addressTo = self.am_getOption('MailTo', self.addressTo) self.addressFrom = self.am_getOption('MailFrom', self.addressFrom) self.controlComponents = self.am_getOption('ControlComponents', self.controlComponents) self.commitURLs = self.am_getOption('CommitURLs', self.commitURLs) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType='Agents') if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType='Executors') if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType='Services') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not(self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.iteritems(): rows.append([[instanceName], [val.get('Treatment', 'No Treatment')], [str(val.get('LogAge', 'Not Relevant'))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=' | ') if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res['OK']: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType='Agents', runitStatus='Run'): """Return a dict of running agents, executors or services. Key is agent's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError("Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res['Value'][instanceType] runningAgents = defaultdict(dict) for system, agents in val.iteritems(): for agentName, agentInfo in agents.iteritems(): if agentInfo['Setup'] and agentInfo['Installed']: if runitStatus != 'All' and agentInfo['RunitStatus'] != runitStatus: continue confPath = cfgPath('/Systems/' + system + '/' + self.setup + '/%s/' % instanceType + agentName) for option, default in (('PollingTime', HOUR), ('Port', None)): optPath = os.path.join(confPath, option) runningAgents[agentName][option] = gConfig.getValue(optPath, default) runningAgents[agentName]["LogFileLocation"] = \ os.path.join(self.diracLocation, 'runit', system, agentName, 'log', 'current') runningAgents[agentName]["PID"] = agentInfo["PID"] runningAgents[agentName]['Module'] = agentInfo['Module'] runningAgents[agentName]['RunitStatus'] = agentInfo['RunitStatus'] runningAgents[agentName]['System'] = system return S_OK(runningAgents) def on_terminate(self, agentName, process): """Execute callback when a process terminates gracefully.""" self.log.info("%s's process with ID: %s has been terminated successfully" % (agentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ('executor', 'agent', 'service'): for name, options in getattr(self, instanceType + 's').iteritems(): # call checkAgent, checkExecutor, checkService res = getattr(self, 'check' + instanceType.capitalize())(name, options) if not res['OK']: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res['Message'])) res = self.componentControl() if not res['OK']: if "Stopped does not exist" not in res['Message'] and \ "Running does not exist" not in res['Message']: self.logError("Failure to control components", res['Message']) if not self.errors: res = self.checkURLs() if not res['OK']: self.logError("Failure to check URLs", res['Message']) else: self.logError('Something was wrong before, not checking URLs this time') self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR('Failed to access logfile %s: %r' % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info("Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName]["Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: agentProc = psutil.Process(int(pid)) processesToTerminate = agentProc.children(recursive=True) processesToTerminate.append(agentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial(self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes['OK']: self.log.info('Failure pinging service: %s: %s' % (url, pingRes['Message'])) res = self.restartInstance(int(options['PID']), serviceName, self.restartServices) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[serviceName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = options['PollingTime'], options['LogFileLocation'], options['PID'] self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options['LogFileLocation'] pid = options['PID'] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res['OK']: return res if res['OK'] and res['Value'] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res['OK'] and res['Value'] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {'Status': 'Checking', 'MinorStatus': executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs['OK']: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs['Message'])) return resJobs if resJobs['Value']: self.log.info("Found %d jobs in 'Checking' status for %s" % (len(resJobs['Value']), executorName)) return S_OK(CHECKING_JOBS) self.log.info("Found no jobs in 'Checking' status for %s" % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent['OK']: return resCurrent currentStatus = resCurrent['Value'] resDefault = self._getDefaultComponentStatus() if not resDefault['OK']: return resDefault defaultStatus = resDefault['Value'] # ensure instances are in the right state shouldBe = {} shouldBe['Run'] = defaultStatus['Run'].intersection(currentStatus['Down']) shouldBe['Down'] = defaultStatus['Down'].intersection(currentStatus['Run']) shouldBe['Unknown'] = defaultStatus['All'].symmetric_difference(currentStatus['All']) self._ensureComponentRunning(shouldBe['Run']) self._ensureComponentDown(shouldBe['Down']) for instance in shouldBe['Unknown']: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall['OK']: return resOverall currentStatus = {'Down': set(), 'Run': set(), 'All': set()} informationDict = resOverall['Value'] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = '%s__%s' % (system, instanceName) runitStatus = instanceInfoDict.get('RunitStatus') if runitStatus in ('Run', 'Down'): currentStatus[runitStatus].add(identifier) currentStatus['All'] = currentStatus['Run'] | currentStatus['Down'] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.gethostname() defaultStatus = {'Down': set(), 'Run': set(), 'All': set()} resRunning = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Running')) resStopped = gConfig.getOptionsDict(os.path.join('/Registry/Hosts/', host, 'Stopped')) if not resRunning['OK']: return resRunning if not resStopped['OK']: return resStopped defaultStatus['Run'] = set(resRunning['Value'].keys()) defaultStatus['Down'] = set(resStopped['Value'].keys()) defaultStatus['All'] = defaultStatus['Run'] | defaultStatus['Down'] if defaultStatus['Run'].intersection(defaultStatus['Down']): self.logError("Overlap in configuration", str(defaultStatus['Run'].intersection(defaultStatus['Down']))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res['OK']: self.logError("Failed to start component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was down, started instance" else: self.accounting[instance]["Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split('__') if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res['OK']: self.logError("Failed to stop component:", "%s: %s" % (instance, res['Message'])) else: self.accounting[instance]["Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance]["Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) res = self.getRunningInstances(instanceType='Services', runitStatus='All') if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in self.services.iteritems(): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if 'SystemAdministrator' in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result['OK']: self.logError('Commit to CS failed', result['Message']) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options['System'] module = options['Module'] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = os.path.join('/Systems', system, self.setup, 'URLs', module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options['RunitStatus'] wouldHave = 'Would have ' if not self.commitURLs else '' if runitStatus == 'Run' and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == 'Down' and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) @staticmethod def _getURL(serviceName, options): """Return URL for the service.""" system = options['System'] port = options['Port'] host = socket.gethostname() url = 'dips://%s:%s/%s/%s' % (host, port, system, serviceName) return url
class ComponentSupervisionAgent(AgentModule): """ComponentSupervisionAgent class.""" def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = "ComponentSupervisionAgent" self.setup = "DIRAC-Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.doNotRestartInstancePattern = ["RequestExecutingAgent"] self.diracLocation = rootPath self.sysAdminClient = SystemAdministratorClient(socket.getfqdn()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self._tornadoPort = "8443" self.errors = list() self.accounting = defaultdict(dict) self.addressTo = [] self.addressFrom = "" self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn( ) def logError(self, errStr, varMsg=""): """Append errors to a list, which is sent in email notification.""" self.log.error(errStr, varMsg) self.errors.append(errStr + " " + varMsg) def beginExecution(self): """Reload the configurations before every cycle.""" self.setup = self.am_getOption("Setup", self.setup) self.enabled = self.am_getOption("EnableFlag", self.enabled) self.restartAgents = self.am_getOption("RestartAgents", self.restartAgents) self.restartExecutors = self.am_getOption("RestartExecutors", self.restartExecutors) self.restartServices = self.am_getOption("RestartServices", self.restartServices) self.addressTo = self.am_getOption("MailTo", self.addressTo) self.addressFrom = self.am_getOption("MailFrom", self.addressFrom) self.controlComponents = self.am_getOption("ControlComponents", self.controlComponents) self.commitURLs = self.am_getOption("CommitURLs", self.commitURLs) self.doNotRestartInstancePattern = self.am_getOption( "DoNotRestartInstancePattern", self.doNotRestartInstancePattern) self.csAPI = CSAPI() res = self.getRunningInstances(instanceType="Agents") if not res["OK"]: return S_ERROR("Failure to get running agents") self.agents = res["Value"] res = self.getRunningInstances(instanceType="Executors") if not res["OK"]: return S_ERROR("Failure to get running executors") self.executors = res["Value"] res = self.getRunningInstances(instanceType="Services") if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] self.accounting.clear() return S_OK() def sendNotification(self): """Send email notification about changes done in the last cycle.""" if not (self.errors or self.accounting): return S_OK() emailBody = "" rows = [] for instanceName, val in self.accounting.items(): rows.append([[instanceName], [val.get("Treatment", "No Treatment")], [str(val.get("LogAge", "Not Relevant"))]]) if rows: columns = ["Instance", "Treatment", "Log File Age (Minutes)"] emailBody += printTable(columns, rows, printOut=False, numbering=False, columnSeparator=" | ") if self.errors: emailBody += "\n\nErrors:" emailBody += "\n".join(self.errors) self.log.notice("Sending Email:\n" + emailBody) for address in self.addressTo: res = self.nClient.sendMail(address, self.emailSubject, emailBody, self.addressFrom, localAttempt=False) if not res["OK"]: self.log.error("Failure to send Email notification to ", address) continue self.errors = [] self.accounting.clear() return S_OK() def getRunningInstances(self, instanceType="Agents", runitStatus="Run"): """Return a dict of running agents, executors or services. Key is component's name, value contains dict with PollingTime, PID, Port, Module, RunitStatus, LogFileLocation :param str instanceType: 'Agents', 'Executors', 'Services' :param str runitStatus: Return only those instances with given RunitStatus or 'All' :returns: Dictionary of running instances """ res = self.sysAdminClient.getOverallStatus() if not res["OK"]: self.logError( "Failure to get %s from system administrator client" % instanceType, res["Message"]) return res val = res["Value"][instanceType] runningComponents = defaultdict(dict) for system, components in val.items(): for componentName, componentInfo in components.items(): if componentInfo["Setup"] and componentInfo["Installed"]: if runitStatus != "All" and componentInfo[ "RunitStatus"] != runitStatus: continue for option, default in (("PollingTime", HOUR), ("Port", None), ("Protocol", None)): runningComponents[componentName][ option] = self._getComponentOption( instanceType, system, componentName, option, default) # remove empty values so we can use defaults in _getURL if not runningComponents[componentName][option]: runningComponents[componentName].pop(option) runningComponents[componentName][ "LogFileLocation"] = os.path.join( self.diracLocation, "runit", system, componentName, "log", "current") runningComponents[componentName]["PID"] = componentInfo[ "PID"] runningComponents[componentName]["Module"] = componentInfo[ "Module"] runningComponents[componentName][ "RunitStatus"] = componentInfo["RunitStatus"] runningComponents[componentName]["System"] = system return S_OK(runningComponents) def _getComponentOption(self, instanceType, system, componentName, option, default): """Get component option from DIRAC CS, using components' base classes methods.""" componentPath = PathFinder.getComponentSection( system=system, component=componentName, setup=self.setup, componentCategory=instanceType, ) if instanceType != "Agents": return gConfig.getValue(Path.cfgPath(componentPath, option), default) # deal with agent configuration componentLoadModule = gConfig.getValue( Path.cfgPath(componentPath, "Module"), componentName) fullComponentName = Path.cfgPath(system, componentName) fullComponentLoadName = Path.cfgPath(system, componentLoadModule) return AgentModule(fullComponentName, fullComponentLoadName).am_getOption( option, default) def on_terminate(self, componentName, process): """Execute callback when a process terminates gracefully.""" self.log.info( "%s's process with ID: %s has been terminated successfully" % (componentName, process.pid)) def execute(self): """Execute checks for agents, executors, services.""" for instanceType in ("executor", "agent", "service"): for name, options in getattr(self, instanceType + "s").items(): # call checkAgent, checkExecutor, checkService res = getattr(self, "check" + instanceType.capitalize())(name, options) if not res["OK"]: self.logError("Failure when checking %s" % instanceType, "%s, %s" % (name, res["Message"])) res = self.componentControl() if not res["OK"]: if "Stopped does not exist" not in res[ "Message"] and "Running does not exist" not in res[ "Message"]: self.logError("Failure to control components", res["Message"]) if not self.errors: res = self.checkURLs() if not res["OK"]: self.logError("Failure to check URLs", res["Message"]) else: self.logError( "Something was wrong before, not checking URLs this time") self.sendNotification() if self.errors: return S_ERROR("Error during this cycle, check log") return S_OK() @staticmethod def getLastAccessTime(logFileLocation): """Return the age of log file.""" lastAccessTime = 0 try: lastAccessTime = os.path.getmtime(logFileLocation) lastAccessTime = datetime.fromtimestamp(lastAccessTime) except OSError as e: return S_ERROR("Failed to access logfile %s: %r" % (logFileLocation, e)) now = datetime.now() age = now - lastAccessTime return S_OK(age) def restartInstance(self, pid, instanceName, enabled): """Kill a process which is then restarted automatically.""" if not (self.enabled and enabled): self.log.info( "Restarting is disabled, please restart %s manually" % instanceName) self.accounting[instanceName][ "Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) if any(pattern in instanceName for pattern in self.doNotRestartInstancePattern): self.log.info( "Restarting for %s is disabled, please restart it manually" % instanceName) self.accounting[instanceName][ "Treatment"] = "Please restart it manually" return S_OK(NO_RESTART) try: componentProc = psutil.Process(int(pid)) processesToTerminate = componentProc.children(recursive=True) processesToTerminate.append(componentProc) for proc in processesToTerminate: proc.terminate() _gone, alive = psutil.wait_procs(processesToTerminate, timeout=5, callback=partial( self.on_terminate, instanceName)) for proc in alive: self.log.info("Forcefully killing process %s" % proc.pid) proc.kill() return S_OK() except psutil.Error as err: self.logError("Exception occurred in terminating processes", "%s" % err) return S_ERROR() def checkService(self, serviceName, options): """Ping the service, restart if the ping does not respond.""" url = self._getURL(serviceName, options) self.log.info("Pinging service", url) pingRes = Client().ping(url=url) if not pingRes["OK"]: self.log.info("Failure pinging service: %s: %s" % (url, pingRes["Message"])) res = self.restartInstance(int(options["PID"]), serviceName, self.restartServices) if not res["OK"]: return res if res["Value"] != NO_RESTART: self.accounting[serviceName][ "Treatment"] = "Successfully Restarted" self.log.info("Service %s has been successfully restarted" % serviceName) self.log.info("Service responded OK") return S_OK() def checkAgent(self, agentName, options): """Check the age of agent's log file, if it is too old then restart the agent.""" pollingTime, currentLogLocation, pid = (options["PollingTime"], options["LogFileLocation"], options["PID"]) self.log.info("Checking Agent: %s" % agentName) self.log.info("Polling Time: %s" % pollingTime) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (agentName, (age.seconds / MINUTES))) maxLogAge = max(pollingTime + HOUR, 2 * HOUR) if age.seconds < maxLogAge: return S_OK() self.log.info("Current log file is too old for Agent %s" % agentName) self.accounting[agentName]["LogAge"] = age.seconds / MINUTES res = self.restartInstance(int(pid), agentName, self.restartAgents) if not res["OK"]: return res if res["Value"] != NO_RESTART: self.accounting[agentName]["Treatment"] = "Successfully Restarted" self.log.info("Agent %s has been successfully restarted" % agentName) return S_OK() def checkExecutor(self, executor, options): """Check the age of executor log file, if too old check for jobs in checking status, then restart the executors.""" currentLogLocation = options["LogFileLocation"] pid = options["PID"] self.log.info("Checking executor: %s" % executor) self.log.info("Current Log File location: %s" % currentLogLocation) res = self.getLastAccessTime(currentLogLocation) if not res["OK"]: return res age = res["Value"] self.log.info("Current log file for %s is %d minutes old" % (executor, (age.seconds / MINUTES))) if age.seconds < 2 * HOUR: return S_OK() self.log.info("Current log file is too old for Executor %s" % executor) self.accounting[executor]["LogAge"] = age.seconds / MINUTES res = self.checkForCheckingJobs(executor) if not res["OK"]: return res if res["OK"] and res["Value"] == NO_CHECKING_JOBS: self.accounting.pop(executor, None) return S_OK(NO_RESTART) res = self.restartInstance(int(pid), executor, self.restartExecutors) if not res["OK"]: return res elif res["OK"] and res["Value"] != NO_RESTART: self.accounting[executor]["Treatment"] = "Successfully Restarted" self.log.info("Executor %s has been successfully restarted" % executor) return S_OK() def checkForCheckingJobs(self, executorName): """Check if there are checking jobs with the **executorName** as current MinorStatus.""" attrDict = {"Status": "Checking", "MinorStatus": executorName} # returns list of jobs IDs resJobs = self.jobMonClient.getJobs(attrDict) if not resJobs["OK"]: self.logError("Could not get jobs for this executor", "%s: %s" % (executorName, resJobs["Message"])) return resJobs if resJobs["Value"]: self.log.info('Found %d jobs in "Checking" status for %s' % (len(resJobs["Value"]), executorName)) return S_OK(CHECKING_JOBS) self.log.info('Found no jobs in "Checking" status for %s' % executorName) return S_OK(NO_CHECKING_JOBS) def componentControl(self): """Monitor and control component status as defined in the CS. Check for running and stopped components and ensure they have the proper status as defined in the CS Registry/Hosts/_HOST_/[Running|Stopped] sections :returns: :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_OK`, :func:`~DIRAC:DIRAC.Core.Utilities.ReturnValues.S_ERROR` """ # get the current status of the components resCurrent = self._getCurrentComponentStatus() if not resCurrent["OK"]: return resCurrent currentStatus = resCurrent["Value"] resDefault = self._getDefaultComponentStatus() if not resDefault["OK"]: return resDefault defaultStatus = resDefault["Value"] # ensure instances are in the right state shouldBe = {} shouldBe["Run"] = defaultStatus["Run"].intersection( currentStatus["Down"]) shouldBe["Down"] = defaultStatus["Down"].intersection( currentStatus["Run"]) shouldBe["Unknown"] = defaultStatus["All"].symmetric_difference( currentStatus["All"]) self._ensureComponentRunning(shouldBe["Run"]) self._ensureComponentDown(shouldBe["Down"]) for instance in shouldBe["Unknown"]: self.logError("Unknown instance", "%r, either uninstall or add to config" % instance) return S_OK() def _getCurrentComponentStatus(self): """Get current status for components.""" resOverall = self.sysAdminClient.getOverallStatus() if not resOverall["OK"]: return resOverall currentStatus = {"Down": set(), "Run": set(), "All": set()} informationDict = resOverall["Value"] for systemsDict in informationDict.values(): for system, instancesDict in systemsDict.items(): for instanceName, instanceInfoDict in instancesDict.items(): identifier = "%s__%s" % (system, instanceName) runitStatus = instanceInfoDict.get("RunitStatus") if runitStatus in ("Run", "Down"): currentStatus[runitStatus].add(identifier) currentStatus["All"] = currentStatus["Run"] | currentStatus["Down"] return S_OK(currentStatus) def _getDefaultComponentStatus(self): """Get the configured status of the components.""" host = socket.getfqdn() defaultStatus = {"Down": set(), "Run": set(), "All": set()} resRunning = gConfig.getOptionsDict( Path.cfgPath("/Registry/Hosts/", host, "Running")) resStopped = gConfig.getOptionsDict( Path.cfgPath("/Registry/Hosts/", host, "Stopped")) if not resRunning["OK"]: return resRunning if not resStopped["OK"]: return resStopped defaultStatus["Run"] = set(resRunning["Value"]) defaultStatus["Down"] = set(resStopped["Value"]) defaultStatus["All"] = defaultStatus["Run"] | defaultStatus["Down"] if defaultStatus["Run"].intersection(defaultStatus["Down"]): self.logError( "Overlap in configuration", str(defaultStatus["Run"].intersection(defaultStatus["Down"]))) return S_ERROR("Bad host configuration") return S_OK(defaultStatus) def _ensureComponentRunning(self, shouldBeRunning): """Ensure the correct components are running.""" for instance in shouldBeRunning: self.log.info("Starting instance %s" % instance) system, name = instance.split("__") if self.controlComponents: res = self.sysAdminClient.startComponent(system, name) if not res["OK"]: self.logError("Failed to start component:", "%s: %s" % (instance, res["Message"])) else: self.accounting[instance][ "Treatment"] = "Instance was down, started instance" else: self.accounting[instance][ "Treatment"] = "Instance is down, should be started" def _ensureComponentDown(self, shouldBeDown): """Ensure the correct components are not running.""" for instance in shouldBeDown: self.log.info("Stopping instance %s" % instance) system, name = instance.split("__") if self.controlComponents: res = self.sysAdminClient.stopComponent(system, name) if not res["OK"]: self.logError("Failed to stop component:", "%s: %s" % (instance, res["Message"])) else: self.accounting[instance][ "Treatment"] = "Instance was running, stopped instance" else: self.accounting[instance][ "Treatment"] = "Instance is running, should be stopped" def checkURLs(self): """Ensure that the running services have their URL in the Config.""" self.log.info("Checking URLs") # get services again, in case they were started/stop in controlComponents gConfig.forceRefresh(fromMaster=True) # get port used for https based services try: tornadoSystemInstance = PathFinder.getSystemInstance( system="Tornado", setup=self.setup, ) self._tornadoPort = gConfig.getValue( Path.cfgPath("/System/Tornado/", tornadoSystemInstance, "Port"), self._tornadoPort, ) except RuntimeError: pass self.log.debug("Using Tornado Port:", self._tornadoPort) res = self.getRunningInstances(instanceType="Services", runitStatus="All") if not res["OK"]: return S_ERROR("Failure to get running services") self.services = res["Value"] for service, options in sorted(self.services.items()): self.log.debug("Checking URL for %s with options %s" % (service, options)) # ignore SystemAdministrator, does not have URLs if "SystemAdministrator" in service: continue self._checkServiceURL(service, options) if self.csAPI.csModified and self.commitURLs: self.log.info("Commiting changes to the CS") result = self.csAPI.commit() if not result["OK"]: self.logError("Commit to CS failed", result["Message"]) return S_ERROR("Failed to commit to CS") return S_OK() def _checkServiceURL(self, serviceName, options): """Ensure service URL is properly configured in the CS.""" url = self._getURL(serviceName, options) system = options["System"] module = options["Module"] self.log.info("Checking URLs for %s/%s" % (system, module)) urlsConfigPath = Path.cfgPath( PathFinder.getSystemURLSection(system=system, setup=self.setup), module) urls = gConfig.getValue(urlsConfigPath, []) self.log.debug("Found configured URLs for %s: %s" % (module, urls)) self.log.debug("This URL is %s" % url) runitStatus = options["RunitStatus"] wouldHave = "Would have " if not self.commitURLs else "" if runitStatus == "Run" and url not in urls: urls.append(url) message = "%sAdded URL %s to URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) if runitStatus == "Down" and url in urls: urls.remove(url) message = "%sRemoved URL %s from URLs for %s/%s" % (wouldHave, url, system, module) self.log.info(message) self.accounting[serviceName + "/URL"]["Treatment"] = message self.csAPI.modifyValue(urlsConfigPath, ",".join(urls)) def _getURL(self, serviceName, options): """Return URL for the service.""" system = options["System"] port = options.get("Port", self._tornadoPort) host = socket.getfqdn() protocol = options.get("Protocol", "dips") url = "%s://%s:%s/%s/%s" % (protocol, host, port, system, serviceName) return url
def do_show( self, args ): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show project - show project to install or upgrade show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show host - show host related parameters show hosts - show all available hosts show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before> | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]* - show all the installations of components that match the given parameters show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: gLogger.notice( self.do_show.__doc__ ) return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient( self.host, self.port ) result = client.getSoftwareComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'installed': client = SystemAdministratorClient( self.host, self.port ) result = client.getInstalledComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'setup': client = SystemAdministratorClient( self.host, self.port ) result = client.getSetupComponents() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'project': result = SystemAdministratorClient( self.host, self.port ).getProject() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( "Current project is %s" % result[ 'Value' ] ) elif option == 'status': client = SystemAdministratorClient( self.host, self.port ) result = client.getOverallStatus() if not result['OK']: self.__errMsg( result['Message'] ) else: fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID'] records = [] rDict = result['Value'] for compType in rDict: for system in rDict[compType]: components = rDict[compType][system].keys() components.sort() for component in components: record = [] if rDict[compType][system][component]['Installed']: module = str( rDict[compType][system][component]['Module'] ) record += [ system,component,module,compType.lower()[:-1]] if rDict[compType][system][component]['Setup']: record += ['Setup'] else: record += ['NotSetup'] if rDict[compType][system][component]['Installed']: record += ['Installed'] else: record += ['NotInstalled'] record += [str( rDict[compType][system][component]['RunitStatus'] )] record += [str( rDict[compType][system][component]['Timeup'] )] record += [str( rDict[compType][system][component]['PID'] )] records.append(record) printTable(fields,records) elif option == 'database' or option == 'databases': client = SystemAdministratorClient( self.host, self.port ) if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = "******" InstallTools.getMySQLPasswords() result = client.getDatabases( InstallTools.mysqlRootPwd ) if not result['OK']: self.__errMsg( result['Message'] ) return resultSW = client.getAvailableDatabases() if not resultSW['OK']: self.__errMsg( resultSW['Message'] ) return sw = resultSW['Value'] installed = result['Value'] gLogger.notice( '' ) for db in sw: if db in installed: gLogger.notice( db.rjust( 25 ), ': Installed' ) else: gLogger.notice( db.rjust( 25 ), ': Not installed' ) if not sw: gLogger.notice( "No database found" ) elif option == 'mysql': client = SystemAdministratorClient( self.host, self.port ) result = client.getMySQLStatus() if not result['OK']: self.__errMsg( result['Message'] ) elif result['Value']: gLogger.notice( '' ) for par, value in result['Value'].items(): gLogger.notice( ( par.rjust( 28 ), ':', value ) ) else: gLogger.notice( "No MySQL database found" ) elif option == "log": self.getLog( argss ) elif option == "info": client = SystemAdministratorClient( self.host, self.port ) result = client.getInfo() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) gLogger.notice( "Setup:", result['Value']['Setup'] ) gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] ) if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): gLogger.notice( "%s version" % e, v ) gLogger.notice( '' ) elif option == "host": client = SystemAdministratorClient( self.host, self.port ) result = client.getHostInfo() if not result['OK']: self.__errMsg( result['Message'] ) else: gLogger.notice( '' ) gLogger.notice( "Host info:" ) gLogger.notice( '' ) fields = ['Parameter','Value'] records = [] for key, value in result['Value'].items(): records.append( [key, str( value ) ] ) printTable( fields, records ) elif option == "hosts": client = ComponentMonitoringClient() result = client.getHosts( {}, False, False ) if not result[ 'OK' ]: self.__errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) ) else: hostList = result[ 'Value' ] gLogger.notice( '' ) gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' ) gLogger.notice( ( '-' * 69 ) ) for element in hostList: gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' ) gLogger.notice( ( '-' * 69 ) ) gLogger.notice( '' ) elif option == "errors": self.getErrors( argss ) elif option == "installations": self.getInstallations( argss ) else: gLogger.notice( "Unknown option:", option )
def do_show( self, args ): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show project - show project to install or upgrade show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show doc <type> <system> <name> - show documentation for a given service or agent show host - show host related parameters show hosts - show all available hosts show ports [host] - show all ports used by a host. If no host is given, the host currently connected to is used show installations [ list | current | -n <Name> | -h <Host> | -s <System> | -m <Module> | -t <Type> | -itb <InstallationTime before> | -ita <InstallationTime after> | -utb <UnInstallationTime before> | -uta <UnInstallationTime after> ]* - show all the installations of components that match the given parameters show profile <system> <component> [ -s <size> | -h <host> | -id <initial date DD/MM/YYYY> | -it <initial time hh:mm> | -ed <end date DD/MM/YYYY | -et <end time hh:mm> ]* - show <size> log lines of profiling information for a component in the machine <host> show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: gLogger.notice( self.do_show.__doc__ ) return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient( self.host, self.port ) result = client.getSoftwareComponents() if not result['OK']: self._errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'installed': client = SystemAdministratorClient( self.host, self.port ) result = client.getInstalledComponents() if not result['OK']: self._errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'setup': client = SystemAdministratorClient( self.host, self.port ) result = client.getSetupComponents() if not result['OK']: self._errMsg( result['Message'] ) else: gLogger.notice( '' ) pprint.pprint( result['Value'] ) elif option == 'project': result = SystemAdministratorClient( self.host, self.port ).getProject() if not result['OK']: self._errMsg( result['Message'] ) else: gLogger.notice( "Current project is %s" % result[ 'Value' ] ) elif option == 'status': client = SystemAdministratorClient( self.host, self.port ) result = client.getOverallStatus() if not result['OK']: self._errMsg( result['Message'] ) else: fields = ["System",'Name','Module','Type','Setup','Installed','Runit','Uptime','PID'] records = [] rDict = result['Value'] for compType in rDict: for system in rDict[compType]: components = rDict[compType][system].keys() components.sort() for component in components: record = [] if rDict[compType][system][component]['Installed']: module = str( rDict[compType][system][component]['Module'] ) record += [ system,component,module,compType.lower()[:-1]] if rDict[compType][system][component]['Setup']: record += ['Setup'] else: record += ['NotSetup'] if rDict[compType][system][component]['Installed']: record += ['Installed'] else: record += ['NotInstalled'] record += [str( rDict[compType][system][component]['RunitStatus'] )] record += [str( rDict[compType][system][component]['Timeup'] )] record += [str( rDict[compType][system][component]['PID'] )] records.append(record) printTable(fields,records) elif option == 'database' or option == 'databases': client = SystemAdministratorClient( self.host, self.port ) if not gComponentInstaller.mysqlPassword: gComponentInstaller.mysqlPassword = "******" gComponentInstaller.getMySQLPasswords() result = client.getDatabases( gComponentInstaller.mysqlRootPwd ) if not result['OK']: self._errMsg( result['Message'] ) return resultSW = client.getAvailableDatabases() if not resultSW['OK']: self._errMsg( resultSW['Message'] ) return sw = resultSW['Value'] installed = result['Value'] gLogger.notice( '' ) for db in sw: if db in installed: gLogger.notice( db.rjust( 25 ), ': Installed' ) else: gLogger.notice( db.rjust( 25 ), ': Not installed' ) if not sw: gLogger.notice( "No database found" ) elif option == 'mysql': client = SystemAdministratorClient( self.host, self.port ) result = client.getMySQLStatus() if not result['OK']: self._errMsg( result['Message'] ) elif result['Value']: gLogger.notice( '' ) for par, value in result['Value'].items(): gLogger.notice( ( par.rjust( 28 ), ':', value ) ) else: gLogger.notice( "No MySQL database found" ) elif option == "log": self.getLog( argss ) elif option == "info": client = SystemAdministratorClient( self.host, self.port ) result = client.getInfo() if not result['OK']: self._errMsg( result['Message'] ) else: gLogger.notice( '' ) gLogger.notice( "Setup:", result['Value']['Setup'] ) gLogger.notice( "DIRAC version:", result['Value']['DIRAC'] ) if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): gLogger.notice( "%s version" % e, v ) gLogger.notice( '' ) elif option == "host": client = SystemAdministratorClient( self.host, self.port ) result = client.getHostInfo() if not result['OK']: self._errMsg( result['Message'] ) else: gLogger.notice( '' ) gLogger.notice( "Host info:" ) gLogger.notice( '' ) fields = ['Parameter','Value'] records = [] for parameter in result['Value'].iteritems(): if parameter[0] == 'Extension': extensions = parameter[1].split( ',' ) for extension in extensions: extensionName, extensionVersion = extension.split( ':' ) records.append( [ '%sVersion' % extensionName, str( extensionVersion ) ] ) else: records.append( [ parameter[0], str( parameter[1] ) ] ) printTable( fields, records ) elif option == "hosts": client = ComponentMonitoringClient() result = client.getHosts( {}, False, False ) if not result[ 'OK' ]: self._errMsg( 'Error retrieving the list of hosts: %s' % ( result[ 'Message' ] ) ) else: hostList = result[ 'Value' ] gLogger.notice( '' ) gLogger.notice( ' ' + 'Host'.center( 32 ) + ' ' + 'CPU'.center( 34 ) + ' ' ) gLogger.notice( ( '-' * 69 ) ) for element in hostList: gLogger.notice( '|' + element[ 'HostName' ].center( 32 ) + '|' + element[ 'CPU' ].center( 34 ) + '|' ) gLogger.notice( ( '-' * 69 ) ) gLogger.notice( '' ) elif option == "ports": if not argss: client = SystemAdministratorClient( self.host ) else: hostname = argss[0] del argss[0] client = ComponentMonitoringClient() result = client.hostExists( { 'HostName': hostname } ) if not result[ 'OK' ]: self._errMsg( result[ 'Message' ] ) return else: if not result[ 'Value' ]: self._errMsg( 'Given host does not exist' ) return client = SystemAdministratorClient( hostname ) result = client.getUsedPorts() if not result[ 'OK' ]: self._errMsg( result[ 'Message' ] ) return pprint.pprint( result[ 'Value' ] ) elif option == "errors": self.getErrors( argss ) elif option == "installations": self.getInstallations( argss ) elif option == "doc": if len( argss ) > 2: if argss[0] in [ 'service', 'agent' ]: compType = argss[0] compSystem = argss[1] compModule = argss[2] client = SystemAdministratorClient( self.host, self.port ) result = client.getComponentDocumentation( compType, compSystem, compModule ) if result[ 'OK' ]: gLogger.notice( result[ 'Value' ] ) else: self._errMsg( result[ 'Message' ] ) else: gLogger.notice( self.do_show.__doc__ ) else: gLogger.notice( self.do_show.__doc__ ) elif option == "profile": if len( argss ) > 1: system = argss[0] del argss[0] component = argss[0] del argss[0] component = '%s_%s' % ( system, component ) argDict = { '-s': None, '-h': self.host, '-id': None, '-it': '00:00', '-ed': None, '-et': '00:00' } key = None for arg in argss: if not key: key = arg else: argDict[ key ] = arg key = None size = None try: if argDict[ '-s' ]: size = int( argDict[ '-s' ] ) except ValueError as _ve: self._errMsg( 'Argument \'size\' must be an integer' ) return host = argDict[ '-h' ] initialDate = argDict[ '-id' ] initialTime = argDict[ '-it' ] endingDate = argDict[ '-ed' ] endingTime = argDict[ '-et' ] if initialDate: initialDate = '%s %s' % ( initialDate, initialTime ) else: initialDate = '' if endingDate: endingDate = '%s %s' % ( endingDate, endingTime ) else: endingDate = '' client = MonitoringClient() if size: result = client.getLimitedData( host, component, size ) else: result = client.getDataForAGivenPeriod( host, component, initialDate, endingDate ) if result[ 'OK' ]: text = '' headers = [result['Value'][0].keys()] for header in headers: text += str( header ).ljust( 15 ) gLogger.notice( text ) for record in result[ 'Value' ]: for metric in record.itervalues(): text += str( metric ).ljust( 15 ) gLogger.notice( text ) else: self._errMsg( result[ 'Message' ] ) else: gLogger.notice( self.do_show.__doc__ ) else: gLogger.notice( "Unknown option:", option )
def do_show(self, args): """ Show list of components with various related information usage: show software - show components for which software is available show installed - show components installed in the host with runit system show setup - show components set up for automatic running in the host show status - show status of the installed components show database - show status of the databases show mysql - show status of the MySQL server show log <system> <service|agent> [nlines] - show last <nlines> lines in the component log file show info - show version of software and setup show errors [*|<system> <service|agent>] - show error count for the given component or all the components in the last hour and day """ argss = args.split() if not argss: print self.do_show.__doc__ return option = argss[0] del argss[0] if option == 'software': client = SystemAdministratorClient(self.host, self.port) result = client.getSoftwareComponents() if not result['OK']: print " ERROR:", result['Message'] else: print pprint.pprint(result['Value']) elif option == 'installed': client = SystemAdministratorClient(self.host, self.port) result = client.getInstalledComponents() if not result['OK']: print " ERROR:", result['Message'] else: print pprint.pprint(result['Value']) elif option == 'setup': client = SystemAdministratorClient(self.host, self.port) result = client.getSetupComponents() if not result['OK']: print " ERROR:", result['Message'] else: print pprint.pprint(result['Value']) elif option == 'status': client = SystemAdministratorClient(self.host, self.port) result = client.getOverallStatus() if not result['OK']: print "ERROR:", result['Message'] else: rDict = result['Value'] print print " System", ' ' * 20, 'Name', ' ' * 15, 'Type', ' ' * 13, 'Setup Installed Runit Uptime PID' print '-' * 116 for compType in rDict: for system in rDict[compType]: for component in rDict[compType][system]: if rDict[compType][system][component]['Installed']: print system.ljust(28), component.ljust( 28), compType.lower()[:-1].ljust(7), if rDict[compType][system][component]['Setup']: print 'SetUp'.rjust(12), else: print 'NotSetup'.rjust(12), if rDict[compType][system][component][ 'Installed']: print 'Installed'.rjust(12), else: print 'NotInstalled'.rjust(12), print str(rDict[compType][system][component] ['RunitStatus']).ljust(7), print str(rDict[compType][system][component] ['Timeup']).rjust(7), print str(rDict[compType][system][component] ['PID']).rjust(8), print elif option == 'database' or option == 'databases': client = SystemAdministratorClient(self.host, self.port) if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = "******" InstallTools.getMySQLPasswords() result = client.getDatabases(InstallTools.mysqlRootPwd) if not result['OK']: print "ERROR:", result['Message'] return resultSW = client.getAvailableDatabases() if not resultSW['OK']: print "ERROR:", resultSW['Message'] return sw = resultSW['Value'] installed = result['Value'] print for db in sw: if db in installed: print db.rjust(25), ': Installed' else: print db.rjust(25), ': Not installed' if not sw: print "No database found" elif option == 'mysql': client = SystemAdministratorClient(self.host, self.port) result = client.getMySQLStatus() if not result['OK']: print "ERROR:", result['Message'] elif result['Value']: print for par, value in result['Value'].items(): print par.rjust(28), ':', value else: print "No MySQL database found" elif option == "log": self.getLog(argss) elif option == "info": client = SystemAdministratorClient(self.host, self.port) result = client.getInfo() if not result['OK']: print "ERROR:", result['Message'] else: print print "Setup:", result['Value']['Setup'] print "DIRAC version:", result['Value']['DIRAC'] if result['Value']['Extensions']: for e, v in result['Value']['Extensions'].items(): print "%s version" % e, v print elif option == "errors": self.getErrors(argss) else: print "Unknown option:", option
class LemonAgent(AgentModule): def initialize(self): self.NON_CRITICAL = "NonCritical" self.CRITICAL = "Critical" self.FAILURE = "FAILURE" self.OK = "OK" self.setup = gConfig.getValue('/DIRAC/Setup', 'LHCb-Development') self.outputNonCritical = True #all components not present here will be treated as non critical self.admClient = SystemAdministratorClient('localhost') return S_OK() def execute(self): """ Main execution method """ monitoredSetups = Operations().getValue('Lemon/MonitoredSetups', ['LHCb-Production']) self.monitoringEnabled = self.setup in monitoredSetups if not self.monitoringEnabled: self._log( "Framework/LemonAgent", self.NON_CRITICAL, self.OK, "Monitoring not enabled for this setup: " + self.setup + ". Exiting.") return S_OK() hostsInMaintenance = Operations().getValue('Lemon/HostsInMaintenance', []) if gethostname() in hostsInMaintenance: self._log("Framework/LemonAgent", self.NON_CRITICAL, self.OK, "I am in maintenance mode, exiting.") return S_OK() result = self.admClient.getOverallStatus() if not result or not result['OK']: self._log("Framework/LemonAgent", self.CRITICAL, self.FAILURE, "Can not obtain result!!") return S_OK() services = result['Value']['Services'] agents = result['Value']['Agents'] self._processResults(services) self._processResults(agents) return S_OK() def _processResults(self, results): for system in results: for part in results[system]: component = results[system][part] componentName = system + "/" + part if component[ 'Setup'] == True: #we want to monitor only set up services and agents critLevel = self._getCriticality(componentName) if critLevel == self.NON_CRITICAL and self.outputNonCritical == False: continue if component['RunitStatus'] == 'Run': self._log(componentName, self._getCriticality(componentName), self.OK, "Service/Agent running fine") else: self._log(componentName, self._getCriticality(componentName), self.FAILURE, "Service/Agent failure!") # else: # if component['Installed'] == True: # print componentName + " is installed but not set up" def _getCriticality(self, component): #lets try to retrieve common criticality first criticality = Operations().getValue('Lemon/Criticalities/' + component, self.NON_CRITICAL) return criticality def _log(self, component, criticality, status, string): gLogger.info("LEMON " + criticality + " " + status + " " + component + ": " + string + "\n")