def showHostErrors( self ): DN = getUserDN() group = getSelectedGroup() if not "host" in request.params: return { "success" : "false" , "error" : "Name of the host is missing or not defined" } host = str( request.params[ "host" ] ) client = SystemAdministratorClient( host , None , delegatedDN=DN , delegatedGroup=group ) result = client.checkComponentLog( "*" ) gLogger.debug( result ) if not result[ "OK" ]: return { "success" : "false" , "error" : result[ "Message" ] } result = result[ "Value" ] callback = list() for key, value in result.items(): system, component = key.split( "/" ) value[ "System" ] = system value[ "Name" ] = component value[ "Host" ] = host callback.append( value ) total = len( callback ) return { "success" : "true" , "result" : callback , "total" : total }
def __init__(self, *args, **kwargs): """Initialize the agent, clients, default values.""" AgentModule.__init__(self, *args, **kwargs) self.name = "ComponentSupervisionAgent" self.setup = "DIRAC-Production" self.enabled = False self.restartAgents = False self.restartExecutors = False self.restartServices = False self.controlComponents = False self.commitURLs = False self.doNotRestartInstancePattern = ["RequestExecutingAgent"] self.diracLocation = rootPath self.sysAdminClient = SystemAdministratorClient(socket.getfqdn()) self.jobMonClient = JobMonitoringClient() self.nClient = NotificationClient() self.csAPI = None self.agents = dict() self.executors = dict() self.services = dict() self._tornadoPort = "8443" self.errors = list() self.accounting = defaultdict(dict) self.addressTo = [] self.addressFrom = "" self.emailSubject = "ComponentSupervisionAgent on %s" % socket.getfqdn( )
def submit( self ): """ Returns flatten list of components (services, agents) installed on hosts returned by getHosts function """ checkUserCredentials() DN = getUserDN() group = getSelectedGroup() callback = list() request = self.request() if not 'Hostname' in request: return { "success" : "false" , "error" : "Name of the host is absent" } host = request[ 'Hostname' ] client = SystemAdministratorClient( host , None , delegatedDN=DN , delegatedGroup=group ) result = client.getOverallStatus() gLogger.debug( "Result of getOverallStatus(): %s" % result ) if not result[ "OK" ]: return { "success" : "false" , "error" : result[ "Message" ] } overall = result[ "Value" ] for record in self.flatten( overall ): record[ "Host" ] = host callback.append( record ) return { "success" : "true" , "result" : callback }
def restartHost(hostName): """ Restart all systems and components of a host :param str hostName: name of the host you want to restart """ host, port = parseHostname(hostName) gLogger.notice("Pinging %s ..." % host) client = SystemAdministratorClient(host, port) result = client.ping() if not result['OK']: gLogger.error("Could not connect to %s: %s" % (host, result['Message'])) return result gLogger.notice("Host %s is active" % host) gLogger.notice("Initiating restart of all systems and components") # This restart call will always return S_ERROR because of SystemAdministrator restart # Connection will be lost to the host result = client.restartComponent('*', '*') if result['Message'] == "Peer closed connection": gLogger.notice( "Restarted all systems on %s : connection to SystemAdministrator lost" % host) return S_OK(result['Message']) gLogger.error("Received unxpected message: %s" % result['Message']) return result
def __executeClient(self, host, method, *parms, **kwargs): """Execute RPC method on a given host""" hostName = Registry.getHostOption(host, "Host", host) client = SystemAdministratorClient(hostName, **self.__kwargs) result = getattr(client, method)(*parms, **kwargs) result["Host"] = host return result
def manageService(self, service, action): """ Manage services running on this machine usage: service <action> <serviceName> """ client = ComponentMonitoringClient() result = client.getInstallations({'UninstallationTime': None}, { 'System': 'External', 'Module': service, 'Type': 'External' }, {'HostName': self.host}, False) if not result['OK']: self._errMsg(result['Message']) return elif len(result['Value']) < 1: self._errMsg('%s is not installed' % (service)) return client = SystemAdministratorClient(self.host, self.port) if action == 'start': result = client.startService(service) elif action == 'stop': result = client.stopService(service) elif action == 'restart': result = client.restartService(service) elif action == 'status': result = client.statusService(service) if not result['OK']: self._errMsg(result['Message']) return gLogger.notice(result['Value'])
def __executeClient(self, host, method, *parms, **kwargs): """ Execute RPC method on a given host """ client = SystemAdministratorClient(host, **self.__kwargs) result = getattr(client, method)(*parms, **kwargs) result['Host'] = host return result
def do_add( self, args ): """ Add new entity to the Configuration Service usage: add system <system> <instance> """ argss = args.split() option = argss[0] del argss[0] if option == "instance" or option == "system": system = argss[0] instance = argss[1] client = SystemAdministratorClient( self.host, self.port ) result = client.getInfo() if not result['OK']: self.__errMsg( result['Message'] ) hostSetup = result['Value']['Setup'] instanceName = gConfig.getValue( '/DIRAC/Setups/%s/%s' % ( hostSetup, system ), '' ) if instanceName: if instanceName == instance: print "System %s already has instance %s defined in %s Setup" % ( system, instance, hostSetup ) else: self.__errMsg( "System %s already has instance %s defined in %s Setup" % ( system, instance, hostSetup ) ) return result = InstallTools.addSystemInstance( system, instance, hostSetup ) if not result['OK']: self.__errMsg( result['Message'] ) else: print "%s system instance %s added successfully" % ( system, instance ) else: print "Unknown option:", option
def do_stop(self, args): """ Stop services or agents or database server usage: stop <system|*> <service|agent|*> stop mysql """ argss = args.split() if argss[0] != 'mysql': system = argss[0] if system != '*': component = argss[1] else: component = '*' client = SystemAdministratorClient(self.host, self.port) result = client.stopComponent(system, component) if not result['OK']: self.__errMsg(result['Message']) else: if system != '*' and component != '*': print "\n%s_%s stopped successfully, runit status:\n" % ( system, component) else: print "\nComponents stopped successfully, runit status:\n" for comp in result['Value']: print comp.rjust( 32), ':', result['Value'][comp]['RunitStatus'] else: print "Not yet implemented"
def __init__(self, **kwargs): """ Constructor """ if 'hosts' in kwargs: self.__hosts = kwargs['hosts'] del kwargs['hosts'] else: result = Registry.getHosts() if result['OK']: self.__hosts = result['Value'] else: self.__hosts = [] # Excluded hosts if 'exclude' in kwargs: self.__hosts = list(set(self.__hosts) - set(kwargs['exclude'])) # Ping the hosts to remove those that don't have a SystemAdministrator service sysAdminHosts = [] for host in self.__hosts: client = SystemAdministratorClient(host) result = client.ping() if result['OK']: sysAdminHosts.append(host) self.__hosts = sysAdminHosts self.__kwargs = dict(kwargs) self.__pool = ThreadPool(len(self.__hosts)) self.__resultDict = {}
def showLog( self ): DN = getUserDN() group = getSelectedGroup() if not "host" in request.params: return "Name of the host is missing or not defined" host = str( request.params[ "host" ] ) if not "system" in request.params: return "Name of the system is missing or not defined" system = str( request.params[ "system" ] ) if not "component" in request.params: return "Name of component is missing or not defined" name = str( request.params[ "component" ] ) client = SystemAdministratorClient( host , None , delegatedDN=DN , delegatedGroup=group ) result = client.getLogTail( system , name ) gLogger.debug( result ) if not result[ "OK" ]: return result[ "Message" ] result = result[ "Value" ] key = system + "_" + name if not key in result: return "%s key is absent in service response" % key log = result[ key ] return log.replace( "\n" , "<br>" )
def web_getHostLog(self): userData = self.getSessionData() DN = str(userData["user"]["DN"]) group = str(userData["user"]["group"]) if not "host" in self.request.arguments: self.finish({ "success": "false", "error": "Name of the host is missing or not defined" }) return host = str(self.request.arguments["host"][0]) if not "system" in self.request.arguments: self.finish({ "success": "false", "error": "Name of the system is missing or not defined" }) return system = str(self.request.arguments["system"][0]) if not "component" in self.request.arguments: self.finish({ "success": "false", "error": "Name of component is missing or not defined" }) return name = str(self.request.arguments["component"][0]) client = SystemAdministratorClient(host, None, delegatedDN=DN, delegatedGroup=group) result = yield self.threadTask(client.getLogTail, system, name) gLogger.debug(result) if not result["OK"]: self.finish({"success": "false", "error": result["Message"]}) return result = result["Value"] key = system + "_" + name if not key in result: self.finish({ "success": "false", "error": "%s key is absent in service response" % key }) return log = result[key] self.finish({"success": "true", "result": log.replace("\n", "<br>")})
def do_revert(self, args): """ Revert the last installed version of software to the previous one usage: revert """ client = SystemAdministratorClient(self.host, self.port) result = client.revertSoftware() if not result['OK']: print "Error:", result['Message'] else: print "Software reverted to", result['Value']
def initialize(self): self.NON_CRITICAL = "NonCritical" self.CRITICAL = "Critical" self.FAILURE = "FAILURE" self.OK = "OK" self.setup = gConfig.getValue('/DIRAC/Setup', 'LHCb-Development') self.outputNonCritical = True #all components not present here will be treated as non critical self.admClient = SystemAdministratorClient('localhost') return S_OK()
def __actionHost( self ): """ Restart all DIRAC components on a given host """ if not "hostname" in request.params: return { "success" : "false" , "error" : "No hostname given" } hosts = request.params[ "hostname" ].split( "," ) DN = getUserDN() group = getSelectedGroup() self.actionSuccess = list() self.actionFailed = list() for i in hosts: client = SystemAdministratorClient( str( i ) , None , delegatedDN=DN , delegatedGroup=group ) if self.action is "restart": result = client.restartComponent( str( "*" ) , str( "*" ) ) elif self.action is "revert": result = client.revertSoftware() else: error = i + ": Action %s is not defined" % self.action self.actionFailed.append( error ) continue gLogger.always( result ) if not result[ "OK" ]: if result[ "Message" ].find( "Unexpected EOF" ) > 0: msg = "Signal 'Unexpected EOF' received. Most likely DIRAC components" msg = i + ": " + msg + " were successfully restarted." self.actionSuccess.append( msg ) continue error = i + ": " + result[ "Message" ] self.actionFailed.append( error ) gLogger.error( error ) else: gLogger.info( result[ "Value" ] ) self.actionSuccess.append( i ) self.prefix = "Host" return self.__aftermath()
def do_exec(self, args): """ Execute a shell command on the remote host and get back the output usage: exec <cmd> [<arguments>] """ client = SystemAdministratorClient(self.host, self.port) result = client.executeCommand(args) if not result['OK']: self.__errMsg(result['Message']) status, output, error = result['Value'] print for line in output.split('\n'): print line if error: self.__errMsg(status) for line in error.split('\n'): print line
def do_update(self, args): """ Update the software on the target host to a given version usage: update <version> """ argss = args.split() version = argss[0] client = SystemAdministratorClient(self.host, self.port) print "Software update can take a while, please wait ..." result = client.updateSoftware(version) if not result['OK']: self.__errMsg("Failed to update the software") print result['Message'] else: print "Software successfully updated." print "You should restart the services to use the new software version." print "Think of updating /Operation/<vo>/<setup>/Versions section in the CS"
def do_cd(self, args): """ Change the current working directory on the target host Usage: cd <dirpath> """ argss = args.split() if len(argss) == 0: # Return to $HOME if self.homeDir: self.previous_cwd = self.cwd self.cwd = self.homeDir else: client = SystemAdministratorClient(self.host, self.port) command = 'echo $HOME' result = client.executeCommand(command) if not result['OK']: self.__errMsg(result['Message']) return status, output, _error = result['Value'] if not status and output: self.homeDir = output.strip() self.previous_cwd = self.cwd self.cwd = self.homeDir self.prompt = '[%s:%s]> ' % (self.host, self.cwd) return newPath = argss[0] if newPath == '-': if self.previous_cwd: cwd = self.cwd self.cwd = self.previous_cwd self.previous_cwd = cwd elif newPath.startswith('/'): self.previous_cwd = self.cwd self.cwd = newPath else: newPath = self.cwd + '/' + newPath self.previous_cwd = self.cwd self.cwd = os.path.normpath(newPath) self.prompt = '[%s:%s]> ' % (self.host, self.cwd)
def getErrors(self, argss): """ Get and print out errors from the logs of specified components """ component = '' if len(argss) < 1: component = '*' else: system = argss[0] if system == "*": component = '*' else: if len(argss) < 2: print print self.do_show.__doc__ return comp = argss[1] component = '/'.join([system, comp]) client = SystemAdministratorClient(self.host, self.port) result = client.checkComponentLog(component) if not result['OK']: self.__errMsg(result['Message']) else: fields = [ 'System', 'Component', 'Last hour', 'Last day', 'Last error' ] records = [] for cname in result['Value']: system, component = cname.split('/') errors_1 = result['Value'][cname]['ErrorsHour'] errors_24 = result['Value'][cname]['ErrorsDay'] lastError = result['Value'][cname]['LastError'] lastError.strip() if len(lastError) > 80: lastError = lastError[:80] + '...' records.append([ system, component, str(errors_1), str(errors_24), lastError ]) records.sort() printTable(fields, records)
def do_uninstall(self, args): """ Uninstall DIRAC component usage: uninstall <system> <component> """ argss = args.split() if not argss or len(argss) != 2: print self.do_uninstall.__doc__ return system, component = argss client = SystemAdministratorClient(self.host, self.port) result = client.uninstallComponent(system, component) if not result['OK']: print "Error:", result['Message'] else: print "Successfully uninstalled %s/%s" % (system, component)
def do_restart( self, args ): """ Restart services or agents or database server usage: restart <system|*> <service|agent|*> restart mysql """ if not args: gLogger.notice( self.do_restart.__doc__ ) return argss = args.split() option = argss[0] del argss[0] if option != 'mysql': if option != "*": if len( argss ) < 1: gLogger.notice( self.do_restart.__doc__ ) return system = option if system != '*': component = argss[0] else: component = '*' client = SystemAdministratorClient( self.host, self.port ) result = client.restartComponent( system, component ) if not result['OK']: if system == '*': gLogger.notice( "All systems are restarted, connection to SystemAdministrator is lost" ) else: self.__errMsg( result['Message'] ) else: if system != '*' and component != '*': gLogger.notice( "\n%s_%s started successfully, runit status:\n" % ( system, component ) ) else: gLogger.notice( "\nComponents started successfully, runit status:\n" ) for comp in result['Value']: gLogger.notice( ( comp.rjust( 32 ), ':', result['Value'][comp]['RunitStatus'] ) ) else: gLogger.notice( "Not yet implemented" )
def web_getHostData(self): """ Returns flatten list of components (services, agents) installed on hosts returned by getHosts function """ # checkUserCredentials() userData = self.getSessionData() DN = str(userData["user"]["DN"]) group = str(userData["user"]["group"]) callback = list() if not (self.request.arguments.has_key("hostname") and self.request.arguments["hostname"][0]): self.finish({ "success": "false", "error": "Name of the host is absent" }) return host = self.request.arguments["hostname"][0] client = SystemAdministratorClient(host, None, delegatedDN=DN, delegatedGroup=group) result = yield self.threadTask(client.getOverallStatus) gLogger.debug("Result of getOverallStatus(): %s" % result) if not result["OK"]: self.finish({"success": "false", "error": result["Message"]}) return overall = result["Value"] for record in self.flatten(overall): record["Host"] = host callback.append(record) self.finish({"success": "true", "result": callback})
def do_update( self, args ): """ Update the software on the target host to a given version usage: update <version> [ -r <rootPath> ] [ -g <lcgVersion> ] where rootPath - path to the DIRAC installation lcgVersion - version of the LCG bindings to install """ try: argss = args.split() version = argss[0] rootPath = '' lcgVersion = '' del argss[0] while len( argss ) > 0: if argss[0] == '-r': rootPath = argss[1] del argss[0] del argss[0] elif argss[0] == '-g': lcgVersion = argss[1] del argss[0] del argss[0] except Exception as x: gLogger.notice( "ERROR: wrong input:", str( x ) ) gLogger.notice( self.do_update.__doc__ ) return client = SystemAdministratorClient( self.host, self.port ) gLogger.notice( "Software update can take a while, please wait ..." ) result = client.updateSoftware( version, rootPath, lcgVersion, timeout = 300 ) if not result['OK']: self._errMsg( "Failed to update the software" ) gLogger.notice( result['Message'] ) else: gLogger.notice( "Software successfully updated." ) gLogger.notice( "You should restart the services to use the new software version." ) gLogger.notice( "Think of updating /Operations/<vo>/<setup>/Pilot/Versions section in the CS" )
def do_exec( self, args ): """ Execute a shell command on the remote host and get back the output usage: exec <cmd> [<arguments>] """ client = SystemAdministratorClient( self.host, self.port ) command = 'cd %s;' % self.cwd + args result = client.executeCommand( command ) if not result['OK']: self.__errMsg( result['Message'] ) return status, output, error = result['Value'] gLogger.notice( '' ) for line in output.split( '\n' ): gLogger.notice( line ) if error: self.__errMsg( status ) for line in error.split( '\n' ): gLogger.notice( line )
def do_start(self, args): """ Start services or agents or database server usage: start <system|*> <service|agent|*> start mysql """ argss = args.split() if len(argss) < 2: print self.do_start.__doc__ return option = argss[0] del argss[0] if option != 'mysql': if len(argss) < 1: print self.do_start.__doc__ return system = option if system != '*': component = argss[0] else: component = '*' client = SystemAdministratorClient(self.host, self.port) result = client.startComponent(system, component) if not result['OK']: self.__errMsg(result['Message']) else: if system != '*' and component != '*': print "\n%s_%s started successfully, runit status:\n" % ( system, component) else: print "\nComponents started successfully, runit status:\n" for comp in result['Value']: print comp.rjust( 32), ':', result['Value'][comp]['RunitStatus'] else: print "Not yet implemented"
def updateHost(hostName, version): """ Apply update to specific host :param str hostName: name of the host you want to update :param str version: version vArBpC you want to update to """ host, port = parseHostname(hostName) client = SystemAdministratorClient(host, port) result = client.ping() if not result['OK']: gLogger.error("Cannot connect to %s" % host) return result gLogger.notice( "Initiating software update of %s, this can take a while, please be patient ..." % host) result = client.updateSoftware(version, '', '', timeout=600) if not result['OK']: return result return S_OK()
def web_getHostErrors(self): userData = self.getSessionData() DN = str(userData["user"]["DN"]) group = str(userData["user"]["group"]) if not "host" in self.request.arguments: self.finish({ "success": "false", "error": "Name of the host is missing or not defined" }) return host = str(self.request.arguments["host"][0]) client = SystemAdministratorClient(host, None, delegatedDN=DN, delegatedGroup=group) result = yield self.threadTask(client.checkComponentLog, "*") gLogger.debug(result) if not result["OK"]: self.finish({"success": "false", "error": result["Message"]}) return result = result["Value"] callback = list() for key, value in result.items(): system, component = key.split("/") value["System"] = system value["Name"] = component value["Host"] = host callback.append(value) total = len(callback) self.finish({"success": "true", "result": callback, "total": total})
def getLog( self, argss ): """ Get the tail of the log file of the given component """ if len( argss ) < 2: print print self.do_show.__doc__ return system = argss[0] component = argss[1] nLines = 40 if len( argss ) > 2: nLines = int( argss[2] ) client = SystemAdministratorClient( self.host, self.port ) result = client.getLogTail( system, component, nLines ) if not result['OK']: self.__errMsg( result['Message'] ) elif result['Value']: for line in result['Value']['_'.join( [system, component] )].split( '\n' ): print ' ', line else: print "No logs found"
def __getClient(self): return SystemAdministratorClient(self.host, self.port)
def do_install(self, args): """ Install various DIRAC components usage: install mysql install db <database> install service <system> <service> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ... install agent <system> <agent> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ... install executor <system> <executor> [-m <ModuleName>] [-p <Option>=<Value>] [-p <Option>=<Value>] ... """ argss = args.split() if not argss: print self.do_install.__doc__ return option = argss[0] del argss[0] if option == "mysql": print "Installing MySQL database, this can take a while ..." client = SystemAdministratorClient(self.host, self.port) if InstallTools.mysqlPassword == 'LocalConfig': InstallTools.mysqlPassword = '' InstallTools.getMySQLPasswords() result = client.installMySQL(InstallTools.mysqlRootPwd, InstallTools.mysqlPassword) if not result['OK']: self.__errMsg(result['Message']) else: print "MySQL:", result['Value'] print "You might need to restart SystemAdministrator service to take new settings into account" elif option == "db": if not argss: print self.do_install.__doc__ return database = argss[0] client = SystemAdministratorClient(self.host, self.port) result = client.getAvailableDatabases() if not result['OK']: self.__errMsg("Can not get database list: %s" % result['Message']) return if not result['Value'].has_key(database): self.__errMsg("Unknown database %s: " % database) return system = result['Value'][database]['System'] setup = gConfig.getValue('/DIRAC/Setup', '') if not setup: self.__errMsg("Unknown current setup") return instance = gConfig.getValue( '/DIRAC/Setups/%s/%s' % (setup, system), '') if not instance: self.__errMsg("No instance defined for system %s" % system) self.__errMsg( "\tAdd new instance with 'add instance %s <instance_name>'" % system) return if not InstallTools.mysqlPassword: InstallTools.mysqlPassword = '******' InstallTools.getMySQLPasswords() result = client.installDatabase(database, InstallTools.mysqlRootPwd) if not result['OK']: self.__errMsg(result['Message']) return extension, system = result['Value'] # result = client.addDatabaseOptionsToCS( system, database ) InstallTools.mysqlHost = self.host result = client.getInfo() if not result['OK']: self.__errMsg(result['Message']) hostSetup = result['Value']['Setup'] result = InstallTools.addDatabaseOptionsToCS( gConfig, system, database, hostSetup) if not result['OK']: self.__errMsg(result['Message']) return print "Database %s from %s/%s installed successfully" % ( database, extension, system) elif option in ["service", "agent", "executor"]: if len(argss) < 2: print self.do_install.__doc__ return system = argss[0] del argss[0] component = argss[0] del argss[0] specialOptions = {} module = '' for i in range(len(argss)): if argss[i] == "-m": specialOptions['Module'] = argss[i + 1] module = argss[i + 1] if argss[i] == "-p": opt, value = argss[i + 1].split('=') specialOptions[opt] = value if module == component: module = '' client = SystemAdministratorClient(self.host, self.port) # First need to update the CS # result = client.addDefaultOptionsToCS( option, system, component ) InstallTools.host = self.host result = client.getInfo() if not result['OK']: self.__errMsg(result['Message']) return hostSetup = result['Value']['Setup'] # Install Module section if not yet there if module: result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, module, getCSExtensions(), hostSetup) # Add component section with specific parameters only result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, component, getCSExtensions(), hostSetup, specialOptions, addDefaultOptions=False) else: # Install component section result = InstallTools.addDefaultOptionsToCS( gConfig, option, system, component, getCSExtensions(), hostSetup, specialOptions) if not result['OK']: self.__errMsg(result['Message']) return # Then we can install and start the component result = client.setupComponent(option, system, component, module) if not result['OK']: self.__errMsg(result['Message']) return compType = result['Value']['ComponentType'] runit = result['Value']['RunitStatus'] print "%s %s_%s is installed, runit status: %s" % ( compType, system, component, runit) else: print "Unknown option:", option