def checkProcesses( liveInstances ): '''check that the expected process is running on each instance''' logger.info( 'checking %d instance(s)', len(liveInstances) ) #maybe should weed out some instances goodInstances = liveInstances cmd = "ps -ef | grep -v grep | grep 'agent.jar' > /dev/null" #cmd = "free --mega 1>&2 ; ps -ef | grep -v grep | grep 'LoadGeneratorAgent start' > /dev/null" # check for a running agent process on each instance stepStatuses = tellInstances.tellInstances( goodInstances, cmd, timeLimit=30*60, knownHostsOnly=True ) #logger.info( 'cmd statuses: %s', stepStatuses ) errorsByIid = {} for statusRec in stepStatuses: status = statusRec['status'] if status: logger.info( 'statusRec: %s', statusRec ) # only certain non-null outcomes are true errors if isinstance( status, Exception ): errorsByIid[ statusRec['instanceId'] ] = statusRec elif status not in [0, 1]: errorsByIid[ statusRec['instanceId'] ] = statusRec logger.info( 'errorsByIid: %s', errorsByIid ) return errorsByIid
def retrieveLogs( liveInstances, neoloadVersion ): ''' instancesFilePath = os.path.join( dataDirPath, 'startedAgents.json' ) startedInstances = [] # get details of launched instances from the json file #TODO should get list of instances with good install, rather than all started instances with open( instancesFilePath, 'r') as jsonInFile: try: startedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) ''' #maybe should weed out some instances goodInstances = liveInstances logger.info( 'retrieving logs for %d instance(s)', len(goodInstances) ) truncVersion = truncateVersion( neoloadVersion ) agentLogFilePath = '/root/.neotys/neoload/v%s/logs/*.log' % truncVersion # download the agent.log file from each instance stepStatuses = tellInstances.tellInstances( goodInstances, download=agentLogFilePath, downloadDestDir=dataDirPath +'/agentLogs', timeLimit=30*60, knownHostsOnly=True )
def retrieveLogs( goodInstances ): '''download proxy log files from each of the given instances ''' logger.info( 'retrieving logs for %d instance(s)', len(goodInstances) ) proxyLogFilePath = '/var/log/squid/*.log' stepStatuses = tellInstances.tellInstances( goodInstances, download=proxyLogFilePath, downloadDestDir=dataDirPath +'/proxyLogs', timeLimit=30*60, knownHostsOnly=True )
def retrieveLogs( goodInstances ): logger.info( 'retrieving logs for %d instance(s)', len(goodInstances) ) #agentLogFilePath = 'lzAgent/Logs/*' agentLogFilePath = 'lzAgent/agent.*s' # download the agent.log file from each instance stepStatuses = tellInstances.tellInstances( goodInstances, download=agentLogFilePath, downloadDestDir=dataDirPath +'/agentLogs', timeLimit=30*60, knownHostsOnly=True )
def retrieveLogs( liveInstances, farmDirPath ): '''retrieve logs from nodes, storing them in dataDirPath/nodeLogs subdirectories''' goodInstances = liveInstances logger.info( 'retrieving logs for %d instance(s)', len(goodInstances) ) nodeLogFilePath = farmDirPath + '/geth.log' # download the log file from each instance stepStatuses = tellInstances.tellInstances( goodInstances, download=nodeLogFilePath, downloadDestDir=dataDirPath +'/nodeLogs', timeLimit=15*60, knownHostsOnly=True ) #logger.info( 'download statuses: %s', stepStatuses ) errorsByIid = {status['instanceId']: status['status'] for status in stepStatuses if status['status'] } return errorsByIid
def checkProcesses( liveInstances ): logger.info( 'checking %d instance(s)', len(liveInstances) ) #maybe should weed out some instances goodInstances = liveInstances cmd = "ps -ef | grep -v grep | grep 'geth' > /dev/null" # check for a running geth process on each instance stepStatuses = tellInstances.tellInstances( goodInstances, cmd, timeLimit=15*60, resultsLogFilePath = dataDirPath + '/checkProcesses.jlog', knownHostsOnly=True ) #logger.info( 'proc statuses: %s', stepStatuses ) errorsByIid = {status['instanceId']: status['status'] for status in stepStatuses if status['status'] } #logger.info( 'errorsByIid: %s', errorsByIid ) return errorsByIid
def checkProcesses( goodInstances ): '''check that the expected process is running on each instance''' logger.info( 'checking %d instance(s)', len(goodInstances) ) cmd = "ps -ef | grep -v grep | grep 'squid' > /dev/null" stepStatuses = tellInstances.tellInstances( goodInstances, cmd, timeLimit=30*60, knownHostsOnly=True ) #logger.info( 'cmd statuses: %s', stepStatuses ) errorsByIid = {} for status in stepStatuses: if status['status']: # might like to be more selective here #logger.info( 'status: %s', status ) errorsByIid[ status['instanceId'] ] = status logger.info( 'errorsByIid: %s', errorsByIid ) return errorsByIid
def checkInstanceClocks( liveInstances, dataDirPath ): jlogFilePath = dataDirPath + '/checkInstanceClocks.jlog' allIids = [inst['instanceId'] for inst in liveInstances ] unfoundIids = set( allIids ) cmd = "date --iso-8601=seconds" # check for a running geth process on each instance stepStatuses = tellInstances.tellInstances( liveInstances, cmd, timeLimit=2*60, resultsLogFilePath = jlogFilePath, knownHostsOnly=True, sshAgent=not True ) #logger.info( 'proc statuses: %s', stepStatuses ) errorsByIid = {status['instanceId']: status['status'] for status in stepStatuses if status['status'] } logger.info( 'preliminary errorsByIid: %s', errorsByIid ) for iid, status in errorsByIid.items(): logger.warning( 'instance %s gave error "%s"', iid, status ) with open( jlogFilePath, 'rb' ) as inFile: for line in inFile: decoded = json.loads( line ) iid = decoded['instanceId'] if decoded.get( 'stdout' ): #logger.info( decoded ) masterDateTime = dateutil.parser.parse( decoded['dateTime'] ) try: nodeDateTime = dateutil.parser.parse( decoded['stdout'] ) except Exception as exc: logger.warning( 'exception parsing %s', decoded['stdout'] ) errorsByIid[ iid ] = {'exception': exc } else: unfoundIids.discard( iid ) delta = masterDateTime - nodeDateTime discrep =delta.total_seconds() logger.info( 'discrep: %.1f seconds on inst %s', discrep, iid ) if discrep > 4 or discrep < -1: logger.warning( 'bad time discrep: %.1f', discrep ) errorsByIid[ iid ] = {'discrep': discrep } if unfoundIids: logger.warning( 'unfoundIids: %s', unfoundIids ) for iid in list( unfoundIids ): if iid not in errorsByIid: errorsByIid[ iid ] = {'found': False } logger.info( '%d errorsByIid: %s', len(errorsByIid), errorsByIid ) return errorsByIid
def checkProcesses( liveInstances ): '''check that the expected process is running on each instance''' logger.info( 'checking %d instance(s)', len(liveInstances) ) #maybe should weed out some instances goodInstances = liveInstances cmd = "ps -ef | grep -v grep | grep 'LoadGeneratorAgent start' > /dev/null" #cmd = "free --mega 1>&2 ; ps -ef | grep -v grep | grep 'LoadGeneratorAgent start' > /dev/null" # check for a running agent process on each instance stepStatuses = tellInstances.tellInstances( goodInstances, cmd, timeLimit=30*60, knownHostsOnly=True ) #logger.info( 'cmd statuses: %s', stepStatuses ) errorsByIid = {} for status in stepStatuses: if status['status']: # might like to be more selective here #logger.info( 'status: %s', status ) errorsByIid[ status['instanceId'] ] = status logger.debug( 'errorsByIid: %s', errorsByIid ) return errorsByIid
with open(launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning('could not load json (%s) %s', type(exc), exc) startedInstances = [ inst for inst in launchedInstances if inst['state'] == 'started' ] logger.info('%d instances were launched', len(startedInstances)) starterCmd = 'geth --config netconfig/%s.config.toml --password pw.txt --unlock $(cat accountAddr.txt) >> ether/%s/stdout.txt 2>>ether/%s/geth.log </dev/null &' % \ (configName, configName, configName) # start the client on each instance stepStatuses = tellInstances.tellInstances( startedInstances, command=starterCmd, resultsLogFilePath=outDataDir + '/startClients.jlog', timeLimit=30 * 60, knownHostsOnly=True) logger.debug('starter statuses: %s', stepStatuses) # make a list of instances where the client was started goodIids = [] for status in stepStatuses: if isinstance(status['status'], int) and status['status'] == 0: goodIids.append(status['instanceId']) else: logger.warning('could not start agent on %s', status['instanceId'][0:8]) if launchedInstances: print( 'when you want to terminate these instances, use %s terminateGethNodes.py "%s"' % (sys.executable, outDataDir))
startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ] logger.info( '%d instances were launched', len(startedInstances) ) installedInstances = [] recruiterLog = readJLog( outDataDir +'/recruitInstances.jlog' ) for logEntry in recruiterLog: if 'returncode' in logEntry and logEntry['returncode'] == 0: installedInstances.append( instancesByIid[ logEntry['instanceId'] ] ) logger.info( '%d instances were installed', len(installedInstances) ) installedIids = [inst['instanceId'] for inst in installedInstances ] # get the ip addr of each instance cmd = 'curl -s -S https://api.ipify.org > ipAddr.txt' stepStatuses = tellInstances.tellInstances( installedInstances, command=cmd, resultsLogFilePath=outDataDir +'/getIpAddr.jlog', download='ipAddr.txt', downloadDestDir=outDataDir +'/agentLogs', timeLimit=6*60, knownHostsOnly=True ) logger.debug( 'download statuses: %s', stepStatuses ) # merge the retrieved ip addrs into instances records in a new array goodInstances = [] for iid in installedIids: ipFilePath = os.path.join( outDataDir, 'agentLogs', iid, 'ipAddr.txt' ) if os.path.isfile( ipFilePath ): with open( ipFilePath, 'r' ) as ipFile: ipAddr = ipFile.read().strip() if ipAddr: inst = instancesByIid.get( iid ) if inst: inst['ipAddr'] = ipAddr
for index, inst in enumerate( startedInstances ): iid = inst['instanceId'] portMap[iid] = index + portRangeStart logger.info( 'configuring agents') returnCodes = configureAgents( startedInstances, ports, timeLimit=600 ) for index, code in enumerate( returnCodes ): if code==0: configuredInstances.append( startedInstances[index] ) else: iid = startedInstances[index].get('instanceId') logger.info( 'inst %s was not configured properly', iid[0:8] ) # start the agent on each instance stepStatuses = tellInstances.tellInstances( configuredInstances, command=starterCmd, resultsLogFilePath=outDataDir +'/startAgents.jlog', timeLimit=30*60, knownHostsOnly=True ) logger.debug( 'starter statuses: %s', stepStatuses ) # make a list of instances where the agent was started goodIids = [] for status in stepStatuses: if isinstance( status['status'], int) and status['status'] == 0: goodIids.append( status['instanceId']) else: logger.warning( 'could not start agent on %s', status['instanceId'][0:8] ) #COULD check bound ports again here #COULD download logs from all installed instances rather than just good-started instances goodInstances = [inst for inst in startedInstances if inst['instanceId'] in goodIids ] if goodInstances: time.sleep( 60 )