def allocateInstances( nWorkersWanted, launchedJsonFilePath ): # see if there are enough released ones to reuse toReuse = [] with g_releasedInstancesLock: if len( g_releasedInstances ) >= nWorkersWanted: for _ in range( 0, nWorkersWanted ): toReuse.append( g_releasedInstances.popleft() ) if toReuse: logger.info( 'REUSING instances') with open( launchedJsonFilePath,'w' ) as outFile: json.dump( toReuse, outFile ) return toReuse nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested)', nWorkersWanted ) raise ValueError( 'not enough devices available') if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = runDistributedBlender.loadSshPubKey() randomPart = str( uuid.uuid4() )[0:13] keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) sys.exit( 'could not upload SSH client key') logResult( 'launchInstances', nWorkersWanted, 'allocateInstances' ) rc = runDistributedBlender.launchInstances( args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) # get instances from the launched json file launchedInstances = [] with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) if True: #launchWanted: with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( launchedInstances, khFile ) return launchedInstances
if launchWanted: nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 5): logger.error( 'not enough devices available (%d requested)', nWorkersWanted ) sys.exit(1) if nWorkersWanted == 0: logger.info( '%d devices available to launch', nAvail ) nWorkersWanted = nAvail if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey() randomPart = str( uuid.uuid4() )[0:13] keyContents += ' #' + randomPart sshClientKeyName = 'pingtest_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) sys.exit( 'could not upload SSH client key') rc = launchInstances( args.authToken, launchedJsonFilePath, nWorkersWanted, sshClientKeyName, filtersJson=args.filter ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) if rc: logger.warning( 'launchInstances returned %d', rc ) sys.exit( 'could not launch instances') # find out if any instances were started startedInstances = getStartedInstances( launchedJsonFilePath ) # a list of instance dicts # do the actual work, unless we shouldn't
def recruitInstances(nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath, installerFileName): '''launch instances and install fahclient on them; terminate those that could not install; return list of good instances''' logger.info('recruiting up to %d instances', nWorkersWanted) goodInstances = [] if launchWanted: nAvail = ncs.getAvailableDeviceCount(args.authToken, filtersJson=args.filter) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail) raise ValueError('not enough devices available') # upload an sshClientKey for launch (unless one was provided) if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str(uuid.uuid4())[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'fah_%s' % (randomPart) respCode = ncs.uploadSshClientKey(args.authToken, sshClientKeyName, keyContents) if respCode < 200 or respCode >= 300: logger.warning('ncs.uploadSshClientKey returned %s', respCode) sys.exit('could not upload SSH client key') #launch rc = launchInstances(args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles=args.encryptFiles) if rc: logger.info('launchInstances returned %d', rc) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info('deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey(args.authToken, sshClientKeyName) launchedInstances = [] # get instances from the launched json file with open(launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning('could not load json (%s) %s', type(exc), exc) if len(launchedInstances) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len(launchedInstances), nWorkersWanted) nonstartedIids = [ inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning('terminating non-started instances %s', nonstartedIids) terminateNcsScInstances(args.authToken, nonstartedIids) logger.info('done terminating non-started instances') # proceed with instances that were actually started startedInstances = [ inst for inst in launchedInstances if inst['state'] == 'started' ] if not startedInstances: return ([], []) if not sigtermSignaled(): installerCmd = './' + installerFileName logger.info('calling tellInstances to install on %d instances', len(startedInstances)) stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd, resultsLogFilePath=resultsLogFilePath, download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent, timeLimit=args.timeLimit, upload=args.uploads, stopOnSigterm=False, knownHostsOnly=False) # SHOULD restore our handler because tellInstances may have overridden it #signal.signal( signal.SIGTERM, sigtermHandler ) if not stepStatuses: logger.warning('no statuses returned from installer') startedIids = [inst['instanceId'] for inst in startedInstances] #logOperation( 'terminateBad', startedIids, '<recruitInstances>' ) terminateNcsScInstances(args.authToken, startedIids) return ([], []) #(goodOnes, badOnes) = triage( stepStatuses ) # separate good tellInstances statuses from bad ones goodOnes = [] badOnes = [] for status in stepStatuses: if isinstance(status['status'], int) and status['status'] == 0: goodOnes.append(status['instanceId']) else: badOnes.append(status) if isinstance(status['status'], asyncio.TimeoutError): logger.info('installer status asyncio.TimeoutError') logger.info('%d good installs, %d bad installs', len(goodOnes), len(badOnes)) #logger.info( 'stepStatuses %s', stepStatuses ) goodInstances = [ inst for inst in startedInstances if inst['instanceId'] in goodOnes ] badIids = [] for status in badOnes: badIids.append(status['instanceId']) if badIids: #logOperation( 'terminateBad', badIids, '<recruitInstances>' ) terminateNcsScInstances(args.authToken, badIids) return goodInstances, badOnes
def recruitInstances( nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath='' ): '''launch instances and install blender on them; terminate those that could not install; return list of good instances''' logger.info( 'recruiting up to %d instances', nWorkersWanted ) if not resultsLogFilePath: resultsLogFilePath = dataDirPath+'/recruitInstances.jlog' goodInstances = [] if launchWanted: logger.info( 'recruiting %d instances', nWorkersWanted ) nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail ) raise ValueError( 'not enough devices available') # prepare sshClientKey for launch if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str( uuid.uuid4() )[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) sys.exit( 'could not upload SSH client key') #launch #logResult( 'operation', {'launchInstances': nWorkersWanted}, '<recruitInstances>' ) logOperation( 'launchInstances', nWorkersWanted, '<recruitInstances>' ) rc = launchInstances( args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles = args.encryptFiles ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) launchedInstances = [] # get instances from the launched json file with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning( 'terminating non-started instances %s', nonstartedIids ) ncs.terminateInstances( args.authToken, nonstartedIids ) # proceed with instances that were actually started startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ] # add instances to knownHosts with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile ) # install blender on startedInstances if not sigtermSignaled(): installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null' logger.info( 'calling tellInstances to install on %d instances', len(startedInstances)) stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd, resultsLogFilePath=resultsLogFilePath, download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent, timeLimit=min(args.instTimeLimit, args.timeLimit), upload=None, stopOnSigterm=not True, knownHostsOnly=True ) # SHOULD restore our handler because tellInstances may have overridden it #signal.signal( signal.SIGTERM, sigtermHandler ) if not stepStatuses: logger.warning( 'no statuses returned from installer') startedIids = [inst['instanceId'] for inst in startedInstances] logOperation( 'terminateBad', startedIids, '<recruitInstances>' ) ncs.terminateInstances( args.authToken, startedIids ) return [] (goodOnes, badOnes) = triage( stepStatuses ) #stepTiming.finish() #eventTimings.append(stepTiming) logger.info( '%d good installs, %d bad installs', len(goodOnes), len(badOnes) ) logger.info( 'stepStatuses %s', stepStatuses ) goodInstances = [inst for inst in startedInstances if inst['instanceId'] in goodOnes ] badIids = [] for status in badOnes: badIids.append( status['instanceId'] ) if badIids: logOperation( 'terminateBad', badIids, '<recruitInstances>' ) ncs.terminateInstances( args.authToken, badIids ) #if goodInstances: # recycleInstances( goodInstances ) return goodInstances
def recruitInstance( launchedJsonFilePath, resultsLogFilePathIgnored ): logger.info( 'recruiting 1 instance' ) nWorkersWanted = 1 # prepare sshClientKey for launch if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str( uuid.uuid4() )[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) raise Exception( 'could not upload SSH client key') #launch logOperation( 'launchInstances', 1, '<recruitInstances>' ) rc = launchInstances( args.authToken, 1, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles = args.encryptFiles ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) if rc: return None launchedInstances = [] # get instances from the launched json file with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning( 'terminating non-started instances %s', nonstartedIids ) ncs.terminateInstances( args.authToken, nonstartedIids ) # proceed with instances that were actually started startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ] if len(startedInstances) != 1: logger.warning( 'launched %d instances', len(startedInstances) ) return None inst = startedInstances[0] iid = inst['instanceId'] abbrevIid = iid[0:16] def trackStderr( proc ): for line in proc.stderr: print( '<stderr>', abbrevIid, line.strip(), file=sys.stderr ) logInstallerEvent( 'stderr', line.strip(), iid ) if sigtermSignaled(): logger.warning( 'terminating instance because sigtermSignaled %s', iid ) logOperation( 'terminateFinal', [iid], '<master>' ) ncs.terminateInstances( args.authToken, [iid] ) return None else: # add instance to knownHosts with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile ) # install blender on startedInstance installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null' logger.info( 'installerCmd %s', installerCmd ) sshSpecs = inst['ssh'] deadline = min( g_deadline, time.time() + args.instTimeLimit ) logInstallerOperation( iid, ['connect', sshSpecs['host'], sshSpecs['port']] ) with subprocess.Popen(['ssh', '-p', str(sshSpecs['port']), '-o', 'ServerAliveInterval=360', '-o', 'ServerAliveCountMax=3', sshSpecs['user'] + '@' + sshSpecs['host'], installerCmd], encoding='utf8', #stdout=subprocess.PIPE, # subprocess.PIPE subprocess.DEVNULL stderr=subprocess.PIPE) as proc: logInstallerOperation( iid, ['command', installerCmd] ) stderrThr = threading.Thread(target=trackStderr, args=(proc,)) stderrThr.start() while time.time() < deadline: proc.poll() # sets proc.returncode if proc.returncode == None: logger.info( 'waiting for install') else: if proc.returncode == 0: logger.info( 'installer succeeded on instance %s', abbrevIid ) else: logger.warning( 'instance %s gave returnCode %d', abbrevIid, proc.returncode ) break if sigtermSignaled(): break time.sleep(30) proc.poll() returnCode = proc.returncode if proc.returncode != None else 124 # declare timeout if no rc if returnCode: logger.warning( 'installer returnCode %s', returnCode ) if returnCode == 124: logInstallerEvent( 'timeout', args.instTimeLimit, iid ) else: logInstallerEvent('returncode', returnCode, iid ) proc.terminate() try: proc.wait(timeout=5) if proc.returncode: logger.warning( 'ssh return code %d', proc.returncode ) except subprocess.TimeoutExpired: logger.warning( 'ssh did not terminate in time' ) stderrThr.join() if returnCode: logger.warning( 'terminating instance because installerFailed %s', iid ) ncs.terminateInstances( args.authToken, [iid] ) logOperation( 'terminateBad', [iid], '<recruitInstances>' ) return None else: return inst return None