def allocateInstances( nWorkersWanted, launchedJsonFilePath ): # see if there are enough released ones to reuse toReuse = [] with g_releasedInstancesLock: if len( g_releasedInstances ) >= nWorkersWanted: for _ in range( 0, nWorkersWanted ): toReuse.append( g_releasedInstances.popleft() ) if toReuse: logger.info( 'REUSING instances') with open( launchedJsonFilePath,'w' ) as outFile: json.dump( toReuse, outFile ) return toReuse nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested)', nWorkersWanted ) raise ValueError( 'not enough devices available') if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = runDistributedBlender.loadSshPubKey() randomPart = str( uuid.uuid4() )[0:13] keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) sys.exit( 'could not upload SSH client key') logResult( 'launchInstances', nWorkersWanted, 'allocateInstances' ) rc = runDistributedBlender.launchInstances( args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) # get instances from the launched json file launchedInstances = [] with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) if True: #launchWanted: with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( launchedInstances, khFile ) return launchedInstances
def getInstancesAvailable(authToken, args): '''gets the number of available instances''' #authToken = args.get('authToken') if not authToken: return jsonify('no authToken provided'), 401 filtersJson = args.get('filter', None) callTime = time.time() nAvail = ncs.getAvailableDeviceCount(authToken, filtersJson) logger.info('ncs.getAvailableDeviceCount took %.1f seconds', time.time() - callTime) #logger.info( '%d devices available to launch', nAvail ) return jsonify(nAvail), 200
def getInstancesAvailable(authToken, args): '''gets the number of available instances''' #authToken = args.get('authToken') if not authToken: return jsonify('no authToken provided'), 401 filtersJson = args.get('filter', None) filtersJson = applyDprIfNone(filtersJson, g_minDpr) filtersJson = applyMinRamIfNone(filtersJson, g_minRamMB) #if not filtersJson: # return jsonify('missing filter arg'), 422 callTime = time.time() nAvail = ncs.getAvailableDeviceCount(authToken, filtersJson) logger.info('ncs.getAvailableDeviceCount took %.1f seconds', time.time() - callTime) #logger.info( '%d devices available to launch', nAvail ) return jsonify(nAvail), 200
def checkForInstances(): '''a threadproc to check whether we have enough instances running and maybe launch more''' while len(g_framesFinished) < g_nFramesWanted and sigtermNotSignaled() and time.time()< g_deadline: overageFactor = 2 nUnfinished = g_nFramesWanted - len(g_framesFinished) nWorkers = len( g_workingInstances ) if nWorkers < (nUnfinished * overageFactor): nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nAvail >= 2: logger.warning( 'starting thread because not enough workers (%d unfinished, %d workers)', nUnfinished, nWorkers ) rendererThread = threading.Thread( target=recruitAndRender, name='recruitAndRender' ) rendererThread.start() time.sleep( 60 ) logger.info( 'finished')
logger.error( 'could not ping target host %s', args.targetHost ) sys.exit(1) except Exception as exc: logger.warning( 'could not access target host %s',args.targetHost ) logger.error( 'got exception %s', exc ) sys.exit(1) nWorkersWanted = args.nWorkers if launchWanted: # overwrite the launchedJson file as empty list, so we won't have problems with stale contents with open( launchedJsonFilePath, 'w' ) as outFile: json.dump( [], outFile ) try: if launchWanted: nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 5): logger.error( 'not enough devices available (%d requested)', nWorkersWanted ) sys.exit(1) if nWorkersWanted == 0: logger.info( '%d devices available to launch', nAvail ) nWorkersWanted = nAvail if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey() randomPart = str( uuid.uuid4() )[0:13] keyContents += ' #' + randomPart sshClientKeyName = 'pingtest_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300:
def recruitInstances(nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath, installerFileName): '''launch instances and install fahclient on them; terminate those that could not install; return list of good instances''' logger.info('recruiting up to %d instances', nWorkersWanted) goodInstances = [] if launchWanted: nAvail = ncs.getAvailableDeviceCount(args.authToken, filtersJson=args.filter) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail) raise ValueError('not enough devices available') # upload an sshClientKey for launch (unless one was provided) if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str(uuid.uuid4())[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'fah_%s' % (randomPart) respCode = ncs.uploadSshClientKey(args.authToken, sshClientKeyName, keyContents) if respCode < 200 or respCode >= 300: logger.warning('ncs.uploadSshClientKey returned %s', respCode) sys.exit('could not upload SSH client key') #launch rc = launchInstances(args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles=args.encryptFiles) if rc: logger.info('launchInstances returned %d', rc) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info('deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey(args.authToken, sshClientKeyName) launchedInstances = [] # get instances from the launched json file with open(launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning('could not load json (%s) %s', type(exc), exc) if len(launchedInstances) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len(launchedInstances), nWorkersWanted) nonstartedIids = [ inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning('terminating non-started instances %s', nonstartedIids) terminateNcsScInstances(args.authToken, nonstartedIids) logger.info('done terminating non-started instances') # proceed with instances that were actually started startedInstances = [ inst for inst in launchedInstances if inst['state'] == 'started' ] if not startedInstances: return ([], []) if not sigtermSignaled(): installerCmd = './' + installerFileName logger.info('calling tellInstances to install on %d instances', len(startedInstances)) stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd, resultsLogFilePath=resultsLogFilePath, download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent, timeLimit=args.timeLimit, upload=args.uploads, stopOnSigterm=False, knownHostsOnly=False) # SHOULD restore our handler because tellInstances may have overridden it #signal.signal( signal.SIGTERM, sigtermHandler ) if not stepStatuses: logger.warning('no statuses returned from installer') startedIids = [inst['instanceId'] for inst in startedInstances] #logOperation( 'terminateBad', startedIids, '<recruitInstances>' ) terminateNcsScInstances(args.authToken, startedIids) return ([], []) #(goodOnes, badOnes) = triage( stepStatuses ) # separate good tellInstances statuses from bad ones goodOnes = [] badOnes = [] for status in stepStatuses: if isinstance(status['status'], int) and status['status'] == 0: goodOnes.append(status['instanceId']) else: badOnes.append(status) if isinstance(status['status'], asyncio.TimeoutError): logger.info('installer status asyncio.TimeoutError') logger.info('%d good installs, %d bad installs', len(goodOnes), len(badOnes)) #logger.info( 'stepStatuses %s', stepStatuses ) goodInstances = [ inst for inst in startedInstances if inst['instanceId'] in goodOnes ] badIids = [] for status in badOnes: badIids.append(status['instanceId']) if badIids: #logOperation( 'terminateBad', badIids, '<recruitInstances>' ) terminateNcsScInstances(args.authToken, badIids) return goodInstances, badOnes
def recruitInstances( nWorkersWanted, launchedJsonFilePath, launchWanted, resultsLogFilePath='' ): '''launch instances and install blender on them; terminate those that could not install; return list of good instances''' logger.info( 'recruiting up to %d instances', nWorkersWanted ) if not resultsLogFilePath: resultsLogFilePath = dataDirPath+'/recruitInstances.jlog' goodInstances = [] if launchWanted: logger.info( 'recruiting %d instances', nWorkersWanted ) nAvail = ncs.getAvailableDeviceCount( args.authToken, filtersJson=args.filter ) if nWorkersWanted > (nAvail + 0): logger.error( 'not enough devices available (%d requested, %d avail)', nWorkersWanted, nAvail ) raise ValueError( 'not enough devices available') # prepare sshClientKey for launch if args.sshClientKeyName: sshClientKeyName = args.sshClientKeyName else: keyContents = loadSshPubKey().strip() randomPart = str( uuid.uuid4() )[0:13] #keyContents += ' #' + randomPart sshClientKeyName = 'bfr_%s' % (randomPart) respCode = ncs.uploadSshClientKey( args.authToken, sshClientKeyName, keyContents ) if respCode < 200 or respCode >= 300: logger.warning( 'ncs.uploadSshClientKey returned %s', respCode ) sys.exit( 'could not upload SSH client key') #launch #logResult( 'operation', {'launchInstances': nWorkersWanted}, '<recruitInstances>' ) logOperation( 'launchInstances', nWorkersWanted, '<recruitInstances>' ) rc = launchInstances( args.authToken, nWorkersWanted, sshClientKeyName, launchedJsonFilePath, filtersJson=args.filter, encryptFiles = args.encryptFiles ) if rc: logger.debug( 'launchInstances returned %d', rc ) # delete sshClientKey only if we just uploaded it if sshClientKeyName != args.sshClientKeyName: logger.info( 'deleting sshClientKey %s', sshClientKeyName) ncs.deleteSshClientKey( args.authToken, sshClientKeyName ) launchedInstances = [] # get instances from the launched json file with open( launchedJsonFilePath, 'r') as jsonInFile: try: launchedInstances = json.load(jsonInFile) # an array except Exception as exc: logger.warning( 'could not load json (%s) %s', type(exc), exc ) if len( launchedInstances ) < nWorkersWanted: logger.warning( 'could not launch as many instances as wanted (%d vs %d)', len( launchedInstances ), nWorkersWanted ) nonstartedIids = [inst['instanceId'] for inst in launchedInstances if inst['state'] != 'started' ] if nonstartedIids: logger.warning( 'terminating non-started instances %s', nonstartedIids ) ncs.terminateInstances( args.authToken, nonstartedIids ) # proceed with instances that were actually started startedInstances = [inst for inst in launchedInstances if inst['state'] == 'started' ] # add instances to knownHosts with open( os.path.expanduser('~/.ssh/known_hosts'), 'a' ) as khFile: jsonToKnownHosts.jsonToKnownHosts( startedInstances, khFile ) # install blender on startedInstances if not sigtermSignaled(): installerCmd = 'sudo apt-get -qq update && sudo apt-get -qq -y install blender > /dev/null' logger.info( 'calling tellInstances to install on %d instances', len(startedInstances)) stepStatuses = tellInstances.tellInstances( startedInstances, installerCmd, resultsLogFilePath=resultsLogFilePath, download=None, downloadDestDir=None, jsonOut=None, sshAgent=args.sshAgent, timeLimit=min(args.instTimeLimit, args.timeLimit), upload=None, stopOnSigterm=not True, knownHostsOnly=True ) # SHOULD restore our handler because tellInstances may have overridden it #signal.signal( signal.SIGTERM, sigtermHandler ) if not stepStatuses: logger.warning( 'no statuses returned from installer') startedIids = [inst['instanceId'] for inst in startedInstances] logOperation( 'terminateBad', startedIids, '<recruitInstances>' ) ncs.terminateInstances( args.authToken, startedIids ) return [] (goodOnes, badOnes) = triage( stepStatuses ) #stepTiming.finish() #eventTimings.append(stepTiming) logger.info( '%d good installs, %d bad installs', len(goodOnes), len(badOnes) ) logger.info( 'stepStatuses %s', stepStatuses ) goodInstances = [inst for inst in startedInstances if inst['instanceId'] in goodOnes ] badIids = [] for status in badOnes: badIids.append( status['instanceId'] ) if badIids: logOperation( 'terminateBad', badIids, '<recruitInstances>' ) ncs.terminateInstances( args.authToken, badIids ) #if goodInstances: # recycleInstances( goodInstances ) return goodInstances