def getHostname(): result = 'unknown' p, o = runCommand([ 'hostname', ], logEcho=False) if len(o) > 0: result = o[0] return result
def monitorEvents(options, events): """This is the main state machine for the Tegra monitor. Respond to the events sent via queue and also monitor the state of the buildslave if it's been started. """ pidFile = os.path.join(options.bbpath, 'twistd.pid') flagFile = os.path.join(options.bbpath, 'proxy.flg') errorFile = os.path.join(options.bbpath, 'error.flg') bbEnv = { 'PATH': os.getenv('PATH'), 'SUT_NAME': options.tegra, 'SUT_IP': options.tegraIP, } event = None bbActive = False tegraActive = False connected = False nChatty = 0 maxChatty = 10 hbFails = 0 maxFails = 50 sleepFails = 5 softCount = 0 # how many times tegraActive is True # but errorFlag is set softCountMax = 5 # how many active events to wait bdfore # triggering a soft reset softResets = 0 softResetMax = 5 # how many soft resets do we try before # waiting for a hard reset hardResets = 0 hardResetsMax = 3 lastHangCheck = time.time() log.info('monitoring started (process pid %s)' % current_process().pid) while True: if not connected: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((options.tegraIP, sutDataPort)) connected = True except: connected = False hbFails += 1 log.info('Error connecting to data port - sleeping for %d seconds' % sleepFails) time.sleep(sleepFails) try: event = events.get(False) except Empty: event = None if event is None: if connected: try: hbData = hbSocket.recv(4096) except: hbData = '' connected = False dumpException('hbSocket.recv()') if len(hbData) > 1: log.debug('socket data [%s]' % hbData[:-1]) if 'ebooting ...' in hbData: log.warning('device is rebooting') events.put(('reboot',)) if os.path.isfile(flagFile): time.sleep(5) hbSocket.close() connected = False else: log.info('heartbeat detected') events.put(('active',)) hbFails = 0 else: hbFails += 1 else: state = event[0] s = 'event %s hbFails %d / %d' % (state, hbFails, maxFails) nChatty += 1 if nChatty > maxChatty: log.info(s) else: log.debug(s) if nChatty > maxChatty: nChatty = 0 if state == 'reboot': tegraActive = False if not os.path.isfile(flagFile): log.warning('Tegra rebooting, stopping buildslave') events.put(('stop',)) elif state == 'stop' or state == 'offline': stopSlave(pidFile) bbActive = False if connected and state == 'offline': hbSocket.close() connected = False elif state == 'active' or state == 'dialback': tegraActive = True if not bbActive: if os.path.isfile(errorFile): log.warning('Tegra active but error flag set [%d/%d]' % (softCount, softResets)) softCount += 1 if softCount > softCountMax: softCount = 0 if softResets < softResetMax: softResets += 1 log.warning('removing error flag to see if tegra comes back') os.remove(errorFile) else: hardResets += 1 log.warning('hard reset reboot check [%d/%d]' % (hardResets, hardResetsMax)) if hardResets < hardResetsMax: sendReboot(options.tegraIP, sutDataPort) else: events.put(('offline',)) else: events.put(('start',)) elif state == 'start': if tegraActive and not bbActive: log.debug('starting buildslave in %s' % options.bbpath) bbProc, _ = runCommand(['twistd', '--no_save', '--rundir=%s' % options.bbpath, '--pidfile=%s' % pidFile, '--python=%s' % os.path.join(options.bbpath, 'buildbot.tac')], env=bbEnv) log.info('buildslave start returned %s' % bbProc.returncode) if bbProc.returncode == 0 or bbProc.returncode == 1: # pause to give twistd a chance to generate the pidfile # before the code that follows goes off killing it because # it thinks that it didn't start properly # OMGRACECONDITIONWTF nTries = 0 while nTries < 20: nTries += 1 if os.path.isfile(pidFile): log.debug('pidfile found, setting bbActive to True') bbActive = True break else: time.sleep(5) elif state == 'dialback': softCount = 0 softResets = 0 hardResets = 0 elif state == 'terminate': break if hbFails > maxFails: hbFails = 0 sleepFails += 5 if sleepFails > 300: sleepFails = 300 if os.path.isfile(flagFile): log.debug('install flag found, resetting error count') else: events.put(('offline',)) if connected: hbSocket.close() connected = False log.debug('bbActive %s tegraActive %s' % (bbActive, tegraActive)) if os.path.isfile(errorFile): if bbActive: log.error('errorFile detected - sending stop request') events.put(('stop',)) if bbActive: if os.path.isfile(pidFile): n = time.time() if not checkSlaveAlive(options.bbpath): log.warning('buildslave should be active but pid is not alive') if int(n - lastHangCheck) > 300: lastHangCheck = n logTD = checkSlaveActive(options.bbpath) if logTD.days > 0 or (logTD.days == 0 and logTD.seconds > options.hangtime): log.error('last activity was %d days %d seconds ago - marking as hung slave' % (logTD.days, logTD.seconds)) events.put(('offline',)) else: log.warning('buildslave should be active but pidfile not found, marking as offline') events.put(('offline',)) else: if os.path.isfile(pidFile): if checkSlaveAlive(options.bbpath): log.error('buildslave should NOT be active but pidfile found, killing buildbot') events.put(('stop',)) else: log.warning('buildslave not active but pidfile found, removing pidfile') os.remove(pidFile) if bbActive: stopSlave(pidFile) log.info('monitor stopped')
def checkTegra(master, tegra): tegraIP = getIPAddress(tegra) tegraPath = os.path.join(options.bbpath, tegra) exportFile = os.path.join(tegraPath, '%s_status.log' % tegra) errorFile = os.path.join(tegraPath, 'error.flg') errorFlag = os.path.isfile(errorFile) sTegra = 'OFFLINE' sutFound = False logTD = None status = {'tegra': tegra, 'active': False, 'cp': 'OFFLINE', 'bs': 'OFFLINE', 'msg': '', } log.debug('%s: %s' % (tegra, tegraIP)) if master is None: status['environment'] = 's' status['master'] = 'localhost' else: status['environment'] = master['environment'][0] status['master'] = 'http://%s:%s' % ( master['hostname'], master['http_port']) fPing, lPing = pingDevice(tegra) if fPing: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) sutFound = True time.sleep(2) hbSocket.send('info\n') d = hbSocket.recv(4096) log.debug('socket data length %d' % len(d)) log.debug(d) status['active'] = True hbSocket.close() except: status['active'] = False dumpException('socket') if status['active']: sTegra = 'online' else: sTegra = 'INACTIVE' if not sutFound: status['msg'] += 'SUTAgent not present;' else: status['msg'] += '%s %s;' % (lPing[0], lPing[1]) # Cheat until we have a better check solution for new watch_devices.sh status['cp'] = 'active' # pretend all is well if checkSlaveAlive(tegraPath): logTD = checkSlaveActive(tegraPath) if logTD is not None: if (logTD.days > 0) or (logTD.days == 0 and logTD.seconds > 3600): status['bs'] = 'INACTIVE' status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds) else: status['bs'] = 'active' else: status['bs'] = 'INACTIVE' else: # scan thru tegra-### dir and see if any buildbot.tac.bug#### files # exist but ignore buildbot.tac file itself (except to note that it is # missing) files = os.listdir(tegraPath) found = False for f in files: if f.startswith('buildbot.tac'): found = True if len(f) > 12: status['msg'] += '%s;' % f if not found: status['msg'] += 'buildbot.tac NOT found;' if errorFlag: status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile) s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg']) ts = time.strftime('%Y-%m-%d %H:%M:%S') log.info(s) open(exportFile, 'a+').write('%s %s\n' % (ts, s)) summary(status['tegra'], status['environment'], sTegra, status[ 'cp'], status['bs'], status['msg'], ts, status['master']) if errorFlag and options.reset: stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave') if not options.reboot: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) hbSocket.send('rebt\n') hbSocket.close() log.info('rebooting tegra') except: dumpException('socket') if errorFlag: log.info('clearing error.flg') os.remove(errorFile) # Here we try to catch the state where sutagent and cp are inactive that # is determined by: # sTegra == 'INACTIVE' and # status['cp'] == 'INACTIVE' # status['cp'] will be set to INACTIVE only if logTD.seconds (last time # clientproxy updated it's log file) is > 3600 if options.reboot: if not sutFound and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) else: if sTegra == 'OFFLINE' and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) if options.reset and sTegra == 'INACTIVE' and status['cp'] == 'INACTIVE': log.info('stopping hung clientproxy') stopDevice(tegra) time.sleep(5) log.info('starting clientproxy for %s' % tegra) os.chdir(tegraPath) runCommand(['python', 'clientproxy.py', '-b', '--device=%s' % tegra])
def getHostname(): result = 'unknown' p, o = runCommand(['hostname', ], logEcho=False) if len(o) > 0: result = o[0] return result
def one_time_setup(ip_addr, major_source): """ One time setup of state ip_addr - of the tegra we want to install app at major_source - we've hacked this script to install may-also-be-needed tools, but the source we're asked to install has the meta data we need Side Effects: We two globals, needed for error reporting: errorFile, proxyFile """ # set up the flag files, used throughout cwd = os.getcwd() global proxyFile, errorFile proxyFile = os.path.join(cwd, "..", "proxy.flg") errorFile = os.path.join(cwd, "..", "error.flg") proxyIP = getOurIP() proxyPort = calculatePort() workdir = os.path.dirname(major_source) inifile = os.path.join(workdir, "fennec", "application.ini") remoteappini = os.path.join(workdir, "talos", "remoteapp.ini") print "copying %s to %s" % (inifile, remoteappini) runCommand(["cp", inifile, remoteappini]) print "connecting to: %s" % ip_addr dm = devicemanager.DeviceManagerSUT(ip_addr) # Moar data! dm.debug = 3 devRoot = checkDeviceRoot(dm) if devRoot is None or devRoot == "/tests": setFlag(errorFile, "Remote Device Error: devRoot from devicemanager [%s] is not correct - exiting" % devRoot) sys.exit(1) try: setFlag(proxyFile) print proxyIP, proxyPort getDeviceTimestamp(dm) setDeviceTimestamp(dm) getDeviceTimestamp(dm) dm.getInfo("process") dm.getInfo("memory") dm.getInfo("uptime") width, height = getResolution(dm) # adjust resolution down to allow fennec to install without memory issues if width >= 1050 or height >= 1050: dm.adjustResolution(1024, 768, "crt") print "calling reboot" dm.reboot(proxyIP, proxyPort) waitForDevice(dm) width, height = getResolution(dm) if width != 1024 and height != 768: clearFlag(proxyFile) setFlag( errorFile, "Remote Device Error: Resolution change failed. Should be %d/%d but is %d/%d" % (1024, 768, width, height), ) sys.exit(1) finally: clearFlag(proxyFile) return dm, devRoot
def one_time_setup(ip_addr, major_source): ''' One time setup of state ip_addr - of the device we want to install app at major_source - we've hacked this script to install may-also-be-needed tools, but the source we're asked to install has the meta data we need Side Effects: global, needed for error reporting: errorFile ''' # set up the flag files, used throughout cwd = os.getcwd() global errorFile errorFile = os.path.join(cwd, '..', 'error.flg') deviceName = os.path.basename(cwd) proxyIP = getOurIP() proxyPort = calculatePort() workdir = os.path.dirname(major_source) inifile = os.path.join(workdir, 'fennec', 'application.ini') remoteappini = os.path.join(workdir, 'talos', 'remoteapp.ini') log.info('copying %s to %s' % (inifile, remoteappini)) runCommand(['cp', inifile, remoteappini]) log.info("connecting to: %s" % ip_addr) dm = devicemanager.DeviceManagerSUT(ip_addr) # Moar data! dm.debug = 3 devRoot = checkDeviceRoot(dm) if devRoot is None or devRoot == '/tests': setFlag( errorFile, "Remote Device Error: devRoot from devicemanager [%s] is not correct - exiting" % devRoot) return None, None try: log.info("%s, %s" % (proxyIP, proxyPort)) getDeviceTimestamp(dm) setDeviceTimestamp(dm) getDeviceTimestamp(dm) dm.getInfo('process') dm.getInfo('memory') dm.getInfo('uptime') width, height = getResolution(dm) # adjust resolution down to allow fennec to install without memory # issues if (width == 1600 or height == 1200): dm.adjustResolution(1024, 768, 'crt') log.info('forcing device reboot') if not powermanagement.soft_reboot_and_verify( device=deviceName, dm=dm, ipAddr=proxyIP, port=proxyPort): return None, None width, height = getResolution(dm) if width != 1024 and height != 768: setFlag( errorFile, "Remote Device Error: Resolution change failed. Should be %d/%d but is %d/%d" % (1024, 768, width, height)) return None, None except devicemanager.AgentError, err: log.error( "remoteDeviceError: while doing one time setup for installation: %s" % err) return None, None
def one_time_setup(ip_addr, major_source): ''' One time setup of state ip_addr - of the device we want to install app at major_source - we've hacked this script to install may-also-be-needed tools, but the source we're asked to install has the meta data we need Side Effects: global, needed for error reporting: errorFile ''' # set up the flag files, used throughout cwd = os.getcwd() global errorFile errorFile = os.path.join(cwd, '..', 'error.flg') deviceName = os.path.basename(cwd) proxyIP = getOurIP() proxyPort = calculatePort() workdir = os.path.dirname(major_source) inifile = os.path.join(workdir, 'fennec', 'application.ini') remoteappini = os.path.join(workdir, 'talos', 'remoteapp.ini') log.info('copying %s to %s' % (inifile, remoteappini)) runCommand(['cp', inifile, remoteappini]) log.info("connecting to: %s" % ip_addr) dm = devicemanager.DeviceManagerSUT(ip_addr) # Moar data! dm.debug = 3 devRoot = checkDeviceRoot(dm) if devRoot is None or devRoot == '/tests': setFlag(errorFile, "Remote Device Error: devRoot from devicemanager [%s] is not correct - exiting" % devRoot) return None, None try: log.info("%s, %s" % (proxyIP, proxyPort)) getDeviceTimestamp(dm) setDeviceTimestamp(dm) getDeviceTimestamp(dm) dm.getInfo('process') dm.getInfo('memory') dm.getInfo('uptime') width, height = getResolution(dm) # adjust resolution down to allow fennec to install without memory # issues if (width == 1600 or height == 1200): dm.adjustResolution(1024, 768, 'crt') log.info('forcing device reboot') if not powermanagement.soft_reboot_and_verify(device=deviceName, dm=dm, ipAddr=proxyIP, port=proxyPort): return None, None width, height = getResolution(dm) if width != 1024 and height != 768: setFlag(errorFile, "Remote Device Error: Resolution change failed. Should be %d/%d but is %d/%d" % (1024, 768, width, height)) return None, None except devicemanager.AgentError, err: log.error("remoteDeviceError: while doing one time setup for installation: %s" % err) return None, None
def one_time_setup(ip_addr, major_source): ''' One time setup of state ip_addr - of the tegra we want to install app at major_source - we've hacked this script to install may-also-be-needed tools, but the source we're asked to install has the meta data we need Side Effects: We two globals, needed for error reporting: errorFile, proxyFile ''' # set up the flag files, used throughout cwd = os.getcwd() global proxyFile, errorFile proxyFile = os.path.join(cwd, '..', 'proxy.flg') errorFile = os.path.join(cwd, '..', 'error.flg') proxyIP = getOurIP() proxyPort = calculatePort() workdir = os.path.dirname(major_source) inifile = os.path.join(workdir, 'fennec', 'application.ini') remoteappini = os.path.join(workdir, 'talos', 'remoteapp.ini') print 'copying %s to %s' % (inifile, remoteappini) runCommand(['cp', inifile, remoteappini]) print "connecting to: %s" % ip_addr dm = devicemanager.DeviceManagerSUT(ip_addr) # Moar data! dm.debug = 3 devRoot = checkDeviceRoot(dm) if devRoot is None or devRoot == '/tests': setFlag( errorFile, "Remote Device Error: devRoot from devicemanager [%s] is not correct - exiting" % devRoot) sys.exit(1) try: setFlag(proxyFile) print proxyIP, proxyPort getDeviceTimestamp(dm) setDeviceTimestamp(dm) getDeviceTimestamp(dm) dm.getInfo('process') dm.getInfo('memory') dm.getInfo('uptime') width, height = getResolution(dm) #adjust resolution down to allow fennec to install without memory issues if (width >= 1050 or height >= 1050): dm.adjustResolution(1024, 768, 'crt') print 'calling reboot' dm.reboot(proxyIP, proxyPort) waitForDevice(dm) width, height = getResolution(dm) if width != 1024 and height != 768: clearFlag(proxyFile) setFlag( errorFile, "Remote Device Error: Resolution change failed. Should be %d/%d but is %d/%d" % (1024, 768, width, height)) sys.exit(1) finally: clearFlag(proxyFile) return dm, devRoot
width, height = getResolution(dm) #adjust resolution down to allow fennec to install without memory issues if (width >= 1050 or height >= 1050): dm.adjustResolution(1024, 768, 'crt') print 'calling reboot' dm.reboot(proxyIP, proxyPort) waitForDevice(dm) width, height = getResolution(dm) if width != 1024 and height != 768: clearFlag(proxyFile) setFlag(errorFile, "Remote Device Error: Resolution change failed. Should be %d/%d but is %d/%d" % (1024,768,width,height)) sys.exit(1) print 'copying %s to %s' % (inifile, remoteappini) runCommand(['cp', inifile, remoteappini]) status = dm.installApp(target) if status is None: print '-'*42 print 'installApp() done - gathering debug info' dm.getInfo('process') dm.getInfo('memory') dm.getInfo('uptime') try: print dm.sendCMD(['exec su -c "logcat -d -v time *:W"']) except devicemanager.DMError, e: print "Exception hit while trying to run logcat: %s" % str(e) setFlag(errorFile, "Remote Device Error: can't run logcat") sys.exit(1) else:
def monitorEvents(options, events): """This is the main state machine for the Tegra monitor. Respond to the events sent via queue and also monitor the state of the buildslave if it's been started. """ pidFile = os.path.join(options.bbpath, 'twistd.pid') flagFile = os.path.join(options.bbpath, 'proxy.flg') errorFile = os.path.join(options.bbpath, 'error.flg') bbEnv = { 'PATH': os.getenv('PATH'), 'SUT_NAME': options.tegra, 'SUT_IP': options.tegraIP, } event = None bbActive = False tegraActive = False connected = False nChatty = 0 maxChatty = 10 hbFails = 0 maxFails = 50 sleepFails = 5 softCount = 0 # how many times tegraActive is True # but errorFlag is set softCountMax = 5 # how many active events to wait bdfore # triggering a soft reset softResets = 0 softResetMax = 5 # how many soft resets do we try before # waiting for a hard reset hardResets = 0 hardResetsMax = 3 lastHangCheck = time.time() log.info('monitoring started (process pid %s)' % current_process().pid) while True: if not connected: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((options.tegraIP, sutDataPort)) connected = True except: connected = False hbFails += 1 log.info( 'Error connecting to data port - sleeping for %d seconds' % sleepFails) time.sleep(sleepFails) try: event = events.get(False) except Empty: event = None if event is None: if connected: try: hbData = hbSocket.recv(4096) except: hbData = '' connected = False dumpException('hbSocket.recv()') if len(hbData) > 1: log.debug('socket data [%s]' % hbData[:-1]) if 'ebooting ...' in hbData: log.warning('device is rebooting') events.put(('reboot', )) if os.path.isfile(flagFile): time.sleep(5) hbSocket.close() connected = False else: log.info('heartbeat detected') events.put(('active', )) hbFails = 0 else: hbFails += 1 else: state = event[0] s = 'event %s hbFails %d / %d' % (state, hbFails, maxFails) nChatty += 1 if nChatty > maxChatty: log.info(s) else: log.debug(s) if nChatty > maxChatty: nChatty = 0 if state == 'reboot': tegraActive = False if not os.path.isfile(flagFile): log.warning('Tegra rebooting, stopping buildslave') events.put(('stop', )) elif state == 'stop' or state == 'offline': stopSlave(pidFile) bbActive = False if connected and state == 'offline': hbSocket.close() connected = False elif state == 'active' or state == 'dialback': tegraActive = True if not bbActive: if os.path.isfile(errorFile): log.warning('Tegra active but error flag set [%d/%d]' % (softCount, softResets)) softCount += 1 if softCount > softCountMax: softCount = 0 if softResets < softResetMax: softResets += 1 log.warning( 'removing error flag to see if tegra comes back' ) os.remove(errorFile) else: hardResets += 1 log.warning('hard reset reboot check [%d/%d]' % (hardResets, hardResetsMax)) if hardResets < hardResetsMax: sendReboot(options.tegraIP, sutDataPort) else: events.put(('offline', )) else: events.put(('start', )) elif state == 'start': if tegraActive and not bbActive: log.debug('starting buildslave in %s' % options.bbpath) bbProc, _ = runCommand([ 'twistd', '--no_save', '--rundir=%s' % options.bbpath, '--pidfile=%s' % pidFile, '--python=%s' % os.path.join(options.bbpath, 'buildbot.tac') ], env=bbEnv) log.info('buildslave start returned %s' % bbProc.returncode) if bbProc.returncode == 0 or bbProc.returncode == 1: # pause to give twistd a chance to generate the pidfile # before the code that follows goes off killing it because # it thinks that it didn't start properly # OMGRACECONDITIONWTF nTries = 0 while nTries < 20: nTries += 1 if os.path.isfile(pidFile): log.debug( 'pidfile found, setting bbActive to True') bbActive = True break else: time.sleep(5) elif state == 'dialback': softCount = 0 softResets = 0 hardResets = 0 elif state == 'terminate': break if hbFails > maxFails: hbFails = 0 sleepFails += 5 if sleepFails > 300: sleepFails = 300 if os.path.isfile(flagFile): log.debug('install flag found, resetting error count') else: events.put(('offline', )) if connected: hbSocket.close() connected = False log.debug('bbActive %s tegraActive %s' % (bbActive, tegraActive)) if os.path.isfile(errorFile): if bbActive: log.error('errorFile detected - sending stop request') events.put(('stop', )) if bbActive: if os.path.isfile(pidFile): n = time.time() if not checkSlaveAlive(options.bbpath): log.warning( 'buildslave should be active but pid is not alive') if int(n - lastHangCheck) > 300: lastHangCheck = n logTD = checkSlaveActive(options.bbpath) if logTD.days > 0 or (logTD.days == 0 and logTD.seconds > options.hangtime): log.error( 'last activity was %d days %d seconds ago - marking as hung slave' % (logTD.days, logTD.seconds)) events.put(('offline', )) else: log.warning( 'buildslave should be active but pidfile not found, marking as offline' ) events.put(('offline', )) else: if os.path.isfile(pidFile): if checkSlaveAlive(options.bbpath): log.error( 'buildslave should NOT be active but pidfile found, killing buildbot' ) events.put(('stop', )) else: log.warning( 'buildslave not active but pidfile found, removing pidfile' ) os.remove(pidFile) if bbActive: stopSlave(pidFile) log.info('monitor stopped')
def checkTegra(master, tegra): tegraIP = getIPAddress(tegra) tegraPath = os.path.join(options.bbpath, tegra) exportFile = os.path.join(tegraPath, '%s_status.log' % tegra) errorFile = os.path.join(tegraPath, 'error.flg') errorFlag = os.path.isfile(errorFile) sTegra = 'OFFLINE' sutFound = False logTD = None status = { 'tegra': tegra, 'active': False, 'cp': 'OFFLINE', 'bs': 'OFFLINE', 'msg': '', } log.debug('%s: %s' % (tegra, tegraIP)) if master is None: status['environment'] = 's' status['master'] = 'localhost' else: status['environment'] = master['environment'][0] status['master'] = 'http://%s:%s' % (master['hostname'], master['http_port']) fPing, lPing = pingDevice(tegra) if fPing: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) sutFound = True time.sleep(2) hbSocket.send('info\n') d = hbSocket.recv(4096) log.debug('socket data length %d' % len(d)) log.debug(d) status['active'] = True hbSocket.close() except: status['active'] = False dumpException('socket') if status['active']: sTegra = 'online' else: sTegra = 'INACTIVE' if not sutFound: status['msg'] += 'SUTAgent not present;' else: status['msg'] += '%s %s;' % (lPing[0], lPing[1]) # Cheat until we have a better check solution for new watch_devices.sh status['cp'] = 'active' # pretend all is well if checkSlaveAlive(tegraPath): logTD = checkSlaveActive(tegraPath) if logTD is not None: if (logTD.days > 0) or (logTD.days == 0 and logTD.seconds > 3600): status['bs'] = 'INACTIVE' status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds) else: status['bs'] = 'active' else: status['bs'] = 'INACTIVE' else: # scan thru tegra-### dir and see if any buildbot.tac.bug#### files # exist but ignore buildbot.tac file itself (except to note that it is # missing) files = os.listdir(tegraPath) found = False for f in files: if f.startswith('buildbot.tac'): found = True if len(f) > 12: status['msg'] += '%s;' % f if not found: status['msg'] += 'buildbot.tac NOT found;' if errorFlag: status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile) s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg']) ts = time.strftime('%Y-%m-%d %H:%M:%S') log.info(s) open(exportFile, 'a+').write('%s %s\n' % (ts, s)) summary(status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg'], ts, status['master']) if errorFlag and options.reset: stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave') if not options.reboot: try: hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) hbSocket.settimeout(float(120)) hbSocket.connect((tegraIP, 20700)) hbSocket.send('rebt\n') hbSocket.close() log.info('rebooting tegra') except: dumpException('socket') if errorFlag: log.info('clearing error.flg') os.remove(errorFile) # Here we try to catch the state where sutagent and cp are inactive that # is determined by: # sTegra == 'INACTIVE' and # status['cp'] == 'INACTIVE' # status['cp'] will be set to INACTIVE only if logTD.seconds (last time # clientproxy updated it's log file) is > 3600 if options.reboot: if not sutFound and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) else: if sTegra == 'OFFLINE' and status['bs'] != 'active': log.info('power cycling tegra') reboot_device(tegra) if options.reset and sTegra == 'INACTIVE' and status['cp'] == 'INACTIVE': log.info('stopping hung clientproxy') stopDevice(tegra) time.sleep(5) log.info('starting clientproxy for %s' % tegra) os.chdir(tegraPath) runCommand(['python', 'clientproxy.py', '-b', '--device=%s' % tegra])