Exemple #1
0
def monitorEvents(options, events):
    """This is the main state machine for the Tegra monitor.

    Respond to the events sent via queue and also monitor the
    state of the buildslave if it's been started.
    """
    pidFile   = os.path.join(options.bbpath, 'twistd.pid')
    flagFile  = os.path.join(options.bbpath, 'proxy.flg')
    errorFile = os.path.join(options.bbpath, 'error.flg')
    bbEnv     = { 'PATH':     os.getenv('PATH'),
                  'SUT_NAME': options.tegra,
                  'SUT_IP':   options.tegraIP,
                }

    event         = None
    bbActive      = False
    tegraActive   = False
    connected     = False
    nChatty       = 0
    maxChatty     = 10
    hbFails       = 0
    maxFails      = 50
    sleepFails    = 5
    softCount     = 0    # how many times tegraActive is True
                         # but errorFlag is set
    softCountMax  = 5    # how many active events to wait bdfore
                         # triggering a soft reset
    softResets    = 0
    softResetMax  = 5    # how many soft resets do we try before
                         # waiting for a hard reset
    hardResets    = 0
    hardResetsMax = 3
    lastHangCheck = time.time()

    log.info('monitoring started (process pid %s)' % current_process().pid)

    while True:
        if not connected:
            try:
                hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                hbSocket.settimeout(float(120))
                hbSocket.connect((options.tegraIP, sutDataPort))
                connected = True
            except:
                connected = False
                hbFails  += 1
                log.info('Error connecting to data port - sleeping for %d seconds' % sleepFails)
                time.sleep(sleepFails)

        try:
            event = events.get(False)
        except Empty:
            event = None

        if event is None:
            if connected:
                try:
                    hbData = hbSocket.recv(4096)
                except:
                    hbData    = ''
                    connected = False
                    dumpException('hbSocket.recv()')

                if len(hbData) > 1:
                    log.debug('socket data [%s]' % hbData[:-1])

                    if 'ebooting ...' in hbData:
                        log.warning('device is rebooting')
                        events.put(('reboot',))
                        if os.path.isfile(flagFile):
                            time.sleep(5)
                        hbSocket.close()
                        connected = False
                    else:
                        log.info('heartbeat detected')
                        events.put(('active',))
                        hbFails = 0
                else:
                    hbFails += 1
        else:
            state = event[0]
            s     = 'event %s hbFails %d / %d' % (state, hbFails, maxFails)
            nChatty += 1
            if nChatty > maxChatty:
                log.info(s)
            else:
                log.debug(s)
            if nChatty > maxChatty:
                nChatty = 0

            if state == 'reboot':
                tegraActive = False
                if not os.path.isfile(flagFile):
                    log.warning('Tegra rebooting, stopping buildslave')
                    events.put(('stop',))
            elif state == 'stop' or state == 'offline':
                stopSlave(pidFile)
                bbActive = False
                if connected and state == 'offline':
                    hbSocket.close()
                    connected = False
            elif state == 'active' or state == 'dialback':
                tegraActive = True
                if not bbActive:
                    if os.path.isfile(errorFile):
                        log.warning('Tegra active but error flag set [%d/%d]' % (softCount, softResets))
                        softCount += 1
                        if softCount > softCountMax:
                            softCount = 0
                            if softResets < softResetMax:
                                softResets += 1
                                log.warning('removing error flag to see if tegra comes back')
                                os.remove(errorFile)
                            else:
                                hardResets += 1
                                log.warning('hard reset reboot check [%d/%d]' % (hardResets, hardResetsMax))
                                if hardResets < hardResetsMax:
                                    sendReboot(options.tegraIP, sutDataPort)
                                else:
                                    events.put(('offline',))
                    else:
                        events.put(('start',))
            elif state == 'start':
                if tegraActive and not bbActive:
                    log.debug('starting buildslave in %s' % options.bbpath)
                    bbProc, _ = runCommand(['twistd', '--no_save',
                                                      '--rundir=%s' % options.bbpath,
                                                      '--pidfile=%s' % pidFile,
                                                      '--python=%s' % os.path.join(options.bbpath, 'buildbot.tac')], 
                                           env=bbEnv)
                    log.info('buildslave start returned %s' % bbProc.returncode)
                    if bbProc.returncode == 0 or bbProc.returncode == 1:
                        # pause to give twistd a chance to generate the pidfile
                        # before the code that follows goes off killing it because
                        # it thinks that it didn't start properly
                        # OMGRACECONDITIONWTF
                        nTries = 0
                        while nTries < 20:
                            nTries += 1
                            if os.path.isfile(pidFile):
                                log.debug('pidfile found, setting bbActive to True')
                                bbActive = True
                                break
                            else:
                                time.sleep(5)
            elif state == 'dialback':
                softCount  = 0
                softResets = 0
                hardResets = 0
            elif state == 'terminate':
                break

        if hbFails > maxFails:
            hbFails     = 0
            sleepFails += 5
            if sleepFails > 300:
                sleepFails = 300
            if os.path.isfile(flagFile):
                log.debug('install flag found, resetting error count')
            else:
                events.put(('offline',))
            if connected:
                hbSocket.close()
                connected = False

        log.debug('bbActive %s tegraActive %s' % (bbActive, tegraActive))

        if os.path.isfile(errorFile):
            if bbActive:
                log.error('errorFile detected - sending stop request')
                events.put(('stop',))

        if bbActive:
            if os.path.isfile(pidFile):
                n = time.time()
                if not checkSlaveAlive(options.bbpath):
                    log.warning('buildslave should be active but pid is not alive')
                    if int(n - lastHangCheck) > 300:
                        lastHangCheck = n
                        logTD = checkSlaveActive(options.bbpath)
                        if logTD.days > 0 or (logTD.days == 0 and logTD.seconds > options.hangtime):
                            log.error('last activity was %d days %d seconds ago - marking as hung slave' % 
                                      (logTD.days, logTD.seconds))
                            events.put(('offline',))
            else:
                log.warning('buildslave should be active but pidfile not found, marking as offline')
                events.put(('offline',))
        else:
            if os.path.isfile(pidFile):
                if checkSlaveAlive(options.bbpath):
                    log.error('buildslave should NOT be active but pidfile found, killing buildbot')
                    events.put(('stop',))
                else:
                    log.warning('buildslave not active but pidfile found, removing pidfile')
                    os.remove(pidFile)

    if bbActive:
        stopSlave(pidFile)

    log.info('monitor stopped')
Exemple #2
0
def checkTegra(master, tegra):
    tegraIP = getIPAddress(tegra)
    tegraPath = os.path.join(options.bbpath, tegra)
    exportFile = os.path.join(tegraPath, '%s_status.log' % tegra)
    errorFile = os.path.join(tegraPath, 'error.flg')
    errorFlag = os.path.isfile(errorFile)
    sTegra = 'OFFLINE'
    sutFound = False
    logTD = None

    status = {'tegra': tegra,
              'active': False,
              'cp': 'OFFLINE',
              'bs': 'OFFLINE',
              'msg': '',
              }

    log.debug('%s: %s' % (tegra, tegraIP))

    if master is None:
        status['environment'] = 's'
        status['master'] = 'localhost'
    else:
        status['environment'] = master['environment'][0]
        status['master'] = 'http://%s:%s' % (
            master['hostname'], master['http_port'])

    fPing, lPing = pingDevice(tegra)
    if fPing:
        try:
            hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            hbSocket.settimeout(float(120))
            hbSocket.connect((tegraIP, 20700))

            sutFound = True

            time.sleep(2)

            hbSocket.send('info\n')

            d = hbSocket.recv(4096)

            log.debug('socket data length %d' % len(d))
            log.debug(d)

            status['active'] = True

            hbSocket.close()
        except:
            status['active'] = False
            dumpException('socket')

        if status['active']:
            sTegra = 'online'
        else:
            sTegra = 'INACTIVE'

        if not sutFound:
            status['msg'] += 'SUTAgent not present;'
    else:
        status['msg'] += '%s %s;' % (lPing[0], lPing[1])

    # Cheat until we have a better check solution for new watch_devices.sh
    status['cp'] = 'active'  # pretend all is well

    if checkSlaveAlive(tegraPath):
        logTD = checkSlaveActive(tegraPath)
        if logTD is not None:
            if (logTD.days > 0) or (logTD.days == 0 and logTD.seconds > 3600):
                status['bs'] = 'INACTIVE'
                status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds)
            else:
                status['bs'] = 'active'
        else:
            status['bs'] = 'INACTIVE'
    else:
        # scan thru tegra-### dir and see if any buildbot.tac.bug#### files
        # exist but ignore buildbot.tac file itself (except to note that it is
        # missing)
        files = os.listdir(tegraPath)
        found = False
        for f in files:
            if f.startswith('buildbot.tac'):
                found = True
                if len(f) > 12:
                    status['msg'] += '%s;' % f
        if not found:
            status['msg'] += 'buildbot.tac NOT found;'

    if errorFlag:
        status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile)

    s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'],
                                     sTegra, status['cp'], status['bs'],
                                     status['msg'])
    ts = time.strftime('%Y-%m-%d %H:%M:%S')
    log.info(s)
    open(exportFile, 'a+').write('%s %s\n' % (ts, s))
    summary(status['tegra'], status['environment'], sTegra, status[
            'cp'], status['bs'], status['msg'], ts, status['master'])

    if errorFlag and options.reset:
        stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave')

        if not options.reboot:
            try:
                hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                hbSocket.settimeout(float(120))
                hbSocket.connect((tegraIP, 20700))
                hbSocket.send('rebt\n')
                hbSocket.close()
                log.info('rebooting tegra')
            except:
                dumpException('socket')

        if errorFlag:
            log.info('clearing error.flg')
            os.remove(errorFile)

    # Here we try to catch the state where sutagent and cp are inactive that
    # is determined by:
    #     sTegra == 'INACTIVE' and
    #     status['cp'] == 'INACTIVE'
    # status['cp'] will be set to INACTIVE only if logTD.seconds (last time
    # clientproxy updated it's log file) is > 3600

    if options.reboot:
        if not sutFound and status['bs'] != 'active':
            log.info('power cycling tegra')
            reboot_device(tegra)
        else:
            if sTegra == 'OFFLINE' and status['bs'] != 'active':
                log.info('power cycling tegra')
                reboot_device(tegra)

    if options.reset and sTegra == 'INACTIVE' and status['cp'] == 'INACTIVE':
        log.info('stopping hung clientproxy')
        stopDevice(tegra)
        time.sleep(5)
        log.info('starting clientproxy for %s' % tegra)
        os.chdir(tegraPath)
        runCommand(['python', 'clientproxy.py', '-b', '--device=%s' % tegra])
Exemple #3
0
def checkTegra(master, tegra):
    tegraIP    = getIPAddress(tegra)
    tegraPath  = os.path.join(options.bbpath, tegra)
    exportFile = os.path.join(tegraPath, '%s_status.log' % tegra)
    errorFile  = os.path.join(tegraPath, 'error.flg')
    proxyFile  = os.path.join(tegraPath, 'proxy.flg')
    errorFlag  = os.path.isfile(errorFile)
    proxyFlag  = os.path.isfile(proxyFile)
    sTegra     = 'OFFLINE'
    sutFound   = False

    status = { 'tegra':  tegra,
               'active': False,
               'cp':     'OFFLINE',
               'bs':     'OFFLINE',
               'msg':    '',
             }

    log.debug('%s: %s' % (tegra, tegraIP))

    if master is None:
        status['environment'] = 's'
        status['master']      = 'localhost'
    else:
        status['environment'] = master['environment'][0]
        status['master']      = 'http://%s:%s' % (master['hostname'], master['http_port'])

    fPing, lPing = pingTegra(tegra)
    if fPing:
        try:
            hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            hbSocket.settimeout(float(120))
            hbSocket.connect((tegraIP, 20700))

            sutFound = True

            time.sleep(2)

            hbSocket.send('info\n')

            d = hbSocket.recv(4096)

            log.debug('socket data length %d' % len(d))
            log.debug(d)

            status['active'] = True

            hbSocket.close()
        except:
            status['active'] = False
            dumpException('socket')

        if status['active']:
            sTegra = 'online'
        else:
            sTegra = 'INACTIVE'

        if not sutFound:
            status['msg'] += 'SUTAgent not present;'
    else:
        status['msg'] += '%s %s;' % (lPing[0], lPing[1])

    if checkCPAlive(tegraPath):
        logTD = checkCPActive(tegraPath)
        if logTD is not None and logTD.days > 0 or (logTD.days == 0 and logTD.seconds > 300):
            status['cp']   = 'INACTIVE'
            status['msg'] += 'CP %dd %ds;' % (logTD.days, logTD.seconds)
        else:
            status['cp'] = 'active'
    else:
        if os.path.isfile(os.path.join(tegraPath, 'clientproxy.pid')):
            status['msg'] += 'clientproxy.pid found;'

    if checkSlaveAlive(tegraPath):
        logTD = checkSlaveActive(tegraPath)
        if logTD is not None and logTD.days > 0 or (logTD.days == 0 and logTD.seconds > 3600):
            status['bs']   = 'INACTIVE'
            status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds)
        else:
            status['bs'] = 'active'
    else:
        # scan thru tegra-### dir and see if any buildbot.tac.bug#### files exist
        # but ignore buildbot.tac file itself (except to note that it is missing)
        files = os.listdir(tegraPath)
        found = False
        for f in files:
            if f.startswith('buildbot.tac'):
                found = True
                if len(f) > 12:
                    status['msg'] += '%s;' % f
        if not found:
            status['msg'] += 'buildbot.tac NOT found;'

    if errorFlag:
        status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile)
    if proxyFlag:
        status['msg'] += 'REBOOTING '

    s  = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg'])
    ts = time.strftime('%Y-%m-%d %H:%M:%S')
    log.info(s)
    open(exportFile, 'a+').write('%s %s\n' % (ts, s))
    summary(status['tegra'], status['environment'], sTegra, status['cp'], status['bs'], status['msg'], ts, status['master'])

    if errorFlag and options.reset:
        stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave')

        if not options.reboot:
            try:
                hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                hbSocket.settimeout(float(120))
                hbSocket.connect((tegraIP, 20700))
                hbSocket.send('rebt\n')
                hbSocket.close()
                log.info('rebooting tegra')
            except:
                dumpException('socket')

        if errorFlag:
            log.info('clearing error.flg')
            os.remove(errorFile)
        if proxyFlag:
            log.info('clearing proxy.flg')
            os.remove(proxyFile)

    if options.reboot:
        if not sutFound and status['bs'] != 'active':
            log.info('power cycling tegra')
            reboot_tegra(tegra)
        else:
            if sTegra == 'OFFLINE' and status['bs'] != 'active':
                log.info('power cycling tegra')
                reboot_tegra(tegra)
Exemple #4
0
def checkTegra(master, tegra):
    tegraIP = getIPAddress(tegra)
    tegraPath = os.path.join(options.bbpath, tegra)
    exportFile = os.path.join(tegraPath, '%s_status.log' % tegra)
    errorFile = os.path.join(tegraPath, 'error.flg')
    proxyFile = os.path.join(tegraPath, 'proxy.flg')
    errorFlag = os.path.isfile(errorFile)
    proxyFlag = os.path.isfile(proxyFile)
    sTegra = 'OFFLINE'
    sutFound = False

    status = {
        'tegra': tegra,
        'active': False,
        'cp': 'OFFLINE',
        'bs': 'OFFLINE',
        'msg': '',
    }

    log.debug('%s: %s' % (tegra, tegraIP))

    if master is None:
        status['environment'] = 's'
        status['master'] = 'localhost'
    else:
        status['environment'] = master['environment'][0]
        status['master'] = 'http://%s:%s' % (master['hostname'],
                                             master['http_port'])

    fPing, lPing = pingTegra(tegra)
    if fPing:
        try:
            hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            hbSocket.settimeout(float(120))
            hbSocket.connect((tegraIP, 20700))

            sutFound = True

            time.sleep(2)

            hbSocket.send('info\n')

            d = hbSocket.recv(4096)

            log.debug('socket data length %d' % len(d))
            log.debug(d)

            status['active'] = True

            hbSocket.close()
        except:
            status['active'] = False
            dumpException('socket')

        if status['active']:
            sTegra = 'online'
        else:
            sTegra = 'INACTIVE'

        if not sutFound:
            status['msg'] += 'SUTAgent not present;'
    else:
        status['msg'] += '%s %s;' % (lPing[0], lPing[1])

    if checkCPAlive(tegraPath):
        logTD = checkCPActive(tegraPath)
        if logTD is not None and logTD.days > 0 or (logTD.days == 0
                                                    and logTD.seconds > 300):
            status['cp'] = 'INACTIVE'
            status['msg'] += 'CP %dd %ds;' % (logTD.days, logTD.seconds)
        else:
            status['cp'] = 'active'
    else:
        if os.path.isfile(os.path.join(tegraPath, 'clientproxy.pid')):
            status['msg'] += 'clientproxy.pid found;'

    if checkSlaveAlive(tegraPath):
        logTD = checkSlaveActive(tegraPath)
        if logTD is not None and logTD.days > 0 or (logTD.days == 0
                                                    and logTD.seconds > 3600):
            status['bs'] = 'INACTIVE'
            status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds)
        else:
            status['bs'] = 'active'
    else:
        # scan thru tegra-### dir and see if any buildbot.tac.bug#### files exist
        # but ignore buildbot.tac file itself (except to note that it is missing)
        files = os.listdir(tegraPath)
        found = False
        for f in files:
            if f.startswith('buildbot.tac'):
                found = True
                if len(f) > 12:
                    status['msg'] += '%s;' % f
        if not found:
            status['msg'] += 'buildbot.tac NOT found;'

    if errorFlag:
        status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile)
    if proxyFlag:
        status['msg'] += 'REBOOTING '

    s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'],
                                     sTegra, status['cp'], status['bs'],
                                     status['msg'])
    ts = time.strftime('%Y-%m-%d %H:%M:%S')
    log.info(s)
    open(exportFile, 'a+').write('%s %s\n' % (ts, s))
    summary(status['tegra'], status['environment'], sTegra, status['cp'],
            status['bs'], status['msg'], ts, status['master'])

    if errorFlag and options.reset:
        stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave')

        if not options.reboot:
            try:
                hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                hbSocket.settimeout(float(120))
                hbSocket.connect((tegraIP, 20700))
                hbSocket.send('rebt\n')
                hbSocket.close()
                log.info('rebooting tegra')
            except:
                dumpException('socket')

        if errorFlag:
            log.info('clearing error.flg')
            os.remove(errorFile)
        if proxyFlag:
            log.info('clearing proxy.flg')
            os.remove(proxyFile)

    if options.reboot:
        if not sutFound and status['bs'] != 'active':
            log.info('power cycling tegra')
            reboot_tegra(tegra)
        else:
            if sTegra == 'OFFLINE' and status['bs'] != 'active':
                log.info('power cycling tegra')
                reboot_tegra(tegra)
Exemple #5
0
def monitorEvents(options, events):
    """This is the main state machine for the Tegra monitor.

    Respond to the events sent via queue and also monitor the
    state of the buildslave if it's been started.
    """
    pidFile = os.path.join(options.bbpath, 'twistd.pid')
    flagFile = os.path.join(options.bbpath, 'proxy.flg')
    errorFile = os.path.join(options.bbpath, 'error.flg')
    bbEnv = {
        'PATH': os.getenv('PATH'),
        'SUT_NAME': options.tegra,
        'SUT_IP': options.tegraIP,
    }

    event = None
    bbActive = False
    tegraActive = False
    connected = False
    nChatty = 0
    maxChatty = 10
    hbFails = 0
    maxFails = 50
    sleepFails = 5
    softCount = 0  # how many times tegraActive is True
    # but errorFlag is set
    softCountMax = 5  # how many active events to wait bdfore
    # triggering a soft reset
    softResets = 0
    softResetMax = 5  # how many soft resets do we try before
    # waiting for a hard reset
    hardResets = 0
    hardResetsMax = 3
    lastHangCheck = time.time()

    log.info('monitoring started (process pid %s)' % current_process().pid)

    while True:
        if not connected:
            try:
                hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                hbSocket.settimeout(float(120))
                hbSocket.connect((options.tegraIP, sutDataPort))
                connected = True
            except:
                connected = False
                hbFails += 1
                log.info(
                    'Error connecting to data port - sleeping for %d seconds' %
                    sleepFails)
                time.sleep(sleepFails)

        try:
            event = events.get(False)
        except Empty:
            event = None

        if event is None:
            if connected:
                try:
                    hbData = hbSocket.recv(4096)
                except:
                    hbData = ''
                    connected = False
                    dumpException('hbSocket.recv()')

                if len(hbData) > 1:
                    log.debug('socket data [%s]' % hbData[:-1])

                    if 'ebooting ...' in hbData:
                        log.warning('device is rebooting')
                        events.put(('reboot', ))
                        if os.path.isfile(flagFile):
                            time.sleep(5)
                        hbSocket.close()
                        connected = False
                    else:
                        log.info('heartbeat detected')
                        events.put(('active', ))
                        hbFails = 0
                else:
                    hbFails += 1
        else:
            state = event[0]
            s = 'event %s hbFails %d / %d' % (state, hbFails, maxFails)
            nChatty += 1
            if nChatty > maxChatty:
                log.info(s)
            else:
                log.debug(s)
            if nChatty > maxChatty:
                nChatty = 0

            if state == 'reboot':
                tegraActive = False
                if not os.path.isfile(flagFile):
                    log.warning('Tegra rebooting, stopping buildslave')
                    events.put(('stop', ))
            elif state == 'stop' or state == 'offline':
                stopSlave(pidFile)
                bbActive = False
                if connected and state == 'offline':
                    hbSocket.close()
                    connected = False
            elif state == 'active' or state == 'dialback':
                tegraActive = True
                if not bbActive:
                    if os.path.isfile(errorFile):
                        log.warning('Tegra active but error flag set [%d/%d]' %
                                    (softCount, softResets))
                        softCount += 1
                        if softCount > softCountMax:
                            softCount = 0
                            if softResets < softResetMax:
                                softResets += 1
                                log.warning(
                                    'removing error flag to see if tegra comes back'
                                )
                                os.remove(errorFile)
                            else:
                                hardResets += 1
                                log.warning('hard reset reboot check [%d/%d]' %
                                            (hardResets, hardResetsMax))
                                if hardResets < hardResetsMax:
                                    sendReboot(options.tegraIP, sutDataPort)
                                else:
                                    events.put(('offline', ))
                    else:
                        events.put(('start', ))
            elif state == 'start':
                if tegraActive and not bbActive:
                    log.debug('starting buildslave in %s' % options.bbpath)
                    bbProc, _ = runCommand([
                        'twistd', '--no_save',
                        '--rundir=%s' % options.bbpath,
                        '--pidfile=%s' % pidFile,
                        '--python=%s' %
                        os.path.join(options.bbpath, 'buildbot.tac')
                    ],
                                           env=bbEnv)
                    log.info('buildslave start returned %s' %
                             bbProc.returncode)
                    if bbProc.returncode == 0 or bbProc.returncode == 1:
                        # pause to give twistd a chance to generate the pidfile
                        # before the code that follows goes off killing it because
                        # it thinks that it didn't start properly
                        # OMGRACECONDITIONWTF
                        nTries = 0
                        while nTries < 20:
                            nTries += 1
                            if os.path.isfile(pidFile):
                                log.debug(
                                    'pidfile found, setting bbActive to True')
                                bbActive = True
                                break
                            else:
                                time.sleep(5)
            elif state == 'dialback':
                softCount = 0
                softResets = 0
                hardResets = 0
            elif state == 'terminate':
                break

        if hbFails > maxFails:
            hbFails = 0
            sleepFails += 5
            if sleepFails > 300:
                sleepFails = 300
            if os.path.isfile(flagFile):
                log.debug('install flag found, resetting error count')
            else:
                events.put(('offline', ))
            if connected:
                hbSocket.close()
                connected = False

        log.debug('bbActive %s tegraActive %s' % (bbActive, tegraActive))

        if os.path.isfile(errorFile):
            if bbActive:
                log.error('errorFile detected - sending stop request')
                events.put(('stop', ))

        if bbActive:
            if os.path.isfile(pidFile):
                n = time.time()
                if not checkSlaveAlive(options.bbpath):
                    log.warning(
                        'buildslave should be active but pid is not alive')
                    if int(n - lastHangCheck) > 300:
                        lastHangCheck = n
                        logTD = checkSlaveActive(options.bbpath)
                        if logTD.days > 0 or (logTD.days == 0 and logTD.seconds
                                              > options.hangtime):
                            log.error(
                                'last activity was %d days %d seconds ago - marking as hung slave'
                                % (logTD.days, logTD.seconds))
                            events.put(('offline', ))
            else:
                log.warning(
                    'buildslave should be active but pidfile not found, marking as offline'
                )
                events.put(('offline', ))
        else:
            if os.path.isfile(pidFile):
                if checkSlaveAlive(options.bbpath):
                    log.error(
                        'buildslave should NOT be active but pidfile found, killing buildbot'
                    )
                    events.put(('stop', ))
                else:
                    log.warning(
                        'buildslave not active but pidfile found, removing pidfile'
                    )
                    os.remove(pidFile)

    if bbActive:
        stopSlave(pidFile)

    log.info('monitor stopped')
Exemple #6
0
def checkTegra(master, tegra):
    tegraIP = getIPAddress(tegra)
    tegraPath = os.path.join(options.bbpath, tegra)
    exportFile = os.path.join(tegraPath, '%s_status.log' % tegra)
    errorFile = os.path.join(tegraPath, 'error.flg')
    errorFlag = os.path.isfile(errorFile)
    sTegra = 'OFFLINE'
    sutFound = False
    logTD = None

    status = {
        'tegra': tegra,
        'active': False,
        'cp': 'OFFLINE',
        'bs': 'OFFLINE',
        'msg': '',
    }

    log.debug('%s: %s' % (tegra, tegraIP))

    if master is None:
        status['environment'] = 's'
        status['master'] = 'localhost'
    else:
        status['environment'] = master['environment'][0]
        status['master'] = 'http://%s:%s' % (master['hostname'],
                                             master['http_port'])

    fPing, lPing = pingDevice(tegra)
    if fPing:
        try:
            hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            hbSocket.settimeout(float(120))
            hbSocket.connect((tegraIP, 20700))

            sutFound = True

            time.sleep(2)

            hbSocket.send('info\n')

            d = hbSocket.recv(4096)

            log.debug('socket data length %d' % len(d))
            log.debug(d)

            status['active'] = True

            hbSocket.close()
        except:
            status['active'] = False
            dumpException('socket')

        if status['active']:
            sTegra = 'online'
        else:
            sTegra = 'INACTIVE'

        if not sutFound:
            status['msg'] += 'SUTAgent not present;'
    else:
        status['msg'] += '%s %s;' % (lPing[0], lPing[1])

    # Cheat until we have a better check solution for new watch_devices.sh
    status['cp'] = 'active'  # pretend all is well

    if checkSlaveAlive(tegraPath):
        logTD = checkSlaveActive(tegraPath)
        if logTD is not None:
            if (logTD.days > 0) or (logTD.days == 0 and logTD.seconds > 3600):
                status['bs'] = 'INACTIVE'
                status['msg'] += 'BS %dd %ds;' % (logTD.days, logTD.seconds)
            else:
                status['bs'] = 'active'
        else:
            status['bs'] = 'INACTIVE'
    else:
        # scan thru tegra-### dir and see if any buildbot.tac.bug#### files
        # exist but ignore buildbot.tac file itself (except to note that it is
        # missing)
        files = os.listdir(tegraPath)
        found = False
        for f in files:
            if f.startswith('buildbot.tac'):
                found = True
                if len(f) > 12:
                    status['msg'] += '%s;' % f
        if not found:
            status['msg'] += 'buildbot.tac NOT found;'

    if errorFlag:
        status['msg'] += 'error.flg [%s] ' % getLastLine(errorFile)

    s = '%s %s %9s %8s %8s :: %s' % (status['tegra'], status['environment'],
                                     sTegra, status['cp'], status['bs'],
                                     status['msg'])
    ts = time.strftime('%Y-%m-%d %H:%M:%S')
    log.info(s)
    open(exportFile, 'a+').write('%s %s\n' % (ts, s))
    summary(status['tegra'], status['environment'], sTegra, status['cp'],
            status['bs'], status['msg'], ts, status['master'])

    if errorFlag and options.reset:
        stopProcess(os.path.join(tegraPath, 'twistd.pid'), 'buildslave')

        if not options.reboot:
            try:
                hbSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                hbSocket.settimeout(float(120))
                hbSocket.connect((tegraIP, 20700))
                hbSocket.send('rebt\n')
                hbSocket.close()
                log.info('rebooting tegra')
            except:
                dumpException('socket')

        if errorFlag:
            log.info('clearing error.flg')
            os.remove(errorFile)

    # Here we try to catch the state where sutagent and cp are inactive that
    # is determined by:
    #     sTegra == 'INACTIVE' and
    #     status['cp'] == 'INACTIVE'
    # status['cp'] will be set to INACTIVE only if logTD.seconds (last time
    # clientproxy updated it's log file) is > 3600

    if options.reboot:
        if not sutFound and status['bs'] != 'active':
            log.info('power cycling tegra')
            reboot_device(tegra)
        else:
            if sTegra == 'OFFLINE' and status['bs'] != 'active':
                log.info('power cycling tegra')
                reboot_device(tegra)

    if options.reset and sTegra == 'INACTIVE' and status['cp'] == 'INACTIVE':
        log.info('stopping hung clientproxy')
        stopDevice(tegra)
        time.sleep(5)
        log.info('starting clientproxy for %s' % tegra)
        os.chdir(tegraPath)
        runCommand(['python', 'clientproxy.py', '-b', '--device=%s' % tegra])