Ejemplo n.º 1
0
def parseIbcheckerrors(allErrs, ibCheckFile=None):
   f = ibCheckFile
   if f == None:
       suffix = 'ibcheckerrors'
       f, fTime = findMostRecentFile( ibDir, suffix )
   print 'using', f, 'for errors'
   lines = open( ibDir + '/' + f, 'r' ).readlines()
   #print lines

   # big problems with the fabric ->
   #
   # ibwarn: [22811] _do_madrpc: recv failed: Connection timed out
   # ibwarn: [22811] mad_rpc: _do_madrpc failed; dport (DR path slid 0; dlid 0; 0,1,31,18,3,31,17)
   # ibwarn: [22811] handle_port: NodeInfo on DR path slid 0; dlid 0; 0,1,31,18,3,31,17 failed, skipping port
   # ibwarn: [22811] _do_madrpc: recv failed: Invalid argument
   # ibwarn: [22811] mad_rpc: _do_madrpc failed; dport (DR path slid 0; dlid 0; 0,1,31,18,3,31)
   # ibwarn: [22811] discover: can't reach node DR path slid 0; dlid 0; 0,1,31,18,3,31 port 18
   # 
   # normal stuff ->
   #
   # #warn: counter SymbolErrors = 65534 (threshold 10) lid 2 port 255
   # #warn: counter LinkDowned = 254 (threshold 10) lid 2 port 255
   # Error check on lid 2 (0x0021283a89190040 qnem-13-3a) port all:  FAILED 
   # #warn: counter SymbolErrors = 722 (threshold 10) lid 259 port 255
   # Error check on lid 259 (0x0021283a87820050 qnem-13-1b) port all:  FAILED 
   # #warn: counter SymbolErrors = 722 (threshold 10) lid 259 port 21
   # Error check on lid 259 (0x0021283a87820050 qnem-13-1b) port 21:  FAILED 
   # #warn: counter SymbolErrors = 11251     (threshold 10) lid 1447 port 12
   # #warn: counter LinkRecovers = 255       (threshold 10) lid 1447 port 12
   # #warn: counter RcvErrors = 2342         (threshold 10) lid 1447 port 12
   # Error check on lid 1447 (0x0021283a87760040 qnem-05-4a) port 12: FAILED
   # #warn: counter SymbolErrors = 13        (threshold 10) lid 989 port 1
   # #warn: counter RcvErrors = 13   (threshold 10) lid 989 port 1
   # Error check on lid 989 (v679 HCA-1) port 1: FAILED
   # #warn: counter SymbolErrors = 19        (threshold 10) lid 974 port 1
   # #warn: counter RcvErrors = 18   (threshold 10) lid 974 port 1
   # Error check on lid 974 (v668 HCA-1) port 1: FAILED

   # (later) also can get many lines like this which I'll ignore ->
   # ibwarn: [6329] dump_perfcounters: PortXmitWait not indicated so ignore this counter
   # ibwarn: [6381] dump_perfcounters: PortXmitWait not indicated so ignore this counter
   # ibwarn: [6438] dump_perfcounters: PortXmitWait not indicated so ignore this counter

   errs = {}
   for ll in lines:
      ll = ll.strip()
      l = ll.split()

      # ignore final summary lines
      if len(l) == 0 or l[0][:2] == "##":
         continue

      # important errors - TODO - return these...
      if l[0] == 'ibwarn:':
         if 'dump_perfcounters:' in l and 'PortXmitWait' in l:
            continue
         print ll
         continue

      #print l
      if l[0] == "#warn:":
         if l[-2] != "port":
            print 'expected a port in a #warn, not "', ll, '"'
            continue
         lid = int(l[-3])
         port = int(l[-1])
         err = l[2]
         errCnt = int(l[4])
         if port == 255:
            continue
         if not allErrs and err not in normalErrors:
            continue
         #print 'lid', lid, 'port', port, 'err', err, 'errCnt', errCnt
         key = ( lid, port )
         if key not in errs.keys():
            errs[key] = {}
            errs[key]['errs'] = []
         errs[key]['errs'].append( ( err, errCnt ) )
      elif l[0] == "Error":
         # ignore 255,all
         lid = int(l[4])
         port = l[-2].split(':')[0]
         if port == 'all':
            port = 255
         port = int(port)
         if port == 255:
            continue
         key = ( lid, port )
         name = ll.split('(')[1].split(')')[0]
         #print 'lid', lid, 'port', port, 'name', name
         if key not in errs.keys(): # all errs for this (lid,port) were skipped
            continue
         errs[key]['name'] = name

   return errs
Ejemplo n.º 2
0
#!/usr/bin/env python

# find and print nodes that have been rebooted since the last ibclearerrors
# used by IB error checking scripts to eliminate nodes rebooted during that interval from error sweeps

from ibTracePorts import findMostRecentFile
from ibFlagErrors import uptimes, filterHosts
from hms import hms
import time, sys

ibDir = '/root/ib'
suffix = 'ibclearerrors'

f, fTime = findMostRecentFile( ibDir, suffix )
#print 'using', f, 'for errors, time', fTime, 'now', time.time(), 'diff', time.time() - fTime, 'hrs', (time.time() - fTime)/3600.0

uptime, down = uptimes()
if uptime == None:  # failed
    sys.exit(1)

#print 'len(uptime)', len(uptime) #, 'uptime', uptime   # uptimes by hostname
#print 'len(down)', len(down), 'down', down

ignore = filterHosts( uptime, fTime )
ignore.sort()
#print 'recently rebooted - ignore hosts', ignore, 'len', len(ignore)

print '# nodes rebooted in last', hms(time.time() - fTime)
for i in ignore:
    if i in down:
        continue