def parseIbcheckerrors(allErrs, ibCheckFile=None): f = ibCheckFile if f == None: suffix = 'ibcheckerrors' f, fTime = findMostRecentFile( ibDir, suffix ) print 'using', f, 'for errors' lines = open( ibDir + '/' + f, 'r' ).readlines() #print lines # big problems with the fabric -> # # ibwarn: [22811] _do_madrpc: recv failed: Connection timed out # ibwarn: [22811] mad_rpc: _do_madrpc failed; dport (DR path slid 0; dlid 0; 0,1,31,18,3,31,17) # ibwarn: [22811] handle_port: NodeInfo on DR path slid 0; dlid 0; 0,1,31,18,3,31,17 failed, skipping port # ibwarn: [22811] _do_madrpc: recv failed: Invalid argument # ibwarn: [22811] mad_rpc: _do_madrpc failed; dport (DR path slid 0; dlid 0; 0,1,31,18,3,31) # ibwarn: [22811] discover: can't reach node DR path slid 0; dlid 0; 0,1,31,18,3,31 port 18 # # normal stuff -> # # #warn: counter SymbolErrors = 65534 (threshold 10) lid 2 port 255 # #warn: counter LinkDowned = 254 (threshold 10) lid 2 port 255 # Error check on lid 2 (0x0021283a89190040 qnem-13-3a) port all: FAILED # #warn: counter SymbolErrors = 722 (threshold 10) lid 259 port 255 # Error check on lid 259 (0x0021283a87820050 qnem-13-1b) port all: FAILED # #warn: counter SymbolErrors = 722 (threshold 10) lid 259 port 21 # Error check on lid 259 (0x0021283a87820050 qnem-13-1b) port 21: FAILED # #warn: counter SymbolErrors = 11251 (threshold 10) lid 1447 port 12 # #warn: counter LinkRecovers = 255 (threshold 10) lid 1447 port 12 # #warn: counter RcvErrors = 2342 (threshold 10) lid 1447 port 12 # Error check on lid 1447 (0x0021283a87760040 qnem-05-4a) port 12: FAILED # #warn: counter SymbolErrors = 13 (threshold 10) lid 989 port 1 # #warn: counter RcvErrors = 13 (threshold 10) lid 989 port 1 # Error check on lid 989 (v679 HCA-1) port 1: FAILED # #warn: counter SymbolErrors = 19 (threshold 10) lid 974 port 1 # #warn: counter RcvErrors = 18 (threshold 10) lid 974 port 1 # Error check on lid 974 (v668 HCA-1) port 1: FAILED # (later) also can get many lines like this which I'll ignore -> # ibwarn: [6329] dump_perfcounters: PortXmitWait not indicated so ignore this counter # ibwarn: [6381] dump_perfcounters: PortXmitWait not indicated so ignore this counter # ibwarn: [6438] dump_perfcounters: PortXmitWait not indicated so ignore this counter errs = {} for ll in lines: ll = ll.strip() l = ll.split() # ignore final summary lines if len(l) == 0 or l[0][:2] == "##": continue # important errors - TODO - return these... if l[0] == 'ibwarn:': if 'dump_perfcounters:' in l and 'PortXmitWait' in l: continue print ll continue #print l if l[0] == "#warn:": if l[-2] != "port": print 'expected a port in a #warn, not "', ll, '"' continue lid = int(l[-3]) port = int(l[-1]) err = l[2] errCnt = int(l[4]) if port == 255: continue if not allErrs and err not in normalErrors: continue #print 'lid', lid, 'port', port, 'err', err, 'errCnt', errCnt key = ( lid, port ) if key not in errs.keys(): errs[key] = {} errs[key]['errs'] = [] errs[key]['errs'].append( ( err, errCnt ) ) elif l[0] == "Error": # ignore 255,all lid = int(l[4]) port = l[-2].split(':')[0] if port == 'all': port = 255 port = int(port) if port == 255: continue key = ( lid, port ) name = ll.split('(')[1].split(')')[0] #print 'lid', lid, 'port', port, 'name', name if key not in errs.keys(): # all errs for this (lid,port) were skipped continue errs[key]['name'] = name return errs
#!/usr/bin/env python # find and print nodes that have been rebooted since the last ibclearerrors # used by IB error checking scripts to eliminate nodes rebooted during that interval from error sweeps from ibTracePorts import findMostRecentFile from ibFlagErrors import uptimes, filterHosts from hms import hms import time, sys ibDir = '/root/ib' suffix = 'ibclearerrors' f, fTime = findMostRecentFile( ibDir, suffix ) #print 'using', f, 'for errors, time', fTime, 'now', time.time(), 'diff', time.time() - fTime, 'hrs', (time.time() - fTime)/3600.0 uptime, down = uptimes() if uptime == None: # failed sys.exit(1) #print 'len(uptime)', len(uptime) #, 'uptime', uptime # uptimes by hostname #print 'len(down)', len(down), 'down', down ignore = filterHosts( uptime, fTime ) ignore.sort() #print 'recently rebooted - ignore hosts', ignore, 'len', len(ignore) print '# nodes rebooted in last', hms(time.time() - fTime) for i in ignore: if i in down: continue