def check_saphana(): T_table = TaskTable() sap = T_table.getByComm("sapstart") hana = T_table.getByComm("hdbindexserver") if (sap and hana): return 2 elif (sap): return 1 else: return 0
def check_hanging_nfsd(_funcpids): subpids = _funcpids("nfsd") d = defaultdict(int) total = 0 T_table = TaskTable() for pid in subpids: t = T_table.getByTid(pid) d[t.state] += 1 total += 1 if ("TASK_UNINTERRUPTIBLE" in d): pylog.warning("Hanging nfsd threads") return True else: return False
def find_and_print(pid): tt = TaskTable() # if pid > INT_MAX, let us treat it as addr of 'struct task_struct' if (pid > INT_MAX): # Do we have this pid in tt? t = readSU("struct task_struct", pid) if (not tt.getByTid(t.pid)): print("Bogus addr") return t = Task(t, tt) else: t = tt.getByTid(pid) if (t): printTaskDetails(t) else: print("There is no task with pid=", pid)
def print_pstree(options): global pid_cnt pid_cnt = 0 init_task = readSymbol("init_task") if (options.task_id > 0): tt = TaskTable() init_task = tt.getByPid(options.task_id) if (init_task == None): init_task = findTaskByPid(options.task_id) if (init_task == None): return print_task(init_task, 0, True, options) print_children(init_task, 0, options) print ("\n\nTotal %s tasks printed" % (pid_cnt))
def check_possible_hang(): T_table = TaskTable() pids_UN = {t.pid for t in T_table.allThreads() \ if t.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE} tot_UN = len(pids_UN) # Now check how many pids are older than 120s mlist = [] for pid in pids_UN: t = T_table.getByTid(pid) mlist.append((t.Ran_ago, pid)) mlist = sorted(mlist) n_old = 0 for ran_ago, pid in mlist[-10:]: if (ran_ago > __OLD_AGO): n_old += 1 if (n_old > 1): pylog.warning("Possible hang") return n_old
def check_memory_pressure(_funcpids): subpids = _funcpids(__mp_names) if (not subpids): return False if (len(subpids) < 100): verifyFastSet(subpids, __mp_names) d = defaultdict(int) total = 0 T_table = TaskTable() for pid in subpids: t = T_table.getByTid(pid) d[t.state] += 1 total += 1 if ("TASK_UNINTERRUPTIBLE" in d or total > 20): pylog.warning("Memory pressure detected") print(" *** {} ***".format(__mp_names)) for k, v in d.items(): print(" {:4d} in {} state".format(v, k)) return True
def do_check(): tt = TaskTable() has_do_coredump = False has_UN = False for task in tt.getThreadsByComm('adclient'): pid = task.pid stack = exec_bt("bt {}".format(task.pid))[0] if (stack.hasfunc('do_coredump')): has_do_coredump = True if (task.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE): has_UN = True if (has_do_coredump and has_UN): break else: return # After some commands issued, GDB returns incorrect type for this - # 'char core_pattern[];' instead of ' char core_pattern[CORENAME_MAX_SIZE]' addr = sym2addr("core_pattern") core_pattern = SmartString(readmem(addr, 1), addr, None) #core_pattern = readSymbol("core_pattern") if (not core_pattern.startswith("|")): return abrt_hook = tt.getByComm('abrt-hook-ccpp') if (not abrt_hook): return __daemon = "/var/centrifydc/daemon" for sock in abrt_hook[0].get_task_socks(): family, sktype, protoname, inet = decodeSock(sock) if (protoname == 'UNIX'): sock = sock.castTo("struct unix_sock") state, ino, s_path = unix_sock(sock) p_state, p_ino, p_path = unix_sock(sock.Peer) for path in (s_path, p_path): if (path.startswith(__daemon)): pylog.info(__txt)
def run_check_on_multipath(): tt = TaskTable() bts = [] errors = 0 task_cnt = 0 multipathd_daemon = 0 # To verify if multipathd daemon is running multipath_blocked = 0 # To verify if multipathd daemon or command is blocked mpath_present = 0 # To verify if multipath device exists with or without # multipathd daemon running wq_blocked = 0 # To verify if scsi_wq or fc_wq is blocked kworker_md_blocked = 0 # Counter for hung worker threads which are waiting for # IO requests on mdraid devices print("\nChecking for device-mapper issues...\n") for t in tt.allThreads(): print("Getting a list of processes in UN state..." "(Count: {:d})".format(task_cnt), end="\r") if ('multipathd' in t.comm): multipathd_daemon = 1 if (t.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE): task_cnt += 1 # crash can miss some threads when there are pages missing # and it will not do 'bt' in that case. try: bts.append(exec_bt("bt %d" % t.pid)[0]) except: pass print("Getting a list of processes in UN state...\t\t\t[Done]") if (task_cnt): print("\nProcessing the back trace of hung tasks...\t\t\t", end='') for bt in bts: if ('kworker' in bt.cmd): if (bt.hasfunc('md_flush_request') and bt.hasfunc('dio_aio_complete_work')): kworker_md_blocked += 1 if ('multipath' in bt.cmd): multipath_blocked = 1 if (('scsi_wq' in bt.cmd) or ('fc_wq' in bt.cmd)): wq_blocked = 1 print("[Done]") # Checks for dm devices for dev in devlist: md, name = dev dm_table_map = StructResult("struct dm_table", md.map) # Check if there is any multipath device present in device-mapper table if (dm_table_map.targets.type.name == "multipath"): mpath_present += 1 # Check if kworker threads are stuck waiting to flush IO on mdraid devices if (kworker_md_blocked >= 5): print( "\n ** {} kworker threads are stuck in UN state waiting to flush the IO" "\n requests on mdraid devices. This could be a result of thundering" "\n herd problem. See reference: " "\n https://marc.info/?l=linux-raid&m=155364683109115&w=2". format(kworker_md_blocked)) print( "\n Run 'hanginfo' for more information on processes in UN state." ) errors += 1 # multipath devices are present but multipathd is not running if (mpath_present != 0 and multipathd_daemon == 0): print( "\n ** multipath device(s) are present, but multipathd service is" "\n not running. IO failover/failback may not work.") errors += 1 # scsi or fc work queue and multipathd are blocked if (multipath_blocked == 1 and wq_blocked == 1): print( "\n ** multipathd and scsi/fc work_queue processes are stuck in UN state," "\n this could block IO failover on multipath devices") print( "\n Run 'hanginfo' for more information on processes in UN state." ) errors += 1 # only multipathd process is stuck in UN state elif (multipath_blocked == 1): print("\n ** multipathd processes stuck in UN state," "\n this could block IO failover on multipath devices") print( "\n Run 'hanginfo' for more information on processes in UN state." ) errors += 1 if (errors == 0 and task_cnt != 0): print("\n No device-mapper, multipath issues detected by utility," "\n but found {} processes in UN state.".format(task_cnt)) print( "\n Run 'hanginfo' for more information on processes in UN state." ) elif (errors == 0 and task_cnt == 0): print("No issues detected by utility.")
def print_wait_for_AF_UNIX(v=0): tt = TaskTable() # Part I - find all Unix sockets with peers peers_dict = defaultdict(list) # peer-> (task, sock) list socks_dict = defaultdict(list) # sock-> owners for t in tt.allTasks(): once = TrueOnce(1) try: task_fds = t.taskFds() except crash.error: # page excluded continue last_ran = float(t.Ran_ago) / 1000 for fd, filep, dentry, inode in task_fds: socketaddr = proto.inode2socketaddr(inode) if (not socketaddr): continue socket = readSU("struct socket", socketaddr) sock = Deref(socket.sk) family, sktype, protoname, inet = proto.decodeSock(sock) if (family != proto.P_FAMILIES.PF_FILE): continue # AF_UNIX. on 2.4 we have just 'struct sock', # on 2.6 'struct unix_sock' if (not proto.sock_V1): sock = sock.castTo("struct unix_sock") #u_sock = readSU("struct unix_sock", 0xffff81073a7c3180) state, ino, path = proto.unix_sock(sock) socks_dict[sock].append((last_ran, t)) # Check whether we have a peer peer = sock.Peer if (peer): peers_dict[peer].append((t, sock)) # Part II - look at all peers nonempty_tasklist = [] for peer, lst in peers_dict.items(): state, ino, path = proto.unix_sock(peer) #if (path != "/dev/log"): # continue #sleep = peer.sk.sk_sleep try: waitq = peer.peer_wait except: waitq = peer.peer_wq.wait tasklist = decode_waitq(waitq) if (tasklist): owners = sorted(socks_dict[peer]) if (not owners): pylog.warning("Cannot find a socket for peer {}".format(peer)) continue last_ran, t = owners[0] # Sanitize tasklist in case it has corrupted pointers ntasklist = [] pids = [] for tt in tasklist: try: pids.append(tt.pid) ntasklist.append(tt) except: pylog.warning("Corrupted waitq of", peer) tasklist = ntasklist state, ino, path = proto.unix_sock(peer) # if last_ran is greater than this, issue a warning __max_time = 5 stack = t.get_task_stack() if (stack.hasfunc("console_lock")): pylog.warning("Syslog daemon is waiting for serial console") print(stack) if (v < 1 and last_ran < __max_time): continue if (v < 1 and path == "/dev/log"): # Just issue warnings msg = ("A problem with syslog daemon <{}> pid={} state={}\n" " It ran {:5.2f}s ago and {} processes" " are waiting for it" "".format(t.comm, t.pid, t.state[5:7], last_ran, len(tasklist))) if (v < 0): msg += ("\n Run 'hanginfo --syslogger -v' to get" " more details") if (t.pid in pids): msg += ("\n Task pid={} CMD={} is waiting for" " its own socket".format(t.pid, t.comm)) pylog.warning(msg) if (v < 0): return print(" -- Socket we wait for: {} {}".format(peer, path)) print(" Youngest process with this socket <{}> pid={}({}) ran " "{:5.2f}s ago".format(t.comm, t.pid, t.state[5:7], last_ran)) # Tasklist has been already sanitized getting rid of those # elements where we have been unable to deref task.pid if (tasklist): print(" ... {} tasks waiting for this socket".\ format(len(tasklist))) if (v > 0): for task in sorted(tasklist, key=operator.attrgetter('pid')): print(" pid=%7d CMD=%s" % (task.pid, task.comm)) else: print( " cannot print tasks as socket wait queue is corrupted" )
def print_pidlist(pids, title='', verbose=False, maxpids=10, sortbytime=True, statefilter=None): npids = len(pids) if (npids < 1): return # Prepare a list of (ran_ms_ago, pid) list to sort them mlist = [] T_table = TaskTable() for pid in pids: t = T_table.getByTid(pid) if (statefilter): state = t.state[5:7] if (state in statefilter): mlist.append((int(t.Ran_ago), pid)) else: mlist.append((int(t.Ran_ago), pid)) mlist = sorted(mlist) # Youngest and oldest if (not mlist): print(" no Pids") return ago_y, pid_y = mlist[0] ago_o, pid_o = mlist[-1] print(" ... {} pids. Youngest,oldest: {}, {} Ran ms ago:" " {}, {}".format(npids, pid_y, pid_o, ago_y, ago_o)) if (maxpids < 1): return if (npids > maxpids): print(" printing {} out of {}".format(maxpids, npids)) if (sortbytime): # ............. sorted by time ................................ if (npids > maxpids): n1 = maxpids // 2 n2 = maxpids - n1 ml1 = mlist[:n1] ml2 = mlist[-n2:] skipped = npids - maxpids mlp = ml1 + [(None, "<{} skipped>".format(skipped))] + ml2 else: mlp = mlist if (verbose): print(" PID CMD CPU Ran ms ago STATE") print(" -------- ------------ -- ------------- -----") for tago, pid in mlp: if (tago is not None): t = T_table.getByTid(pid) comm = str(t.comm)[:11] state = t.state[5:7] print(" {:8d} {:12s} {:3d} {:10d} {}".\ format(pid, comm, t.cpu, tago, state)) else: print(" {}".format(pid)) else: print(" sorted by ran_ago, youngest first".format(npids)) mlp_s = [pid for (ts, pid) in mlp] __print_pids(mlp_s, title) else: # ............. sorted by pid ................................. pids = sorted(pids) if (npids > maxpids): n1 = maxpids // 2 n2 = maxpids - n1 ml1 = pids[:n1] ml2 = pids[-n2:] mlp = ml1 + ["..."] + ml2 else: mlp = pids print(" sorted by pid") __print_pids(mlp, title)
def print_dev_pack(): ptype_all = readSymbol("ptype_all") #print ("ptype_all=", ptype_all, "\n") # For 2.4 packet_type has next pointer, for 2.6 list_head is embedded newstyle = (whatis("ptype_base").ctype == "struct list_head") if (newstyle): offset = member_offset("struct packet_type", "list") else: offset = member_offset("struct packet_type", "next") print("--------ptype_all-------------------------------------------") tt = TaskTable() if (newstyle): for pt in readSUListFromHead(Addr(ptype_all), "list", "struct packet_type"): print(pt) ptype = ntohs(pt.type) pdev = pt.dev pfunc = addr2sym(pt.func) print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc)) # for SOCK_PACKET and AF_PACKET we can find PID if (pt.af_packet_priv == 0): continue if (pfunc == 'packet_rcv' or pfunc == 'packet_rcv_spkt'): sock = readSU("struct sock", pt.af_packet_priv) socket = Deref(sock.sk_socket) filep = socket.file for t in tt.getByFile(filep): print("\t pid=%d, command=%s" % (t.pid, t.comm)) else: # 2.4 for pa in readList(ptype_all, offset): pt = readSU("struct packet_type", pa) print(pt) ptype = ntohs(pt.type) pdev = pt.dev pfunc = addr2sym(pt.func) print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc)) print("\n--------ptype_base-------------------------------------------") bucket = 0 for a in readSymbol("ptype_base"): if (newstyle): for pt in readSUListFromHead(Addr(a), "list", "struct packet_type"): print(pt, " (bucket=%d)" % bucket) ptype = ntohs(pt.type) pdev = pt.dev pfunc = addr2sym(pt.func) print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc)) else: # 2.4 if (a == 0): continue for pa in readList(a, offset): pt = readSU("struct packet_type", pa) print(pt, " (bucket=%d)" % bucket) ptype = ntohs(pt.type) pdev = pt.dev pfunc = addr2sym(pt.func) print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc)) bucket += 1
o = args = parser.parse_args() verbose = details = o.Verbose # Reset net_ns to default set_ns_net() if (o.Netns != -1): if (o.Netns == 'all'): set_ns_net('all') elif (not set_ns_net(o.Netns)): print("Invalid net ns {:#x}".format(o.Netns)) sys.exit(0) #__experimental = O.experimental if (o.Pid > -1): tt = TaskTable() task = tt.getByTid(o.Pid) if (task): printTaskSockets(task) sys.exit(0) tcpstate_filter = None _tasksocktable = None if (o.Pid == -2): tt = TaskTable() tasks = tt.allTasks() _tasksocktable = getAllSocks(tasks) # IF we are interested in retransmissions, add '--tcp' automatically tcp_retrans_only = o.retransonly
def pstree(pid=1): tt = TaskTable() init = tt.getByPid(pid) for s in walk_children(init, top=True): print(s)
def printTasks(reverse=False, maxtoprint=-1): tt = TaskTable() if (debug): print("Uptime:", ms2uptime(tt.basems)) out = [] if (not reverse): # Natural order (task followed by its threads) for mt in tt.allTasks(): out.append((mt.Ran_ago, mt.pid, mt)) for t in mt.threads: #print (" struct thread_info 0x%x" % long(t)) out.append((t.Ran_ago, t.pid, t)) hdr = 'Tasks in PID order, grouped by Thread Group leader' else: # Most recent first for t in tt.allThreads(): out.append((t.Ran_ago, t.pid, t)) out.sort() hdr = 'Tasks in reverse order, scheduled recently first' # Apply the filter if (taskstates_filter): out1 = [] for *group, t in out: sstate = t.state[5:7] if (sstate in taskstates_filter): out1.append((*group, t)) out = out1 nthreads = len(out) if (maxtoprint != -1 and maxtoprint < nthreads): # Split them 1:1 nbeg = maxtoprint // 2 nend = maxtoprint - nbeg out = out[:nbeg] + [(None, None, None)] + out[-nend:] extra = " ({} tasks skipped)".format(nthreads - maxtoprint) else: extra = '' # Print the header print("=== {}{} ===".format(hdr, extra)) _header = " PID CMD CPU Ran ms ago STATE\n" +\ "-------- ------------ -- ------------- -----" if (not runcmd): print(_header) for ran_ms_ago, pid, t in out: if (pid is None): print(" <snip>") continue sstate = t.state[5:7] tgid = t.tgid pid_template = " {:6d}" if (pid != tgid): if (not reverse): pid_template = " {:6d}" extra = " (tgid=%d)" % tgid else: extra = "" uid = t.Uid pid_s = pid_template.format(pid) extra = "%13s UID=%d" % (extra, uid) if (is_task_active(long(t.ts))): pid_s = ">" + pid_s[1:] uid = t.Uid # Thread pointers might be corrupted try: if (runcmd): print(_header) print ("%s %14s %3d %14d %s %s" \ % (pid_s, t.comm, t.cpu, int(ran_ms_ago), sstate, extra)) if (runcmd): _cmdline = "{} {}".format(runcmd, pid) print("\ncrash> {}".format(_cmdline)) out = exec_crash_command(_cmdline) if (": command not found: " in out): sys.exit(1) print(out) # In versbose mode, print stack as well if (verbose): bt = exec_bt("bt %d" % pid) print(bt[0]) if (verbose or runcmd): print("\n", "-" * 78, "\n", sep='') except crash.error: pylog.error("corrupted", t)
sys.exit(0) if (o.Taskfilter): taskstates_filter = re.split("\s*,\s*", o.Taskfilter) if (o.Cmd): runcmd = o.Cmd if (o.Memory): print_memory_stats(maxpids) elif (o.Reverse): printTasks(reverse=True) elif (o.Hang): taskstates_filter = 'UN' printTasks(reverse=True, maxtoprint=maxpids * 2) elif (o.Summary): tasksSummary() elif (o.Pstree): if (o.Pidinfo): pstree(o.Pidinfo) else: pstree() elif (o.Pidinfo): find_and_print(o.Pidinfo) elif (o.Ns): tt = TaskTable() print_namespaces_info(tt, verbose) else: printTasks()
action="store_true", help="Print tree of resources owners (experimental!)") op.add_option("--saphana", dest="Saphana", default=0, action="store_true", help="Print recommendations for SAP HANA specific hangs") (o, args) = op.parse_args() v = _VERBOSE = o.Verbose _PRINT_TREE = o.Tree _MAXPIDS = o.Maxpids _SORTBYTIME = not o.Sortbypid T_table = TaskTable() _SAPHANA = check_saphana() if (o.Version): print("HANGINFO version %s" % (__version__)) sys.exit(0) if (o.Syslogger): print_wait_for_AF_UNIX(_VERBOSE) sys.exit(0) if (o.Saphana): try: from LinuxDump.SapHana import doSapHana except ImportError: