Ejemplo n.º 1
0
def check_saphana():
    T_table = TaskTable()
    sap = T_table.getByComm("sapstart")
    hana = T_table.getByComm("hdbindexserver")
    if (sap and hana):
        return 2
    elif (sap):
        return 1
    else:
        return 0
Ejemplo n.º 2
0
def check_hanging_nfsd(_funcpids):
    subpids = _funcpids("nfsd")
    d = defaultdict(int)
    total = 0
    T_table = TaskTable()
    for pid in subpids:
        t = T_table.getByTid(pid)
        d[t.state] += 1
        total += 1
    if ("TASK_UNINTERRUPTIBLE" in d):
        pylog.warning("Hanging nfsd threads")
        return True
    else:
        return False
Ejemplo n.º 3
0
def find_and_print(pid):
    tt = TaskTable()
    # if pid > INT_MAX, let us treat it as addr of 'struct task_struct'
    if (pid > INT_MAX):
        # Do we have this pid in tt?
        t = readSU("struct task_struct", pid)
        if (not tt.getByTid(t.pid)):
            print("Bogus addr")
            return
        t = Task(t, tt)
    else:
        t = tt.getByTid(pid)
    if (t):
        printTaskDetails(t)
    else:
        print("There is no task with pid=", pid)
Ejemplo n.º 4
0
def print_pstree(options):
    global pid_cnt
    pid_cnt = 0
    init_task = readSymbol("init_task")
    if (options.task_id > 0):
        tt = TaskTable()
        init_task = tt.getByPid(options.task_id)
        if (init_task == None):
            init_task = findTaskByPid(options.task_id)
        if (init_task == None):
            return

    print_task(init_task, 0, True, options)
    print_children(init_task, 0, options)

    print ("\n\nTotal %s tasks printed" % (pid_cnt))
Ejemplo n.º 5
0
def check_possible_hang():
    T_table = TaskTable()
    pids_UN = {t.pid for t in T_table.allThreads() \
        if t.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE}
    tot_UN = len(pids_UN)
    # Now check how many pids are older than 120s
    mlist = []
    for pid in pids_UN:
        t = T_table.getByTid(pid)
        mlist.append((t.Ran_ago, pid))
    mlist = sorted(mlist)
    n_old = 0
    for ran_ago, pid in mlist[-10:]:
        if (ran_ago > __OLD_AGO):
            n_old += 1
    if (n_old > 1):
        pylog.warning("Possible hang")
    return n_old
Ejemplo n.º 6
0
def check_memory_pressure(_funcpids):
    subpids = _funcpids(__mp_names)
    if (not subpids):
        return False
    if (len(subpids) < 100):
        verifyFastSet(subpids, __mp_names)
    d = defaultdict(int)
    total = 0
    T_table = TaskTable()
    for pid in subpids:
        t = T_table.getByTid(pid)
        d[t.state] += 1
        total += 1
    if ("TASK_UNINTERRUPTIBLE" in d or total > 20):
        pylog.warning("Memory pressure detected")
        print("  *** {} ***".format(__mp_names))
        for k, v in d.items():
            print("   {:4d} in {} state".format(v, k))
        return True
Ejemplo n.º 7
0
def do_check():
    tt = TaskTable()
    has_do_coredump = False
    has_UN = False
    for task in tt.getThreadsByComm('adclient'):
        pid = task.pid
        stack = exec_bt("bt {}".format(task.pid))[0]
        if (stack.hasfunc('do_coredump')):
            has_do_coredump = True
        if (task.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE):
            has_UN = True
        if (has_do_coredump and has_UN):
            break
    else:
        return

    # After some commands issued, GDB returns incorrect type for this -
    # 'char core_pattern[];' instead of ' char core_pattern[CORENAME_MAX_SIZE]'
    addr = sym2addr("core_pattern")
    core_pattern = SmartString(readmem(addr, 1), addr, None)
    #core_pattern = readSymbol("core_pattern")
    if (not core_pattern.startswith("|")):
        return
    abrt_hook = tt.getByComm('abrt-hook-ccpp')

    if (not abrt_hook):
        return

    __daemon = "/var/centrifydc/daemon"
    for sock in abrt_hook[0].get_task_socks():
        family, sktype, protoname, inet = decodeSock(sock)
        if (protoname == 'UNIX'):
            sock = sock.castTo("struct unix_sock")
            state, ino, s_path = unix_sock(sock)
            p_state, p_ino, p_path = unix_sock(sock.Peer)
            for path in (s_path, p_path):
                if (path.startswith(__daemon)):
                    pylog.info(__txt)
Ejemplo n.º 8
0
def run_check_on_multipath():
    tt = TaskTable()
    bts = []
    errors = 0
    task_cnt = 0
    multipathd_daemon = 0  # To verify if multipathd daemon is running
    multipath_blocked = 0  # To verify if multipathd daemon or command is blocked
    mpath_present = 0  # To verify if multipath device exists with or without
    # multipathd daemon running
    wq_blocked = 0  # To verify if scsi_wq or fc_wq is blocked
    kworker_md_blocked = 0  # Counter for hung worker threads which are waiting for
    # IO requests on mdraid devices

    print("\nChecking for device-mapper issues...\n")

    for t in tt.allThreads():
        print("Getting a list of processes in UN state..."
              "(Count: {:d})".format(task_cnt),
              end="\r")
        if ('multipathd' in t.comm):
            multipathd_daemon = 1
        if (t.ts.state & TASK_STATE.TASK_UNINTERRUPTIBLE):
            task_cnt += 1
            # crash can miss some threads when there are pages missing
            # and it will not do 'bt' in that case.
            try:
                bts.append(exec_bt("bt %d" % t.pid)[0])
            except:
                pass
    print("Getting a list of processes in UN state...\t\t\t[Done]")

    if (task_cnt):
        print("\nProcessing the back trace of hung tasks...\t\t\t", end='')
        for bt in bts:
            if ('kworker' in bt.cmd):
                if (bt.hasfunc('md_flush_request')
                        and bt.hasfunc('dio_aio_complete_work')):
                    kworker_md_blocked += 1

            if ('multipath' in bt.cmd):
                multipath_blocked = 1

            if (('scsi_wq' in bt.cmd) or ('fc_wq' in bt.cmd)):
                wq_blocked = 1
        print("[Done]")

    # Checks for dm devices
    for dev in devlist:
        md, name = dev
        dm_table_map = StructResult("struct dm_table", md.map)
        # Check if there is any multipath device present in device-mapper table
        if (dm_table_map.targets.type.name == "multipath"):
            mpath_present += 1

    # Check if kworker threads are stuck waiting to flush IO on mdraid devices
    if (kworker_md_blocked >= 5):
        print(
            "\n ** {} kworker threads are stuck in UN state waiting to flush the IO"
            "\n    requests on mdraid devices. This could be a result of thundering"
            "\n    herd problem. See reference: "
            "\n    https://marc.info/?l=linux-raid&m=155364683109115&w=2".
            format(kworker_md_blocked))
        print(
            "\n    Run 'hanginfo' for more information on processes in UN state."
        )
        errors += 1

    # multipath devices are present but multipathd is not running
    if (mpath_present != 0 and multipathd_daemon == 0):
        print(
            "\n ** multipath device(s) are present, but multipathd service is"
            "\n    not running. IO failover/failback may not work.")
        errors += 1

    # scsi or fc work queue and multipathd are blocked
    if (multipath_blocked == 1 and wq_blocked == 1):
        print(
            "\n ** multipathd and scsi/fc work_queue processes are stuck in UN state,"
            "\n    this could block IO failover on multipath devices")
        print(
            "\n    Run 'hanginfo' for more information on processes in UN state."
        )
        errors += 1
    # only multipathd process is stuck in UN state
    elif (multipath_blocked == 1):
        print("\n ** multipathd processes stuck in UN state,"
              "\n    this could block IO failover on multipath devices")
        print(
            "\n    Run 'hanginfo' for more information on processes in UN state."
        )
        errors += 1

    if (errors == 0 and task_cnt != 0):
        print("\n    No device-mapper, multipath issues detected by utility,"
              "\n    but found {} processes in UN state.".format(task_cnt))
        print(
            "\n    Run 'hanginfo' for more information on processes in UN state."
        )
    elif (errors == 0 and task_cnt == 0):
        print("No issues detected by utility.")
Ejemplo n.º 9
0
def print_wait_for_AF_UNIX(v=0):
    tt = TaskTable()

    # Part I - find all Unix sockets with peers
    peers_dict = defaultdict(list)  # peer-> (task, sock) list
    socks_dict = defaultdict(list)  # sock-> owners
    for t in tt.allTasks():
        once = TrueOnce(1)
        try:
            task_fds = t.taskFds()
        except crash.error:
            # page excluded
            continue
        last_ran = float(t.Ran_ago) / 1000
        for fd, filep, dentry, inode in task_fds:
            socketaddr = proto.inode2socketaddr(inode)
            if (not socketaddr): continue

            socket = readSU("struct socket", socketaddr)
            sock = Deref(socket.sk)
            family, sktype, protoname, inet = proto.decodeSock(sock)
            if (family != proto.P_FAMILIES.PF_FILE):
                continue

            # AF_UNIX. on 2.4 we have just 'struct sock',
            # on 2.6 'struct unix_sock'
            if (not proto.sock_V1):
                sock = sock.castTo("struct unix_sock")

            #u_sock = readSU("struct unix_sock", 0xffff81073a7c3180)
            state, ino, path = proto.unix_sock(sock)
            socks_dict[sock].append((last_ran, t))
            # Check whether we have a peer
            peer = sock.Peer
            if (peer):
                peers_dict[peer].append((t, sock))

    # Part II - look at all peers
    nonempty_tasklist = []
    for peer, lst in peers_dict.items():
        state, ino, path = proto.unix_sock(peer)
        #if (path != "/dev/log"):
        #    continue
        #sleep = peer.sk.sk_sleep
        try:
            waitq = peer.peer_wait
        except:
            waitq = peer.peer_wq.wait
        tasklist = decode_waitq(waitq)
        if (tasklist):
            owners = sorted(socks_dict[peer])
            if (not owners):
                pylog.warning("Cannot find a socket for peer {}".format(peer))
                continue
            last_ran, t = owners[0]
            # Sanitize tasklist in case it has corrupted pointers
            ntasklist = []
            pids = []
            for tt in tasklist:
                try:
                    pids.append(tt.pid)
                    ntasklist.append(tt)
                except:
                    pylog.warning("Corrupted waitq of", peer)
            tasklist = ntasklist
            state, ino, path = proto.unix_sock(peer)
            # if last_ran is greater than this, issue a warning
            __max_time = 5
            stack = t.get_task_stack()
            if (stack.hasfunc("console_lock")):
                pylog.warning("Syslog daemon is waiting for serial console")
                print(stack)
            if (v < 1 and last_ran < __max_time):
                continue
            if (v < 1 and path == "/dev/log"):
                # Just issue warnings
                msg = ("A problem with syslog daemon <{}> pid={} state={}\n"
                       "       It ran {:5.2f}s ago and {} processes"
                       " are waiting for it"
                       "".format(t.comm, t.pid, t.state[5:7], last_ran,
                                 len(tasklist)))
                if (v < 0):
                    msg += ("\n       Run 'hanginfo --syslogger -v' to get"
                            " more details")
                if (t.pid in pids):
                    msg += ("\n       Task pid={} CMD={} is waiting for"
                            " its own socket".format(t.pid, t.comm))

                pylog.warning(msg)
                if (v < 0):
                    return

            print(" -- Socket we wait for: {} {}".format(peer, path))
            print("   Youngest process with this socket <{}> pid={}({}) ran "
                  "{:5.2f}s ago".format(t.comm, t.pid, t.state[5:7], last_ran))
            # Tasklist has been already sanitized getting rid of those
            # elements where we have been unable to deref task.pid
            if (tasklist):
                print("   ...  {} tasks waiting for this socket".\
                    format(len(tasklist)))

                if (v > 0):
                    for task in sorted(tasklist,
                                       key=operator.attrgetter('pid')):
                        print("     pid=%7d   CMD=%s" % (task.pid, task.comm))
            else:
                print(
                    "     cannot print tasks as socket wait queue is corrupted"
                )
Ejemplo n.º 10
0
def print_pidlist(pids,
                  title='',
                  verbose=False,
                  maxpids=10,
                  sortbytime=True,
                  statefilter=None):
    npids = len(pids)
    if (npids < 1):
        return
    # Prepare a list of (ran_ms_ago, pid) list to sort them
    mlist = []
    T_table = TaskTable()
    for pid in pids:
        t = T_table.getByTid(pid)
        if (statefilter):
            state = t.state[5:7]
            if (state in statefilter):
                mlist.append((int(t.Ran_ago), pid))
        else:
            mlist.append((int(t.Ran_ago), pid))
    mlist = sorted(mlist)
    # Youngest and oldest
    if (not mlist):
        print("  no Pids")
        return
    ago_y, pid_y = mlist[0]
    ago_o, pid_o = mlist[-1]

    print("    ... {} pids. Youngest,oldest: {}, {}  Ran ms ago:"
          " {}, {}".format(npids, pid_y, pid_o, ago_y, ago_o))

    if (maxpids < 1):
        return
    if (npids > maxpids):
        print("        printing {} out of {}".format(maxpids, npids))
    if (sortbytime):
        # ............. sorted by time ................................
        if (npids > maxpids):
            n1 = maxpids // 2
            n2 = maxpids - n1
            ml1 = mlist[:n1]
            ml2 = mlist[-n2:]
            skipped = npids - maxpids
            mlp = ml1 + [(None, "<{} skipped>".format(skipped))] + ml2
        else:
            mlp = mlist
        if (verbose):
            print("     PID          CMD       CPU   Ran ms ago   STATE")
            print("    --------   ------------  --  ------------- -----")
            for tago, pid in mlp:
                if (tago is not None):
                    t = T_table.getByTid(pid)
                    comm = str(t.comm)[:11]
                    state = t.state[5:7]
                    print("    {:8d}  {:12s}  {:3d} {:10d}      {}".\
                        format(pid, comm, t.cpu, tago, state))
                else:
                    print("             {}".format(pid))
        else:
            print("        sorted by ran_ago, youngest first".format(npids))
            mlp_s = [pid for (ts, pid) in mlp]
            __print_pids(mlp_s, title)

    else:
        # ............. sorted by pid .................................
        pids = sorted(pids)
        if (npids > maxpids):
            n1 = maxpids // 2
            n2 = maxpids - n1
            ml1 = pids[:n1]
            ml2 = pids[-n2:]
            mlp = ml1 + ["..."] + ml2
        else:
            mlp = pids
        print("        sorted by pid")
        __print_pids(mlp, title)
Ejemplo n.º 11
0
def print_dev_pack():
    ptype_all = readSymbol("ptype_all")
    #print ("ptype_all=", ptype_all, "\n")
    # For 2.4 packet_type has next pointer, for 2.6 list_head is embedded
    newstyle = (whatis("ptype_base").ctype == "struct list_head")
    if (newstyle):
        offset = member_offset("struct packet_type", "list")
    else:
        offset = member_offset("struct packet_type", "next")

    print("--------ptype_all-------------------------------------------")
    tt = TaskTable()
    if (newstyle):
        for pt in readSUListFromHead(Addr(ptype_all), "list",
                                     "struct packet_type"):
            print(pt)

            ptype = ntohs(pt.type)
            pdev = pt.dev
            pfunc = addr2sym(pt.func)
            print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc))

            # for SOCK_PACKET and AF_PACKET we can find PID
            if (pt.af_packet_priv == 0):
                continue

            if (pfunc == 'packet_rcv' or pfunc == 'packet_rcv_spkt'):
                sock = readSU("struct sock", pt.af_packet_priv)
                socket = Deref(sock.sk_socket)
                filep = socket.file
                for t in tt.getByFile(filep):
                    print("\t    pid=%d, command=%s" % (t.pid, t.comm))
    else:
        # 2.4
        for pa in readList(ptype_all, offset):
            pt = readSU("struct packet_type", pa)
            print(pt)

            ptype = ntohs(pt.type)
            pdev = pt.dev
            pfunc = addr2sym(pt.func)
            print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc))

    print("\n--------ptype_base-------------------------------------------")
    bucket = 0
    for a in readSymbol("ptype_base"):
        if (newstyle):
            for pt in readSUListFromHead(Addr(a), "list",
                                         "struct packet_type"):
                print(pt, " (bucket=%d)" % bucket)

                ptype = ntohs(pt.type)
                pdev = pt.dev
                pfunc = addr2sym(pt.func)
                print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc))
        else:
            # 2.4
            if (a == 0):
                continue
            for pa in readList(a, offset):
                pt = readSU("struct packet_type", pa)
                print(pt, " (bucket=%d)" % bucket)

                ptype = ntohs(pt.type)
                pdev = pt.dev
                pfunc = addr2sym(pt.func)
                print("\ttype=0x%04x dev=0x%x func=%s" % (ptype, pdev, pfunc))
        bucket += 1
Ejemplo n.º 12
0
    o = args = parser.parse_args()

    verbose = details = o.Verbose

    # Reset net_ns to default
    set_ns_net()
    if (o.Netns != -1):
        if (o.Netns == 'all'):
            set_ns_net('all')
        elif (not set_ns_net(o.Netns)):
            print("Invalid net ns {:#x}".format(o.Netns))
            sys.exit(0)

    #__experimental = O.experimental
    if (o.Pid > -1):
        tt = TaskTable()
        task = tt.getByTid(o.Pid)

        if (task):
            printTaskSockets(task)
        sys.exit(0)

    tcpstate_filter = None
    _tasksocktable = None
    if (o.Pid == -2):
        tt = TaskTable()
        tasks = tt.allTasks()
        _tasksocktable = getAllSocks(tasks)

    # IF we are interested in retransmissions, add '--tcp' automatically
    tcp_retrans_only = o.retransonly
Ejemplo n.º 13
0
def pstree(pid=1):
    tt = TaskTable()
    init = tt.getByPid(pid)
    for s in walk_children(init, top=True):
        print(s)
Ejemplo n.º 14
0
def printTasks(reverse=False, maxtoprint=-1):
    tt = TaskTable()
    if (debug):
        print("Uptime:", ms2uptime(tt.basems))

    out = []

    if (not reverse):
        # Natural order (task followed by its threads)
        for mt in tt.allTasks():
            out.append((mt.Ran_ago, mt.pid, mt))
            for t in mt.threads:
                #print ("    struct thread_info 0x%x" % long(t))
                out.append((t.Ran_ago, t.pid, t))
        hdr = 'Tasks in PID order, grouped by Thread Group leader'
    else:
        # Most recent first
        for t in tt.allThreads():
            out.append((t.Ran_ago, t.pid, t))
        out.sort()
        hdr = 'Tasks in reverse order, scheduled recently first'

    # Apply the filter
    if (taskstates_filter):
        out1 = []
        for *group, t in out:
            sstate = t.state[5:7]
            if (sstate in taskstates_filter):
                out1.append((*group, t))
        out = out1

    nthreads = len(out)
    if (maxtoprint != -1 and maxtoprint < nthreads):
        # Split them 1:1
        nbeg = maxtoprint // 2
        nend = maxtoprint - nbeg
        out = out[:nbeg] + [(None, None, None)] + out[-nend:]
        extra = " ({} tasks skipped)".format(nthreads - maxtoprint)
    else:
        extra = ''

    # Print the header
    print("=== {}{} ===".format(hdr, extra))
    _header = " PID          CMD       CPU   Ran ms ago   STATE\n" +\
    "--------   ------------  --  ------------- -----"
    if (not runcmd):
        print(_header)

    for ran_ms_ago, pid, t in out:
        if (pid is None):
            print("           <snip>")
            continue
        sstate = t.state[5:7]
        tgid = t.tgid
        pid_template = " {:6d}"
        if (pid != tgid):
            if (not reverse):
                pid_template = "  {:6d}"
            extra = " (tgid=%d)" % tgid
        else:
            extra = ""
        uid = t.Uid
        pid_s = pid_template.format(pid)
        extra = "%13s UID=%d" % (extra, uid)
        if (is_task_active(long(t.ts))):
            pid_s = ">" + pid_s[1:]

        uid = t.Uid
        # Thread pointers might be corrupted
        try:
            if (runcmd):
                print(_header)
            print ("%s %14s %3d %14d  %s %s" \
                        % (pid_s, t.comm,  t.cpu,
                            int(ran_ms_ago), sstate, extra))
            if (runcmd):
                _cmdline = "{} {}".format(runcmd, pid)
                print("\ncrash> {}".format(_cmdline))
                out = exec_crash_command(_cmdline)
                if (": command not found: " in out):
                    sys.exit(1)
                print(out)
            # In versbose mode, print stack as well
            if (verbose):
                bt = exec_bt("bt %d" % pid)
                print(bt[0])
            if (verbose or runcmd):
                print("\n", "-" * 78, "\n", sep='')

        except crash.error:
            pylog.error("corrupted", t)
Ejemplo n.º 15
0
        sys.exit(0)

    if (o.Taskfilter):
        taskstates_filter = re.split("\s*,\s*", o.Taskfilter)

    if (o.Cmd):
        runcmd = o.Cmd

    if (o.Memory):
        print_memory_stats(maxpids)

    elif (o.Reverse):
        printTasks(reverse=True)
    elif (o.Hang):
        taskstates_filter = 'UN'
        printTasks(reverse=True, maxtoprint=maxpids * 2)
    elif (o.Summary):
        tasksSummary()
    elif (o.Pstree):
        if (o.Pidinfo):
            pstree(o.Pidinfo)
        else:
            pstree()
    elif (o.Pidinfo):
        find_and_print(o.Pidinfo)
    elif (o.Ns):
        tt = TaskTable()
        print_namespaces_info(tt, verbose)
    else:
        printTasks()
Ejemplo n.º 16
0
                  action="store_true",
                  help="Print tree of resources owners  (experimental!)")

    op.add_option("--saphana",
                  dest="Saphana",
                  default=0,
                  action="store_true",
                  help="Print recommendations for SAP HANA specific hangs")

    (o, args) = op.parse_args()

    v = _VERBOSE = o.Verbose
    _PRINT_TREE = o.Tree
    _MAXPIDS = o.Maxpids
    _SORTBYTIME = not o.Sortbypid
    T_table = TaskTable()
    _SAPHANA = check_saphana()

    if (o.Version):
        print("HANGINFO version %s" % (__version__))
        sys.exit(0)

    if (o.Syslogger):
        print_wait_for_AF_UNIX(_VERBOSE)
        sys.exit(0)

    if (o.Saphana):
        try:
            from LinuxDump.SapHana import doSapHana
        except ImportError: