def status(nodes): util.output("%-10s %-10s %-10s %-13s %-6s %-6s %-20s " % ("Name", "Type", "Host", "Status", "Pid", "Peers", "Started")) all = isRunning(nodes) running = [] cmds1 = [] cmds2 = [] for (node, isrunning) in all: if isrunning: running += [node] cmds1 += [(node, "cat-file", ["%s/.startup" % node.cwd()])] cmds2 += [(node, "cat-file", ["%s/.status" % node.cwd()])] startups = execute.runHelperParallel(cmds1) statuses = execute.runHelperParallel(cmds2) startups = dict([(n.tag, success and util.fmttime(output[0]) or "???") for (n, success, output) in startups]) statuses = dict([(n.tag, success and output[0].split()[0].lower() or "???") for (n, success, output) in statuses]) peers = {} nodes = [n for n in running if statuses[n.tag] == "running"] for (node, success, args) in _queryPeerStatus(nodes): if success: peers[node.tag] = [] for f in args[0].split(): (key, val) = f.split("=") if key == "peer" and val != "": peers[node.tag] += [val] else: peers[node.tag] = None for (node, isrunning) in all: util.output("%-10s " % node.tag, nl=False) util.output("%-10s %-10s " % (node.type, node.host), nl=False) if isrunning: util.output("%-13s " % statuses[node.tag], nl=False) elif node.hasCrashed(): util.output("%-13s " % "crashed", nl=False) else: util.output("%-13s " % "stopped", nl=False) if isrunning: util.output("%-6s " % node.getPID(), nl=False) if node.tag in peers and peers[node.tag] != None: util.output("%-6d " % len(peers[node.tag]), nl=False) else: util.output("%-6s " % "???", nl=False) util.output("%-8s " % startups[node.tag], nl=False) util.output()
def status(nodes): util.output("%-10s %-10s %-10s %-13s %-6s %-6s %-20s " % ("Name", "Type", "Host", "Status", "Pid", "Peers", "Started")) all = isRunning(nodes) running = [] cmds1 = [] cmds2 = [] for (node, isrunning) in all: if isrunning: running += [node] cmds1 += [(node, "cat-file", ["%s/.startup" % node.cwd()])] cmds2 += [(node, "cat-file", ["%s/.status" % node.cwd()])] startups = execute.runHelperParallel(cmds1) statuses = execute.runHelperParallel(cmds2) startups = dict([(n.name, success and util.fmttime(output[0]) or "???") for (n, success, output) in startups]) statuses = dict([(n.name, success and output[0].split()[0].lower() or "???") for (n, success, output) in statuses]) peers = {} nodes = [n for n in running if statuses[n.name] == "running"] for (node, success, args) in _queryPeerStatus(nodes): if success: peers[node.name] = [] for f in args[0].split(): keyval = f.split("=") if len(keyval) > 1: (key, val) = keyval if key == "peer" and val != "": peers[node.name] += [val] else: peers[node.name] = None for (node, isrunning) in all: util.output("%-10s " % node.name, nl=False) util.output("%-10s %-10s " % (node.type, node.host), nl=False) if isrunning: util.output("%-13s " % statuses[node.name], nl=False) elif node.hasCrashed(): util.output("%-13s " % "crashed", nl=False) else: util.output("%-13s " % "stopped", nl=False) if isrunning: util.output("%-6s " % node.getPID(), nl=False) if node.name in peers and peers[node.name] != None: util.output("%-6d " % len(peers[node.name]), nl=False) else: util.output("%-6s " % "???", nl=False) util.output("%-8s " % startups[node.name], nl=False) util.output()
def isRunning(nodes, setcrashed=True): results = [] cmds = [] for node in nodes: pid = node.getPID() if not pid: results += [(node, False)] continue cmds += [(node, "check-pid", [str(pid)])] for (node, success, output) in execute.runHelperParallel(cmds): # If we cannot connect to the host at all, we filter it out because # the process might actually still be running but we can't tell. if output == None: if config.Config.cron == "0": util.warn("cannot connect to %s" % node.name) continue results += [(node, success)] if not success: if setcrashed: # Grmpf. It crashed. node.clearPID() node.setCrashed() return results
def isRunning(nodes, setcrashed=True): results = [] cmds = [] for node in nodes: pid = node.getPID() if not pid: results += [(node, False)] continue cmds += [(node, "check-pid", [str(pid)])] for (node, success, output) in execute.runHelperParallel(cmds): # If we cannot connect to the host at all, we filter it out because # the process might actually still be running but we can't tell. if output == None: if config.Config.cron == "0": util.warn("cannot connect to %s" % node.name) continue results += [(node, success)] if not success: if setcrashed: # Grmpf. It crashed. node.clearPID(); node.setCrashed() return results
def getDf(nodes): dirs = ("logdir", "bindir", "helperdir", "cfgdir", "spooldir", "policydir", "libdir", "tmpdir", "staticdir", "scriptsdir") df = {} for node in nodes: df[node.tag] = {} for dir in dirs: path = config.Config.config[dir] cmds = [] for node in nodes: cmds += [(node, "df", [path])] results = execute.runHelperParallel(cmds) for (node, success, output) in results: if success: fields = output[0].split() # Ignore NFS mounted volumes. if fields[0].find(":") < 0: df[node.tag][fields[0]] = fields result = {} for node in df: result[node] = df[node].values() return result
def getCapstatsOutput(nodes, interval): if not config.Config.capstats: if config.Config.cron == "0": util.warn("do not have capstats binary available") return [] results = [] cmds = [] hosts = {} for node in nodes: try: hosts[(node.addr, node.interface)] = node except AttributeError: continue for (addr, interface) in hosts.keys(): node = hosts[addr, interface] capstats = [ config.Config.capstats, "-i", interface, "-I", str(interval), "-n", "1" ] # Unfinished feature: only consider a particular MAC. Works here for capstats # but Bro config is not adapted currently so we disable it for now. # try: # capstats += ["-f", "\\'", "ether dst %s" % node.ether, "\\'"] # except AttributeError: # pass cmds += [(node, "run-cmd", capstats)] outputs = execute.runHelperParallel(cmds) for (node, success, output) in outputs: if not success: results += [(node, "%s: cannot execute capstats" % node.tag, {})] continue fields = output[0].split() vals = {} try: for field in fields[1:]: (key, val) = field.split("=") vals[key] = float(val) results += [(node, None, vals)] except ValueError: results += [(node, "%s: unexpected capstats output: %s" % (node.tag, output[0]), {})] return results
def getDf(nodes): hadError = False dirs = ( "logdir", "bindir", "helperdir", "cfgdir", "spooldir", "policydir", "libdir", "tmpdir", "staticdir", "scriptsdir", ) df = {} for node in nodes: df[node.name] = {} for dir in dirs: path = config.Config.config[dir] cmds = [] for node in nodes: if dir == "logdir" and node.type != "manager": # Don't need this on the workers/proxies. continue cmds += [(node, "df", [path])] results = execute.runHelperParallel(cmds) for (node, success, output) in results: if success: if output: fields = output[0].split() # Ignore NFS mounted volumes. if fields[0].find(":") < 0: df[node.name][fields[0]] = fields else: util.output("error checking disk space on node '%s': no df output" % node) hadError = True else: if output: msg = output[0] else: msg = "unknown failure" util.output("error checking disk space on node '%s': %s" % (node, msg)) hadError = True result = {} for node in df: result[node] = df[node].values() return (hadError, result)
def executeCmd(nodes, cmd): for special in "|'\"": cmd = cmd.replace(special, "\\" + special) cmds = [(n, "run-cmd", [cmd]) for n in nodes] for (node, success, output) in execute.runHelperParallel(cmds): util.output("[%s] %s\n> %s" % (node.host, (success and " " or "error"), "\n> ".join(output)))
def attachGdb(nodes): running = isRunning(nodes) cmds = [] for (node, isrunning) in running: if isrunning: cmds += [(node, "gdb-attach", ["gdb-%s" % node.name, config.Config.bro, str(node.getPID())])] results = execute.runHelperParallel(cmds) for (node, success, output) in results: if success: util.output("gdb attached on %s" % node.name) else: util.output("cannot attach gdb on %s: %s" % node.name, output)
def getDf(nodes): dirs = ("logdir", "bindir", "helperdir", "cfgdir", "spooldir", "policydir", "libdir", "tmpdir", "staticdir", "scriptsdir") df = {} for node in nodes: df["%s/%s" % (node.name, node.host)] = {} for dir in dirs: path = config.Config.config[dir] cmds = [] for node in nodes: if dir == "logdir" and node.type != "manager": # Don't need this on the workers/proxies. continue cmds += [(node, "df", [path])] results = execute.runHelperParallel(cmds) for (node, success, output) in results: nodehost = "%s/%s" % (node.name, node.host) if success: if output: fields = output[0].split() # Ignore NFS mounted volumes. if fields[0].find(":") < 0: total = float(fields[1]) used = float(fields[2]) avail = float(fields[3]) perc = used * 100.0 / (used + avail) df[nodehost][fields[0]] = [fields[0], total, used, avail, perc] else: df[nodehost]["FAIL"] = ["FAIL", "no output from df helper"] else: if output: msg = output[0] else: msg = "unknown failure" df[nodehost]["FAIL"] = ["FAIL", msg] result = [] for node in nodes: nodehost = "%s/%s" % (node.name, node.host) result.append((nodehost, df[nodehost].values())) return result
def _makeCrashReports(nodes): cmds = [] for node in nodes: cmds += [(node, "run-cmd", [ os.path.join(config.Config.scriptsdir, "post-terminate"), node.cwd(), "crash" ])] for (node, success, output) in execute.runHelperParallel(cmds): if not success: util.output("cannot run post-terminate for %s" % node.tag) else: util.sendMail("Crash report from %s" % node.tag, "\n".join(output)) node.clearCrashed()
def _makeCrashReports(nodes): for n in nodes: plugin.Registry.broProcessDied(n) cmds = [] for node in nodes: cmds += [(node, "run-cmd", [os.path.join(config.Config.scriptsdir, "post-terminate"), node.cwd(), "crash"])] for (node, success, output) in execute.runHelperParallel(cmds): if not success: util.output("cannot run post-terminate for %s" % node.name) else: util.sendMail("Crash report from %s" % node.name, "\n".join(output)) node.clearCrashed()
def attachGdb(nodes): running = isRunning(nodes) cmds = [] for (node, isrunning) in running: if isrunning: cmds += [(node, "gdb-attach", ["gdb-%s" % node.tag, config.Config.bro, node.getPID()])] results = execute.runHelperParallel(cmds) for (node, success, output) in results: if success: util.output("gdb attached on %s" % node.tag) else: util.output("cannot attach gdb on %s: %s" % node.tag, output)
def _makeCrashReports(nodes): for n in nodes: plugin.Registry.broProcessDied(n) msg = "If you want to help us debug this problem, then please forward\nthis mail to [email protected]\n" cmds = [] for node in nodes: cmds += [(node, "run-cmd", [os.path.join(config.Config.scriptsdir, "post-terminate"), node.cwd(), "crash"])] for (node, success, output) in execute.runHelperParallel(cmds): if not success: util.output("cannot run post-terminate for %s" % node.name) else: util.sendMail("Crash report from %s" % node.name, msg + "\n".join(output)) node.clearCrashed()
def getDf(nodes): dirs = ("logdir", "bindir", "helperdir", "cfgdir", "spooldir", "policydir", "libdir", "tmpdir", "staticdir", "scriptsdir") df = {} for node in nodes: df[node.name] = {} for dir in dirs: path = config.Config.config[dir] cmds = [] for node in nodes: if dir == "logdir" and node.type != "manager": # Don't need this on the workers/proxies. continue cmds += [(node, "df", [path])] results = execute.runHelperParallel(cmds) for (node, success, output) in results: if success: if len(output) > 0: fields = output[0].split() # Ignore NFS mounted volumes. if fields[0].find(":") < 0: df[node.name][fields[0]] = fields else: util.warn("Invalid df output for node '%s'." % node) result = {} for node in df: result[node] = df[node].values() return result
def getCapstatsOutput(nodes, interval): if not config.Config.capstatspath: if config.Config.cron == "0": util.warn("do not have capstats binary available") return [] results = [] cmds = [] hosts = {} for node in nodes: try: hosts[(node.addr, node.interface)] = node except AttributeError: continue for (addr, interface) in hosts.keys(): node = hosts[addr, interface] capstats = [config.Config.capstatspath, "-i", interface, "-I", str(interval), "-n", "1"] # Unfinished feature: only consider a particular MAC. Works here for capstats # but Bro config is not adapted currently so we disable it for now. # try: # capstats += ["-f", "\\'", "ether dst %s" % node.ether, "\\'"] # except AttributeError: # pass cmds += [(node, "run-cmd", capstats)] outputs = execute.runHelperParallel(cmds) totals = {} for (node, success, output) in outputs: if not success: results += [(node, "%s: cannot execute capstats" % node.name, {})] continue fields = output[0].split() vals = { } try: for field in fields[1:]: (key, val) = field.split("=") val = float(val) vals[key] = val try: totals[key] += val except KeyError: totals[key] = val results += [(node, None, vals)] except ValueError: results += [(node, "%s: unexpected capstats output: %s" % (node.name, output[0]), {})] # Add pseudo-node for totals if len(nodes) > 1: results += [(node_mod.Node("$total"), None, totals)] return results
def waitForBros(nodes, status, timeout, ensurerunning): # If ensurerunning is true, process must still be running. if ensurerunning: running = isRunning(nodes) else: running = [(node, True) for node in nodes] results = [] # Determine set of nodes still to check. todo = {} for (node, isrunning) in running: if isrunning: todo[node.name] = node else: results += [(node, False)] more_than_one = (len(todo) > 1) points = False while True: # Determine whether process is still running. We need to do this # before we get the state to avoid a race condition. running = isRunning(todo.values(), setcrashed=False) # Check nodes' .status file cmds = [] for node in todo.values(): cmds += [(node, "cat-file", ["%s/.status" % node.cwd()])] for (node, success, output) in execute.runHelperParallel(cmds): if success: try: (stat, loc) = output[0].split() if status in stat: # Status reached. Cool. del todo[node.name] results += [(node, True)] except IndexError: # Something's wrong. We give up on that node. del todo[node.name] results += [(node, False)] for (node, isrunning) in running: if node.name in todo and not isrunning: # Alright, a dead node's status will not change anymore. del todo[node.name] results += [(node, False)] if len(todo) == 0: # All done. break # Wait a bit before we start over. time.sleep(1) # Timeout reached? timeout -= 1 if timeout <= 0: break if more_than_one: util.output("%d " % len(todo), nl=False) else: util.output(".", nl=False) points = True for node in todo.values(): # These did time-out. results += [(node, False)] if points: if more_than_one: util.output("%d " % len(todo)) else: util.output("") return results
def getTopOutput(nodes): results = [] cmds = [] running = isRunning(nodes) # Get all the PIDs first. pids = {} parents = {} for (node, isrunning) in running: if isrunning: pid = node.getPID() pids[node.name] = [pid] parents[node.name] = str(pid) cmds += [(node, "get-childs", [str(pid)])] else: results += [(node, "not running", [{}])] continue if not cmds: return results for (node, success, output) in execute.runHelperParallel(cmds): if not success: results += [(node, "cannot get child pids", [{}])] continue pids[node.name] += [int(line) for line in output] cmds = [] hosts = {} # Now run top once per host. for node in nodes: # Do the loop again to keep the order. if node.name not in pids: continue if node.host in hosts: continue hosts[node.host] = 1 cmds += [(node, "top", [])] if not cmds: return results res = {} for (node, success, output) in execute.runHelperParallel(cmds): res[node.host] = (success, output) # Gather results for all the nodes that are running for node in nodes: if node.name not in pids: continue success, output = res[node.host] if not success or not output: results += [(node, "cannot get top output", [{}])] continue procs = [line.split() for line in output if int(line.split()[0]) in pids[node.name]] if not procs: # It's possible that the process is no longer there. results += [(node, "not running", [{}])] continue vals = [] try: for p in procs: d = {} d["pid"] = int(p[0]) d["proc"] = (p[0] == parents[node.name] and "parent" or "child") d["vsize"] = long(float(p[1])) #May be something like 2.17684e+9 d["rss"] = long(float(p[2])) d["cpu"] = p[3] d["cmd"] = " ".join(p[4:]) vals += [d] except ValueError, err: results += [(node, "unexpected top output: %s" % err, [{}])] continue results += [(node, None, vals)]
def getCapstatsOutput(nodes, interval): results = [] hosts = {} for node in nodes: if not node.interface: continue try: hosts[(node.addr, node.interface)] = node except AttributeError: continue cmds = [] for (addr, interface) in hosts.keys(): node = hosts[addr, interface] # If interface name contains semicolons (to aggregate traffic from # multiple devices with PF_RING, the interface name can be in a # semicolon-delimited format, such as "p2p1;p2p2"), then we must # quote it to prevent shell from interpreting semicolon as command # separator (another layer of quotes is needed because the eval # command is used). capstats = [config.Config.capstatspath, "-I", str(interval), "-n", "1", "-i", "'\"%s\"'" % interface] cmds += [(node, "run-cmd", capstats)] outputs = execute.runHelperParallel(cmds) totals = {} for (node, success, output) in outputs: if not success: if output: results += [(node, "%s: capstats failed (%s)" % (node.name, output[0]), {})] else: results += [(node, "%s: cannot execute capstats" % node.name, {})] continue if not output: results += [(node, "%s: no capstats output" % node.name, {})] continue fields = output[0].split()[1:] if not fields: results += [(node, "%s: unexpected capstats output: %s" % (node.name, output[0]), {})] continue vals = {} try: for field in fields: (key, val) = field.split("=") val = float(val) vals[key] = val if key in totals: totals[key] += val else: totals[key] = val results += [(node, None, vals)] except ValueError: results += [(node, "%s: unexpected capstats output: %s" % (node.name, output[0]), {})] # Add pseudo-node for totals if len(nodes) > 1: results += [(node_mod.Node("$total"), None, totals)] return results
def getTopOutput(nodes): results = [] cmds = [] running = isRunning(nodes) # Get all the PIDs first. pids = {} parents = {} for (node, isrunning) in running: if isrunning: pid = node.getPID() pids[node.name] = [pid] parents[node.name] = str(pid) cmds += [(node, "get-childs", [str(pid)])] else: results += [(node, "not running", [{}])] continue if not cmds: return results for (node, success, output) in execute.runHelperParallel(cmds): if not success: results += [(node, "cannot get child pids", [{}])] continue pids[node.name] += [int(line) for line in output] cmds = [] # Now run top. for node in nodes: # Do the loop again to keep the order. if not node.name in pids: continue cmds += [(node, "top", [])] if not cmds: return results for (node, success, output) in execute.runHelperParallel(cmds): if not success: results += [(node, "cannot get top output", [{}])] procs = [line.split() for line in output if int(line.split()[0]) in pids[node.name]] if not procs: # It can happen that on the meantime the process is not there anymore. results += [(node, "not running", [{}])] continue vals = [] for p in procs: d = {} d["pid"] = int(p[0]) d["proc"] = (p[0] == parents[node.name] and "parent" or "child") d["vsize"] = long(float(p[1])) # May be something like 2.17684e+09 d["rss"] = long(float(p[2])) d["cpu"] = p[3] d["cmd"] = " ".join(p[4:]) vals += [d] results += [(node, None, vals)] return results
def _stopNodes(nodes): results = [] running = [] # Check for crashed nodes. for (node, isrunning) in isRunning(nodes): if isrunning: running += [node] util.output("stopping %s ..." % node.name) else: results += [(node, True)] if node.hasCrashed(): _makeCrashReports([node]) util.output("%s not running (was crashed)" % node.name) else: util.output("%s not running" % node.name) # Helper function to stop nodes with given signal. def stop(nodes, signal): cmds = [] for node in nodes: cmds += [(node, "stop", [str(node.getPID()), str(signal)])] return execute.runHelperParallel(cmds) #events = [] #for node in nodes: # events += [(node, "Control::shutdown_request", [], "Control::shutdown_response")] #return execute.sendEventsParallel(events) # Stop nodes. for (node, success, output) in stop(running, 15): if not success: util.output("failed to send stop signal to %s" % node.name) if running: time.sleep(1) # Check whether they terminated. terminated = [] kill = [] for (node, success) in waitForBros(running, "TERMINATED", int(config.Config.stoptimeout), False): if not success: # Check whether it crashed during shutdown ... result = isRunning([node]) for (node, isrunning) in result: if isrunning: util.output("%s did not terminate ... killing ..." % node.name) kill += [node] else: # crashed flag is set by isRunning(). util.output("%s crashed during shutdown" % node.name) if len(kill): # Kill those which did not terminate gracefully. stop(kill, 9) # Given them a bit to disappear. time.sleep(5) # Check which are still running. We check all nodes to be on the safe side # and give them a bit more time to finally disappear. timeout = 10 todo = {} for node in running: todo[node.name] = node while True: running = isRunning(todo.values(), setcrashed=False) for (node, isrunning) in running: if node.name in todo and not isrunning: # Alright, it's gone. del todo[node.name] terminated += [node] results += [(node, True)] if len(todo) == 0: # All done. break # Wait a bit before we start over. if timeout <= 0: break time.sleep(1) timeout -= 1 results += [(node, False) for node in todo] # Do post-terminate cleanup for those which terminated gracefully. cleanup = [node for node in terminated if not node.hasCrashed()] cmds = [] for node in cleanup: cmds += [(node, "run-cmd", [os.path.join(config.Config.scriptsdir, "post-terminate"), node.cwd()])] for (node, success, output) in execute.runHelperParallel(cmds): if not success: util.output("cannot run post-terminate for %s" % node.name) cron.logAction(node, "stopped (failed)") else: cron.logAction(node, "stopped") node.clearPID() node.clearCrashed() return results
def stop(nodes, signal): cmds = [] for node in nodes: cmds += [(node, "stop", [str(node.getPID()), str(signal)])] return execute.runHelperParallel(cmds)
def _startNodes(nodes): results = [] filtered = [] # Ignore nodes which are still running. for (node, isrunning) in isRunning(nodes): if not isrunning: filtered += [node] util.output("starting %s ..." % node.name) else: util.output("%s still running" % node.name) nodes = filtered # Generate crash report for any crashed nodes. crashed = [node for node in nodes if node.hasCrashed()] _makeCrashReports(crashed) # Make working directories. dirs = [(node, node.cwd()) for node in nodes] nodes = [] for (node, success) in execute.mkdirs(dirs): if success: nodes += [node] else: util.output("cannot create working directory for %s" % node.name) results += [(node, False)] # Start Bro process. cmds = [] envs = [] for node in nodes: cmds += [(node, "start", [node.cwd()] + _makeBroParams(node, True))] envs += [_makeEnvParam(node)] nodes = [] for (node, success, output) in execute.runHelperParallel(cmds, envs=envs): if success: nodes += [node] node.setPID(int(output[0])) else: util.output("cannot start %s" % node.name) results += [(node, False)] # Check whether processes did indeed start up. hanging = [] running = [] for (node, success) in waitForBros(nodes, "RUNNING", 3, True): if success: running += [node] else: hanging += [node] # It can happen that Bro hangs in DNS lookups at startup # which can take a while. At this point we already know # that the process has been started (waitForBro ensures that). # If by now there is not a TERMINATED status, we assume that it # is doing fine and will move on to RUNNING once DNS is done. for (node, success) in waitForBros(hanging, "TERMINATED", 0, False): if success: util.output("%s terminated immediately after starting; check output with \"diag\"" % node.name) node.clearPID() results += [(node, False)] else: util.output("(%s still initializing)" % node.name) running += [node] for node in running: cron.logAction(node, "started") results += [(node, True)] return results
def _startNodes(nodes): results = [] filtered = [] # Ignore nodes which are still running. for (node, isrunning) in isRunning(nodes): if not isrunning: filtered += [node] util.output("starting %s ..." % node.name) else: util.output("%s still running" % node.name) nodes = filtered # Generate crash report for any crashed nodes. crashed = [node for node in nodes if node.hasCrashed()] _makeCrashReports(crashed) # Make working directories. dirs = [(node, node.cwd()) for node in nodes] nodes = [] for (node, success) in execute.mkdirs(dirs): if success: nodes += [node] else: util.output("cannot create working directory for %s" % node.name) results += [(node, False)] # Start Bro process. cmds = [] envs = [] for node in nodes: cmds += [(node, "start", [node.cwd()] + _makeBroParams(node, True))] envs += [_makeEnvParam(node)] nodes = [] for (node, success, output) in execute.runHelperParallel(cmds, envs=envs): if success: nodes += [node] node.setPID(int(output[0])) else: util.output("cannot start %s" % node.name) results += [(node, False)] # Check whether processes did indeed start up. hanging = [] running = [] for (node, success) in waitForBros(nodes, "RUNNING", 3, True): if success: running += [node] else: hanging += [node] # It can happen that Bro hangs in DNS lookups at startup # which can take a while. At this point we already know # that the process has been started (waitForBro ensures that). # If by now there is not a TERMINATED status, we assume that it # is doing fine and will move on to RUNNING once DNS is done. for (node, success) in waitForBros(hanging, "TERMINATED", 0, False): if success: util.output( "%s terminated immediately after starting; check output with \"diag\"" % node.name) node.clearPID() results += [(node, False)] else: util.output("(%s still initializing)" % node.name) running += [node] for node in running: cron.logAction(node, "started") results += [(node, True)] return results
def _stopNodes(nodes): results = [] running = [] # Check for crashed nodes. for (node, isrunning) in isRunning(nodes): if isrunning: running += [node] util.output("stopping %s ..." % node.name) else: results += [(node, True)] if node.hasCrashed(): _makeCrashReports([node]) util.output("%s not running (was crashed)" % node.name) else: util.output("%s not running" % node.name) # Helper function to stop nodes with given signal. def stop(nodes, signal): cmds = [] for node in nodes: cmds += [(node, "stop", [str(node.getPID()), str(signal)])] return execute.runHelperParallel(cmds) #events = [] #for node in nodes: # events += [(node, "Control::shutdown_request", [], "Control::shutdown_response")] #return execute.sendEventsParallel(events) # Stop nodes. for (node, success, output) in stop(running, 15): if not success: util.output("failed to send stop signal to %s" % node.name) if running: time.sleep(1) # Check whether they terminated. terminated = [] kill = [] for (node, success) in waitForBros(running, "TERMINATED", int(config.Config.stoptimeout), False): if not success: # Check whether it crashed during shutdown ... result = isRunning([node]) for (node, isrunning) in result: if isrunning: util.output("%s did not terminate ... killing ..." % node.name) kill += [node] else: # crashed flag is set by isRunning(). util.output("%s crashed during shutdown" % node.name) if len(kill): # Kill those which did not terminate gracefully. stop(kill, 9) # Given them a bit to disappear. time.sleep(5) # Check which are still running. We check all nodes to be on the safe side # and give them a bit more time to finally disappear. timeout = 10 todo = {} for node in running: todo[node.name] = node while True: running = isRunning(todo.values(), setcrashed=False) for (node, isrunning) in running: if node.name in todo and not isrunning: # Alright, it's gone. del todo[node.name] terminated += [node] results += [(node, True)] if len(todo) == 0: # All done. break # Wait a bit before we start over. if timeout <= 0: break time.sleep(1) timeout -= 1 results += [(node, False) for node in todo] # Do post-terminate cleanup for those which terminated gracefully. cleanup = [node for node in terminated if not node.hasCrashed()] cmds = [] for node in cleanup: cmds += [(node, "run-cmd", [ os.path.join(config.Config.scriptsdir, "post-terminate"), node.cwd() ])] for (node, success, output) in execute.runHelperParallel(cmds): if not success: util.output("cannot run post-terminate for %s" % node.name) cron.logAction(node, "stopped (failed)") else: cron.logAction(node, "stopped") node.clearPID() node.clearCrashed() return results
def getCapstatsOutput(nodes, interval): if not config.Config.capstatspath: if config.Config.cron == "0": util.warn("do not have capstats binary available") return [] results = [] cmds = [] hosts = {} for node in nodes: if not node.interface: continue try: hosts[(node.addr, node.interface)] = node except AttributeError: continue for (addr, interface) in hosts.keys(): node = hosts[addr, interface] # If interface name contains semicolons (to aggregate traffic from # multiple devices with PF_RING, the interface name can be in a # semicolon-delimited format, such as "p2p1;p2p2"), then we must # quote it to prevent shell from interpreting semicolon as command # separator (another layer of quotes is needed because the eval # command is used). capstats = [config.Config.capstatspath, "-I", str(interval), "-n", "1", "-i", "'\"%s\"'" % interface] # Unfinished feature: only consider a particular MAC. Works here for capstats # but Bro config is not adapted currently so we disable it for now. # try: # capstats += ["-f", "\\'", "ether dst %s" % node.ether, "\\'"] # except AttributeError: # pass cmds += [(node, "run-cmd", capstats)] outputs = execute.runHelperParallel(cmds) totals = {} for (node, success, output) in outputs: if not success: if output: results += [(node, "%s: capstats failed (%s)" % (node.name, output[0]), {})] else: results += [(node, "%s: cannot execute capstats" % node.name, {})] continue if not output: results += [(node, "%s: no capstats output" % node.name, {})] continue fields = output[0].split()[1:] if not fields: results += [(node, "%s: unexpected capstats output: %s" % (node.name, output[0]), {})] continue vals = {} try: for field in fields: (key, val) = field.split("=") val = float(val) vals[key] = val try: totals[key] += val except KeyError: totals[key] = val results += [(node, None, vals)] except ValueError: results += [(node, "%s: unexpected capstats output: %s" % (node.name, output[0]), {})] # Add pseudo-node for totals if len(nodes) > 1: results += [(node_mod.Node("$total"), None, totals)] return results
def status(nodes): typewidth = 7 hostwidth = 16 if config.Config.standalone == "1": # In standalone mode, the "type" column needs more width typewidth = 10 hostwidth = 13 util.output( "%-12s %-*s %-*s %-9s %-6s %-6s %s" % ("Name", typewidth, "Type", hostwidth, "Host", "Status", "Pid", "Peers", "Started") ) all = isRunning(nodes) running = [] cmds1 = [] cmds2 = [] for (node, isrunning) in all: if isrunning: running += [node] cmds1 += [(node, "cat-file", ["%s/.startup" % node.cwd()])] cmds2 += [(node, "cat-file", ["%s/.status" % node.cwd()])] startups = execute.runHelperParallel(cmds1) statuses = execute.runHelperParallel(cmds2) startups = dict([(n.name, success and util.fmttime(output[0]) or "???") for (n, success, output) in startups]) statuses = dict([(n.name, success and output[0].split()[0].lower() or "???") for (n, success, output) in statuses]) peers = {} nodes = [n for n in running if statuses[n.name] == "running"] for (node, success, args) in _queryPeerStatus(nodes): if success: peers[node.name] = [] for f in args[0].split(): keyval = f.split("=") if len(keyval) > 1: (key, val) = keyval if key == "peer" and val != "": peers[node.name] += [val] else: peers[node.name] = None for (node, isrunning) in all: util.output("%-12s " % node.name, nl=False) util.output("%-*s %-*s " % (typewidth, node.type, hostwidth, node.host), nl=False) if isrunning: util.output("%-9s " % statuses[node.name], nl=False) elif node.hasCrashed(): util.output("%-9s " % "crashed", nl=False) else: util.output("%-9s " % "stopped", nl=False) if isrunning: util.output("%-6s " % node.getPID(), nl=False) if node.name in peers and peers[node.name] != None: util.output("%-6d " % len(peers[node.name]), nl=False) else: util.output("%-6s " % "???", nl=False) util.output("%s" % startups[node.name], nl=False) util.output() # Return True if all nodes are running return len(nodes) == len(all)
def getTopOutput(nodes): results = [] cmds = [] running = isRunning(nodes) # Get all the PIDs first. pids = {} parents = {} for (node, isrunning) in running: if isrunning: pid = node.getPID() pids[node.name] = [pid] parents[node.name] = str(pid) cmds += [(node, "get-childs", [str(pid)])] else: results += [(node, "not running", [{}])] continue if not cmds: return results for (node, success, output) in execute.runHelperParallel(cmds): if not success: results += [(node, "cannot get child pids", [{}])] continue pids[node.name] += [int(line) for line in output] cmds = [] # Now run top. for node in nodes: # Do the loop again to keep the order. if not node.name in pids: continue cmds += [(node, "top", [])] if not cmds: return results for (node, success, output) in execute.runHelperParallel(cmds): if not success: results += [(node, "cannot get top output", [{}])] procs = [ line.split() for line in output if int(line.split()[0]) in pids[node.name] ] if not procs: # It can happen that on the meantime the process is not there anymore. results += [(node, "not running", [{}])] continue vals = [] for p in procs: d = {} d["pid"] = int(p[0]) d["proc"] = (p[0] == parents[node.name] and "parent" or "child") d["vsize"] = long(float(p[1])) # May be something like 2.17684e+09 d["rss"] = long(float(p[2])) d["cpu"] = p[3] d["cmd"] = " ".join(p[4:]) vals += [d] results += [(node, None, vals)] return results