class Node: """represents a cassandra/dse node""" def __init__(self): self.start = None self.end = None self.tables = OrderedDefaultDict(Table) self.stages = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.pauses = [] self.version = None self.lines = 0 self.skipped_lines = 0 self.cassandra_version = None self.dumps_analyzed = 0 def get_busiest_tables(self, by_prop): """return busiest tables by_prop (data or ops)""" return sorted( self.tables.items(), key=lambda table: getattr(table[1], by_prop), reverse=True, ) def longest_tp_name_length(self): """find the length of the thread pool with the longest name""" longest = 0 for stage in self.stages.values(): slen = len(max(stage, key=len)) if slen > longest: longest = slen return longest def longest_tp_value_length(self): """find the length of the longest value in any threadpool""" longest = 0 for stage in self.stages.values(): for vals in stage.values(): vlen = len(max(map(str, vals), key=len)) if vlen > longest: longest = vlen return longest def duration(self): """duration this node was analyzed""" return self.end - self.start
class StatusLogger: """status logger""" def __init__( self, diag_dir, files=None, start=None, end=None, wanted_stages=None, command_name="sperf core statuslogger", syslog_prefix="system.log", dbglog_prefix="debug.log", ): self.diag_dir = diag_dir self.files = files self.wanted_stages = wanted_stages if env.DEBUG: print("wanted stages:", self.wanted_stages) self.nodes = OrderedDefaultDict(Node) self.analyzed = False self.dumps_analyzed = 0 self.rule_types = OrderedDefaultDict(int) self.command_name = command_name self.syslog_prefix = syslog_prefix self.dbglog_prefix = dbglog_prefix self.start = None self.end = None if start: self.start = date_parse(start) if end: self.end = date_parse(end) def analyze(self): """analyze log files""" if self.analyzed: return event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for f in target: nodename = extract_node_name(f, ignore_missing_nodes=True) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", f) with FileWithProgress(f) as log: statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event["date"] < self.start: continue if self.end and event["date"] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if "rule_type" in event: self.rule_types[event["rule_type"]] += 1 elif event["event_type"] == "unknown": self.rule_types["unknown"] += 1 else: self.rule_types["no type"] += 1 if event["event_type"] == "server_version": if event.get("version"): node.version = event["version"] if node.version.startswith("6"): node.cassandra_version = "DSE Private Fork" elif event.get("cassandra_version"): node.cassandra_version = event["cassandra_version"] # skipping solr, spark etc as it maybe too much noise for statuslogger elif event["event_type"] == "memtable_status": tname = ".".join([event["keyspace"], event["table"]]) if event["ops"] > node.tables[tname].ops: node.tables[tname].ops = event["ops"] try: if event["data"] > node.tables[tname].data: node.tables[tname].data = event["data"] except Exception as e: print(event) raise e elif event["event_type"] == "pause": node.pauses.append(event["duration"]) elif event["event_type"] == "threadpool_header": node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event["event_type"] == "threadpool_status": if re.match(r"TPC/\d+$", event["pool_name"]): if not node.version: node.version = "6.x" if "delayed" in event and event["delayed"]: val = event["delayed"] node.stages["local backpressure"][ event["pool_name"]].append(val) else: for pool in [ "active", "pending", "blocked", "all_time_blocked", ]: if pool in event and event[pool]: if not self.wanted_stages or event[ "pool_name"].startswith( self.wanted_stages): node.stages[pool][ event["pool_name"]].append( event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items()) def __setdates(self, node, date): if not node.start: node.start = date node.end = date return if date > node.end: node.end = date if date < node.start: node.start = date def print_histogram(self): """print histogram report, analyzing if necessary""" self.analyze() print("%s version: %s" % (self.command_name, VERSION)) print("") print("Histogram") print("") if not self.nodes: print("Nothing found!") return cassandra_versions = set() dse_versions = set() for name, node in self.nodes.items(): cassandra_versions.add(node.cassandra_version) dse_versions.add(node.dse_versions) print(name) print("-" * 60) print("%s lines" % format_num(node.lines)) print("%s skipped lines" % format_num(node.skipped_lines)) print("dse version: %s" % (node.version or "unknown")) print("cassandra version: %s" % (node.cassandra_version or "unknown")) print("log start time: %s" % node.start) print("log end time: %s" % node.end) if not node.dumps_analyzed: print("Nothing found!") continue print("duration: %s" % format_seconds(int(node.duration().total_seconds()))) print("stages analyzed: %s" % node.dumps_analyzed) if node.pauses: percentiles = [[]] percentiles.append(get_percentile_headers("GC pauses")) header = [""] header.extend(["---" for i in range(6)]) percentiles.append(header) percentiles.append( get_percentiles("ms", node.pauses, strformat="%i")) pad_table(percentiles, min_width=11, extra_pad=2) for line in percentiles: print("".join(line)) print("total GC events: %s" % len(node.pauses)) print("") ops = node.get_busiest_tables("ops")[:5] if ops: print("busiest tables (ops)") print("-" * 30) nlen = max(len(o[0]) for o in ops) for n, t in ops: print(n.ljust(nlen), t) data = node.get_busiest_tables("data")[:5] print("busiest tables (data)") print("-" * 30) nlen = max(len(d[0]) for d in data) for n, t in data: print(n.ljust(nlen), t) print("") if node.stages: percentiles = [] percentiles.append(get_percentile_headers("stages")) percentiles.append([]) for status, stage in node.stages.items(): header = [status.upper()] header.extend("" for i in range(6)) percentiles.append(header) for tpname, vals in sorted(stage.items(), key=lambda tpool: max(tpool[1]), reverse=True): percentiles.append( get_percentiles(tpname, vals, strformat="%i")) percentiles.append([]) pad_table(percentiles, extra_pad=2) for line in percentiles: print("".join(line)) print("") self.__print_recs(dse_versions, cassandra_versions) def __print_recs(self, versions, cassandra_versions): version = None cassandra_version = None if len(cassandra_versions) > 1: print( "WARNING more than one version present assuming no version with recommendations" ) elif len(cassandra_versions) == 1: cassandra_version = cassandra_versions[0] elif len(versions) == 1: version = versions[0] engine = Engine(version, cassandra_version) recs = set() for node in self.nodes.values(): rstage = OrderedDefaultDict(dict) for status, stage in node.stages.items(): for tpname, vals in stage.items(): rstage[tpname][status] = max(vals) for tpname in rstage: for sname in ( "active", "pending", "local backpressure", "completed", "blocked", "all_time_blocked", ): if sname not in rstage[tpname]: rstage[tpname][sname] = 0 s = Stage( tpname, rstage[tpname]["active"], rstage[tpname]["pending"], rstage[tpname]["local backpressure"], rstage[tpname]["completed"], rstage[tpname]["blocked"], rstage[tpname]["all_time_blocked"], ) reason, rec = engine.analyze_stage(s) if reason: recs.add((rec, reason)) if recs: print("recommmendations") print("----------------") for rec, reason in recs: print("* %s (%s)" % (rec, reason)) def print_summary(self): """prints a summary report""" self.analyze() summary = Summary(self.nodes) print("%s version: %s" % (self.command_name, VERSION)) print("") print("Summary (%s lines)" % format_num(summary.lines)) print("Summary (%s skipped lines)" % format_num(summary.skipped_lines)) print("") print("dse versions: %s" % (set(summary.versions) or "unknown")) print("cassandra versions: %s" % (set(summary.cassandra_versions) or "unknown")) print("first log time: %s" % summary.start) print("last log time: %s" % summary.end) if not self.dumps_analyzed: print("Nothing found!") return print("duration: %s" % format_seconds(int(summary.duration.total_seconds()))) print("total stages analyzed: %s" % self.dumps_analyzed) print("total nodes analyzed: %s" % len(summary.nodes)) pauses = summary.get_pauses() if pauses: print("") percentiles = [] percentiles.append(get_percentile_headers("GC pauses")) percentiles.append(["", "---", "---", "---", "---", "---", "---"]) percentiles.append(get_percentiles("ms", pauses, strformat="%i")) pad_table(percentiles, min_width=11, extra_pad=2) for line in percentiles: print("".join(line)) print("total GC events: %s" % len(pauses)) print("") ops = summary.get_busiest_tables("ops")[:5] if ops: print("busiest tables by ops across all nodes") print("-" * 30) for onode, (oname, num) in ops: print("* %s: %s: %s" % (onode, oname, num)) print("") print("busiest table by data across all nodes") print("-" * 30) for dnode, (dname, data) in summary.get_busiest_tables("data")[:5]: print("* %s: %s: %s" % (dnode, dname, data)) print("") print("busiest stages across all nodes") print("-" * 30) data = [] for (name, status, stage, value) in summary.get_busiest_stages(): data.append( ["* %s %s:" % (stage, status), str(value), "(%s)" % name]) pad_table(data, extra_pad=2) for line in data: print("".join(line)) pending = summary.get_stages_in("pending") data = [] if pending: data.append([]) self.__print_stages("pending", pending, data) delayed = summary.get_stages_in("local backpressure") if delayed: data.append([]) self.__print_backpressure(delayed, data) data.append([]) pad_table(data, extra_pad=2) for line in data: print("".join(line)) self.__print_recs(summary.versions, summary.cassandra_versions) def __print_backpressure(self, stages, data): nodes = OrderedDict() # get total backpressure and max backpressure per node for name, stage in stages.items(): if name not in nodes: # if not present initialize so aggregate code can safely assume 0 value is already populated agg = OrderedDict() agg["total"] = 0 agg["count"] = 0 agg["max"] = 0 nodes[name] = agg node_agg = nodes[name] for vals in stage.values(): node_agg["count"] += len(vals) node_agg["total"] += sum(vals) node_agg["max"] = max(max(vals), node_agg["max"]) data.append(["busiest LOCAL BACKPRESSURE"]) data.append(["-" * 30]) # sort to make the node with the worst backpressure total first for name, val in sorted( nodes.items(), key=lambda kvp: kvp[1]["total"] / float(kvp[1]["count"]), reverse=True, ): data.append(["%s:" % name]) avg_backpressure = val["total"] / float(val["count"]) data.append( [" " * 5, "avg backpressure", "%.2f" % avg_backpressure]) data.append([" " * 5, "max backpressure", str(nodes[name]["max"])]) def __print_stages(self, status, stages, data): data.append(["busiest stages in %s" % status.upper()]) data.append(["-" * 30]) # sort to make the worst node first for name, stage in sorted(stages.items(), key=lambda tpool: max(tpool[1].values()), reverse=True): data.append(["%s:" % name]) # sort again to make the worst stage on the node first for tp, vals in sorted(stage.items(), key=lambda stg: max(stg[1]), reverse=True): data.append([" " * 5, "%s:" % tp, str(max(vals))])
class GCInspector: """GCInspector class""" def __init__(self, diag_dir=None, files=None, start=None, end=None): self.diag_dir = diag_dir self.files = files self.pauses = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.gc_types = OrderedDefaultDict(int) self.start = None self.end = None self.starts = OrderedDefaultDict(datetime.datetime) self.ends = OrderedDefaultDict(datetime.datetime) self.analyzed = False self.start_time = None self.end_time = None if start: self.start_time = date_parse(start) if end: self.end_time = date_parse(end) def analyze(self): """analyze files""" target = None if self.files: target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: node = extract_node_name(file) with diag.FileWithProgress(file) as log: for event in parser.read_log(log, gc.capture_line): if event["event_type"] == "pause": if self.start_time and event["date"] < self.start_time: continue if self.end_time and event["date"] > self.end_time: continue self.__setdates(event["date"], node) self.pauses[node][event["date"]].append( event["duration"]) self.gc_types[event["gc_type"]] += 1 self.analyzed = True def __setdates(self, date, node): """track start/end times""" # global if not self.start: self.start = date self.end = date if date > self.end: self.end = date if date < self.start: self.start = date # node specific if node not in self.starts: self.starts[node] = date self.ends[node] = date return if date > self.ends[node]: self.ends[node] = date if date < self.starts[node]: self.starts[node] = date def all_pauses(self): """get pauses for all nodes""" pauses = OrderedDefaultDict(list) for pausedata in self.pauses.values(): for time, pause in pausedata.items(): pauses[time].extend(pause) return pauses def print_report(self, interval=3600, by_node=False, top=3): """print gc report""" print("gcinspector version %s" % VERSION) print("") if not self.analyzed: self.analyze() if not self.pauses: print("No pauses found") return if not by_node: pauses = self.all_pauses() self.__print_gc( sorted( bucketize(pauses, start=self.start, end=self.end, seconds=interval).items(), key=lambda t: t[0], )) plist = [] for time in pauses: plist.extend(pauses[time]) worst_k = heapq.nlargest(top, plist) print("Worst pauses in ms:") print(worst_k) else: for node in self.pauses: print(node) self.__print_gc( sorted( bucketize( self.pauses[node], start=self.starts[node], end=self.ends[node], seconds=interval, ).items(), key=lambda t: t[0], )) plist = [] for time, pauses in self.pauses[node].items(): plist.extend(pauses) worst_k = heapq.nlargest(top, plist) print("Worst pauses in ms:") print(worst_k) print("") print("") print("Collections by type") print("-" * 20) for collection, count in self.gc_types.items(): print("* %s: %s" % (collection, count)) print("") def __print_gc(self, data): """print data to the user, expecting datetime keys and list(int) values""" print(". <300ms + 301-500ms ! >500ms") print("-" * 30) busiest = None for time, pauses in data: total = sum(pauses) if not busiest: busiest = (time, total) elif total > busiest[1]: busiest = (time, total) print(time.strftime("%Y-%m-%d %H:%M:%S"), end=" ") print(len(pauses), end=" ") for pause in pauses: c = "." if pause > 300: c = "+" if pause > 500: c = "!" print(c, end="") print("") print("") print("busiest period: %s (%sms)" % (busiest[0].strftime("%Y-%m-%d %H:%M:%S"), busiest[1])) print("") percentiles = [[]] percentiles.append(get_percentile_headers("GC pauses")) header = [""] header.extend(["---" for i in range(6)]) percentiles.append(header) percentiles.append( get_percentiles( "ms", list( itertools.chain.from_iterable(pauses for time, pauses in data)), strformat="%i", )) pad_table(percentiles, min_width=11, extra_pad=2) for line in percentiles: print("".join(line)) print()