def __init__( self, diag_dir, files=None, start=None, end=None, wanted_stages=None, command_name="sperf core statuslogger", syslog_prefix="system.log", dbglog_prefix="debug.log", ): self.diag_dir = diag_dir self.files = files self.wanted_stages = wanted_stages if env.DEBUG: print("wanted stages:", self.wanted_stages) self.nodes = OrderedDefaultDict(Node) self.analyzed = False self.dumps_analyzed = 0 self.rule_types = OrderedDefaultDict(int) self.command_name = command_name self.syslog_prefix = syslog_prefix self.dbglog_prefix = dbglog_prefix self.start = None self.end = None if start: self.start = date_parse(start) if end: self.end = date_parse(end)
def __init__(self): self.start = None self.end = None self.tables = OrderedDefaultDict(Table) self.stages = OrderedDefaultDict(lambda: defaultdict(list)) self.pauses = [] self.version = None self.lines = 0 self.skipped_lines = 0 self.cassandra_version = None self.dumps_analyzed = 0
def collate_threads(self, threads): """ combines similar threads """ ret = OrderedDefaultDict(lambda: OrderedDefaultDict(float)) exprs = [] exprs.append(r':.*') exprs.append(r'-\d+.*') exprs.append(r'-\/.*') for thread in threads: name = thread for e in exprs: name = re.sub(e, '', name) for t, v in threads[thread].items(): ret[name][t] = round(ret[name][t] + v, 2) ret[name]['thread_count'] += 1 return ret
def analyze(self): error_if_file_not_found = False """ analyze log files """ if self.files: error_if_file_not_found = True target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir, "output.log") else: self.analyzed = True return for file in target: with diag.FileWithProgress(file) as log: if not log.file_desc and error_if_file_not_found: raise FileNotFoundError(log.error) for event in parser.read_output_log(log): if event["event_type"] == "classpath": thisjars = OrderedDefaultDict(int) for jar in event["classpath"].split(":"): j = jar.split("/")[-1] if j.endswith("jar"): # to eliminate dupes within the same file, because java is crazy town if j not in thisjars: thisjars[j] += 1 self.jars[j] += 1 self.files_analyzed += 1 self.analyzed = True
def all_pauses(self): """get pauses for all nodes""" pauses = OrderedDefaultDict(list) for pausedata in self.pauses.values(): for time, pause in pausedata.items(): pauses[time].extend(pause) return pauses
def generate(self, parsed): """generates a time series report for a tarball""" table = [] table.append("") table.append("filter cache evictions by hour") table.append("------------------------------") events_by_datetime = OrderedDefaultDict(list) start = dates.max_utc_time() end = dates.min_utc_time() for node, events in parsed["nodes"].items(): for info in events.get("evictions"): # put into structure we can use for bucketize for value in info.values(): if value.time_stamp > end: end = value.time_stamp if value.time_stamp < start: start = value.time_stamp events_by_datetime[value.time_stamp].append(value) buckets = sorted( util.bucketize(events_by_datetime, start, end, 3600).items(), key=lambda t: t[0], ) maxval = len(max(buckets, key=lambda t: len(t[1]))[1]) for time, matches in buckets: pad = "" for x in range(len(str(maxval)) - len(str(len(matches)))): pad += " " table.append("%s %s %s" % ( time.strftime("%Y-%m-%d %H:%M:%S") + pad, len(matches), util.textbar(maxval, len(matches)), )) return "\n".join(table)
def __init__(self, regex, diag_dir=None, files=None, start=None, end=None, ignorecase=True): self.diag_dir = diag_dir self.files = files self.start = None self.end = None self.start_time = None self.end_time = None self.last_time = None if start: self.start_time = date_parse(start) if end: self.end_time = date_parse(end) if ignorecase: self.strayregex = re.compile(self.strayre + regex + ".*", re.IGNORECASE) self.timeregex = re.compile(self.basere + regex + ".*", re.IGNORECASE) self.supplied_regex = regex.lower() else: self.strayregex = re.compile(self.strayre + regex + ".*") self.timeregex = re.compile(self.basere + regex + ".*") self.supplied_regex = regex self.valid_log_regex = re.compile(self.basere) self.matches = OrderedDefaultDict(list) self.count = 0 self.unknown = 0 self.analyzed = False
def __init__(self, infile, conf=None): self.infile = infile self.parser = IOStatParser() self.count = 0 self.cpu_exceeded = 0 self.iowait_exceeded = 0 self.devices = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.cpu_stats = OrderedDefaultDict(list) self.queuedepth = OrderedDefaultDict(int) self.start = None self.end = None self.device_index = OrderedDict() self.cpu_index = OrderedDict() self.conf = conf or self.__mk_conf() self.recs = set() self.analyzed = False
class Node: """represents a cassandra/dse node""" def __init__(self): self.start = None self.end = None self.tables = OrderedDefaultDict(Table) self.stages = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.pauses = [] self.version = None self.lines = 0 self.skipped_lines = 0 self.cassandra_version = None self.dumps_analyzed = 0 def get_busiest_tables(self, by_prop): """return busiest tables by_prop (data or ops)""" return sorted( self.tables.items(), key=lambda table: getattr(table[1], by_prop), reverse=True, ) def longest_tp_name_length(self): """find the length of the thread pool with the longest name""" longest = 0 for stage in self.stages.values(): slen = len(max(stage, key=len)) if slen > longest: longest = slen return longest def longest_tp_value_length(self): """find the length of the longest value in any threadpool""" longest = 0 for stage in self.stages.values(): for vals in stage.values(): vlen = len(max(map(str, vals), key=len)) if vlen > longest: longest = vlen return longest def duration(self): """duration this node was analyzed""" return self.end - self.start
def analyze(self): """parses logs for results""" print("bucketgrep version %s" % VERSION) print("search: '%s'" % self.supplied_regex) target = None if self.files: target = self.files elif self.diag_dir: if self.diag_dir == ".": directory_path = os.getcwd() print("from directory '%s':" % directory_path) else: print("from directory '%s':" % self.diag_dir) target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: with diag.FileWithProgress(file) as log: node_name = extract_node_name(file, ignore_missing_nodes=True) self.node_matches[node_name] = OrderedDefaultDict(list) for line in log: # as long as it's a valid log line we want the date, # even if we don't care about the rest of the line so we can set # the last date for any straregex lines that match current_dt = self.valid_log_regex.match(line) if current_dt: dt = date()(current_dt.group("date")) # if the log line is valite we want to set the last_time self.last_time = dt # we now can validate if our search term matches the log line d = self.timeregex.match(line) if d: # normal case, well-formatted log line self.__setdates(dt) if self.start_time and dt < self.start_time: continue if self.end_time and dt > self.end_time: continue self.matches[dt].append(line) self.node_matches[node_name][dt].append(line) self.count += 1 else: m = self.strayregex.match(line) # check for a match in an unformatted line, like a traceback if m: if self.last_time is None: # match, but no previous timestamp to associate with self.unknown += 1 continue self.matches[self.last_time].append(line) self.node_matches[node_name][ self.last_time].append(line) self.count += 1 self.analyzed = True
def __init__(self, children): """ Constructor expects to be passed one or more case and rule objects. The case objects are used to group the rules. """ self.rules = OrderedDefaultDict(list) keys = None for child in children: if isinstance(child, case): keys = child.keys else: for key in keys: self.rules[key].append(child)
def __init__(self, diag_dir, files=None, start=None, end=None): self.diag_dir = diag_dir self.files = files self.parser = SlowQueryParser() self.querytimes = OrderedDefaultDict(list) self.queries = [] self.analyzed = False self.start = None self.end = None self.cross = 0 self.start_time = None self.end_time = None if start: self.start_time = date_parse(start) if end: self.end_time = date_parse(end)
def __print_recs(self, versions, cassandra_versions): version = None cassandra_version = None if len(cassandra_versions) > 1: print( "WARNING more than one version present assuming no version with recommendations" ) elif len(cassandra_versions) == 1: cassandra_version = cassandra_versions[0] elif len(versions) == 1: version = versions[0] engine = Engine(version, cassandra_version) recs = set() for node in self.nodes.values(): rstage = OrderedDefaultDict(dict) for status, stage in node.stages.items(): for tpname, vals in stage.items(): rstage[tpname][status] = max(vals) for tpname in rstage: for sname in ( "active", "pending", "local backpressure", "completed", "blocked", "all_time_blocked", ): if sname not in rstage[tpname]: rstage[tpname][sname] = 0 s = Stage( tpname, rstage[tpname]["active"], rstage[tpname]["pending"], rstage[tpname]["local backpressure"], rstage[tpname]["completed"], rstage[tpname]["blocked"], rstage[tpname]["all_time_blocked"], ) reason, rec = engine.analyze_stage(s) if reason: recs.add((rec, reason)) if recs: print("recommmendations") print("----------------") for rec, reason in recs: print("* %s (%s)" % (rec, reason))
def __init__(self, diag_dir=None, files=None, start=None, end=None): self.diag_dir = diag_dir self.files = files self.pauses = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.gc_types = OrderedDefaultDict(int) self.start = None self.end = None self.starts = OrderedDefaultDict(datetime.datetime) self.ends = OrderedDefaultDict(datetime.datetime) self.analyzed = False self.start_time = None self.end_time = None if start: self.start_time = date_parse(start) if end: self.end_time = date_parse(end)
def __print_recs(self): engine = Engine() recs = set() for node in self.nodes.values(): rstage = OrderedDefaultDict(dict) for status, stage in node.stages.items(): for tpname, vals in stage.items(): rstage[tpname][status] = max(vals) for tpname in rstage: for sname in ( "active", "pending", "local backpressure", "completed", "blocked", "all_time_blocked", ): if sname not in rstage[tpname]: rstage[tpname][sname] = 0 s = Stage( tpname, rstage[tpname]["active"], rstage[tpname]["pending"], rstage[tpname]["local backpressure"], rstage[tpname]["completed"], rstage[tpname]["blocked"], rstage[tpname]["all_time_blocked"], ) reason, rec = engine.analyze_stage(s) if reason: recs.add((rec, reason)) if recs: print("recommmendations") print("----------------") for rec, reason in recs: print("* %s (%s)" % (rec, reason))
def parse(self, log): """ parse ttop output """ total = OrderedDict() threads = OrderedDefaultDict(dict) for line in log: if self.state is None: m = self.begin_match.match(line) if m: dt = date('%Y-%m-%dT%H:%M:%S.%f%z')(m.group('date')) if self.start and dt < self.start: continue if self.end and dt > self.end: continue total['date'] = dt self.state = self.BEGIN continue if self.state == self.BEGIN: m = self.process_match.match(line) if not m: raise ValueError("process line not found in " + line) self.state = self.PROCESS total['cpu_total'] = float(m.group('cpu_total')) continue if self.state == self.PROCESS: m = self.application_match.match(line) if not m: raise ValueError("application line not found in " + line) self.state = self.APPLICATION total['app_cpu'] = float(m.group('app_cpu')) total['user_cpu'] = float(m.group('user_cpu')) total['sys_cpu'] = float(m.group('sys_cpu')) continue if self.state == self.APPLICATION: m = self.other_match.match(line) if not m: raise ValueError("other line not found in '" + line + "'") self.state = self.OTHER total['other_cpu'] = float(m.group('other_cpu')) continue if self.state == self.OTHER: m = self.thread_match.match(line) if not m: raise ValueError("thread line not found in '" + line + "'") self.state = self.THREAD total['thread_count'] = int(m.group('thread_count')) continue if self.state == self.THREAD: m = self.heap_match.match(line) if not m: raise ValueError("heap line not found in '" + line + "'") self.state = self.TINFO total['heap_rate'] = self.convert(m.group('heap_rate'), m.group('heap_unit')) continue if self.state == self.TINFO: if line == '\n': self.state = None yield total, threads total = OrderedDict() threads = OrderedDefaultDict(dict) else: m = self.tinfo_match.match(line) if not m: raise ValueError("thread info not found in '" + line + "'") threads[m.group('thread_name')]['user_cpu'] = float( m.group('user_cpu')) threads[m.group('thread_name')]['sys_cpu'] = float( m.group('sys_cpu')) threads[m.group('thread_name')]['total_cpu'] = float( m.group('sys_cpu')) + float(m.group('user_cpu')) threads[m.group( 'thread_name')]['heap_rate'] = self.convert( m.group('heap_rate'), m.group('heap_unit')) continue
class GCInspector: """GCInspector class""" def __init__(self, diag_dir=None, files=None, start=None, end=None): self.diag_dir = diag_dir self.files = files self.pauses = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.gc_types = OrderedDefaultDict(int) self.start = None self.end = None self.starts = OrderedDefaultDict(datetime.datetime) self.ends = OrderedDefaultDict(datetime.datetime) self.analyzed = False self.start_time = None self.end_time = None if start: self.start_time = date_parse(start) if end: self.end_time = date_parse(end) def analyze(self): """analyze files""" target = None if self.files: target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: node = extract_node_name(file) with diag.FileWithProgress(file) as log: for event in parser.read_log(log, gc.capture_line): if event["event_type"] == "pause": if self.start_time and event["date"] < self.start_time: continue if self.end_time and event["date"] > self.end_time: continue self.__setdates(event["date"], node) self.pauses[node][event["date"]].append( event["duration"]) self.gc_types[event["gc_type"]] += 1 self.analyzed = True def __setdates(self, date, node): """track start/end times""" # global if not self.start: self.start = date self.end = date if date > self.end: self.end = date if date < self.start: self.start = date # node specific if node not in self.starts: self.starts[node] = date self.ends[node] = date return if date > self.ends[node]: self.ends[node] = date if date < self.starts[node]: self.starts[node] = date def all_pauses(self): """get pauses for all nodes""" pauses = OrderedDefaultDict(list) for pausedata in self.pauses.values(): for time, pause in pausedata.items(): pauses[time].extend(pause) return pauses def print_report(self, interval=3600, by_node=False, top=3): """print gc report""" print("gcinspector version %s" % VERSION) print("") if not self.analyzed: self.analyze() if not self.pauses: print("No pauses found") return if not by_node: pauses = self.all_pauses() self.__print_gc( sorted( bucketize(pauses, start=self.start, end=self.end, seconds=interval).items(), key=lambda t: t[0], )) plist = [] for time in pauses: plist.extend(pauses[time]) worst_k = heapq.nlargest(top, plist) print("Worst pauses in ms:") print(worst_k) else: for node in self.pauses: print(node) self.__print_gc( sorted( bucketize( self.pauses[node], start=self.starts[node], end=self.ends[node], seconds=interval, ).items(), key=lambda t: t[0], )) plist = [] for time, pauses in self.pauses[node].items(): plist.extend(pauses) worst_k = heapq.nlargest(top, plist) print("Worst pauses in ms:") print(worst_k) print("") print("") print("Collections by type") print("-" * 20) for collection, count in self.gc_types.items(): print("* %s: %s" % (collection, count)) print("") def __print_gc(self, data): """print data to the user, expecting datetime keys and list(int) values""" print(". <300ms + 301-500ms ! >500ms") print("-" * 30) busiest = None for time, pauses in data: total = sum(pauses) if not busiest: busiest = (time, total) elif total > busiest[1]: busiest = (time, total) print(time.strftime("%Y-%m-%d %H:%M:%S"), end=" ") print(len(pauses), end=" ") for pause in pauses: c = "." if pause > 300: c = "+" if pause > 500: c = "!" print(c, end="") print("") print("") print("busiest period: %s (%sms)" % (busiest[0].strftime("%Y-%m-%d %H:%M:%S"), busiest[1])) print("") percentiles = [[]] percentiles.append(get_percentile_headers("GC pauses")) header = [""] header.extend(["---" for i in range(6)]) percentiles.append(header) percentiles.append( get_percentiles( "ms", list( itertools.chain.from_iterable(pauses for time, pauses in data)), strformat="%i", )) pad_table(percentiles, min_width=11, extra_pad=2) for line in percentiles: print("".join(line)) print()
class StatusLogger: """status logger""" def __init__( self, diag_dir, files=None, start=None, end=None, wanted_stages=None, command_name="sperf core statuslogger", syslog_prefix="system.log", dbglog_prefix="debug.log", ): self.diag_dir = diag_dir self.files = files self.wanted_stages = wanted_stages if env.DEBUG: print("wanted stages:", self.wanted_stages) self.nodes = OrderedDefaultDict(Node) self.analyzed = False self.dumps_analyzed = 0 self.rule_types = OrderedDefaultDict(int) self.command_name = command_name self.syslog_prefix = syslog_prefix self.dbglog_prefix = dbglog_prefix self.start = None self.end = None if start: self.start = date_parse(start) if end: self.end = date_parse(end) def analyze(self): """analyze log files""" if self.analyzed: return event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for f in target: nodename = extract_node_name(f, ignore_missing_nodes=True) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", f) with FileWithProgress(f) as log: statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event["date"] < self.start: continue if self.end and event["date"] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if "rule_type" in event: self.rule_types[event["rule_type"]] += 1 elif event["event_type"] == "unknown": self.rule_types["unknown"] += 1 else: self.rule_types["no type"] += 1 if event["event_type"] == "server_version": if event.get("version"): node.version = event["version"] if node.version.startswith("6"): node.cassandra_version = "DSE Private Fork" elif event.get("cassandra_version"): node.cassandra_version = event["cassandra_version"] # skipping solr, spark etc as it maybe too much noise for statuslogger elif event["event_type"] == "memtable_status": tname = ".".join([event["keyspace"], event["table"]]) if event["ops"] > node.tables[tname].ops: node.tables[tname].ops = event["ops"] try: if event["data"] > node.tables[tname].data: node.tables[tname].data = event["data"] except Exception as e: print(event) raise e elif event["event_type"] == "pause": node.pauses.append(event["duration"]) elif event["event_type"] == "threadpool_header": node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event["event_type"] == "threadpool_status": if re.match(r"TPC/\d+$", event["pool_name"]): if not node.version: node.version = "6.x" if "delayed" in event and event["delayed"]: val = event["delayed"] node.stages["local backpressure"][ event["pool_name"]].append(val) else: for pool in [ "active", "pending", "blocked", "all_time_blocked", ]: if pool in event and event[pool]: if not self.wanted_stages or event[ "pool_name"].startswith( self.wanted_stages): node.stages[pool][ event["pool_name"]].append( event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items()) def __setdates(self, node, date): if not node.start: node.start = date node.end = date return if date > node.end: node.end = date if date < node.start: node.start = date def print_histogram(self): """print histogram report, analyzing if necessary""" self.analyze() print("%s version: %s" % (self.command_name, VERSION)) print("") print("Histogram") print("") if not self.nodes: print("Nothing found!") return cassandra_versions = set() dse_versions = set() for name, node in self.nodes.items(): cassandra_versions.add(node.cassandra_version) dse_versions.add(node.dse_versions) print(name) print("-" * 60) print("%s lines" % format_num(node.lines)) print("%s skipped lines" % format_num(node.skipped_lines)) print("dse version: %s" % (node.version or "unknown")) print("cassandra version: %s" % (node.cassandra_version or "unknown")) print("log start time: %s" % node.start) print("log end time: %s" % node.end) if not node.dumps_analyzed: print("Nothing found!") continue print("duration: %s" % format_seconds(int(node.duration().total_seconds()))) print("stages analyzed: %s" % node.dumps_analyzed) if node.pauses: percentiles = [[]] percentiles.append(get_percentile_headers("GC pauses")) header = [""] header.extend(["---" for i in range(6)]) percentiles.append(header) percentiles.append( get_percentiles("ms", node.pauses, strformat="%i")) pad_table(percentiles, min_width=11, extra_pad=2) for line in percentiles: print("".join(line)) print("total GC events: %s" % len(node.pauses)) print("") ops = node.get_busiest_tables("ops")[:5] if ops: print("busiest tables (ops)") print("-" * 30) nlen = max(len(o[0]) for o in ops) for n, t in ops: print(n.ljust(nlen), t) data = node.get_busiest_tables("data")[:5] print("busiest tables (data)") print("-" * 30) nlen = max(len(d[0]) for d in data) for n, t in data: print(n.ljust(nlen), t) print("") if node.stages: percentiles = [] percentiles.append(get_percentile_headers("stages")) percentiles.append([]) for status, stage in node.stages.items(): header = [status.upper()] header.extend("" for i in range(6)) percentiles.append(header) for tpname, vals in sorted(stage.items(), key=lambda tpool: max(tpool[1]), reverse=True): percentiles.append( get_percentiles(tpname, vals, strformat="%i")) percentiles.append([]) pad_table(percentiles, extra_pad=2) for line in percentiles: print("".join(line)) print("") self.__print_recs(dse_versions, cassandra_versions) def __print_recs(self, versions, cassandra_versions): version = None cassandra_version = None if len(cassandra_versions) > 1: print( "WARNING more than one version present assuming no version with recommendations" ) elif len(cassandra_versions) == 1: cassandra_version = cassandra_versions[0] elif len(versions) == 1: version = versions[0] engine = Engine(version, cassandra_version) recs = set() for node in self.nodes.values(): rstage = OrderedDefaultDict(dict) for status, stage in node.stages.items(): for tpname, vals in stage.items(): rstage[tpname][status] = max(vals) for tpname in rstage: for sname in ( "active", "pending", "local backpressure", "completed", "blocked", "all_time_blocked", ): if sname not in rstage[tpname]: rstage[tpname][sname] = 0 s = Stage( tpname, rstage[tpname]["active"], rstage[tpname]["pending"], rstage[tpname]["local backpressure"], rstage[tpname]["completed"], rstage[tpname]["blocked"], rstage[tpname]["all_time_blocked"], ) reason, rec = engine.analyze_stage(s) if reason: recs.add((rec, reason)) if recs: print("recommmendations") print("----------------") for rec, reason in recs: print("* %s (%s)" % (rec, reason)) def print_summary(self): """prints a summary report""" self.analyze() summary = Summary(self.nodes) print("%s version: %s" % (self.command_name, VERSION)) print("") print("Summary (%s lines)" % format_num(summary.lines)) print("Summary (%s skipped lines)" % format_num(summary.skipped_lines)) print("") print("dse versions: %s" % (set(summary.versions) or "unknown")) print("cassandra versions: %s" % (set(summary.cassandra_versions) or "unknown")) print("first log time: %s" % summary.start) print("last log time: %s" % summary.end) if not self.dumps_analyzed: print("Nothing found!") return print("duration: %s" % format_seconds(int(summary.duration.total_seconds()))) print("total stages analyzed: %s" % self.dumps_analyzed) print("total nodes analyzed: %s" % len(summary.nodes)) pauses = summary.get_pauses() if pauses: print("") percentiles = [] percentiles.append(get_percentile_headers("GC pauses")) percentiles.append(["", "---", "---", "---", "---", "---", "---"]) percentiles.append(get_percentiles("ms", pauses, strformat="%i")) pad_table(percentiles, min_width=11, extra_pad=2) for line in percentiles: print("".join(line)) print("total GC events: %s" % len(pauses)) print("") ops = summary.get_busiest_tables("ops")[:5] if ops: print("busiest tables by ops across all nodes") print("-" * 30) for onode, (oname, num) in ops: print("* %s: %s: %s" % (onode, oname, num)) print("") print("busiest table by data across all nodes") print("-" * 30) for dnode, (dname, data) in summary.get_busiest_tables("data")[:5]: print("* %s: %s: %s" % (dnode, dname, data)) print("") print("busiest stages across all nodes") print("-" * 30) data = [] for (name, status, stage, value) in summary.get_busiest_stages(): data.append( ["* %s %s:" % (stage, status), str(value), "(%s)" % name]) pad_table(data, extra_pad=2) for line in data: print("".join(line)) pending = summary.get_stages_in("pending") data = [] if pending: data.append([]) self.__print_stages("pending", pending, data) delayed = summary.get_stages_in("local backpressure") if delayed: data.append([]) self.__print_backpressure(delayed, data) data.append([]) pad_table(data, extra_pad=2) for line in data: print("".join(line)) self.__print_recs(summary.versions, summary.cassandra_versions) def __print_backpressure(self, stages, data): nodes = OrderedDict() # get total backpressure and max backpressure per node for name, stage in stages.items(): if name not in nodes: # if not present initialize so aggregate code can safely assume 0 value is already populated agg = OrderedDict() agg["total"] = 0 agg["count"] = 0 agg["max"] = 0 nodes[name] = agg node_agg = nodes[name] for vals in stage.values(): node_agg["count"] += len(vals) node_agg["total"] += sum(vals) node_agg["max"] = max(max(vals), node_agg["max"]) data.append(["busiest LOCAL BACKPRESSURE"]) data.append(["-" * 30]) # sort to make the node with the worst backpressure total first for name, val in sorted( nodes.items(), key=lambda kvp: kvp[1]["total"] / float(kvp[1]["count"]), reverse=True, ): data.append(["%s:" % name]) avg_backpressure = val["total"] / float(val["count"]) data.append( [" " * 5, "avg backpressure", "%.2f" % avg_backpressure]) data.append([" " * 5, "max backpressure", str(nodes[name]["max"])]) def __print_stages(self, status, stages, data): data.append(["busiest stages in %s" % status.upper()]) data.append(["-" * 30]) # sort to make the worst node first for name, stage in sorted(stages.items(), key=lambda tpool: max(tpool[1].values()), reverse=True): data.append(["%s:" % name]) # sort again to make the worst stage on the node first for tp, vals in sorted(stage.items(), key=lambda stg: max(stg[1]), reverse=True): data.append([" " * 5, "%s:" % tp, str(max(vals))])
def __init__(self, diag_dir=None, files=None): self.diag_dir = diag_dir self.files = files self.analyzed = False self.jars = OrderedDefaultDict(int) self.files_analyzed = 0
class BucketGrep: """greps for custom regex and bucketizes results""" strayre = r".*" basere = r" *(?P<level>[A-Z]*) *\[(?P<thread_name>[^\]]*?)[:_-]?(?P<thread_id>[0-9]*)\] (?P<date>.{10} .{12}) *.*" def __init__( self, regex, diag_dir=None, files=None, start=None, end=None, ignorecase=True, report="summary", ): self.diag_dir = diag_dir self.files = files self.start = None self.end = None self.start_time = None self.end_time = None self.last_time = None self.report = report if start: self.start_time = date_parse(start) if end: self.end_time = date_parse(end) if ignorecase: self.strayregex = re.compile(self.strayre + regex + ".*", re.IGNORECASE) self.timeregex = re.compile(self.basere + regex + ".*", re.IGNORECASE) self.supplied_regex = regex.lower() else: self.strayregex = re.compile(self.strayre + regex + ".*") self.timeregex = re.compile(self.basere + regex + ".*") self.supplied_regex = regex self.valid_log_regex = re.compile(self.basere) self.node_matches = OrderedDefaultDict() self.matches = OrderedDefaultDict(list) self.count = 0 self.unknown = 0 self.analyzed = False def analyze(self): """parses logs for results""" print("bucketgrep version %s" % VERSION) print("search: '%s'" % self.supplied_regex) target = None if self.files: target = self.files elif self.diag_dir: if self.diag_dir == ".": directory_path = os.getcwd() print("from directory '%s':" % directory_path) else: print("from directory '%s':" % self.diag_dir) target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: with diag.FileWithProgress(file) as log: node_name = extract_node_name(file, ignore_missing_nodes=True) self.node_matches[node_name] = OrderedDefaultDict(list) for line in log: # as long as it's a valid log line we want the date, # even if we don't care about the rest of the line so we can set # the last date for any straregex lines that match current_dt = self.valid_log_regex.match(line) if current_dt: dt = date()(current_dt.group("date")) # if the log line is valite we want to set the last_time self.last_time = dt # we now can validate if our search term matches the log line d = self.timeregex.match(line) if d: # normal case, well-formatted log line self.__setdates(dt) if self.start_time and dt < self.start_time: continue if self.end_time and dt > self.end_time: continue self.matches[dt].append(line) self.node_matches[node_name][dt].append(line) self.count += 1 else: m = self.strayregex.match(line) # check for a match in an unformatted line, like a traceback if m: if self.last_time is None: # match, but no previous timestamp to associate with self.unknown += 1 continue self.matches[self.last_time].append(line) self.node_matches[node_name][ self.last_time].append(line) self.count += 1 self.analyzed = True def __setdates(self, dt): if not self.start: self.start = dt self.end = dt return if dt > self.end: self.end = dt if dt < self.start: self.start = dt def print_report(self, interval=3600): """print bucketized result counts""" print() if not self.analyzed: self.analyze() if not self.matches: print("No matches found") if self.unknown: print(self.unknown, "matches without timestamp") return if self.report == "summary": print() print("cluster wide") print("------------") buckets = sorted( bucketize(self.matches, start=self.start, end=self.end, seconds=interval).items(), key=lambda t: t[0], ) maxval = len(max(buckets, key=lambda t: len(t[1]))[1]) for time, matches in buckets: pad = "" for x in range(len(str(maxval)) - len(str(len(matches)))): pad += " " print( time.strftime("%Y-%m-%d %H:%M:%S") + pad, len(matches), textbar(maxval, len(matches)), ) else: print() print() print("per node numbers") print("----------------") for node in sorted(self.node_matches.keys()): print() print("node: %s" % node) print("--------") if not len(self.node_matches[node]): print("No matches for %s found" % node) continue buckets = sorted( bucketize( self.node_matches[node], start=self.start, end=self.end, seconds=interval, ).items(), key=lambda t: t[0], ) maxval = len(max(buckets, key=lambda t: len(t[1]))[1]) for time, matches in buckets: pad = "" for x in range(len(str(maxval)) - len(str(len(matches)))): pad += " " print( time.strftime("%Y-%m-%d %H:%M:%S") + pad, len(matches), textbar(maxval, len(matches)), ) if self.unknown: print(self.unknown, "matches without timestamp")
class SysbottleReport: "Produces a report from iostat output" def __init__(self, infile, conf=None): self.infile = infile self.parser = IOStatParser() self.count = 0 self.cpu_exceeded = 0 self.iowait_exceeded = 0 self.devices = OrderedDefaultDict(lambda: OrderedDefaultDict(list)) self.cpu_stats = OrderedDefaultDict(list) self.queuedepth = OrderedDefaultDict(int) self.start = None self.end = None self.device_index = OrderedDict() self.cpu_index = OrderedDict() self.conf = conf or self.__mk_conf() self.recs = set() self.analyzed = False def __mk_conf(self): conf = OrderedDict() conf["iowait_threshold"] = 5 conf["cpu_threshold"] = 50 conf["disks"] = [] conf["queue_threshold"] = 1 conf["busy_threshold"] = 5 return conf def analyze(self): "analyzes the file this class was initialized with" for io in self.parser.parse(self.infile): self.count += 1 if not self.device_index: self.__mk_col_idx(io) self.__analyze_disk(io) self.__analyze_cpu(io) if not self.start: self.start = io["date"] if not self.end or io["date"] > self.end: self.end = io["date"] self.analyzed = True def __mk_col_idx(self, stat): for i, col in enumerate(stat["device"]["cols"]): self.device_index[col] = i for i, col in enumerate(stat["cpu"]["cols"]): self.cpu_index[col] = i def __want_disk(self, name): if not self.conf["disks"]: return True return name in self.conf["disks"] def __analyze_disk(self, stat): for disk, values in stat["device"]["stat"].items(): if self.__want_disk(disk): for col in self.device_index: val = values[self.device_index[col]] self.devices[disk][col].append(val) if "qu" in col and val >= self.conf["queue_threshold"]: self.queuedepth[disk] += 1 self.recs.add("* decrease activity on %s" % disk) def __analyze_cpu(self, stat): total = 0 for cpu in ["system", "user", "nice", "steal"]: total += stat["cpu"]["stat"][self.cpu_index["%" + cpu]] self.cpu_stats["total"].append(total) if total > self.conf["cpu_threshold"]: self.cpu_exceeded += 1 self.recs.add("* tune for less CPU usage") for col in self.cpu_index: val = stat["cpu"]["stat"][self.cpu_index[col]] self.cpu_stats[col].append(val) if (stat["cpu"]["stat"][self.cpu_index["%iowait"]] > self.conf["iowait_threshold"]): self.iowait_exceeded += 1 self.recs.add("* tune for less IO") def print_report(self): "prints a report for the file this class was initialized with, analyzing if necessary" if not self.analyzed: self.analyze() print("sysbottle version %s" % VERSION) print() print() print("* total records: %s" % self.count) if self.count: def report_percentage(a): return (float(a) / float(self.count)) * 100.0 print( "* total bottleneck time: %.2f%% (cpu bound, io bound, or both)" % report_percentage(self.iowait_exceeded + self.cpu_exceeded)) print("* cpu+system+nice+steal time > %.2f%%: %.2f%%" % (self.conf["cpu_threshold"], report_percentage(self.cpu_exceeded))) print("* iowait time > %.2f%%: %.2f%%" % ( self.conf["iowait_threshold"], report_percentage(self.iowait_exceeded), )) print("* start %s" % self.start) print("* end %s" % self.end) log_time_seconds = (self.end - self.start).total_seconds() + 1 print("* log time: %ss" % log_time_seconds) print("* interval: %ss" % report_percentage(log_time_seconds)) for device in self.devices.keys(): print("* %s time at queue depth >= %.2f: %.2f%%" % ( device, self.conf["queue_threshold"], report_percentage(self.queuedepth[device]), )) print() lines = [] lines.append(get_percentile_headers()) lines.append(["", "---", "---", "---", "---", "---", "---"]) lines.append(get_percentiles("cpu", self.cpu_stats["total"])) lines.append(get_percentiles("iowait", self.cpu_stats["%iowait"])) lines.append([]) lines.append(get_percentile_headers()) lines.append(["", "---", "---", "---", "---", "---", "---"]) for device in self.devices: lines.append([device, "", "", "", "", "", ""]) for iotype in self.devices[device].keys(): if "qu" in iotype or "wait" in iotype: lines.append( get_percentiles("- " + iotype + ":", self.devices[device][iotype])) lines.append([]) humanize.pad_table(lines, 8, 2) for line in lines: print("".join(line)) self.print_recommendations() def print_recommendations(self): """print recommendations""" if not self.recs: return print("recommendations") print("-" * 15) for rec in self.recs: print(rec)
def parse(self, log): """ parse ttop output """ total = OrderedDict() threads = OrderedDefaultDict(dict) for line in log: if self.state is None: m = self.begin_match.match(line) if m: dt = datetime.strptime(m.group("date"), "%Y-%m-%dT%H:%M:%S.%f%z") if self.start and dt < self.start: continue if self.end and dt > self.end: continue total["date"] = dt self.state = self.BEGIN continue if self.state == self.BEGIN: m = self.process_match.match(line) if not m: raise ValueError("process line not found in " + line) self.state = self.PROCESS total["cpu_total"] = float(m.group("cpu_total")) continue if self.state == self.PROCESS: m = self.application_match.match(line) if not m: raise ValueError("application line not found in " + line) self.state = self.APPLICATION total["app_cpu"] = float(m.group("app_cpu")) total["user_cpu"] = float(m.group("user_cpu")) total["sys_cpu"] = float(m.group("sys_cpu")) continue if self.state == self.APPLICATION: m = self.other_match.match(line) if not m: raise ValueError("other line not found in '" + line + "'") self.state = self.OTHER total["other_cpu"] = float(m.group("other_cpu")) continue if self.state == self.OTHER: m = self.thread_match.match(line) if not m: raise ValueError("thread line not found in '" + line + "'") self.state = self.THREAD total["thread_count"] = int(m.group("thread_count")) continue if self.state == self.THREAD: m = self.heap_match.match(line) if not m: raise ValueError("heap line not found in '" + line + "'") self.state = self.TINFO total["heap_rate"] = self.convert(m.group("heap_rate"), m.group("heap_unit")) continue if self.state == self.TINFO: if line == "\n": self.state = None yield total, threads total = OrderedDict() threads = OrderedDefaultDict(dict) else: m = self.tinfo_match.match(line) if not m: raise ValueError("thread info not found in '" + line + "'") threads[m.group("thread_name")]["user_cpu"] = float( m.group("user_cpu")) threads[m.group("thread_name")]["sys_cpu"] = float( m.group("sys_cpu")) threads[m.group("thread_name")]["total_cpu"] = float( m.group("sys_cpu")) + float(m.group("user_cpu")) threads[m.group( "thread_name")]["heap_rate"] = self.convert( m.group("heap_rate"), m.group("heap_unit")) continue