Esempio n. 1
0
class Node:
    """represents a cassandra/dse node"""
    def __init__(self):
        self.start = None
        self.end = None
        self.tables = OrderedDefaultDict(Table)
        self.stages = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
        self.pauses = []
        self.version = None
        self.lines = 0
        self.skipped_lines = 0
        self.cassandra_version = None
        self.dumps_analyzed = 0

    def get_busiest_tables(self, by_prop):
        """return busiest tables by_prop (data or ops)"""
        return sorted(
            self.tables.items(),
            key=lambda table: getattr(table[1], by_prop),
            reverse=True,
        )

    def longest_tp_name_length(self):
        """find the length of the thread pool with the longest name"""
        longest = 0
        for stage in self.stages.values():
            slen = len(max(stage, key=len))
            if slen > longest:
                longest = slen
        return longest

    def longest_tp_value_length(self):
        """find the length of the longest value in any threadpool"""
        longest = 0
        for stage in self.stages.values():
            for vals in stage.values():
                vlen = len(max(map(str, vals), key=len))
                if vlen > longest:
                    longest = vlen
        return longest

    def duration(self):
        """duration this node was analyzed"""
        return self.end - self.start
Esempio n. 2
0
class StatusLogger:
    """status logger"""
    def __init__(
        self,
        diag_dir,
        files=None,
        start=None,
        end=None,
        wanted_stages=None,
        command_name="sperf core statuslogger",
        syslog_prefix="system.log",
        dbglog_prefix="debug.log",
    ):
        self.diag_dir = diag_dir
        self.files = files
        self.wanted_stages = wanted_stages
        if env.DEBUG:
            print("wanted stages:", self.wanted_stages)
        self.nodes = OrderedDefaultDict(Node)
        self.analyzed = False
        self.dumps_analyzed = 0
        self.rule_types = OrderedDefaultDict(int)
        self.command_name = command_name
        self.syslog_prefix = syslog_prefix
        self.dbglog_prefix = dbglog_prefix
        self.start = None
        self.end = None
        if start:
            self.start = date_parse(start)
        if end:
            self.end = date_parse(end)

    def analyze(self):
        """analyze log files"""
        if self.analyzed:
            return
        event_filter = UniqEventPerNodeFilter()
        target = None
        if self.files:
            target = self.files
        elif self.diag_dir:
            target_system = find_logs(self.diag_dir,
                                      file_to_find=self.syslog_prefix)
            target_debug = find_logs(self.diag_dir,
                                     file_to_find=self.dbglog_prefix)
            target = target_system + target_debug
        else:
            raise Exception("no diag dir and no files specified")
        for f in target:
            nodename = extract_node_name(f, ignore_missing_nodes=True)
            event_filter.set_node(nodename)
            node = self.nodes[nodename]
            if env.DEBUG:
                print("parsing", f)
            with FileWithProgress(f) as log:
                statuslogger_fixer = UnknownStatusLoggerWriter()
                for event in parser.read_system_log(log):
                    statuslogger_fixer.check(event)
                    if self.start and event["date"] < self.start:
                        continue
                    if self.end and event["date"] > self.end:
                        continue
                    self.__setdates(node, statuslogger_fixer.last_event_date)
                    node.lines += 1
                    if event_filter.is_duplicate(event):
                        node.skipped_lines += 1
                        continue
                    if env.DEBUG:
                        if "rule_type" in event:
                            self.rule_types[event["rule_type"]] += 1
                        elif event["event_type"] == "unknown":
                            self.rule_types["unknown"] += 1
                        else:
                            self.rule_types["no type"] += 1
                    if event["event_type"] == "server_version":
                        if event.get("version"):
                            node.version = event["version"]
                            if node.version.startswith("6"):
                                node.cassandra_version = "DSE Private Fork"
                        elif event.get("cassandra_version"):
                            node.cassandra_version = event["cassandra_version"]
                        # skipping solr, spark etc as it maybe too much noise for statuslogger
                    elif event["event_type"] == "memtable_status":
                        tname = ".".join([event["keyspace"], event["table"]])
                        if event["ops"] > node.tables[tname].ops:
                            node.tables[tname].ops = event["ops"]
                        try:
                            if event["data"] > node.tables[tname].data:
                                node.tables[tname].data = event["data"]
                        except Exception as e:
                            print(event)
                            raise e
                    elif event["event_type"] == "pause":
                        node.pauses.append(event["duration"])
                    elif event["event_type"] == "threadpool_header":
                        node.dumps_analyzed += 1
                        self.dumps_analyzed += 1
                    elif event["event_type"] == "threadpool_status":
                        if re.match(r"TPC/\d+$", event["pool_name"]):
                            if not node.version:
                                node.version = "6.x"
                            if "delayed" in event and event["delayed"]:
                                val = event["delayed"]
                                node.stages["local backpressure"][
                                    event["pool_name"]].append(val)
                        else:
                            for pool in [
                                    "active",
                                    "pending",
                                    "blocked",
                                    "all_time_blocked",
                            ]:
                                if pool in event and event[pool]:
                                    if not self.wanted_stages or event[
                                            "pool_name"].startswith(
                                                self.wanted_stages):
                                        node.stages[pool][
                                            event["pool_name"]].append(
                                                event[pool])
        self.analyzed = True
        if env.DEBUG:
            print(self.rule_types.items())

    def __setdates(self, node, date):
        if not node.start:
            node.start = date
            node.end = date
            return
        if date > node.end:
            node.end = date
        if date < node.start:
            node.start = date

    def print_histogram(self):
        """print histogram report, analyzing if necessary"""
        self.analyze()
        print("%s version: %s" % (self.command_name, VERSION))
        print("")
        print("Histogram")
        print("")
        if not self.nodes:
            print("Nothing found!")
            return
        cassandra_versions = set()
        dse_versions = set()
        for name, node in self.nodes.items():
            cassandra_versions.add(node.cassandra_version)
            dse_versions.add(node.dse_versions)
            print(name)
            print("-" * 60)
            print("%s lines" % format_num(node.lines))
            print("%s skipped lines" % format_num(node.skipped_lines))
            print("dse version: %s" % (node.version or "unknown"))
            print("cassandra version: %s" %
                  (node.cassandra_version or "unknown"))
            print("log start time: %s" % node.start)
            print("log end time: %s" % node.end)
            if not node.dumps_analyzed:
                print("Nothing found!")
                continue
            print("duration: %s" %
                  format_seconds(int(node.duration().total_seconds())))
            print("stages analyzed: %s" % node.dumps_analyzed)
            if node.pauses:
                percentiles = [[]]
                percentiles.append(get_percentile_headers("GC pauses"))
                header = [""]
                header.extend(["---" for i in range(6)])
                percentiles.append(header)
                percentiles.append(
                    get_percentiles("ms", node.pauses, strformat="%i"))
                pad_table(percentiles, min_width=11, extra_pad=2)
                for line in percentiles:
                    print("".join(line))
                print("total GC events: %s" % len(node.pauses))
            print("")
            ops = node.get_busiest_tables("ops")[:5]
            if ops:
                print("busiest tables (ops)")
                print("-" * 30)
                nlen = max(len(o[0]) for o in ops)
                for n, t in ops:
                    print(n.ljust(nlen), t)
                data = node.get_busiest_tables("data")[:5]
                print("busiest tables (data)")
                print("-" * 30)
                nlen = max(len(d[0]) for d in data)
                for n, t in data:
                    print(n.ljust(nlen), t)
            print("")
            if node.stages:
                percentiles = []
                percentiles.append(get_percentile_headers("stages"))
                percentiles.append([])
                for status, stage in node.stages.items():
                    header = [status.upper()]
                    header.extend("" for i in range(6))
                    percentiles.append(header)
                    for tpname, vals in sorted(stage.items(),
                                               key=lambda tpool: max(tpool[1]),
                                               reverse=True):
                        percentiles.append(
                            get_percentiles(tpname, vals, strformat="%i"))
                    percentiles.append([])
                pad_table(percentiles, extra_pad=2)
                for line in percentiles:
                    print("".join(line))
        print("")
        self.__print_recs(dse_versions, cassandra_versions)

    def __print_recs(self, versions, cassandra_versions):
        version = None
        cassandra_version = None
        if len(cassandra_versions) > 1:
            print(
                "WARNING more than one version present assuming no version with recommendations"
            )
        elif len(cassandra_versions) == 1:
            cassandra_version = cassandra_versions[0]
        elif len(versions) == 1:
            version = versions[0]
        engine = Engine(version, cassandra_version)
        recs = set()
        for node in self.nodes.values():
            rstage = OrderedDefaultDict(dict)
            for status, stage in node.stages.items():
                for tpname, vals in stage.items():
                    rstage[tpname][status] = max(vals)
            for tpname in rstage:
                for sname in (
                        "active",
                        "pending",
                        "local backpressure",
                        "completed",
                        "blocked",
                        "all_time_blocked",
                ):
                    if sname not in rstage[tpname]:
                        rstage[tpname][sname] = 0
                s = Stage(
                    tpname,
                    rstage[tpname]["active"],
                    rstage[tpname]["pending"],
                    rstage[tpname]["local backpressure"],
                    rstage[tpname]["completed"],
                    rstage[tpname]["blocked"],
                    rstage[tpname]["all_time_blocked"],
                )
                reason, rec = engine.analyze_stage(s)
                if reason:
                    recs.add((rec, reason))
        if recs:
            print("recommmendations")
            print("----------------")
        for rec, reason in recs:
            print("* %s (%s)" % (rec, reason))

    def print_summary(self):
        """prints a summary report"""
        self.analyze()
        summary = Summary(self.nodes)
        print("%s version: %s" % (self.command_name, VERSION))
        print("")
        print("Summary (%s lines)" % format_num(summary.lines))
        print("Summary (%s skipped lines)" % format_num(summary.skipped_lines))
        print("")
        print("dse versions: %s" % (set(summary.versions) or "unknown"))
        print("cassandra versions: %s" %
              (set(summary.cassandra_versions) or "unknown"))
        print("first log time: %s" % summary.start)
        print("last log time: %s" % summary.end)
        if not self.dumps_analyzed:
            print("Nothing found!")
            return
        print("duration: %s" %
              format_seconds(int(summary.duration.total_seconds())))
        print("total stages analyzed: %s" % self.dumps_analyzed)
        print("total nodes analyzed: %s" % len(summary.nodes))
        pauses = summary.get_pauses()
        if pauses:
            print("")
            percentiles = []
            percentiles.append(get_percentile_headers("GC pauses"))
            percentiles.append(["", "---", "---", "---", "---", "---", "---"])
            percentiles.append(get_percentiles("ms", pauses, strformat="%i"))
            pad_table(percentiles, min_width=11, extra_pad=2)
            for line in percentiles:
                print("".join(line))
            print("total GC events: %s" % len(pauses))
        print("")
        ops = summary.get_busiest_tables("ops")[:5]
        if ops:
            print("busiest tables by ops across all nodes")
            print("-" * 30)
            for onode, (oname, num) in ops:
                print("* %s: %s: %s" % (onode, oname, num))
            print("")
            print("busiest table by data across all nodes")
            print("-" * 30)
            for dnode, (dname, data) in summary.get_busiest_tables("data")[:5]:
                print("* %s: %s: %s" % (dnode, dname, data))
            print("")
        print("busiest stages across all nodes")
        print("-" * 30)
        data = []
        for (name, status, stage, value) in summary.get_busiest_stages():
            data.append(
                ["* %s %s:" % (stage, status),
                 str(value),
                 "(%s)" % name])
        pad_table(data, extra_pad=2)
        for line in data:
            print("".join(line))
        pending = summary.get_stages_in("pending")
        data = []
        if pending:
            data.append([])
            self.__print_stages("pending", pending, data)
        delayed = summary.get_stages_in("local backpressure")
        if delayed:
            data.append([])
            self.__print_backpressure(delayed, data)
        data.append([])
        pad_table(data, extra_pad=2)
        for line in data:
            print("".join(line))
        self.__print_recs(summary.versions, summary.cassandra_versions)

    def __print_backpressure(self, stages, data):
        nodes = OrderedDict()
        # get total backpressure and max backpressure per node
        for name, stage in stages.items():
            if name not in nodes:
                # if not present initialize so aggregate code can safely assume 0 value is already populated
                agg = OrderedDict()
                agg["total"] = 0
                agg["count"] = 0
                agg["max"] = 0
                nodes[name] = agg
            node_agg = nodes[name]
            for vals in stage.values():
                node_agg["count"] += len(vals)
                node_agg["total"] += sum(vals)
                node_agg["max"] = max(max(vals), node_agg["max"])
        data.append(["busiest LOCAL BACKPRESSURE"])
        data.append(["-" * 30])
        # sort to make the node with the worst backpressure total first
        for name, val in sorted(
                nodes.items(),
                key=lambda kvp: kvp[1]["total"] / float(kvp[1]["count"]),
                reverse=True,
        ):
            data.append(["%s:" % name])
            avg_backpressure = val["total"] / float(val["count"])
            data.append(
                [" " * 5, "avg backpressure",
                 "%.2f" % avg_backpressure])
            data.append([" " * 5, "max backpressure", str(nodes[name]["max"])])

    def __print_stages(self, status, stages, data):
        data.append(["busiest stages in %s" % status.upper()])
        data.append(["-" * 30])
        # sort to make the worst node first
        for name, stage in sorted(stages.items(),
                                  key=lambda tpool: max(tpool[1].values()),
                                  reverse=True):
            data.append(["%s:" % name])
            # sort again to make the worst stage on the node first
            for tp, vals in sorted(stage.items(),
                                   key=lambda stg: max(stg[1]),
                                   reverse=True):
                data.append([" " * 5, "%s:" % tp, str(max(vals))])
Esempio n. 3
0
class GCInspector:
    """GCInspector class"""
    def __init__(self, diag_dir=None, files=None, start=None, end=None):
        self.diag_dir = diag_dir
        self.files = files
        self.pauses = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
        self.gc_types = OrderedDefaultDict(int)
        self.start = None
        self.end = None
        self.starts = OrderedDefaultDict(datetime.datetime)
        self.ends = OrderedDefaultDict(datetime.datetime)
        self.analyzed = False
        self.start_time = None
        self.end_time = None
        if start:
            self.start_time = date_parse(start)
        if end:
            self.end_time = date_parse(end)

    def analyze(self):
        """analyze files"""
        target = None
        if self.files:
            target = self.files
        elif self.diag_dir:
            target = diag.find_logs(self.diag_dir)
        else:
            raise Exception("no diag dir and no files specified")
        for file in target:
            node = extract_node_name(file)
            with diag.FileWithProgress(file) as log:
                for event in parser.read_log(log, gc.capture_line):
                    if event["event_type"] == "pause":
                        if self.start_time and event["date"] < self.start_time:
                            continue
                        if self.end_time and event["date"] > self.end_time:
                            continue
                        self.__setdates(event["date"], node)
                        self.pauses[node][event["date"]].append(
                            event["duration"])
                        self.gc_types[event["gc_type"]] += 1
        self.analyzed = True

    def __setdates(self, date, node):
        """track start/end times"""
        # global
        if not self.start:
            self.start = date
            self.end = date
        if date > self.end:
            self.end = date
        if date < self.start:
            self.start = date
        # node specific
        if node not in self.starts:
            self.starts[node] = date
            self.ends[node] = date
            return
        if date > self.ends[node]:
            self.ends[node] = date
        if date < self.starts[node]:
            self.starts[node] = date

    def all_pauses(self):
        """get pauses for all nodes"""
        pauses = OrderedDefaultDict(list)
        for pausedata in self.pauses.values():
            for time, pause in pausedata.items():
                pauses[time].extend(pause)
        return pauses

    def print_report(self, interval=3600, by_node=False, top=3):
        """print gc report"""
        print("gcinspector version %s" % VERSION)
        print("")
        if not self.analyzed:
            self.analyze()
        if not self.pauses:
            print("No pauses found")
            return
        if not by_node:
            pauses = self.all_pauses()
            self.__print_gc(
                sorted(
                    bucketize(pauses,
                              start=self.start,
                              end=self.end,
                              seconds=interval).items(),
                    key=lambda t: t[0],
                ))
            plist = []
            for time in pauses:
                plist.extend(pauses[time])
            worst_k = heapq.nlargest(top, plist)
            print("Worst pauses in ms:")
            print(worst_k)

        else:
            for node in self.pauses:
                print(node)
                self.__print_gc(
                    sorted(
                        bucketize(
                            self.pauses[node],
                            start=self.starts[node],
                            end=self.ends[node],
                            seconds=interval,
                        ).items(),
                        key=lambda t: t[0],
                    ))
                plist = []
                for time, pauses in self.pauses[node].items():
                    plist.extend(pauses)
                worst_k = heapq.nlargest(top, plist)
                print("Worst pauses in ms:")
                print(worst_k)
                print("")
        print("")
        print("Collections by type")
        print("-" * 20)
        for collection, count in self.gc_types.items():
            print("* %s: %s" % (collection, count))
        print("")

    def __print_gc(self, data):
        """print data to the user, expecting datetime keys and list(int) values"""
        print(". <300ms + 301-500ms ! >500ms")
        print("-" * 30)
        busiest = None
        for time, pauses in data:
            total = sum(pauses)
            if not busiest:
                busiest = (time, total)
            elif total > busiest[1]:
                busiest = (time, total)
            print(time.strftime("%Y-%m-%d %H:%M:%S"), end=" ")
            print(len(pauses), end=" ")
            for pause in pauses:
                c = "."
                if pause > 300:
                    c = "+"
                if pause > 500:
                    c = "!"
                print(c, end="")
            print("")
        print("")
        print("busiest period: %s (%sms)" %
              (busiest[0].strftime("%Y-%m-%d %H:%M:%S"), busiest[1]))
        print("")
        percentiles = [[]]
        percentiles.append(get_percentile_headers("GC pauses"))
        header = [""]
        header.extend(["---" for i in range(6)])
        percentiles.append(header)
        percentiles.append(
            get_percentiles(
                "ms",
                list(
                    itertools.chain.from_iterable(pauses
                                                  for time, pauses in data)),
                strformat="%i",
            ))
        pad_table(percentiles, min_width=11, extra_pad=2)
        for line in percentiles:
            print("".join(line))
        print()