Exemple #1
0
 def __init__(
     self,
     diag_dir,
     files=None,
     start=None,
     end=None,
     wanted_stages=None,
     command_name="sperf core statuslogger",
     syslog_prefix="system.log",
     dbglog_prefix="debug.log",
 ):
     self.diag_dir = diag_dir
     self.files = files
     self.wanted_stages = wanted_stages
     if env.DEBUG:
         print("wanted stages:", self.wanted_stages)
     self.nodes = OrderedDefaultDict(Node)
     self.analyzed = False
     self.dumps_analyzed = 0
     self.rule_types = OrderedDefaultDict(int)
     self.command_name = command_name
     self.syslog_prefix = syslog_prefix
     self.dbglog_prefix = dbglog_prefix
     self.start = None
     self.end = None
     if start:
         self.start = date_parse(start)
     if end:
         self.end = date_parse(end)
Exemple #2
0
 def __init__(self):
     self.start = None
     self.end = None
     self.tables = OrderedDefaultDict(Table)
     self.stages = OrderedDefaultDict(lambda: defaultdict(list))
     self.pauses = []
     self.version = None
     self.lines = 0
     self.skipped_lines = 0
     self.cassandra_version = None
     self.dumps_analyzed = 0
Exemple #3
0
 def collate_threads(self, threads):
     """ combines similar threads """
     ret = OrderedDefaultDict(lambda: OrderedDefaultDict(float))
     exprs = []
     exprs.append(r':.*')
     exprs.append(r'-\d+.*')
     exprs.append(r'-\/.*')
     for thread in threads:
         name = thread
         for e in exprs:
             name = re.sub(e, '', name)
         for t, v in threads[thread].items():
             ret[name][t] = round(ret[name][t] + v, 2)
             ret[name]['thread_count'] += 1
     return ret
Exemple #4
0
 def analyze(self):
     error_if_file_not_found = False
     """ analyze log files """
     if self.files:
         error_if_file_not_found = True
         target = self.files
     elif self.diag_dir:
         target = diag.find_logs(self.diag_dir, "output.log")
     else:
         self.analyzed = True
         return
     for file in target:
         with diag.FileWithProgress(file) as log:
             if not log.file_desc and error_if_file_not_found:
                 raise FileNotFoundError(log.error)
             for event in parser.read_output_log(log):
                 if event["event_type"] == "classpath":
                     thisjars = OrderedDefaultDict(int)
                     for jar in event["classpath"].split(":"):
                         j = jar.split("/")[-1]
                         if j.endswith("jar"):
                             # to eliminate dupes within the same file, because java is crazy town
                             if j not in thisjars:
                                 thisjars[j] += 1
                                 self.jars[j] += 1
             self.files_analyzed += 1
     self.analyzed = True
Exemple #5
0
 def all_pauses(self):
     """get pauses for all nodes"""
     pauses = OrderedDefaultDict(list)
     for pausedata in self.pauses.values():
         for time, pause in pausedata.items():
             pauses[time].extend(pause)
     return pauses
Exemple #6
0
 def generate(self, parsed):
     """generates a time series report for a tarball"""
     table = []
     table.append("")
     table.append("filter cache evictions by hour")
     table.append("------------------------------")
     events_by_datetime = OrderedDefaultDict(list)
     start = dates.max_utc_time()
     end = dates.min_utc_time()
     for node, events in parsed["nodes"].items():
         for info in events.get("evictions"):
             # put into structure we can use for bucketize
             for value in info.values():
                 if value.time_stamp > end:
                     end = value.time_stamp
                 if value.time_stamp < start:
                     start = value.time_stamp
                 events_by_datetime[value.time_stamp].append(value)
     buckets = sorted(
         util.bucketize(events_by_datetime, start, end, 3600).items(),
         key=lambda t: t[0],
     )
     maxval = len(max(buckets, key=lambda t: len(t[1]))[1])
     for time, matches in buckets:
         pad = ""
         for x in range(len(str(maxval)) - len(str(len(matches)))):
             pad += " "
         table.append("%s %s %s" % (
             time.strftime("%Y-%m-%d %H:%M:%S") + pad,
             len(matches),
             util.textbar(maxval, len(matches)),
         ))
     return "\n".join(table)
Exemple #7
0
 def __init__(self,
              regex,
              diag_dir=None,
              files=None,
              start=None,
              end=None,
              ignorecase=True):
     self.diag_dir = diag_dir
     self.files = files
     self.start = None
     self.end = None
     self.start_time = None
     self.end_time = None
     self.last_time = None
     if start:
         self.start_time = date_parse(start)
     if end:
         self.end_time = date_parse(end)
     if ignorecase:
         self.strayregex = re.compile(self.strayre + regex + ".*",
                                      re.IGNORECASE)
         self.timeregex = re.compile(self.basere + regex + ".*",
                                     re.IGNORECASE)
         self.supplied_regex = regex.lower()
     else:
         self.strayregex = re.compile(self.strayre + regex + ".*")
         self.timeregex = re.compile(self.basere + regex + ".*")
         self.supplied_regex = regex
     self.valid_log_regex = re.compile(self.basere)
     self.matches = OrderedDefaultDict(list)
     self.count = 0
     self.unknown = 0
     self.analyzed = False
Exemple #8
0
    def __init__(self, infile, conf=None):
        self.infile = infile
        self.parser = IOStatParser()

        self.count = 0
        self.cpu_exceeded = 0
        self.iowait_exceeded = 0
        self.devices = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
        self.cpu_stats = OrderedDefaultDict(list)
        self.queuedepth = OrderedDefaultDict(int)
        self.start = None
        self.end = None

        self.device_index = OrderedDict()
        self.cpu_index = OrderedDict()
        self.conf = conf or self.__mk_conf()
        self.recs = set()
        self.analyzed = False
Exemple #9
0
class Node:
    """represents a cassandra/dse node"""
    def __init__(self):
        self.start = None
        self.end = None
        self.tables = OrderedDefaultDict(Table)
        self.stages = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
        self.pauses = []
        self.version = None
        self.lines = 0
        self.skipped_lines = 0
        self.cassandra_version = None
        self.dumps_analyzed = 0

    def get_busiest_tables(self, by_prop):
        """return busiest tables by_prop (data or ops)"""
        return sorted(
            self.tables.items(),
            key=lambda table: getattr(table[1], by_prop),
            reverse=True,
        )

    def longest_tp_name_length(self):
        """find the length of the thread pool with the longest name"""
        longest = 0
        for stage in self.stages.values():
            slen = len(max(stage, key=len))
            if slen > longest:
                longest = slen
        return longest

    def longest_tp_value_length(self):
        """find the length of the longest value in any threadpool"""
        longest = 0
        for stage in self.stages.values():
            for vals in stage.values():
                vlen = len(max(map(str, vals), key=len))
                if vlen > longest:
                    longest = vlen
        return longest

    def duration(self):
        """duration this node was analyzed"""
        return self.end - self.start
Exemple #10
0
 def analyze(self):
     """parses logs for results"""
     print("bucketgrep version %s" % VERSION)
     print("search: '%s'" % self.supplied_regex)
     target = None
     if self.files:
         target = self.files
     elif self.diag_dir:
         if self.diag_dir == ".":
             directory_path = os.getcwd()
             print("from directory '%s':" % directory_path)
         else:
             print("from directory '%s':" % self.diag_dir)
         target = diag.find_logs(self.diag_dir)
     else:
         raise Exception("no diag dir and no files specified")
     for file in target:
         with diag.FileWithProgress(file) as log:
             node_name = extract_node_name(file, ignore_missing_nodes=True)
             self.node_matches[node_name] = OrderedDefaultDict(list)
             for line in log:
                 # as long as it's a valid log line we want the date,
                 # even if we don't care about the rest of the line so we can set
                 # the last date for any straregex lines that match
                 current_dt = self.valid_log_regex.match(line)
                 if current_dt:
                     dt = date()(current_dt.group("date"))
                     # if the log line is valite we want to set the last_time
                     self.last_time = dt
                 # we now can validate if our search term matches the log line
                 d = self.timeregex.match(line)
                 if d:
                     # normal case, well-formatted log line
                     self.__setdates(dt)
                     if self.start_time and dt < self.start_time:
                         continue
                     if self.end_time and dt > self.end_time:
                         continue
                     self.matches[dt].append(line)
                     self.node_matches[node_name][dt].append(line)
                     self.count += 1
                 else:
                     m = self.strayregex.match(line)
                     # check for a match in an unformatted line, like a traceback
                     if m:
                         if self.last_time is None:
                             # match, but no previous timestamp to associate with
                             self.unknown += 1
                             continue
                         self.matches[self.last_time].append(line)
                         self.node_matches[node_name][
                             self.last_time].append(line)
                         self.count += 1
     self.analyzed = True
Exemple #11
0
    def __init__(self, children):
        """
        Constructor expects to be passed one or more case and rule objects. The case objects are
        used to group the rules.
        """

        self.rules = OrderedDefaultDict(list)
        keys = None
        for child in children:
            if isinstance(child, case):
                keys = child.keys
            else:
                for key in keys:
                    self.rules[key].append(child)
Exemple #12
0
 def __init__(self, diag_dir, files=None, start=None, end=None):
     self.diag_dir = diag_dir
     self.files = files
     self.parser = SlowQueryParser()
     self.querytimes = OrderedDefaultDict(list)
     self.queries = []
     self.analyzed = False
     self.start = None
     self.end = None
     self.cross = 0
     self.start_time = None
     self.end_time = None
     if start:
         self.start_time = date_parse(start)
     if end:
         self.end_time = date_parse(end)
Exemple #13
0
 def __print_recs(self, versions, cassandra_versions):
     version = None
     cassandra_version = None
     if len(cassandra_versions) > 1:
         print(
             "WARNING more than one version present assuming no version with recommendations"
         )
     elif len(cassandra_versions) == 1:
         cassandra_version = cassandra_versions[0]
     elif len(versions) == 1:
         version = versions[0]
     engine = Engine(version, cassandra_version)
     recs = set()
     for node in self.nodes.values():
         rstage = OrderedDefaultDict(dict)
         for status, stage in node.stages.items():
             for tpname, vals in stage.items():
                 rstage[tpname][status] = max(vals)
         for tpname in rstage:
             for sname in (
                     "active",
                     "pending",
                     "local backpressure",
                     "completed",
                     "blocked",
                     "all_time_blocked",
             ):
                 if sname not in rstage[tpname]:
                     rstage[tpname][sname] = 0
             s = Stage(
                 tpname,
                 rstage[tpname]["active"],
                 rstage[tpname]["pending"],
                 rstage[tpname]["local backpressure"],
                 rstage[tpname]["completed"],
                 rstage[tpname]["blocked"],
                 rstage[tpname]["all_time_blocked"],
             )
             reason, rec = engine.analyze_stage(s)
             if reason:
                 recs.add((rec, reason))
     if recs:
         print("recommmendations")
         print("----------------")
     for rec, reason in recs:
         print("* %s (%s)" % (rec, reason))
Exemple #14
0
 def __init__(self, diag_dir=None, files=None, start=None, end=None):
     self.diag_dir = diag_dir
     self.files = files
     self.pauses = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
     self.gc_types = OrderedDefaultDict(int)
     self.start = None
     self.end = None
     self.starts = OrderedDefaultDict(datetime.datetime)
     self.ends = OrderedDefaultDict(datetime.datetime)
     self.analyzed = False
     self.start_time = None
     self.end_time = None
     if start:
         self.start_time = date_parse(start)
     if end:
         self.end_time = date_parse(end)
Exemple #15
0
 def __print_recs(self):
     engine = Engine()
     recs = set()
     for node in self.nodes.values():
         rstage = OrderedDefaultDict(dict)
         for status, stage in node.stages.items():
             for tpname, vals in stage.items():
                 rstage[tpname][status] = max(vals)
         for tpname in rstage:
             for sname in (
                     "active",
                     "pending",
                     "local backpressure",
                     "completed",
                     "blocked",
                     "all_time_blocked",
             ):
                 if sname not in rstage[tpname]:
                     rstage[tpname][sname] = 0
             s = Stage(
                 tpname,
                 rstage[tpname]["active"],
                 rstage[tpname]["pending"],
                 rstage[tpname]["local backpressure"],
                 rstage[tpname]["completed"],
                 rstage[tpname]["blocked"],
                 rstage[tpname]["all_time_blocked"],
             )
             reason, rec = engine.analyze_stage(s)
             if reason:
                 recs.add((rec, reason))
     if recs:
         print("recommmendations")
         print("----------------")
     for rec, reason in recs:
         print("* %s (%s)" % (rec, reason))
Exemple #16
0
 def parse(self, log):
     """ parse ttop output """
     total = OrderedDict()
     threads = OrderedDefaultDict(dict)
     for line in log:
         if self.state is None:
             m = self.begin_match.match(line)
             if m:
                 dt = date('%Y-%m-%dT%H:%M:%S.%f%z')(m.group('date'))
                 if self.start and dt < self.start:
                     continue
                 if self.end and dt > self.end:
                     continue
                 total['date'] = dt
                 self.state = self.BEGIN
                 continue
         if self.state == self.BEGIN:
             m = self.process_match.match(line)
             if not m:
                 raise ValueError("process line not found in " + line)
             self.state = self.PROCESS
             total['cpu_total'] = float(m.group('cpu_total'))
             continue
         if self.state == self.PROCESS:
             m = self.application_match.match(line)
             if not m:
                 raise ValueError("application line not found in " + line)
             self.state = self.APPLICATION
             total['app_cpu'] = float(m.group('app_cpu'))
             total['user_cpu'] = float(m.group('user_cpu'))
             total['sys_cpu'] = float(m.group('sys_cpu'))
             continue
         if self.state == self.APPLICATION:
             m = self.other_match.match(line)
             if not m:
                 raise ValueError("other line not found in '" + line + "'")
             self.state = self.OTHER
             total['other_cpu'] = float(m.group('other_cpu'))
             continue
         if self.state == self.OTHER:
             m = self.thread_match.match(line)
             if not m:
                 raise ValueError("thread line not found in '" + line + "'")
             self.state = self.THREAD
             total['thread_count'] = int(m.group('thread_count'))
             continue
         if self.state == self.THREAD:
             m = self.heap_match.match(line)
             if not m:
                 raise ValueError("heap line not found in '" + line + "'")
             self.state = self.TINFO
             total['heap_rate'] = self.convert(m.group('heap_rate'),
                                               m.group('heap_unit'))
             continue
         if self.state == self.TINFO:
             if line == '\n':
                 self.state = None
                 yield total, threads
                 total = OrderedDict()
                 threads = OrderedDefaultDict(dict)
             else:
                 m = self.tinfo_match.match(line)
                 if not m:
                     raise ValueError("thread info not found in '" + line +
                                      "'")
                 threads[m.group('thread_name')]['user_cpu'] = float(
                     m.group('user_cpu'))
                 threads[m.group('thread_name')]['sys_cpu'] = float(
                     m.group('sys_cpu'))
                 threads[m.group('thread_name')]['total_cpu'] = float(
                     m.group('sys_cpu')) + float(m.group('user_cpu'))
                 threads[m.group(
                     'thread_name')]['heap_rate'] = self.convert(
                         m.group('heap_rate'), m.group('heap_unit'))
                 continue
Exemple #17
0
class GCInspector:
    """GCInspector class"""
    def __init__(self, diag_dir=None, files=None, start=None, end=None):
        self.diag_dir = diag_dir
        self.files = files
        self.pauses = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
        self.gc_types = OrderedDefaultDict(int)
        self.start = None
        self.end = None
        self.starts = OrderedDefaultDict(datetime.datetime)
        self.ends = OrderedDefaultDict(datetime.datetime)
        self.analyzed = False
        self.start_time = None
        self.end_time = None
        if start:
            self.start_time = date_parse(start)
        if end:
            self.end_time = date_parse(end)

    def analyze(self):
        """analyze files"""
        target = None
        if self.files:
            target = self.files
        elif self.diag_dir:
            target = diag.find_logs(self.diag_dir)
        else:
            raise Exception("no diag dir and no files specified")
        for file in target:
            node = extract_node_name(file)
            with diag.FileWithProgress(file) as log:
                for event in parser.read_log(log, gc.capture_line):
                    if event["event_type"] == "pause":
                        if self.start_time and event["date"] < self.start_time:
                            continue
                        if self.end_time and event["date"] > self.end_time:
                            continue
                        self.__setdates(event["date"], node)
                        self.pauses[node][event["date"]].append(
                            event["duration"])
                        self.gc_types[event["gc_type"]] += 1
        self.analyzed = True

    def __setdates(self, date, node):
        """track start/end times"""
        # global
        if not self.start:
            self.start = date
            self.end = date
        if date > self.end:
            self.end = date
        if date < self.start:
            self.start = date
        # node specific
        if node not in self.starts:
            self.starts[node] = date
            self.ends[node] = date
            return
        if date > self.ends[node]:
            self.ends[node] = date
        if date < self.starts[node]:
            self.starts[node] = date

    def all_pauses(self):
        """get pauses for all nodes"""
        pauses = OrderedDefaultDict(list)
        for pausedata in self.pauses.values():
            for time, pause in pausedata.items():
                pauses[time].extend(pause)
        return pauses

    def print_report(self, interval=3600, by_node=False, top=3):
        """print gc report"""
        print("gcinspector version %s" % VERSION)
        print("")
        if not self.analyzed:
            self.analyze()
        if not self.pauses:
            print("No pauses found")
            return
        if not by_node:
            pauses = self.all_pauses()
            self.__print_gc(
                sorted(
                    bucketize(pauses,
                              start=self.start,
                              end=self.end,
                              seconds=interval).items(),
                    key=lambda t: t[0],
                ))
            plist = []
            for time in pauses:
                plist.extend(pauses[time])
            worst_k = heapq.nlargest(top, plist)
            print("Worst pauses in ms:")
            print(worst_k)

        else:
            for node in self.pauses:
                print(node)
                self.__print_gc(
                    sorted(
                        bucketize(
                            self.pauses[node],
                            start=self.starts[node],
                            end=self.ends[node],
                            seconds=interval,
                        ).items(),
                        key=lambda t: t[0],
                    ))
                plist = []
                for time, pauses in self.pauses[node].items():
                    plist.extend(pauses)
                worst_k = heapq.nlargest(top, plist)
                print("Worst pauses in ms:")
                print(worst_k)
                print("")
        print("")
        print("Collections by type")
        print("-" * 20)
        for collection, count in self.gc_types.items():
            print("* %s: %s" % (collection, count))
        print("")

    def __print_gc(self, data):
        """print data to the user, expecting datetime keys and list(int) values"""
        print(". <300ms + 301-500ms ! >500ms")
        print("-" * 30)
        busiest = None
        for time, pauses in data:
            total = sum(pauses)
            if not busiest:
                busiest = (time, total)
            elif total > busiest[1]:
                busiest = (time, total)
            print(time.strftime("%Y-%m-%d %H:%M:%S"), end=" ")
            print(len(pauses), end=" ")
            for pause in pauses:
                c = "."
                if pause > 300:
                    c = "+"
                if pause > 500:
                    c = "!"
                print(c, end="")
            print("")
        print("")
        print("busiest period: %s (%sms)" %
              (busiest[0].strftime("%Y-%m-%d %H:%M:%S"), busiest[1]))
        print("")
        percentiles = [[]]
        percentiles.append(get_percentile_headers("GC pauses"))
        header = [""]
        header.extend(["---" for i in range(6)])
        percentiles.append(header)
        percentiles.append(
            get_percentiles(
                "ms",
                list(
                    itertools.chain.from_iterable(pauses
                                                  for time, pauses in data)),
                strformat="%i",
            ))
        pad_table(percentiles, min_width=11, extra_pad=2)
        for line in percentiles:
            print("".join(line))
        print()
Exemple #18
0
class StatusLogger:
    """status logger"""
    def __init__(
        self,
        diag_dir,
        files=None,
        start=None,
        end=None,
        wanted_stages=None,
        command_name="sperf core statuslogger",
        syslog_prefix="system.log",
        dbglog_prefix="debug.log",
    ):
        self.diag_dir = diag_dir
        self.files = files
        self.wanted_stages = wanted_stages
        if env.DEBUG:
            print("wanted stages:", self.wanted_stages)
        self.nodes = OrderedDefaultDict(Node)
        self.analyzed = False
        self.dumps_analyzed = 0
        self.rule_types = OrderedDefaultDict(int)
        self.command_name = command_name
        self.syslog_prefix = syslog_prefix
        self.dbglog_prefix = dbglog_prefix
        self.start = None
        self.end = None
        if start:
            self.start = date_parse(start)
        if end:
            self.end = date_parse(end)

    def analyze(self):
        """analyze log files"""
        if self.analyzed:
            return
        event_filter = UniqEventPerNodeFilter()
        target = None
        if self.files:
            target = self.files
        elif self.diag_dir:
            target_system = find_logs(self.diag_dir,
                                      file_to_find=self.syslog_prefix)
            target_debug = find_logs(self.diag_dir,
                                     file_to_find=self.dbglog_prefix)
            target = target_system + target_debug
        else:
            raise Exception("no diag dir and no files specified")
        for f in target:
            nodename = extract_node_name(f, ignore_missing_nodes=True)
            event_filter.set_node(nodename)
            node = self.nodes[nodename]
            if env.DEBUG:
                print("parsing", f)
            with FileWithProgress(f) as log:
                statuslogger_fixer = UnknownStatusLoggerWriter()
                for event in parser.read_system_log(log):
                    statuslogger_fixer.check(event)
                    if self.start and event["date"] < self.start:
                        continue
                    if self.end and event["date"] > self.end:
                        continue
                    self.__setdates(node, statuslogger_fixer.last_event_date)
                    node.lines += 1
                    if event_filter.is_duplicate(event):
                        node.skipped_lines += 1
                        continue
                    if env.DEBUG:
                        if "rule_type" in event:
                            self.rule_types[event["rule_type"]] += 1
                        elif event["event_type"] == "unknown":
                            self.rule_types["unknown"] += 1
                        else:
                            self.rule_types["no type"] += 1
                    if event["event_type"] == "server_version":
                        if event.get("version"):
                            node.version = event["version"]
                            if node.version.startswith("6"):
                                node.cassandra_version = "DSE Private Fork"
                        elif event.get("cassandra_version"):
                            node.cassandra_version = event["cassandra_version"]
                        # skipping solr, spark etc as it maybe too much noise for statuslogger
                    elif event["event_type"] == "memtable_status":
                        tname = ".".join([event["keyspace"], event["table"]])
                        if event["ops"] > node.tables[tname].ops:
                            node.tables[tname].ops = event["ops"]
                        try:
                            if event["data"] > node.tables[tname].data:
                                node.tables[tname].data = event["data"]
                        except Exception as e:
                            print(event)
                            raise e
                    elif event["event_type"] == "pause":
                        node.pauses.append(event["duration"])
                    elif event["event_type"] == "threadpool_header":
                        node.dumps_analyzed += 1
                        self.dumps_analyzed += 1
                    elif event["event_type"] == "threadpool_status":
                        if re.match(r"TPC/\d+$", event["pool_name"]):
                            if not node.version:
                                node.version = "6.x"
                            if "delayed" in event and event["delayed"]:
                                val = event["delayed"]
                                node.stages["local backpressure"][
                                    event["pool_name"]].append(val)
                        else:
                            for pool in [
                                    "active",
                                    "pending",
                                    "blocked",
                                    "all_time_blocked",
                            ]:
                                if pool in event and event[pool]:
                                    if not self.wanted_stages or event[
                                            "pool_name"].startswith(
                                                self.wanted_stages):
                                        node.stages[pool][
                                            event["pool_name"]].append(
                                                event[pool])
        self.analyzed = True
        if env.DEBUG:
            print(self.rule_types.items())

    def __setdates(self, node, date):
        if not node.start:
            node.start = date
            node.end = date
            return
        if date > node.end:
            node.end = date
        if date < node.start:
            node.start = date

    def print_histogram(self):
        """print histogram report, analyzing if necessary"""
        self.analyze()
        print("%s version: %s" % (self.command_name, VERSION))
        print("")
        print("Histogram")
        print("")
        if not self.nodes:
            print("Nothing found!")
            return
        cassandra_versions = set()
        dse_versions = set()
        for name, node in self.nodes.items():
            cassandra_versions.add(node.cassandra_version)
            dse_versions.add(node.dse_versions)
            print(name)
            print("-" * 60)
            print("%s lines" % format_num(node.lines))
            print("%s skipped lines" % format_num(node.skipped_lines))
            print("dse version: %s" % (node.version or "unknown"))
            print("cassandra version: %s" %
                  (node.cassandra_version or "unknown"))
            print("log start time: %s" % node.start)
            print("log end time: %s" % node.end)
            if not node.dumps_analyzed:
                print("Nothing found!")
                continue
            print("duration: %s" %
                  format_seconds(int(node.duration().total_seconds())))
            print("stages analyzed: %s" % node.dumps_analyzed)
            if node.pauses:
                percentiles = [[]]
                percentiles.append(get_percentile_headers("GC pauses"))
                header = [""]
                header.extend(["---" for i in range(6)])
                percentiles.append(header)
                percentiles.append(
                    get_percentiles("ms", node.pauses, strformat="%i"))
                pad_table(percentiles, min_width=11, extra_pad=2)
                for line in percentiles:
                    print("".join(line))
                print("total GC events: %s" % len(node.pauses))
            print("")
            ops = node.get_busiest_tables("ops")[:5]
            if ops:
                print("busiest tables (ops)")
                print("-" * 30)
                nlen = max(len(o[0]) for o in ops)
                for n, t in ops:
                    print(n.ljust(nlen), t)
                data = node.get_busiest_tables("data")[:5]
                print("busiest tables (data)")
                print("-" * 30)
                nlen = max(len(d[0]) for d in data)
                for n, t in data:
                    print(n.ljust(nlen), t)
            print("")
            if node.stages:
                percentiles = []
                percentiles.append(get_percentile_headers("stages"))
                percentiles.append([])
                for status, stage in node.stages.items():
                    header = [status.upper()]
                    header.extend("" for i in range(6))
                    percentiles.append(header)
                    for tpname, vals in sorted(stage.items(),
                                               key=lambda tpool: max(tpool[1]),
                                               reverse=True):
                        percentiles.append(
                            get_percentiles(tpname, vals, strformat="%i"))
                    percentiles.append([])
                pad_table(percentiles, extra_pad=2)
                for line in percentiles:
                    print("".join(line))
        print("")
        self.__print_recs(dse_versions, cassandra_versions)

    def __print_recs(self, versions, cassandra_versions):
        version = None
        cassandra_version = None
        if len(cassandra_versions) > 1:
            print(
                "WARNING more than one version present assuming no version with recommendations"
            )
        elif len(cassandra_versions) == 1:
            cassandra_version = cassandra_versions[0]
        elif len(versions) == 1:
            version = versions[0]
        engine = Engine(version, cassandra_version)
        recs = set()
        for node in self.nodes.values():
            rstage = OrderedDefaultDict(dict)
            for status, stage in node.stages.items():
                for tpname, vals in stage.items():
                    rstage[tpname][status] = max(vals)
            for tpname in rstage:
                for sname in (
                        "active",
                        "pending",
                        "local backpressure",
                        "completed",
                        "blocked",
                        "all_time_blocked",
                ):
                    if sname not in rstage[tpname]:
                        rstage[tpname][sname] = 0
                s = Stage(
                    tpname,
                    rstage[tpname]["active"],
                    rstage[tpname]["pending"],
                    rstage[tpname]["local backpressure"],
                    rstage[tpname]["completed"],
                    rstage[tpname]["blocked"],
                    rstage[tpname]["all_time_blocked"],
                )
                reason, rec = engine.analyze_stage(s)
                if reason:
                    recs.add((rec, reason))
        if recs:
            print("recommmendations")
            print("----------------")
        for rec, reason in recs:
            print("* %s (%s)" % (rec, reason))

    def print_summary(self):
        """prints a summary report"""
        self.analyze()
        summary = Summary(self.nodes)
        print("%s version: %s" % (self.command_name, VERSION))
        print("")
        print("Summary (%s lines)" % format_num(summary.lines))
        print("Summary (%s skipped lines)" % format_num(summary.skipped_lines))
        print("")
        print("dse versions: %s" % (set(summary.versions) or "unknown"))
        print("cassandra versions: %s" %
              (set(summary.cassandra_versions) or "unknown"))
        print("first log time: %s" % summary.start)
        print("last log time: %s" % summary.end)
        if not self.dumps_analyzed:
            print("Nothing found!")
            return
        print("duration: %s" %
              format_seconds(int(summary.duration.total_seconds())))
        print("total stages analyzed: %s" % self.dumps_analyzed)
        print("total nodes analyzed: %s" % len(summary.nodes))
        pauses = summary.get_pauses()
        if pauses:
            print("")
            percentiles = []
            percentiles.append(get_percentile_headers("GC pauses"))
            percentiles.append(["", "---", "---", "---", "---", "---", "---"])
            percentiles.append(get_percentiles("ms", pauses, strformat="%i"))
            pad_table(percentiles, min_width=11, extra_pad=2)
            for line in percentiles:
                print("".join(line))
            print("total GC events: %s" % len(pauses))
        print("")
        ops = summary.get_busiest_tables("ops")[:5]
        if ops:
            print("busiest tables by ops across all nodes")
            print("-" * 30)
            for onode, (oname, num) in ops:
                print("* %s: %s: %s" % (onode, oname, num))
            print("")
            print("busiest table by data across all nodes")
            print("-" * 30)
            for dnode, (dname, data) in summary.get_busiest_tables("data")[:5]:
                print("* %s: %s: %s" % (dnode, dname, data))
            print("")
        print("busiest stages across all nodes")
        print("-" * 30)
        data = []
        for (name, status, stage, value) in summary.get_busiest_stages():
            data.append(
                ["* %s %s:" % (stage, status),
                 str(value),
                 "(%s)" % name])
        pad_table(data, extra_pad=2)
        for line in data:
            print("".join(line))
        pending = summary.get_stages_in("pending")
        data = []
        if pending:
            data.append([])
            self.__print_stages("pending", pending, data)
        delayed = summary.get_stages_in("local backpressure")
        if delayed:
            data.append([])
            self.__print_backpressure(delayed, data)
        data.append([])
        pad_table(data, extra_pad=2)
        for line in data:
            print("".join(line))
        self.__print_recs(summary.versions, summary.cassandra_versions)

    def __print_backpressure(self, stages, data):
        nodes = OrderedDict()
        # get total backpressure and max backpressure per node
        for name, stage in stages.items():
            if name not in nodes:
                # if not present initialize so aggregate code can safely assume 0 value is already populated
                agg = OrderedDict()
                agg["total"] = 0
                agg["count"] = 0
                agg["max"] = 0
                nodes[name] = agg
            node_agg = nodes[name]
            for vals in stage.values():
                node_agg["count"] += len(vals)
                node_agg["total"] += sum(vals)
                node_agg["max"] = max(max(vals), node_agg["max"])
        data.append(["busiest LOCAL BACKPRESSURE"])
        data.append(["-" * 30])
        # sort to make the node with the worst backpressure total first
        for name, val in sorted(
                nodes.items(),
                key=lambda kvp: kvp[1]["total"] / float(kvp[1]["count"]),
                reverse=True,
        ):
            data.append(["%s:" % name])
            avg_backpressure = val["total"] / float(val["count"])
            data.append(
                [" " * 5, "avg backpressure",
                 "%.2f" % avg_backpressure])
            data.append([" " * 5, "max backpressure", str(nodes[name]["max"])])

    def __print_stages(self, status, stages, data):
        data.append(["busiest stages in %s" % status.upper()])
        data.append(["-" * 30])
        # sort to make the worst node first
        for name, stage in sorted(stages.items(),
                                  key=lambda tpool: max(tpool[1].values()),
                                  reverse=True):
            data.append(["%s:" % name])
            # sort again to make the worst stage on the node first
            for tp, vals in sorted(stage.items(),
                                   key=lambda stg: max(stg[1]),
                                   reverse=True):
                data.append([" " * 5, "%s:" % tp, str(max(vals))])
Exemple #19
0
 def __init__(self, diag_dir=None, files=None):
     self.diag_dir = diag_dir
     self.files = files
     self.analyzed = False
     self.jars = OrderedDefaultDict(int)
     self.files_analyzed = 0
Exemple #20
0
class BucketGrep:
    """greps for custom regex and bucketizes results"""

    strayre = r".*"
    basere = r" *(?P<level>[A-Z]*) *\[(?P<thread_name>[^\]]*?)[:_-]?(?P<thread_id>[0-9]*)\] (?P<date>.{10} .{12}) *.*"

    def __init__(
        self,
        regex,
        diag_dir=None,
        files=None,
        start=None,
        end=None,
        ignorecase=True,
        report="summary",
    ):
        self.diag_dir = diag_dir
        self.files = files
        self.start = None
        self.end = None
        self.start_time = None
        self.end_time = None
        self.last_time = None
        self.report = report
        if start:
            self.start_time = date_parse(start)
        if end:
            self.end_time = date_parse(end)
        if ignorecase:
            self.strayregex = re.compile(self.strayre + regex + ".*",
                                         re.IGNORECASE)
            self.timeregex = re.compile(self.basere + regex + ".*",
                                        re.IGNORECASE)
            self.supplied_regex = regex.lower()
        else:
            self.strayregex = re.compile(self.strayre + regex + ".*")
            self.timeregex = re.compile(self.basere + regex + ".*")
            self.supplied_regex = regex
        self.valid_log_regex = re.compile(self.basere)
        self.node_matches = OrderedDefaultDict()
        self.matches = OrderedDefaultDict(list)
        self.count = 0
        self.unknown = 0
        self.analyzed = False

    def analyze(self):
        """parses logs for results"""
        print("bucketgrep version %s" % VERSION)
        print("search: '%s'" % self.supplied_regex)
        target = None
        if self.files:
            target = self.files
        elif self.diag_dir:
            if self.diag_dir == ".":
                directory_path = os.getcwd()
                print("from directory '%s':" % directory_path)
            else:
                print("from directory '%s':" % self.diag_dir)
            target = diag.find_logs(self.diag_dir)
        else:
            raise Exception("no diag dir and no files specified")
        for file in target:
            with diag.FileWithProgress(file) as log:
                node_name = extract_node_name(file, ignore_missing_nodes=True)
                self.node_matches[node_name] = OrderedDefaultDict(list)
                for line in log:
                    # as long as it's a valid log line we want the date,
                    # even if we don't care about the rest of the line so we can set
                    # the last date for any straregex lines that match
                    current_dt = self.valid_log_regex.match(line)
                    if current_dt:
                        dt = date()(current_dt.group("date"))
                        # if the log line is valite we want to set the last_time
                        self.last_time = dt
                    # we now can validate if our search term matches the log line
                    d = self.timeregex.match(line)
                    if d:
                        # normal case, well-formatted log line
                        self.__setdates(dt)
                        if self.start_time and dt < self.start_time:
                            continue
                        if self.end_time and dt > self.end_time:
                            continue
                        self.matches[dt].append(line)
                        self.node_matches[node_name][dt].append(line)
                        self.count += 1
                    else:
                        m = self.strayregex.match(line)
                        # check for a match in an unformatted line, like a traceback
                        if m:
                            if self.last_time is None:
                                # match, but no previous timestamp to associate with
                                self.unknown += 1
                                continue
                            self.matches[self.last_time].append(line)
                            self.node_matches[node_name][
                                self.last_time].append(line)
                            self.count += 1
        self.analyzed = True

    def __setdates(self, dt):
        if not self.start:
            self.start = dt
            self.end = dt
            return
        if dt > self.end:
            self.end = dt
        if dt < self.start:
            self.start = dt

    def print_report(self, interval=3600):
        """print bucketized result counts"""

        print()
        if not self.analyzed:
            self.analyze()
        if not self.matches:
            print("No matches found")
            if self.unknown:
                print(self.unknown, "matches without timestamp")
            return
        if self.report == "summary":
            print()
            print("cluster wide")
            print("------------")
            buckets = sorted(
                bucketize(self.matches,
                          start=self.start,
                          end=self.end,
                          seconds=interval).items(),
                key=lambda t: t[0],
            )
            maxval = len(max(buckets, key=lambda t: len(t[1]))[1])
            for time, matches in buckets:
                pad = ""
                for x in range(len(str(maxval)) - len(str(len(matches)))):
                    pad += " "
                print(
                    time.strftime("%Y-%m-%d %H:%M:%S") + pad,
                    len(matches),
                    textbar(maxval, len(matches)),
                )
        else:
            print()
            print()
            print("per node numbers")
            print("----------------")
            for node in sorted(self.node_matches.keys()):
                print()
                print("node: %s" % node)
                print("--------")
                if not len(self.node_matches[node]):
                    print("No matches for %s found" % node)
                    continue
                buckets = sorted(
                    bucketize(
                        self.node_matches[node],
                        start=self.start,
                        end=self.end,
                        seconds=interval,
                    ).items(),
                    key=lambda t: t[0],
                )
                maxval = len(max(buckets, key=lambda t: len(t[1]))[1])
                for time, matches in buckets:
                    pad = ""
                    for x in range(len(str(maxval)) - len(str(len(matches)))):
                        pad += " "
                    print(
                        time.strftime("%Y-%m-%d %H:%M:%S") + pad,
                        len(matches),
                        textbar(maxval, len(matches)),
                    )
        if self.unknown:
            print(self.unknown, "matches without timestamp")
Exemple #21
0
class SysbottleReport:
    "Produces a report from iostat output"

    def __init__(self, infile, conf=None):
        self.infile = infile
        self.parser = IOStatParser()

        self.count = 0
        self.cpu_exceeded = 0
        self.iowait_exceeded = 0
        self.devices = OrderedDefaultDict(lambda: OrderedDefaultDict(list))
        self.cpu_stats = OrderedDefaultDict(list)
        self.queuedepth = OrderedDefaultDict(int)
        self.start = None
        self.end = None

        self.device_index = OrderedDict()
        self.cpu_index = OrderedDict()
        self.conf = conf or self.__mk_conf()
        self.recs = set()
        self.analyzed = False

    def __mk_conf(self):
        conf = OrderedDict()
        conf["iowait_threshold"] = 5
        conf["cpu_threshold"] = 50
        conf["disks"] = []
        conf["queue_threshold"] = 1
        conf["busy_threshold"] = 5
        return conf

    def analyze(self):
        "analyzes the file this class was initialized with"
        for io in self.parser.parse(self.infile):
            self.count += 1
            if not self.device_index:
                self.__mk_col_idx(io)
            self.__analyze_disk(io)
            self.__analyze_cpu(io)
            if not self.start:
                self.start = io["date"]
            if not self.end or io["date"] > self.end:
                self.end = io["date"]
        self.analyzed = True

    def __mk_col_idx(self, stat):
        for i, col in enumerate(stat["device"]["cols"]):
            self.device_index[col] = i
        for i, col in enumerate(stat["cpu"]["cols"]):
            self.cpu_index[col] = i

    def __want_disk(self, name):
        if not self.conf["disks"]:
            return True
        return name in self.conf["disks"]

    def __analyze_disk(self, stat):
        for disk, values in stat["device"]["stat"].items():
            if self.__want_disk(disk):
                for col in self.device_index:
                    val = values[self.device_index[col]]
                    self.devices[disk][col].append(val)
                    if "qu" in col and val >= self.conf["queue_threshold"]:
                        self.queuedepth[disk] += 1
                        self.recs.add("* decrease activity on %s" % disk)

    def __analyze_cpu(self, stat):
        total = 0
        for cpu in ["system", "user", "nice", "steal"]:
            total += stat["cpu"]["stat"][self.cpu_index["%" + cpu]]
        self.cpu_stats["total"].append(total)
        if total > self.conf["cpu_threshold"]:
            self.cpu_exceeded += 1
            self.recs.add("* tune for less CPU usage")
        for col in self.cpu_index:
            val = stat["cpu"]["stat"][self.cpu_index[col]]
            self.cpu_stats[col].append(val)
        if (stat["cpu"]["stat"][self.cpu_index["%iowait"]] >
                self.conf["iowait_threshold"]):
            self.iowait_exceeded += 1
            self.recs.add("* tune for less IO")

    def print_report(self):
        "prints a report for the file this class was initialized with, analyzing if necessary"
        if not self.analyzed:
            self.analyze()
        print("sysbottle version %s" % VERSION)
        print()
        print()
        print("* total records: %s" % self.count)
        if self.count:

            def report_percentage(a):
                return (float(a) / float(self.count)) * 100.0

            print(
                "* total bottleneck time: %.2f%% (cpu bound, io bound, or both)"
                % report_percentage(self.iowait_exceeded + self.cpu_exceeded))
            print("* cpu+system+nice+steal time > %.2f%%: %.2f%%" %
                  (self.conf["cpu_threshold"],
                   report_percentage(self.cpu_exceeded)))
            print("* iowait time > %.2f%%: %.2f%%" % (
                self.conf["iowait_threshold"],
                report_percentage(self.iowait_exceeded),
            ))
            print("* start %s" % self.start)
            print("* end %s" % self.end)
            log_time_seconds = (self.end - self.start).total_seconds() + 1
            print("* log time: %ss" % log_time_seconds)
            print("* interval: %ss" % report_percentage(log_time_seconds))
            for device in self.devices.keys():
                print("* %s time at queue depth >= %.2f: %.2f%%" % (
                    device,
                    self.conf["queue_threshold"],
                    report_percentage(self.queuedepth[device]),
                ))
            print()
            lines = []
            lines.append(get_percentile_headers())
            lines.append(["", "---", "---", "---", "---", "---", "---"])
            lines.append(get_percentiles("cpu", self.cpu_stats["total"]))
            lines.append(get_percentiles("iowait", self.cpu_stats["%iowait"]))
            lines.append([])
            lines.append(get_percentile_headers())
            lines.append(["", "---", "---", "---", "---", "---", "---"])
            for device in self.devices:
                lines.append([device, "", "", "", "", "", ""])
                for iotype in self.devices[device].keys():
                    if "qu" in iotype or "wait" in iotype:
                        lines.append(
                            get_percentiles("- " + iotype + ":",
                                            self.devices[device][iotype]))
            lines.append([])
            humanize.pad_table(lines, 8, 2)
            for line in lines:
                print("".join(line))
            self.print_recommendations()

    def print_recommendations(self):
        """print recommendations"""
        if not self.recs:
            return
        print("recommendations")
        print("-" * 15)
        for rec in self.recs:
            print(rec)
Exemple #22
0
 def parse(self, log):
     """ parse ttop output """
     total = OrderedDict()
     threads = OrderedDefaultDict(dict)
     for line in log:
         if self.state is None:
             m = self.begin_match.match(line)
             if m:
                 dt = datetime.strptime(m.group("date"),
                                        "%Y-%m-%dT%H:%M:%S.%f%z")
                 if self.start and dt < self.start:
                     continue
                 if self.end and dt > self.end:
                     continue
                 total["date"] = dt
                 self.state = self.BEGIN
                 continue
         if self.state == self.BEGIN:
             m = self.process_match.match(line)
             if not m:
                 raise ValueError("process line not found in " + line)
             self.state = self.PROCESS
             total["cpu_total"] = float(m.group("cpu_total"))
             continue
         if self.state == self.PROCESS:
             m = self.application_match.match(line)
             if not m:
                 raise ValueError("application line not found in " + line)
             self.state = self.APPLICATION
             total["app_cpu"] = float(m.group("app_cpu"))
             total["user_cpu"] = float(m.group("user_cpu"))
             total["sys_cpu"] = float(m.group("sys_cpu"))
             continue
         if self.state == self.APPLICATION:
             m = self.other_match.match(line)
             if not m:
                 raise ValueError("other line not found in '" + line + "'")
             self.state = self.OTHER
             total["other_cpu"] = float(m.group("other_cpu"))
             continue
         if self.state == self.OTHER:
             m = self.thread_match.match(line)
             if not m:
                 raise ValueError("thread line not found in '" + line + "'")
             self.state = self.THREAD
             total["thread_count"] = int(m.group("thread_count"))
             continue
         if self.state == self.THREAD:
             m = self.heap_match.match(line)
             if not m:
                 raise ValueError("heap line not found in '" + line + "'")
             self.state = self.TINFO
             total["heap_rate"] = self.convert(m.group("heap_rate"),
                                               m.group("heap_unit"))
             continue
         if self.state == self.TINFO:
             if line == "\n":
                 self.state = None
                 yield total, threads
                 total = OrderedDict()
                 threads = OrderedDefaultDict(dict)
             else:
                 m = self.tinfo_match.match(line)
                 if not m:
                     raise ValueError("thread info not found in '" + line +
                                      "'")
                 threads[m.group("thread_name")]["user_cpu"] = float(
                     m.group("user_cpu"))
                 threads[m.group("thread_name")]["sys_cpu"] = float(
                     m.group("sys_cpu"))
                 threads[m.group("thread_name")]["total_cpu"] = float(
                     m.group("sys_cpu")) + float(m.group("user_cpu"))
                 threads[m.group(
                     "thread_name")]["heap_rate"] = self.convert(
                         m.group("heap_rate"), m.group("heap_unit"))
                 continue