Esempio n. 1
0
def find_config_in_logs(node_configs, output_logs, system_logs):
    """read the output logs and extract the configuration from each file"""
    warnings = []
    node_logs = {}
    for system_log in system_logs:
        node = util.extract_node_name(system_log)
        if node not in node_logs:
            node_logs[node] = {"system": [], "output": ""}
        node_logs[node]["system"].append(system_log)
    for output_log in output_logs:
        node = util.extract_node_name(output_log)
        if node not in node_logs:
            node_logs[node] = {"system": [], "output": ""}
        node_logs[node]["output"] = output_log
    for node, logs in node_logs.items():
        output_log = logs.get("output")
        system_logs = logs.get("system")
        events = []
        with diag.FileWithProgress(output_log) as output_log_file:
            if output_log_file.file_desc:
                events = parser.read_output_log(output_log_file)
                node_configs[node] = _find_configuration_events(events)
                continue
            #try the system logs to find the last configuration found
            for system_log in system_logs:
                with diag.FileWithProgress(system_log) as system_log_file:
                    if system_log_file.error:
                        #skip files we can't read
                        continue
                    #yes I know this looks like I'm crazy but the output log parser is what I'm interested in
                    events.extend(parser.read_output_log(system_log_file))
            #I only one the most recent logs in the system log to be used
            events = sorted(events, key=lambda e: e["date"], reverse=False)
            node_configs[node] = _find_configuration_events(events)
    return warnings
Esempio n. 2
0
def test_extract_node_name():
    """simple test of split handling in a path"""
    my_node = util.extract_node_name(
        os.path.join("exy", "abc", "nodes", "my_node"))
    assert my_node == "my_node"

    #should match last nodes token
    my_node = util.extract_node_name(
        os.path.join("exy", "nodes", "abc", "nodes", "my_node"))
    assert my_node == "my_node"
Esempio n. 3
0
    def test_extract_node_name(self):
        """simple test of split handling in a path"""
        my_node = util.extract_node_name(
            os.path.join("exy", "abc", "nodes", "my_node"))
        self.assertEqual(my_node, "my_node")

        # should match last nodes token
        my_node = util.extract_node_name(
            os.path.join("exy", "nodes", "abc", "nodes", "my_node"))
        self.assertEqual(my_node, "my_node")
Esempio n. 4
0
def parse(args):
    """parse entry point, generates a report object
    from a tarball or series of files"""
    logs = diag.find_files(args, args.system_log_prefix)
    if args.diag_dir == ".":
        directory_path = os.getcwd()
        print("from directory '%s':" % directory_path)
    else:
        print("from directory '%s':" % args.diag_dir)
    node_stats = OrderedDict()
    after_time = dates.date_parse(args.after)
    before_time = dates.date_parse(args.before)
    for log in logs:
        start_log_time, last_log_time = diag.log_range(log)
        with diag.FileWithProgress(log) as log_file:
            raw_events = parser.read_system_log(log_file)
            item_ev_stats, bytes_ev_stats = calculate_eviction_stats(
                raw_events, after_time, before_time)
            node = util.extract_node_name(log, True)
            node_stats[node] = OrderedDict([
                ("evictions", (bytes_ev_stats, item_ev_stats)),
                ("start", start_log_time),
                ("end", last_log_time),
            ])
    return OrderedDict([
        ("nodes", node_stats),
        ("after_time", after_time),
        ("before_time", before_time),
    ])
Esempio n. 5
0
def get_cass_drive_read_ahead(node_info_json, block_dev_reports):
    """searches node_info_json for drive mappings and reports on cass drive read ahead settings"""
    node_info = json.load(node_info_json)
    nodes_cass_drives = OrderedDict()
    if not node_info:
        return OrderedDict()
    for node in node_info:
        drives = OrderedDict()
        drive_data = (node_info.get(node, OrderedDict()).get(
            "partitions", OrderedDict()).get("data", OrderedDict()))
        if drive_data is None:
            continue
        for drive in drive_data:
            # rename that reverses opscenter behavior if there is a match present
            drive = drive.replace("_", "-")
            drives[drive] = None
            nodes_cass_drives[node] = drives
    for block_dev_report in block_dev_reports:
        node_name = util.extract_node_name(block_dev_report)
        if node_name in nodes_cass_drives:
            with diag.FileWithProgress(block_dev_report) as block_dev_file:
                row_gen = parser.read_block_dev(block_dev_file)
                drives = extract_block_dev(row_gen)
                for drive, ra_bytes in drives.items():
                    if drive in nodes_cass_drives[node_name]:
                        nodes_cass_drives[node_name][drive] = ra_bytes
    return nodes_cass_drives
Esempio n. 6
0
def parse(args):
    """parse entry point, generates a report object
    from a tarball or series of files"""
    logs = diag.find_files(args, args.system_log_prefix)
    print("from directory '%s':" % args.diag_dir)
    node_stats = OrderedDict()
    after_time = dates.date_parse(args.after)
    before_time = dates.date_parse(args.before)
    for log in logs:
        start_log_time, last_log_time = diag.log_range(log)
        with diag.FileWithProgress(log) as log_file:
            raw_events = parser.read_system_log(log_file)
            filter_cache_events_all = [
                event for event in raw_events
                if event.get("event_category", "") == "filter_cache"
            ]
            filter_cache_events = [
                event for event in filter_cache_events_all if "date" in event
                and event["date"] > after_time and event["date"] < before_time
            ]
            item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats,
                                             "eviction_items")
            bytes_eviction_stats = _get_stats(filter_cache_events,
                                              BytesFCStats, "eviction_bytes")
            node = util.extract_node_name(log, True)
            node_stats[node] = OrderedDict([
                ("evictions", (bytes_eviction_stats, item_eviction_stats)),
                ("start", start_log_time),
                ("end", last_log_time),
            ])
    return OrderedDict([
        ("nodes", node_stats),
        ("after_time", after_time),
        ("before_time", before_time),
    ])
Esempio n. 7
0
def generate_recommendations(parsed):
    """generate recommendations off the parsed data"""
    gc_over_500 = 0
    delayed_counter = 0
    pending_counter = 0
    blocked_counter = 0
    event_filter = diag.UniqEventPerNodeFilter()
    for rec_log in parsed["rec_logs"]:
        node = util.extract_node_name(rec_log)
        event_filter.set_node(node)
        with diag.FileWithProgress(rec_log) as rec_log_file:
            if rec_log_file.error:
                parsed["warnings"].append("%s failed with error %s" %
                                          (rec_log, rec_log_file.error))
            else:
                statuslogger_fixer = diag.UnknownStatusLoggerWriter()
                events = parser.read_system_log(rec_log_file)
                for event in [
                        e for e in events if not event_filter.is_duplicate(e)
                ]:
                    statuslogger_fixer.check(event)
                    if event.get("event_type") == "unknown":
                        continue
                    if (event.get("event_type") == "pause"
                            and event.get("event_category")
                            == "garbage_collection"):
                        if event.get("duration") > 500:
                            gc_over_500 += 1
                    _status_logger_counter(event, delayed_counter,
                                           pending_counter, blocked_counter)
    recommendations = []
    _recs_on_stages(recommendations, gc_over_500, delayed_counter,
                    pending_counter, blocked_counter)
    _recs_on_configs(recommendations, parsed["configs"])
    return recommendations
Esempio n. 8
0
def add_gc_to_configs(configs, system_logs):
    """read the system logs and extract the configuration from each file"""
    results = {}
    warnings = []
    for system_log in system_logs:
        with diag.FileWithProgress(system_log) as system_log_file:
            if system_log_file.error:
                warnings.append(system_log_file.error)
            events = parser.read_system_log(system_log_file)
            worst_gc = 0
            for event in events:
                if event.get('event_type') == 'pause' and event.get(
                        'event_category') == 'garbage_collection':
                    worst_gc = max(event.get('duration'), worst_gc)
            node = util.extract_node_name(system_log)
            results[node] = worst_gc
    for config in configs:
        worst_gc_per_config = 0
        worst_node = ""
        for node in config.get('nodes_list', []):
            node_worst_gc = results.get(node, 0)
            if node_worst_gc > worst_gc_per_config:
                worst_gc_per_config = node_worst_gc
                worst_node = node
        config['worst_gc'] = (worst_gc_per_config, worst_node)
    return warnings
Esempio n. 9
0
def add_gc_to_configs(configs, system_logs):
    """read the system logs and extract the configuration from each file"""
    results = OrderedDict()
    warnings = []
    for system_log in system_logs:
        with diag.FileWithProgress(system_log) as system_log_file:
            if system_log_file.error:
                warnings.append(system_log_file.error)
            events = parser.read_system_log(system_log_file)
            worst_gc = 0
            for event in events:
                if (
                    event.get("event_type") == "pause"
                    and event.get("event_category") == "garbage_collection"
                ):
                    worst_gc = max(event.get("duration"), worst_gc)
            node = util.extract_node_name(system_log)
            results[node] = worst_gc
    for config in configs:
        worst_gc_per_config = 0
        worst_node = ""
        for node in config.get("nodes_list", []):
            node_worst_gc = results.get(node, 0)
            if node_worst_gc > worst_gc_per_config:
                worst_gc_per_config = node_worst_gc
                worst_node = node
        config["worst_gc"] = (worst_gc_per_config, worst_node)
    return warnings
Esempio n. 10
0
def parse(args):
    """parse entry point, generates a report object
    from a tarball or series of files"""
    logs = diag.find_files(args, args.system_log_prefix)
    print("from directory '%s':" % args.diag_dir)
    node_stats = {}
    after_time = dates.date_parse(args.after)
    before_time = dates.date_parse(args.before)
    for log in logs:
        start_log_time, last_log_time = diag.log_range(log)
        with diag.FileWithProgress(log) as log_file:
            raw_events = parser.read_system_log(log_file)
            events = [
                event for event in raw_events
                if event['date'] > after_time and event['date'] < before_time
            ]
            filter_cache_events = [event for event in events \
                    if event['event_category'] == 'filter_cache']
            item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats,
                                             'eviction_items')
            bytes_eviction_stats = _get_stats(filter_cache_events,
                                              BytesFCStats, 'eviction_bytes')
            node = util.extract_node_name(log, True)
            node_stats[node] = {"evictions" : (bytes_eviction_stats, item_eviction_stats), \
                "start": start_log_time, "end": last_log_time, \
                }
    return { \
            "nodes": node_stats, \
            "after_time": after_time, \
            "before_time": before_time, \
            }
Esempio n. 11
0
def warn_missing(node_configs, file_list, warnings, text):
    """log to warnings file"""
    if not file_list:
        warnings.append("%s: all nodes" % text)
        return
    nodes = node_configs.keys()
    nodes_files = [util.extract_node_name(f) for f in file_list]
    nodes_missing = [n for n in nodes if n not in nodes_files]
    if nodes_missing:
        warnings.append("%s: %s" % (text, ", ".join(nodes_missing)))
Esempio n. 12
0
 def analyze(self):
     """parses logs for results"""
     print("bucketgrep version %s" % VERSION)
     print("search: '%s'" % self.supplied_regex)
     target = None
     if self.files:
         target = self.files
     elif self.diag_dir:
         if self.diag_dir == ".":
             directory_path = os.getcwd()
             print("from directory '%s':" % directory_path)
         else:
             print("from directory '%s':" % self.diag_dir)
         target = diag.find_logs(self.diag_dir)
     else:
         raise Exception("no diag dir and no files specified")
     for file in target:
         with diag.FileWithProgress(file) as log:
             node_name = extract_node_name(file, ignore_missing_nodes=True)
             self.node_matches[node_name] = OrderedDefaultDict(list)
             for line in log:
                 # as long as it's a valid log line we want the date,
                 # even if we don't care about the rest of the line so we can set
                 # the last date for any straregex lines that match
                 current_dt = self.valid_log_regex.match(line)
                 if current_dt:
                     dt = date()(current_dt.group("date"))
                     # if the log line is valite we want to set the last_time
                     self.last_time = dt
                 # we now can validate if our search term matches the log line
                 d = self.timeregex.match(line)
                 if d:
                     # normal case, well-formatted log line
                     self.__setdates(dt)
                     if self.start_time and dt < self.start_time:
                         continue
                     if self.end_time and dt > self.end_time:
                         continue
                     self.matches[dt].append(line)
                     self.node_matches[node_name][dt].append(line)
                     self.count += 1
                 else:
                     m = self.strayregex.match(line)
                     # check for a match in an unformatted line, like a traceback
                     if m:
                         if self.last_time is None:
                             # match, but no previous timestamp to associate with
                             self.unknown += 1
                             continue
                         self.matches[self.last_time].append(line)
                         self.node_matches[node_name][
                             self.last_time].append(line)
                         self.count += 1
     self.analyzed = True
Esempio n. 13
0
def _parse_cfstats(cfstats_files):
    node_parsed_map = OrderedDict()
    warnings = []
    for cfstat_file in cfstats_files:
        node = util.extract_node_name(cfstat_file)
        try:
            parsed = cfstats.read_file(cfstat_file)
            node_parsed_map[node] = parsed
        except IOError as e:
            warnings.append(e)
            continue
    return node_parsed_map, warnings
Esempio n. 14
0
 def analyze(self):
     """analyze files"""
     target = None
     if self.files:
         target = self.files
     elif self.diag_dir:
         target = diag.find_logs(self.diag_dir)
     else:
         raise Exception("no diag dir and no files specified")
     for file in target:
         node = extract_node_name(file)
         with diag.FileWithProgress(file) as log:
             for event in parser.read_log(log, gc.capture_line):
                 if event["event_type"] == "pause":
                     if self.start_time and event["date"] < self.start_time:
                         continue
                     if self.end_time and event["date"] > self.end_time:
                         continue
                     self.__setdates(event["date"], node)
                     self.pauses[node][event["date"]].append(
                         event["duration"])
                     self.gc_types[event["gc_type"]] += 1
     self.analyzed = True
Esempio n. 15
0
 def analyze(self):
     """analyze log files"""
     if self.analyzed:
         return
     event_filter = UniqEventPerNodeFilter()
     target = None
     if self.files:
         target = self.files
     elif self.diag_dir:
         target_system = find_logs(self.diag_dir,
                                   file_to_find=self.syslog_prefix)
         target_debug = find_logs(self.diag_dir,
                                  file_to_find=self.dbglog_prefix)
         target = target_system + target_debug
     else:
         raise Exception("no diag dir and no files specified")
     for f in target:
         nodename = extract_node_name(f, ignore_missing_nodes=True)
         event_filter.set_node(nodename)
         node = self.nodes[nodename]
         if env.DEBUG:
             print("parsing", f)
         with FileWithProgress(f) as log:
             statuslogger_fixer = UnknownStatusLoggerWriter()
             for event in parser.read_system_log(log):
                 statuslogger_fixer.check(event)
                 if self.start and event["date"] < self.start:
                     continue
                 if self.end and event["date"] > self.end:
                     continue
                 self.__setdates(node, statuslogger_fixer.last_event_date)
                 node.lines += 1
                 if event_filter.is_duplicate(event):
                     node.skipped_lines += 1
                     continue
                 if env.DEBUG:
                     if "rule_type" in event:
                         self.rule_types[event["rule_type"]] += 1
                     elif event["event_type"] == "unknown":
                         self.rule_types["unknown"] += 1
                     else:
                         self.rule_types["no type"] += 1
                 if event["event_type"] == "server_version":
                     if event.get("version"):
                         node.version = event["version"]
                         if node.version.startswith("6"):
                             node.cassandra_version = "DSE Private Fork"
                     elif event.get("cassandra_version"):
                         node.cassandra_version = event["cassandra_version"]
                     # skipping solr, spark etc as it maybe too much noise for statuslogger
                 elif event["event_type"] == "memtable_status":
                     tname = ".".join([event["keyspace"], event["table"]])
                     if event["ops"] > node.tables[tname].ops:
                         node.tables[tname].ops = event["ops"]
                     try:
                         if event["data"] > node.tables[tname].data:
                             node.tables[tname].data = event["data"]
                     except Exception as e:
                         print(event)
                         raise e
                 elif event["event_type"] == "pause":
                     node.pauses.append(event["duration"])
                 elif event["event_type"] == "threadpool_header":
                     node.dumps_analyzed += 1
                     self.dumps_analyzed += 1
                 elif event["event_type"] == "threadpool_status":
                     if re.match(r"TPC/\d+$", event["pool_name"]):
                         if not node.version:
                             node.version = "6.x"
                         if "delayed" in event and event["delayed"]:
                             val = event["delayed"]
                             node.stages["local backpressure"][
                                 event["pool_name"]].append(val)
                     else:
                         for pool in [
                                 "active",
                                 "pending",
                                 "blocked",
                                 "all_time_blocked",
                         ]:
                             if pool in event and event[pool]:
                                 if not self.wanted_stages or event[
                                         "pool_name"].startswith(
                                             self.wanted_stages):
                                     node.stages[pool][
                                         event["pool_name"]].append(
                                             event[pool])
     self.analyzed = True
     if env.DEBUG:
         print(self.rule_types.items())
Esempio n. 16
0
def generate_recommendations(parsed):
    """generate recommendations off the parsed data"""
    gc_target = 0
    pause_target = None
    if "jvm_args" in parsed:
        pause_target = parsed["jvm_args"].get("-XX:MaxGCPauseMillis")
    if pause_target is not None:
        parse_target_array = []
        if len(pause_target.keys()) > 0:
            for value, nodes in pause_target.items():
                try:
                    parsed_target = int(value)
                except TypeError as e:
                    raise ValueError(
                        "unexpected gc target pause of %s on nodes %s with error %s"
                        % (value, nodes, e))
                if gc_target < parsed_target:
                    gc_target = parsed_target
                parse_target_array.append("(%s: %s)" %
                                          (parsed_target, ",".join(nodes)))
            if len(pause_target.keys()) > 1:
                print(
                    "WARN there are several pause targets so using the highest for recommendations but they may not be accurate: Configuration is as follows %s"
                    % "; ".join(parse_target_array))
    else:
        print(
            "WARN cannot find -XX:MaxGCPauseMillis in the logs setting common default of 500ms"
        )
        gc_target = 500
    tombstone_errors = 0
    tombstone_warns = 0
    gc_over_target = 0
    counter = StatusLoggerCounter()
    solr_index_backoff = OrderedDict()
    solr_index_restore = OrderedDict()
    zero_copy_errors = 0
    drops_remote_only = 0
    rejected = 0
    drop_sums = 0
    drop_types = set()
    event_filter = diag.UniqEventPerNodeFilter()
    bp = BackpressureStats(local_backpressure_active={}, per_core_bp={})
    core_balance = {}
    tpc_event_types = ["6.8", "new"]
    pool_name_pattern = re.compile(r"TPC\/(?P<core>[0-9]+)$")
    for rec_log in parsed["rec_logs"]:
        node = util.extract_node_name(rec_log)
        event_filter.set_node(node)
        with diag.FileWithProgress(rec_log) as rec_log_file:
            if rec_log_file.error:
                parsed["warnings"].append("%s failed with error %s" %
                                          (rec_log, rec_log_file.error))
            else:
                statuslogger_fixer = diag.UnknownStatusLoggerWriter()
                if env.DEBUG:
                    print("parsing", rec_log_file.filepath)
                events = parser.read_system_log(rec_log_file)
                for event in [
                        e for e in events if not event_filter.is_duplicate(e)
                ]:
                    # statuslogger_fixer.check(event) HAS to run first before any code below or statuslogger events will get tossed.
                    statuslogger_fixer.check(event)
                    event_type = event.get("event_type")
                    event_category = event.get("event_category")
                    event_product = event.get("event_product")
                    rule_type = event.get("rule_type")
                    if event_type == "unknown":
                        continue
                    if event_type == "pause" and event_category == "garbage_collection":
                        if event.get("duration") > gc_target:
                            gc_over_target += 1
                    elif event_product == "tombstone":
                        if event_type == "scan_error":
                            tombstone_errors += event.get("tombstones")
                        elif (event_type == "tpc_scan_warn"
                              or event_type == "seda_scan_warn"):
                            tombstone_warns += event.get("tombstones")
                    elif (event_type == "threadpool_status"
                          and rule_type in tpc_event_types):
                        pool_name = event.get("pool_name")
                        if env.DEBUG:
                            print("detected pool name is %s" % pool_name)
                        match = pool_name_pattern.match(pool_name)
                        if match:
                            core = int(match.group(1))
                            if env.DEBUG:
                                print("detected core is %i" % core)
                            pending = event.get("pending")
                            if node in core_balance:
                                core_balance[node].append(
                                    PendingCoreMeasurement(core, pending))
                            else:
                                core_balance[node] = [
                                    PendingCoreMeasurement(core, pending)
                                ]
                    elif (event_category == "streaming"
                          and event_product == "zcs"
                          and event_type == "bloom_filter"):
                        zero_copy_errors += 1
                    elif event_type == "core_backpressure":
                        if node in bp.per_core_bp.keys():
                            bp.per_core_bp[node].cores.append(
                                event.get("core_num"))
                            bp.per_core_bp[node].total_bp_events += 1
                        else:
                            bp.per_core_bp[node] = CoreBackpressureStats(
                                cores=[event.get("core_num")],
                                total_bp_events=1)
                    elif event_type == "core_backpressure_local":
                        if node in bp.local_backpressure_active:
                            bp.local_backpressure_active[node] += 1
                        else:
                            bp.local_backpressure_active[node] = 1
                    elif event_category == "indexing":
                        core_name = event.get("core_name")
                        d = event.get("date")
                        if event.get("event_type") == "increase_soft_commit":
                            if core_name in solr_index_backoff:
                                solr_index_backoff[core_name]["count"] += 1
                                solr_index_backoff[core_name]["dates"].append(
                                    d)
                            else:
                                solr_index_backoff[core_name] = {
                                    "count": 1,
                                    "dates": [event.get("date")],
                                }
                        elif event_type == "restore_soft_commit":
                            if core_name in solr_index_restore:
                                solr_index_restore[core_name]["count"] += 1
                                solr_index_restore[core_name]["dates"].append(
                                    d)
                            else:
                                solr_index_restore[core_name] = {
                                    "count": 1,
                                    "dates": [event.get("date")],
                                }
                    elif event_type == "network_backpressure":
                        rejected += event.get("total_dropped")
                    elif event_type == "drops":
                        local = event.get("localCount")
                        remote = event.get("remoteCount")
                        drop_type = event.get("messageType")
                        drop_types.add(drop_type)
                        drop_sums += local + remote
                        if remote > 0 and local == 0:
                            drops_remote_only += 1
                    _status_logger_counter(event, counter)
    recommendations = []
    _recs_on_stages(recommendations, gc_over_target, gc_target, counter)
    _recs_on_configs(recommendations, parsed["configs"])
    _recs_on_solr(recommendations, solr_index_backoff, solr_index_restore)
    _recs_on_drops(
        recommendations,
        drops_remote_only,
        sorted(list(drop_types), reverse=True),
        drop_sums,
    )
    _recs_on_zero_copy(recommendations, zero_copy_errors)
    _recs_on_rejects(recommendations, rejected)
    _recs_on_bp(recommendations, bp, gc_over_target)
    _recs_on_core_balance(recommendations, core_balance)
    _recs_on_tombstones(recommendations, tombstone_errors, tombstone_warns)
    return recommendations