def find_config_in_logs(node_configs, output_logs, system_logs): """read the output logs and extract the configuration from each file""" warnings = [] node_logs = {} for system_log in system_logs: node = util.extract_node_name(system_log) if node not in node_logs: node_logs[node] = {"system": [], "output": ""} node_logs[node]["system"].append(system_log) for output_log in output_logs: node = util.extract_node_name(output_log) if node not in node_logs: node_logs[node] = {"system": [], "output": ""} node_logs[node]["output"] = output_log for node, logs in node_logs.items(): output_log = logs.get("output") system_logs = logs.get("system") events = [] with diag.FileWithProgress(output_log) as output_log_file: if output_log_file.file_desc: events = parser.read_output_log(output_log_file) node_configs[node] = _find_configuration_events(events) continue #try the system logs to find the last configuration found for system_log in system_logs: with diag.FileWithProgress(system_log) as system_log_file: if system_log_file.error: #skip files we can't read continue #yes I know this looks like I'm crazy but the output log parser is what I'm interested in events.extend(parser.read_output_log(system_log_file)) #I only one the most recent logs in the system log to be used events = sorted(events, key=lambda e: e["date"], reverse=False) node_configs[node] = _find_configuration_events(events) return warnings
def test_extract_node_name(): """simple test of split handling in a path""" my_node = util.extract_node_name( os.path.join("exy", "abc", "nodes", "my_node")) assert my_node == "my_node" #should match last nodes token my_node = util.extract_node_name( os.path.join("exy", "nodes", "abc", "nodes", "my_node")) assert my_node == "my_node"
def test_extract_node_name(self): """simple test of split handling in a path""" my_node = util.extract_node_name( os.path.join("exy", "abc", "nodes", "my_node")) self.assertEqual(my_node, "my_node") # should match last nodes token my_node = util.extract_node_name( os.path.join("exy", "nodes", "abc", "nodes", "my_node")) self.assertEqual(my_node, "my_node")
def parse(args): """parse entry point, generates a report object from a tarball or series of files""" logs = diag.find_files(args, args.system_log_prefix) if args.diag_dir == ".": directory_path = os.getcwd() print("from directory '%s':" % directory_path) else: print("from directory '%s':" % args.diag_dir) node_stats = OrderedDict() after_time = dates.date_parse(args.after) before_time = dates.date_parse(args.before) for log in logs: start_log_time, last_log_time = diag.log_range(log) with diag.FileWithProgress(log) as log_file: raw_events = parser.read_system_log(log_file) item_ev_stats, bytes_ev_stats = calculate_eviction_stats( raw_events, after_time, before_time) node = util.extract_node_name(log, True) node_stats[node] = OrderedDict([ ("evictions", (bytes_ev_stats, item_ev_stats)), ("start", start_log_time), ("end", last_log_time), ]) return OrderedDict([ ("nodes", node_stats), ("after_time", after_time), ("before_time", before_time), ])
def get_cass_drive_read_ahead(node_info_json, block_dev_reports): """searches node_info_json for drive mappings and reports on cass drive read ahead settings""" node_info = json.load(node_info_json) nodes_cass_drives = OrderedDict() if not node_info: return OrderedDict() for node in node_info: drives = OrderedDict() drive_data = (node_info.get(node, OrderedDict()).get( "partitions", OrderedDict()).get("data", OrderedDict())) if drive_data is None: continue for drive in drive_data: # rename that reverses opscenter behavior if there is a match present drive = drive.replace("_", "-") drives[drive] = None nodes_cass_drives[node] = drives for block_dev_report in block_dev_reports: node_name = util.extract_node_name(block_dev_report) if node_name in nodes_cass_drives: with diag.FileWithProgress(block_dev_report) as block_dev_file: row_gen = parser.read_block_dev(block_dev_file) drives = extract_block_dev(row_gen) for drive, ra_bytes in drives.items(): if drive in nodes_cass_drives[node_name]: nodes_cass_drives[node_name][drive] = ra_bytes return nodes_cass_drives
def parse(args): """parse entry point, generates a report object from a tarball or series of files""" logs = diag.find_files(args, args.system_log_prefix) print("from directory '%s':" % args.diag_dir) node_stats = OrderedDict() after_time = dates.date_parse(args.after) before_time = dates.date_parse(args.before) for log in logs: start_log_time, last_log_time = diag.log_range(log) with diag.FileWithProgress(log) as log_file: raw_events = parser.read_system_log(log_file) filter_cache_events_all = [ event for event in raw_events if event.get("event_category", "") == "filter_cache" ] filter_cache_events = [ event for event in filter_cache_events_all if "date" in event and event["date"] > after_time and event["date"] < before_time ] item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats, "eviction_items") bytes_eviction_stats = _get_stats(filter_cache_events, BytesFCStats, "eviction_bytes") node = util.extract_node_name(log, True) node_stats[node] = OrderedDict([ ("evictions", (bytes_eviction_stats, item_eviction_stats)), ("start", start_log_time), ("end", last_log_time), ]) return OrderedDict([ ("nodes", node_stats), ("after_time", after_time), ("before_time", before_time), ])
def generate_recommendations(parsed): """generate recommendations off the parsed data""" gc_over_500 = 0 delayed_counter = 0 pending_counter = 0 blocked_counter = 0 event_filter = diag.UniqEventPerNodeFilter() for rec_log in parsed["rec_logs"]: node = util.extract_node_name(rec_log) event_filter.set_node(node) with diag.FileWithProgress(rec_log) as rec_log_file: if rec_log_file.error: parsed["warnings"].append("%s failed with error %s" % (rec_log, rec_log_file.error)) else: statuslogger_fixer = diag.UnknownStatusLoggerWriter() events = parser.read_system_log(rec_log_file) for event in [ e for e in events if not event_filter.is_duplicate(e) ]: statuslogger_fixer.check(event) if event.get("event_type") == "unknown": continue if (event.get("event_type") == "pause" and event.get("event_category") == "garbage_collection"): if event.get("duration") > 500: gc_over_500 += 1 _status_logger_counter(event, delayed_counter, pending_counter, blocked_counter) recommendations = [] _recs_on_stages(recommendations, gc_over_500, delayed_counter, pending_counter, blocked_counter) _recs_on_configs(recommendations, parsed["configs"]) return recommendations
def add_gc_to_configs(configs, system_logs): """read the system logs and extract the configuration from each file""" results = {} warnings = [] for system_log in system_logs: with diag.FileWithProgress(system_log) as system_log_file: if system_log_file.error: warnings.append(system_log_file.error) events = parser.read_system_log(system_log_file) worst_gc = 0 for event in events: if event.get('event_type') == 'pause' and event.get( 'event_category') == 'garbage_collection': worst_gc = max(event.get('duration'), worst_gc) node = util.extract_node_name(system_log) results[node] = worst_gc for config in configs: worst_gc_per_config = 0 worst_node = "" for node in config.get('nodes_list', []): node_worst_gc = results.get(node, 0) if node_worst_gc > worst_gc_per_config: worst_gc_per_config = node_worst_gc worst_node = node config['worst_gc'] = (worst_gc_per_config, worst_node) return warnings
def add_gc_to_configs(configs, system_logs): """read the system logs and extract the configuration from each file""" results = OrderedDict() warnings = [] for system_log in system_logs: with diag.FileWithProgress(system_log) as system_log_file: if system_log_file.error: warnings.append(system_log_file.error) events = parser.read_system_log(system_log_file) worst_gc = 0 for event in events: if ( event.get("event_type") == "pause" and event.get("event_category") == "garbage_collection" ): worst_gc = max(event.get("duration"), worst_gc) node = util.extract_node_name(system_log) results[node] = worst_gc for config in configs: worst_gc_per_config = 0 worst_node = "" for node in config.get("nodes_list", []): node_worst_gc = results.get(node, 0) if node_worst_gc > worst_gc_per_config: worst_gc_per_config = node_worst_gc worst_node = node config["worst_gc"] = (worst_gc_per_config, worst_node) return warnings
def parse(args): """parse entry point, generates a report object from a tarball or series of files""" logs = diag.find_files(args, args.system_log_prefix) print("from directory '%s':" % args.diag_dir) node_stats = {} after_time = dates.date_parse(args.after) before_time = dates.date_parse(args.before) for log in logs: start_log_time, last_log_time = diag.log_range(log) with diag.FileWithProgress(log) as log_file: raw_events = parser.read_system_log(log_file) events = [ event for event in raw_events if event['date'] > after_time and event['date'] < before_time ] filter_cache_events = [event for event in events \ if event['event_category'] == 'filter_cache'] item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats, 'eviction_items') bytes_eviction_stats = _get_stats(filter_cache_events, BytesFCStats, 'eviction_bytes') node = util.extract_node_name(log, True) node_stats[node] = {"evictions" : (bytes_eviction_stats, item_eviction_stats), \ "start": start_log_time, "end": last_log_time, \ } return { \ "nodes": node_stats, \ "after_time": after_time, \ "before_time": before_time, \ }
def warn_missing(node_configs, file_list, warnings, text): """log to warnings file""" if not file_list: warnings.append("%s: all nodes" % text) return nodes = node_configs.keys() nodes_files = [util.extract_node_name(f) for f in file_list] nodes_missing = [n for n in nodes if n not in nodes_files] if nodes_missing: warnings.append("%s: %s" % (text, ", ".join(nodes_missing)))
def analyze(self): """parses logs for results""" print("bucketgrep version %s" % VERSION) print("search: '%s'" % self.supplied_regex) target = None if self.files: target = self.files elif self.diag_dir: if self.diag_dir == ".": directory_path = os.getcwd() print("from directory '%s':" % directory_path) else: print("from directory '%s':" % self.diag_dir) target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: with diag.FileWithProgress(file) as log: node_name = extract_node_name(file, ignore_missing_nodes=True) self.node_matches[node_name] = OrderedDefaultDict(list) for line in log: # as long as it's a valid log line we want the date, # even if we don't care about the rest of the line so we can set # the last date for any straregex lines that match current_dt = self.valid_log_regex.match(line) if current_dt: dt = date()(current_dt.group("date")) # if the log line is valite we want to set the last_time self.last_time = dt # we now can validate if our search term matches the log line d = self.timeregex.match(line) if d: # normal case, well-formatted log line self.__setdates(dt) if self.start_time and dt < self.start_time: continue if self.end_time and dt > self.end_time: continue self.matches[dt].append(line) self.node_matches[node_name][dt].append(line) self.count += 1 else: m = self.strayregex.match(line) # check for a match in an unformatted line, like a traceback if m: if self.last_time is None: # match, but no previous timestamp to associate with self.unknown += 1 continue self.matches[self.last_time].append(line) self.node_matches[node_name][ self.last_time].append(line) self.count += 1 self.analyzed = True
def _parse_cfstats(cfstats_files): node_parsed_map = OrderedDict() warnings = [] for cfstat_file in cfstats_files: node = util.extract_node_name(cfstat_file) try: parsed = cfstats.read_file(cfstat_file) node_parsed_map[node] = parsed except IOError as e: warnings.append(e) continue return node_parsed_map, warnings
def analyze(self): """analyze files""" target = None if self.files: target = self.files elif self.diag_dir: target = diag.find_logs(self.diag_dir) else: raise Exception("no diag dir and no files specified") for file in target: node = extract_node_name(file) with diag.FileWithProgress(file) as log: for event in parser.read_log(log, gc.capture_line): if event["event_type"] == "pause": if self.start_time and event["date"] < self.start_time: continue if self.end_time and event["date"] > self.end_time: continue self.__setdates(event["date"], node) self.pauses[node][event["date"]].append( event["duration"]) self.gc_types[event["gc_type"]] += 1 self.analyzed = True
def analyze(self): """analyze log files""" if self.analyzed: return event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for f in target: nodename = extract_node_name(f, ignore_missing_nodes=True) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", f) with FileWithProgress(f) as log: statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event["date"] < self.start: continue if self.end and event["date"] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if "rule_type" in event: self.rule_types[event["rule_type"]] += 1 elif event["event_type"] == "unknown": self.rule_types["unknown"] += 1 else: self.rule_types["no type"] += 1 if event["event_type"] == "server_version": if event.get("version"): node.version = event["version"] if node.version.startswith("6"): node.cassandra_version = "DSE Private Fork" elif event.get("cassandra_version"): node.cassandra_version = event["cassandra_version"] # skipping solr, spark etc as it maybe too much noise for statuslogger elif event["event_type"] == "memtable_status": tname = ".".join([event["keyspace"], event["table"]]) if event["ops"] > node.tables[tname].ops: node.tables[tname].ops = event["ops"] try: if event["data"] > node.tables[tname].data: node.tables[tname].data = event["data"] except Exception as e: print(event) raise e elif event["event_type"] == "pause": node.pauses.append(event["duration"]) elif event["event_type"] == "threadpool_header": node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event["event_type"] == "threadpool_status": if re.match(r"TPC/\d+$", event["pool_name"]): if not node.version: node.version = "6.x" if "delayed" in event and event["delayed"]: val = event["delayed"] node.stages["local backpressure"][ event["pool_name"]].append(val) else: for pool in [ "active", "pending", "blocked", "all_time_blocked", ]: if pool in event and event[pool]: if not self.wanted_stages or event[ "pool_name"].startswith( self.wanted_stages): node.stages[pool][ event["pool_name"]].append( event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items())
def generate_recommendations(parsed): """generate recommendations off the parsed data""" gc_target = 0 pause_target = None if "jvm_args" in parsed: pause_target = parsed["jvm_args"].get("-XX:MaxGCPauseMillis") if pause_target is not None: parse_target_array = [] if len(pause_target.keys()) > 0: for value, nodes in pause_target.items(): try: parsed_target = int(value) except TypeError as e: raise ValueError( "unexpected gc target pause of %s on nodes %s with error %s" % (value, nodes, e)) if gc_target < parsed_target: gc_target = parsed_target parse_target_array.append("(%s: %s)" % (parsed_target, ",".join(nodes))) if len(pause_target.keys()) > 1: print( "WARN there are several pause targets so using the highest for recommendations but they may not be accurate: Configuration is as follows %s" % "; ".join(parse_target_array)) else: print( "WARN cannot find -XX:MaxGCPauseMillis in the logs setting common default of 500ms" ) gc_target = 500 tombstone_errors = 0 tombstone_warns = 0 gc_over_target = 0 counter = StatusLoggerCounter() solr_index_backoff = OrderedDict() solr_index_restore = OrderedDict() zero_copy_errors = 0 drops_remote_only = 0 rejected = 0 drop_sums = 0 drop_types = set() event_filter = diag.UniqEventPerNodeFilter() bp = BackpressureStats(local_backpressure_active={}, per_core_bp={}) core_balance = {} tpc_event_types = ["6.8", "new"] pool_name_pattern = re.compile(r"TPC\/(?P<core>[0-9]+)$") for rec_log in parsed["rec_logs"]: node = util.extract_node_name(rec_log) event_filter.set_node(node) with diag.FileWithProgress(rec_log) as rec_log_file: if rec_log_file.error: parsed["warnings"].append("%s failed with error %s" % (rec_log, rec_log_file.error)) else: statuslogger_fixer = diag.UnknownStatusLoggerWriter() if env.DEBUG: print("parsing", rec_log_file.filepath) events = parser.read_system_log(rec_log_file) for event in [ e for e in events if not event_filter.is_duplicate(e) ]: # statuslogger_fixer.check(event) HAS to run first before any code below or statuslogger events will get tossed. statuslogger_fixer.check(event) event_type = event.get("event_type") event_category = event.get("event_category") event_product = event.get("event_product") rule_type = event.get("rule_type") if event_type == "unknown": continue if event_type == "pause" and event_category == "garbage_collection": if event.get("duration") > gc_target: gc_over_target += 1 elif event_product == "tombstone": if event_type == "scan_error": tombstone_errors += event.get("tombstones") elif (event_type == "tpc_scan_warn" or event_type == "seda_scan_warn"): tombstone_warns += event.get("tombstones") elif (event_type == "threadpool_status" and rule_type in tpc_event_types): pool_name = event.get("pool_name") if env.DEBUG: print("detected pool name is %s" % pool_name) match = pool_name_pattern.match(pool_name) if match: core = int(match.group(1)) if env.DEBUG: print("detected core is %i" % core) pending = event.get("pending") if node in core_balance: core_balance[node].append( PendingCoreMeasurement(core, pending)) else: core_balance[node] = [ PendingCoreMeasurement(core, pending) ] elif (event_category == "streaming" and event_product == "zcs" and event_type == "bloom_filter"): zero_copy_errors += 1 elif event_type == "core_backpressure": if node in bp.per_core_bp.keys(): bp.per_core_bp[node].cores.append( event.get("core_num")) bp.per_core_bp[node].total_bp_events += 1 else: bp.per_core_bp[node] = CoreBackpressureStats( cores=[event.get("core_num")], total_bp_events=1) elif event_type == "core_backpressure_local": if node in bp.local_backpressure_active: bp.local_backpressure_active[node] += 1 else: bp.local_backpressure_active[node] = 1 elif event_category == "indexing": core_name = event.get("core_name") d = event.get("date") if event.get("event_type") == "increase_soft_commit": if core_name in solr_index_backoff: solr_index_backoff[core_name]["count"] += 1 solr_index_backoff[core_name]["dates"].append( d) else: solr_index_backoff[core_name] = { "count": 1, "dates": [event.get("date")], } elif event_type == "restore_soft_commit": if core_name in solr_index_restore: solr_index_restore[core_name]["count"] += 1 solr_index_restore[core_name]["dates"].append( d) else: solr_index_restore[core_name] = { "count": 1, "dates": [event.get("date")], } elif event_type == "network_backpressure": rejected += event.get("total_dropped") elif event_type == "drops": local = event.get("localCount") remote = event.get("remoteCount") drop_type = event.get("messageType") drop_types.add(drop_type) drop_sums += local + remote if remote > 0 and local == 0: drops_remote_only += 1 _status_logger_counter(event, counter) recommendations = [] _recs_on_stages(recommendations, gc_over_target, gc_target, counter) _recs_on_configs(recommendations, parsed["configs"]) _recs_on_solr(recommendations, solr_index_backoff, solr_index_restore) _recs_on_drops( recommendations, drops_remote_only, sorted(list(drop_types), reverse=True), drop_sums, ) _recs_on_zero_copy(recommendations, zero_copy_errors) _recs_on_rejects(recommendations, rejected) _recs_on_bp(recommendations, bp, gc_over_target) _recs_on_core_balance(recommendations, core_balance) _recs_on_tombstones(recommendations, tombstone_errors, tombstone_warns) return recommendations