def parse(args): """parse entry point, generates a report object from a tarball or series of files""" logs = diag.find_files(args, args.system_log_prefix) if args.diag_dir == ".": directory_path = os.getcwd() print("from directory '%s':" % directory_path) else: print("from directory '%s':" % args.diag_dir) node_stats = OrderedDict() after_time = dates.date_parse(args.after) before_time = dates.date_parse(args.before) for log in logs: start_log_time, last_log_time = diag.log_range(log) with diag.FileWithProgress(log) as log_file: raw_events = parser.read_system_log(log_file) item_ev_stats, bytes_ev_stats = calculate_eviction_stats( raw_events, after_time, before_time) node = util.extract_node_name(log, True) node_stats[node] = OrderedDict([ ("evictions", (bytes_ev_stats, item_ev_stats)), ("start", start_log_time), ("end", last_log_time), ]) return OrderedDict([ ("nodes", node_stats), ("after_time", after_time), ("before_time", before_time), ])
def generate_recommendations(parsed): """generate recommendations off the parsed data""" gc_over_500 = 0 delayed_counter = 0 pending_counter = 0 blocked_counter = 0 event_filter = diag.UniqEventPerNodeFilter() for rec_log in parsed["rec_logs"]: node = util.extract_node_name(rec_log) event_filter.set_node(node) with diag.FileWithProgress(rec_log) as rec_log_file: if rec_log_file.error: parsed["warnings"].append("%s failed with error %s" % (rec_log, rec_log_file.error)) else: statuslogger_fixer = diag.UnknownStatusLoggerWriter() events = parser.read_system_log(rec_log_file) for event in [ e for e in events if not event_filter.is_duplicate(e) ]: statuslogger_fixer.check(event) if event.get("event_type") == "unknown": continue if (event.get("event_type") == "pause" and event.get("event_category") == "garbage_collection"): if event.get("duration") > 500: gc_over_500 += 1 _status_logger_counter(event, delayed_counter, pending_counter, blocked_counter) recommendations = [] _recs_on_stages(recommendations, gc_over_500, delayed_counter, pending_counter, blocked_counter) _recs_on_configs(recommendations, parsed["configs"]) return recommendations
def parse(args): """parse entry point, generates a report object from a tarball or series of files""" logs = diag.find_files(args, args.system_log_prefix) print("from directory '%s':" % args.diag_dir) node_stats = OrderedDict() after_time = dates.date_parse(args.after) before_time = dates.date_parse(args.before) for log in logs: start_log_time, last_log_time = diag.log_range(log) with diag.FileWithProgress(log) as log_file: raw_events = parser.read_system_log(log_file) filter_cache_events_all = [ event for event in raw_events if event.get("event_category", "") == "filter_cache" ] filter_cache_events = [ event for event in filter_cache_events_all if "date" in event and event["date"] > after_time and event["date"] < before_time ] item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats, "eviction_items") bytes_eviction_stats = _get_stats(filter_cache_events, BytesFCStats, "eviction_bytes") node = util.extract_node_name(log, True) node_stats[node] = OrderedDict([ ("evictions", (bytes_eviction_stats, item_eviction_stats)), ("start", start_log_time), ("end", last_log_time), ]) return OrderedDict([ ("nodes", node_stats), ("after_time", after_time), ("before_time", before_time), ])
def test_calculate_eviction_stats(self): lines = [ "INFO [RemoteMessageServer query worker - 81] 2020-01-21 11:34:33,033 SolrFilterCache.java:340 - Filter cache org.apache.solr.search.SolrFilterCache$1@7c723229 has reached 8000000 entries of a maximum of 8000000. Evicting oldest entries...", "ERROR [RemoteMessageServer query worker - 18] 2020-01-21 11:34:34,475 MessageServer.java:277 - Failed to process request:", "INFO [RemoteMessageServer query worker - 81] 2020-01-21 11:34:35,448 SolrFilterCache.java:356 - ...eviction completed in 1304 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$1@7c723229 usage is now 32441266 bytes across 4000000 entries.", "INFO [LocalMessageServer query worker - 77] 2020-01-21 12:24:23,912 SolrFilterCache.java:340 - Filter cache org.apache.solr.search.SolrFilterCache$1@324b2c16 has reached 3999974 entries of a maximum of 8000000. Evicting oldest entries...", "INFO [LocalMessageServer query worker - 77] 2020-01-21 12:24:23,912 SolrFilterCache.java:356 - ...eviction completed in 1 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$1@324b2c16 usage is now 32005744 bytes across 3999962 entries.", "INFO [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,942 SolrFilterCache.java:311 - Filter cache org.apache.solr.search.SolrFilterCache$6@5af917a4 has reached 16 GB bytes of off-heap memory usage, the maximum is 16 GB. Evicting oldest entries...", "INFO [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,950 SolrFilterCache.java:328 - ...eviction completed in 9 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$6@5af917a4 usage is now 114781220 across 159 entries.", # new version of logs, after DSP-18693 "INFO [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,942 SolrFilterCache.java:311 - Filter cache org.apache.solr.search.SolrFilterCache$6@5af917b6 has reached 16 GB bytes of off-heap memory usage, the maximum is 16 GB. Evicting oldest entries...", "INFO [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,950 SolrFilterCache.java:328 - ...eviction completed in 8 milliseconds. Filter cache org.apache.solr.search.SolrFilterCache$6@5af917b6 usage is now 114781220 bytes across 159 entries.", # eviction event without duration log line "INFO [RemoteMessageServer query worker - 41] 2020-01-21 12:47:26,970 SolrFilterCache.java:311 - Filter cache org.apache.solr.search.SolrFilterCache$6@5af917c7 has reached 16 GB bytes of off-heap memory usage, the maximum is 16 GB. Evicting oldest entries...", ] raw_events = parser.read_system_log(lines) after_time = dates.date_parse("2020-01-21 00:00:00,000") before_time = dates.date_parse("2020-02-21 00:00:00,000") item_ev_stats, bytes_ev_stats = calculate_eviction_stats( raw_events, after_time, before_time ) assert len(item_ev_stats.values()) == 2 assert sum([s.duration for s in item_ev_stats.values()]) == 1304 + 1 assert len(bytes_ev_stats.values()) == 3 assert sum([s.duration for s in bytes_ev_stats.values()]) == 9 + 8 + 0
def add_gc_to_configs(configs, system_logs): """read the system logs and extract the configuration from each file""" results = {} warnings = [] for system_log in system_logs: with diag.FileWithProgress(system_log) as system_log_file: if system_log_file.error: warnings.append(system_log_file.error) events = parser.read_system_log(system_log_file) worst_gc = 0 for event in events: if event.get('event_type') == 'pause' and event.get( 'event_category') == 'garbage_collection': worst_gc = max(event.get('duration'), worst_gc) node = util.extract_node_name(system_log) results[node] = worst_gc for config in configs: worst_gc_per_config = 0 worst_node = "" for node in config.get('nodes_list', []): node_worst_gc = results.get(node, 0) if node_worst_gc > worst_gc_per_config: worst_gc_per_config = node_worst_gc worst_node = node config['worst_gc'] = (worst_gc_per_config, worst_node) return warnings
def add_gc_to_configs(configs, system_logs): """read the system logs and extract the configuration from each file""" results = OrderedDict() warnings = [] for system_log in system_logs: with diag.FileWithProgress(system_log) as system_log_file: if system_log_file.error: warnings.append(system_log_file.error) events = parser.read_system_log(system_log_file) worst_gc = 0 for event in events: if ( event.get("event_type") == "pause" and event.get("event_category") == "garbage_collection" ): worst_gc = max(event.get("duration"), worst_gc) node = util.extract_node_name(system_log) results[node] = worst_gc for config in configs: worst_gc_per_config = 0 worst_node = "" for node in config.get("nodes_list", []): node_worst_gc = results.get(node, 0) if node_worst_gc > worst_gc_per_config: worst_gc_per_config = node_worst_gc worst_node = node config["worst_gc"] = (worst_gc_per_config, worst_node) return warnings
def parse(args): """parse entry point, generates a report object from a tarball or series of files""" logs = diag.find_files(args, args.system_log_prefix) print("from directory '%s':" % args.diag_dir) node_stats = {} after_time = dates.date_parse(args.after) before_time = dates.date_parse(args.before) for log in logs: start_log_time, last_log_time = diag.log_range(log) with diag.FileWithProgress(log) as log_file: raw_events = parser.read_system_log(log_file) events = [ event for event in raw_events if event['date'] > after_time and event['date'] < before_time ] filter_cache_events = [event for event in events \ if event['event_category'] == 'filter_cache'] item_eviction_stats = _get_stats(filter_cache_events, ItemFCStats, 'eviction_items') bytes_eviction_stats = _get_stats(filter_cache_events, BytesFCStats, 'eviction_bytes') node = util.extract_node_name(log, True) node_stats[node] = {"evictions" : (bytes_eviction_stats, item_eviction_stats), \ "start": start_log_time, "end": last_log_time, \ } return { \ "nodes": node_stats, \ "after_time": after_time, \ "before_time": before_time, \ }
def parse(args): """reads the args used in the command to determine what to parse and how to parse it. The returned object should be suitable for a report""" files = diag.find_files(args, args.log_prefix) queries = [] for filename in files: with diag.FileWithProgress(filename) as log_file: events = parser.read_system_log(log_file) for event in events: if event.get('event_type', '') == 'query_logs' and \ event.get('event_product', '') == 'solr' and \ event.get('event_category', '') == 'query_component': queries.append(parse_event(event)) return Parsed( queries=queries, top_n_worst=args.top, unique_reasons=args.uniquereasons, score_threshold=args.scorethreshold, )
def analyze(self): """ analyze log files """ if self.analyzed: return # pylint: disable=too-many-nested-blocks event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for file in target: nodename = node_name(file) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", file) log = open(file, 'r') statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event['date'] < self.start: continue if self.end and event['date'] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if 'rule_type' in event: self.rule_types[event['rule_type']] += 1 elif event['event_type'] == 'unknown': self.rule_types['unknown'] += 1 else: self.rule_types['no type'] += 1 if event['event_type'] == 'server_version': if event.get('version'): node.version = event['version'] elif event.get('cassandra_version'): node.cassandra_version = event['cassandra_version'] #skipping solr, spark etc as it maybe too much noise for statuslogger elif event['event_type'] == 'memtable_status': tname = '.'.join([event['keyspace'], event['table']]) if event['ops'] > node.tables[tname].ops: node.tables[tname].ops = event['ops'] try: if event['data'] > node.tables[tname].data: node.tables[tname].data = event['data'] except Exception as e: print(event) raise e elif event['event_type'] == 'pause': node.pauses.append(event['duration']) elif event['event_type'] == 'threadpool_header': node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event['event_type'] == 'threadpool_status': if re.match(r"TPC/\d+$", event['pool_name']): if not node.version: node.version = "6.x" if 'delayed' in event and event['delayed']: print(event) val = event['delayed'] node.stages['local backpressure'][ event['pool_name']].append(val) else: for pool in [ 'active', 'pending', 'blocked', 'all_time_blocked' ]: if pool in event and event[pool]: if not self.wanted_stages or event[ 'pool_name'].startswith( self.wanted_stages): node.stages[pool][ event['pool_name']].append(event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items())
def analyze(self): """analyze log files""" if self.analyzed: return event_filter = UniqEventPerNodeFilter() target = None if self.files: target = self.files elif self.diag_dir: target_system = find_logs(self.diag_dir, file_to_find=self.syslog_prefix) target_debug = find_logs(self.diag_dir, file_to_find=self.dbglog_prefix) target = target_system + target_debug else: raise Exception("no diag dir and no files specified") for f in target: nodename = extract_node_name(f, ignore_missing_nodes=True) event_filter.set_node(nodename) node = self.nodes[nodename] if env.DEBUG: print("parsing", f) with FileWithProgress(f) as log: statuslogger_fixer = UnknownStatusLoggerWriter() for event in parser.read_system_log(log): statuslogger_fixer.check(event) if self.start and event["date"] < self.start: continue if self.end and event["date"] > self.end: continue self.__setdates(node, statuslogger_fixer.last_event_date) node.lines += 1 if event_filter.is_duplicate(event): node.skipped_lines += 1 continue if env.DEBUG: if "rule_type" in event: self.rule_types[event["rule_type"]] += 1 elif event["event_type"] == "unknown": self.rule_types["unknown"] += 1 else: self.rule_types["no type"] += 1 if event["event_type"] == "server_version": if event.get("version"): node.version = event["version"] if node.version.startswith("6"): node.cassandra_version = "DSE Private Fork" elif event.get("cassandra_version"): node.cassandra_version = event["cassandra_version"] # skipping solr, spark etc as it maybe too much noise for statuslogger elif event["event_type"] == "memtable_status": tname = ".".join([event["keyspace"], event["table"]]) if event["ops"] > node.tables[tname].ops: node.tables[tname].ops = event["ops"] try: if event["data"] > node.tables[tname].data: node.tables[tname].data = event["data"] except Exception as e: print(event) raise e elif event["event_type"] == "pause": node.pauses.append(event["duration"]) elif event["event_type"] == "threadpool_header": node.dumps_analyzed += 1 self.dumps_analyzed += 1 elif event["event_type"] == "threadpool_status": if re.match(r"TPC/\d+$", event["pool_name"]): if not node.version: node.version = "6.x" if "delayed" in event and event["delayed"]: val = event["delayed"] node.stages["local backpressure"][ event["pool_name"]].append(val) else: for pool in [ "active", "pending", "blocked", "all_time_blocked", ]: if pool in event and event[pool]: if not self.wanted_stages or event[ "pool_name"].startswith( self.wanted_stages): node.stages[pool][ event["pool_name"]].append( event[pool]) self.analyzed = True if env.DEBUG: print(self.rule_types.items())
def generate_recommendations(parsed): """generate recommendations off the parsed data""" gc_target = 0 pause_target = None if "jvm_args" in parsed: pause_target = parsed["jvm_args"].get("-XX:MaxGCPauseMillis") if pause_target is not None: parse_target_array = [] if len(pause_target.keys()) > 0: for value, nodes in pause_target.items(): try: parsed_target = int(value) except TypeError as e: raise ValueError( "unexpected gc target pause of %s on nodes %s with error %s" % (value, nodes, e)) if gc_target < parsed_target: gc_target = parsed_target parse_target_array.append("(%s: %s)" % (parsed_target, ",".join(nodes))) if len(pause_target.keys()) > 1: print( "WARN there are several pause targets so using the highest for recommendations but they may not be accurate: Configuration is as follows %s" % "; ".join(parse_target_array)) else: print( "WARN cannot find -XX:MaxGCPauseMillis in the logs setting common default of 500ms" ) gc_target = 500 tombstone_errors = 0 tombstone_warns = 0 gc_over_target = 0 counter = StatusLoggerCounter() solr_index_backoff = OrderedDict() solr_index_restore = OrderedDict() zero_copy_errors = 0 drops_remote_only = 0 rejected = 0 drop_sums = 0 drop_types = set() event_filter = diag.UniqEventPerNodeFilter() bp = BackpressureStats(local_backpressure_active={}, per_core_bp={}) core_balance = {} tpc_event_types = ["6.8", "new"] pool_name_pattern = re.compile(r"TPC\/(?P<core>[0-9]+)$") for rec_log in parsed["rec_logs"]: node = util.extract_node_name(rec_log) event_filter.set_node(node) with diag.FileWithProgress(rec_log) as rec_log_file: if rec_log_file.error: parsed["warnings"].append("%s failed with error %s" % (rec_log, rec_log_file.error)) else: statuslogger_fixer = diag.UnknownStatusLoggerWriter() if env.DEBUG: print("parsing", rec_log_file.filepath) events = parser.read_system_log(rec_log_file) for event in [ e for e in events if not event_filter.is_duplicate(e) ]: # statuslogger_fixer.check(event) HAS to run first before any code below or statuslogger events will get tossed. statuslogger_fixer.check(event) event_type = event.get("event_type") event_category = event.get("event_category") event_product = event.get("event_product") rule_type = event.get("rule_type") if event_type == "unknown": continue if event_type == "pause" and event_category == "garbage_collection": if event.get("duration") > gc_target: gc_over_target += 1 elif event_product == "tombstone": if event_type == "scan_error": tombstone_errors += event.get("tombstones") elif (event_type == "tpc_scan_warn" or event_type == "seda_scan_warn"): tombstone_warns += event.get("tombstones") elif (event_type == "threadpool_status" and rule_type in tpc_event_types): pool_name = event.get("pool_name") if env.DEBUG: print("detected pool name is %s" % pool_name) match = pool_name_pattern.match(pool_name) if match: core = int(match.group(1)) if env.DEBUG: print("detected core is %i" % core) pending = event.get("pending") if node in core_balance: core_balance[node].append( PendingCoreMeasurement(core, pending)) else: core_balance[node] = [ PendingCoreMeasurement(core, pending) ] elif (event_category == "streaming" and event_product == "zcs" and event_type == "bloom_filter"): zero_copy_errors += 1 elif event_type == "core_backpressure": if node in bp.per_core_bp.keys(): bp.per_core_bp[node].cores.append( event.get("core_num")) bp.per_core_bp[node].total_bp_events += 1 else: bp.per_core_bp[node] = CoreBackpressureStats( cores=[event.get("core_num")], total_bp_events=1) elif event_type == "core_backpressure_local": if node in bp.local_backpressure_active: bp.local_backpressure_active[node] += 1 else: bp.local_backpressure_active[node] = 1 elif event_category == "indexing": core_name = event.get("core_name") d = event.get("date") if event.get("event_type") == "increase_soft_commit": if core_name in solr_index_backoff: solr_index_backoff[core_name]["count"] += 1 solr_index_backoff[core_name]["dates"].append( d) else: solr_index_backoff[core_name] = { "count": 1, "dates": [event.get("date")], } elif event_type == "restore_soft_commit": if core_name in solr_index_restore: solr_index_restore[core_name]["count"] += 1 solr_index_restore[core_name]["dates"].append( d) else: solr_index_restore[core_name] = { "count": 1, "dates": [event.get("date")], } elif event_type == "network_backpressure": rejected += event.get("total_dropped") elif event_type == "drops": local = event.get("localCount") remote = event.get("remoteCount") drop_type = event.get("messageType") drop_types.add(drop_type) drop_sums += local + remote if remote > 0 and local == 0: drops_remote_only += 1 _status_logger_counter(event, counter) recommendations = [] _recs_on_stages(recommendations, gc_over_target, gc_target, counter) _recs_on_configs(recommendations, parsed["configs"]) _recs_on_solr(recommendations, solr_index_backoff, solr_index_restore) _recs_on_drops( recommendations, drops_remote_only, sorted(list(drop_types), reverse=True), drop_sums, ) _recs_on_zero_copy(recommendations, zero_copy_errors) _recs_on_rejects(recommendations, rejected) _recs_on_bp(recommendations, bp, gc_over_target) _recs_on_core_balance(recommendations, core_balance) _recs_on_tombstones(recommendations, tombstone_errors, tombstone_warns) return recommendations