def check_mtu_dropped_packets(self): ifaces = {} for r in self.results.find_by_tag("over-mtu"): if r.get(1) in ifaces: ifaces[r.get(1)] += 1 else: ifaces[r.get(1)] = 1 if ifaces: helper = HostNetworkingHelper() # only report the issue if the interfaces actually exist raise_issue = False host_interfaces = helper.get_host_interfaces( include_namespaces=True) ifaces_extant = [] for iface in ifaces: if iface in host_interfaces: raise_issue = True ifaces_extant.append(iface) if raise_issue: msg = ("kernel has reported over-mtu dropped packets for ({}) " "interfaces".format(len(ifaces_extant))) issue = issue_types.NetworkWarning(msg) issues_utils.add_issue(issue) # sort by nuber of occurences sorted_dict = {} for k, v in sorted(ifaces.items(), key=lambda e: e[1], reverse=True): sorted_dict[k] = v KERNEL_INFO["over-mtu-dropped-packets"] = sorted_dict
def get_kernel_info(): uname = helpers.get_uname() if uname: ret = re.compile(r"^Linux\s+\S+\s+(\S+)\s+.+").match(uname) if ret: KERNEL_INFO["version"] = ret[1] get_cmdline_info() check_nodes_memory("Normal") if KERNEL_INFO.get("memory-checks") is None: # only check other types of no issue detected on Normal check_nodes_memory("DMA32") # We only report on compaction errors if there is a shortage of high-order # zones. if KERNEL_INFO.get("memory-checks"): fail_count = get_vmstat_value("compact_fail") success_count = get_vmstat_value("compact_success") # we use an arbitrary threshold of 10k to suggest that a lot of # compaction has occurred but noting that this is a rolling counter # and is not necessarily representative of current state. if success_count > 10000: pcent = int(fail_count / (success_count / 100)) if pcent > 10: msg = ("failures are at {}% of successes (see {})".format( pcent, VMSTAT)) KERNEL_INFO["memory-checks"]["compaction"] = msg issue = issue_types.MemoryWarning("compaction " + msg) issues_utils.add_issue(issue) get_slab_major_consumers() else: KERNEL_INFO["memory-checks"] = "no issues found"
def get_crushmap_mixed_buckets(self): """ Report buckets that have mixed type of items, as they will cause crush map unable to compute the expected up set """ osd_crush_dump = cli_helpers.get_osd_crush_dump_json_decoded() if not osd_crush_dump: return bad_buckets = [] buckets = self.build_buckets_from_crushdump(osd_crush_dump) # check all bucket for bid in buckets: items = buckets[bid]["items"] type_ids = [] for item in items: if item >= 0: type_ids.append(0) else: type_ids.append(buckets[item]["type_id"]) # verify if the type_id list contain mixed type id if type_ids.count(type_ids[0]) != len(type_ids): bad_buckets.append(buckets[bid]["name"]) if bad_buckets: issue = CephCrushWarning("mixed crush buckets indentified (see " "--storage for more info)") issues_utils.add_issue(issue) CEPH_INFO["mixed_crush_buckets"] = bad_buckets
def get_memory_info(self): self.check_nodes_memory("Normal") if KERNEL_INFO.get("memory-checks") is None: # only check other types of no issue detected on Normal self.check_nodes_memory("DMA32") # We only report on compaction errors if there is a shortage of # high-order zones. if KERNEL_INFO.get("memory-checks"): fail_count = self.get_vmstat_value("compact_fail") success_count = self.get_vmstat_value("compact_success") # we use an arbitrary threshold of 10k to suggest that a lot of # compaction has occurred but noting that this is a rolling counter # and is not necessarily representative of current state. if success_count > 10000: pcent = int(fail_count / (success_count / 100)) if pcent > 10: msg = ("failures are at {}% of successes (see {})".format( pcent, VMSTAT)) issue = issue_types.MemoryWarning("compaction " + msg) issues_utils.add_issue(issue) self.get_slab_major_consumers() else: KERNEL_INFO["memory-checks"] = "no issues found"
def test_add_issue(self): with mock.patch.object(issues_utils, 'PLUGIN_TMP_DIR', self.tmpdir): issues_utils.add_issue(issue_types.MemoryWarning("test")) ret = issues_utils._get_issues() self.assertEquals(ret, {issues_utils.MASTER_YAML_ISSUES_FOUND_KEY: [{'type': 'MemoryWarning', 'desc': 'test', 'origin': 'testplugin.01part'}]})
def get_partition_handling(self): """Get the partition handling settings.""" results = self.results.find_by_tag("cluster_partition_handling") if not results: return setting = results[0].get(1) if setting == "ignore": msg = "Cluster partition handling is currently set to ignore. " \ "This is potentially dangerous and a setting of " \ "pause_minority is recommended." issues_utils.add_issue(issue_types.RabbitMQWarning(msg)) self.resources["cluster-partition-handling"] = setting
def check_log_errors(self): path = os.path.join(constants.DATA_ROOT, 'var/log/rabbitmq/rabbit@*.log') if constants.USE_ALL_LOGS: path = f"{path}*" self.searcher.add_search_term(SearchDef(r".+ \S+_partitioned_network", tag="partitions"), path=path) results = self.searcher.search() if results.find_by_tag("partitions"): msg = ("cluster either has or has had partitions - check " "cluster_status") issues_utils.add_issue(issue_types.RabbitMQWarning(msg))
def get_machine_info(self): ps_machines = set() log_machines = set() machines_running = set() machines_stopped = set() if not os.path.exists(JUJU_LOG_PATH): return for line in cli_helpers.get_ps(): if "machine-" in line: ret = re.compile(r".+machine-([0-9]+).*").match(line) if ret: ps_machines.add(ret[1]) for f in os.listdir(JUJU_LOG_PATH): ret = re.compile(r"machine-([0-9]+)\.log.*").match(f) if ret: log_machines.add(ret[1]) combined_machines = ps_machines.union(log_machines) for machine in combined_machines: conf_path = ( "var/lib/juju/agents/machine-{}/agent.conf".format(machine)) agent_conf = os.path.join(constants.DATA_ROOT, conf_path) version = "unknown" if os.path.exists(agent_conf): for line in open(agent_conf).readlines(): ret = re.compile(r"upgradedToVersion:\s+(.+)").match(line) if ret: version = ret[1] if machine in ps_machines: machines_running.add("{} (version={})".format( machine, version)) else: machines_stopped.add(machine) if machines_running: JUJU_MACHINE_INFO["machines"]["running"] = list(machines_running) if machines_stopped: JUJU_MACHINE_INFO["machines"]["stopped"] = list(machines_stopped) if not machines_running and (machines_stopped or self.get_local_running_units): msg = ("there is no Juju machined running on this host but it " "seems there should be") add_issue(JujuWarning(msg))
def check_stats(self): if not self.get_sysfs_cachesets(): return for path in self.get_sysfs_cachesets(): path = os.path.join(path, "cache_available_percent") with open(path) as fd: value = fd.read().strip() limit = CACHE_AVAILABLE_PERCENT_LIMIT_LP1900438 if int(value) <= limit: msg = ( "bcache cache_available_percent ({}) is <= {} - " "this node could be suffering from bug 1900438".format( value, limit)) add_issue(BcacheWarning(msg)) add_known_bug(1900438, "see BcacheWarning for info")
def get_queues(self): """Get distribution of queues across cluster.""" sd = self._sequences["queues"]["searchdef"] vhost_queues = {} raise_issues = [] for results in self.results.find_sequence_sections(sd).values(): vhost = None queues = {} for result in results: if result.tag == sd.start_tag: vhost = result.get(1) elif result.tag == sd.body_tag: info = {"pid_name": result.get(1), "queue": result.get(2)} if info["pid_name"] not in queues: queues[info["pid_name"]] = 1 else: queues[info["pid_name"]] += 1 vhost_queues[vhost] = {} if len(queues.keys()) == 0: continue total = functools.reduce(lambda x, y: x + y, list(queues.values()), 0) vhost_queues[vhost] = {} for pid in queues: if total > 0: fraction = queues[pid] / total fraction_string = "{:.2f}%".format(fraction * 100) if fraction > 2 / 3: raise_issues.append( "{} holds more than 2/3 of queues".format(pid)) else: fraction_string = "N/A" vhost_queues[vhost][pid] = "{:d} ({})".format( queues[pid], fraction_string) for issue in raise_issues: issues_utils.add_issue(issue_types.RabbitMQWarning(issue)) if vhost_queues: # list all vhosts but only show their queues if not [] self.resources["vhosts"] = sorted(list(vhost_queues.keys())) self.resources["vhost-queue-distributions"] = \ {k: v for k, v in vhost_queues.items() if v}
def check_vrrp_transitions(self): if "transitions" not in L3HA_CHECKS.get("keepalived", {}): return max_transitions = 0 warn_count = 0 threshold = VRRP_TRANSITION_WARN_THRESHOLD for router in L3HA_CHECKS["keepalived"]["transitions"]: transitions = L3HA_CHECKS["keepalived"]["transitions"][router] if transitions > threshold: max_transitions = max(transitions, max_transitions) warn_count += 1 if warn_count: msg = ("{} router(s) have had more than {} vrrp transitions " "(max={}) in the last 24 hours".format( warn_count, threshold, max_transitions)) issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg))
def check_ovs_cleanup(self): """ Allow one run on node boot/reboot but not after. """ raise_issue = False start_count = 0 for line in cli_helpers.get_journalctl(unit="neutron-ovs-cleanup"): expr = r"Started OpenStack Neutron OVS cleanup." if re.compile("-- Reboot --").match(line): # reset after reboot start_count = 0 elif re.compile(expr).search(line): if start_count: raise_issue = True break start_count += 1 if raise_issue: msg = ("neutron-ovs-cleanup has been manually run on this " "host. This is not recommended and can have unintended " "side-effects.") issues_utils.add_issue(issue_types.OpenstackWarning(msg))
def process_results(self): """ Report on interfaces that are showing packet drops or errors. Sometimes it is normal for an interface to have packet drops and if we think that is the case we ignore but otherwise we raise an issue to alert. Interfaces we currently ignore: OVS bridges. In Openstack for example when using Neutron HA routers, vrrp peers that are in BACKUP state may still receive packets on their external interface but these will be dropped since they have no where to go. In this case it is possible to have 100% packet drops on the interface if that VR has never been a vrrp MASTER. For this scenario we filter interfaces whose name matches e.g. qg-3ca935f4-07. """ stats = {} all_dropped = [] # interfaces where all packets are dropped all_errors = [] # interfaces where all packets are errors for sd in self.sequence_defs: for results in self.results.find_sequence_sections(sd).values(): port = None _stats = {} for result in results: if result.tag == sd.start_tag: port = result.get(1) elif result.tag == sd.body_tag: key = result.get(1) packets = int(result.get(2)) errors = int(result.get(3)) dropped = int(result.get(4)) log_stats = False if packets: dropped_pcent = int((100 / packets) * dropped) errors_pcent = int((100 / packets) * errors) if dropped_pcent > 1 or errors_pcent > 1: log_stats = True elif errors or dropped: log_stats = True if log_stats: _stats[key] = {"packets": packets} if errors: _stats[key]["errors"] = errors if dropped: _stats[key]["dropped"] = dropped if port and _stats: # Ports to ignore - see docstring for info if (port in self.ovs_bridges or re.compile(r"^(q|s)g-\S{11}$").match(port)): continue for key in _stats: s = _stats[key] if s.get('dropped') and not s['packets']: all_dropped.append(port) if s.get('errors') and not s['packets']: all_errors.append(port) stats[port] = _stats if stats: if all_dropped: msg = ( "found {} ovs interfaces with 100% dropped packets".format( len(all_dropped))) issues_utils.add_issue(issue_types.OpenvSwitchWarning(msg)) if all_errors: msg = ( "found {} ovs interfaces with 100% packet errors".format( len(all_errors))) issues_utils.add_issue(issue_types.OpenvSwitchWarning(msg)) stats_sorted = {} for k in sorted(stats): stats_sorted[k] = stats[k] OVS_INFO["port-stats"] = stats_sorted
def get_queue_info(self): """Get distribution of queues across cluster.""" sd = self._sequences["queues"]["searchdef"] vhost_queues = {} issues_raised = {} skewed_queue_nodes = {} for results in self.results.find_sequence_sections(sd).values(): vhost = None queues = {} for result in results: if result.tag == sd.start_tag: # check both report formats vhost = result.get(1) elif result.tag == sd.body_tag: node_name = result.get(1) or result.get(4) # if we matched the section header, skip if node_name == "pid": continue queue = result.get(2) or result.get(3) # if we matched the section header, skip if queue == "name": continue if node_name not in queues: queues[node_name] = 0 queues[node_name] += 1 vhost_queues[vhost] = {} if not queues: continue total = sum(queues.values()) for node_name in queues: if total > 0: fraction = queues[node_name] / total fraction_string = "{:.2f}%".format(fraction * 100) if fraction > 2 / 3: if node_name not in skewed_queue_nodes: skewed_queue_nodes[node_name] = 0 skewed_queue_nodes[node_name] += 1 else: fraction_string = "N/A" vhost_queues[vhost][node_name] = "{:d} ({})".format( queues[node_name], fraction_string) # Report the node with the greatest skew of queues/vhost if skewed_queue_nodes: max_node = None for node_name in skewed_queue_nodes: if max_node is None: max_node = node_name elif (skewed_queue_nodes[node_name] >= skewed_queue_nodes[max_node]): max_node = node_name if (skewed_queue_nodes[max_node] > issues_raised.get(max_node, 0)): issues_raised[max_node] = skewed_queue_nodes[max_node] # this should only actually ever report one node for node_name in issues_raised: msg = ("{} holds more than 2/3 of queues for {}/{} vhost(s)". format(node_name, issues_raised[node_name], len(vhost_queues))) issues_utils.add_issue(issue_types.RabbitMQWarning(msg)) if vhost_queues: # list all vhosts but only show their queues if not [] self.resources["vhosts"] = sorted(list(vhost_queues.keys())) self.resources["vhost-queue-distributions"] = \ {k: v for k, v in vhost_queues.items() if v}