def add_known_bugs_to_master_plugin(): """ Fetch the current plugin known_bugs.yaml and add it to the master yaml. Note that this can only be called once per plugin and is typically performed as a final part after all others have executed. """ bugs = _get_known_bugs() if bugs and bugs.get(MASTER_YAML_KNOWN_BUGS_KEY): plugin_yaml.save_part(bugs, priority=99)
def add_issues_to_master_plugin(): """ Fetch the current plugin issues.yaml and add it to the master yaml. Note that this can only be called once per plugin and is typically performed as a final part after all others have executed. """ issues = _get_issues() if issues and issues.get(MASTER_YAML_ISSUES_FOUND_KEY): plugin_yaml.save_part(issues, priority=99)
def run_report_searches(self): self.register_report_searches() self.results = self.searcher.search() self.run_report_callbacks() if not self.resources: return RABBITMQ_INFO["resources"] = self.resources def __call__(self): super().__call__() self.get_running_services_info() self.run_report_searches() def get_rabbitmq_service_checker(): # Do this way to make it easier to write unit tests. return RabbitMQServiceChecks(RMQ_SERVICES_EXPRS, hint_range=(0, 3)) def get_rabbitmq_package_checker(): # Do this way to make it easier to write unit tests. return RabbitMQPackageChecks(RMQ_PACKAGES) if __name__ == "__main__": get_rabbitmq_service_checker()() get_rabbitmq_package_checker()() if RABBITMQ_INFO: plugin_yaml.save_part(RABBITMQ_INFO, priority=0)
for router in L3HA_CHECKS["keepalived"]["transitions"]: transitions = L3HA_CHECKS["keepalived"]["transitions"][router] if transitions > threshold: max_transitions = max(transitions, max_transitions) warn_count += 1 if warn_count: msg = ("{} router(s) have had more than {} vrrp transitions " "(max={}) in the last 24 hours".format( warn_count, threshold, max_transitions)) issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg)) def __call__(self): self.get_neutron_ha_info() self.get_vrrp_transitions() # there will likely be a large number of transitions if we look across # all time so dont run this check. if not constants.USE_ALL_LOGS: self.check_vrrp_transitions() def run_checks(): return NeutronL3HAChecks() if __name__ == "__main__": run_checks()() if L3HA_CHECKS: plugin_yaml.save_part({"neutron-l3ha": L3HA_CHECKS}, priority=8)
continue if iface: ret = re.compile(r".+\s+([0-9\.]+/[0-9]+).+\s+{}$".format( iface)).match(line) if iface in ip_addr_output[i - 3] and ret: NETWORK_INFO[cni_type][iface]["addr"] = ret[1] iface = None ret = re.compile( r"^\s+vxlan id .+\s+(\S+)\s+dev\s+([0-9a-z]+).+").match(line) if cni_type in NETWORK_INFO and ret: iface_info = "{}@{}".format(ret[1], ret[2]) NETWORK_INFO[cni_type][iface]["vxlan"] = iface_info def __call__(self): super().__call__() self.get_network_info() def get_kubernetes_network_checks(): # do this way to facilitate unit tests return KubernetesNetworkChecks() if __name__ == "__main__": get_kubernetes_network_checks()() if NETWORK_INFO: plugin_yaml.save_part({"network": NETWORK_INFO}, priority=1)
for k, v in sorted(ifaces.items(), key=lambda e: e[1], reverse=True): sorted_dict[k] = v KERNEL_INFO["over-mtu-dropped-packets"] = sorted_dict def register_mtu_dropped_packets_search(self): path = os.path.join(constants.DATA_ROOT, 'var/log/kern.log') if constants.USE_ALL_LOGS: path = path + "*" sdef = SearchDef(r".+\] (\S+): dropped over-mtu packet", hint="dropped", tag="over-mtu") self.search_obj.add_search_term(sdef, path) def __call__(self): self.search_obj = FileSearcher() self.register_mtu_dropped_packets_search() self.results = self.search_obj.search() self.check_mtu_dropped_packets() def get_kernal_network_checks(): return KernelNetworkChecks() if __name__ == "__main__": get_kernal_network_checks()() if KERNEL_INFO: plugin_yaml.save_part(KERNEL_INFO, priority=2)
port_health_info = {} for port in self.neutron_phy_ports: stats = self._get_port_stats(name=port) if stats: port_health_info[port] = stats if port_health_info: health = {"phy-ports": port_health_info} if "port-health" in NETWORK_INFO: NETWORK_INFO["port-health"].updated(health) else: NETWORK_INFO["port-health"] = health def __call__(self): super().__call__() self.get_ns_info() self.get_config_network_info() self.get_neutron_phy_port_health() self.get_instances_port_health() def get_network_checker(): return OpenstackNetworkChecks() if __name__ == "__main__": get_network_checker()() if NETWORK_INFO: NETWORK_INFO = {"network": NETWORK_INFO} plugin_yaml.save_part(NETWORK_INFO, priority=4)
#!/usr/bin/python3 from common import plugin_yaml from common.checks import PackageChecksBase from openstack_common import ( OST_PROJECTS, OST_DEP_PKGS, OST_PKG_ALIASES, ) class OpenstackPackageChecks(PackageChecksBase): pass def get_checks(): package_exprs = OST_PROJECTS + OST_PKG_ALIASES + OST_DEP_PKGS return OpenstackPackageChecks(package_exprs) if __name__ == "__main__": c = get_checks() info = c() if info: plugin_yaml.save_part({"dpkg": info}, priority=3)
result = "failed" if event_name not in ext_event_info: ext_event_info[event_name] = {} if result not in ext_event_info[event_name]: ext_event_info[event_name][result] = [] ext_event_info[event_name][result].append({"port": event_id, "instance": instance_id}) if ext_event_info: for event in ext_event_info: if event not in EXT_EVENT_INFO: EXT_EVENT_INFO[event] = {} for result in ext_event_info[event]: s = ext_event_info[event][result] EXT_EVENT_INFO[event][result] = list(s) if __name__ == "__main__": # Supported events - https://docs.openstack.org/api-ref/compute/?expanded=run-events-detail#create-external-events-os-server-external-events # noqa E501 data_source = os.path.join(constants.DATA_ROOT, "var/log/nova/nova-compute.log") get_events("network-changed", data_source) get_events("network-vif-plugged", data_source) if EXT_EVENT_INFO: EXT_EVENT_INFO = {"os-server-external-events": EXT_EVENT_INFO} plugin_yaml.save_part(EXT_EVENT_INFO, priority=2)
self._agent_log_issues[service] = {} self._agent_log_issues[service][agent] = e def process_results(self, results): """Process search results to see if we got any hits.""" for service in SERVICE_RESOURCES: for agent in SERVICE_RESOURCES[service]["daemons"]: self._process_agent_results(results, service, agent) return self._agent_log_issues def run_agent_exception_checks(): s = FileSearcher() checks = [CommonAgentChecks(s)] for check in checks: check.register_search_terms() results = s.search() for check in checks: check_results = check.process_results(results) if check_results: AGENT_CHECKS_RESULTS["agent-exceptions"] = check_results if __name__ == "__main__": run_agent_exception_checks() if AGENT_CHECKS_RESULTS["agent-exceptions"]: plugin_yaml.save_part(AGENT_CHECKS_RESULTS, priority=7)
if path is None: path = os.path.join(constants.DATA_ROOT, "etc", proj, "{}.conf".format(proj)) if os.path.exists(path): for line in cli_helpers.safe_readlines(path): ret = re.compile(r"^debug\s*=\s*([A-Za-z]+).*").match(line) if ret: debug_enabled[proj] = cli_helpers.bool_str(ret[1]) if debug_enabled: OPENSTACK_INFO["debug-logging-enabled"] = debug_enabled def __call__(self): super().__call__() self.get_release_info() self.get_running_services_info() self.get_debug_log_info() def get_openstack_service_checker(): # Do this way to make it easier to write unit tests. OPENSTACK_SERVICES_EXPRS = OST_SERVICES_EXPRS + OST_SERVICES_DEPS return OpenstackServiceChecks(OPENSTACK_SERVICES_EXPRS, hint_range=(0, 3)) if __name__ == "__main__": get_openstack_service_checker()() if OPENSTACK_INFO: plugin_yaml.save_part(OPENSTACK_INFO, priority=0)
if not os.path.exists(cfg): continue for key in FEATURES[service][module]: for line in open(cfg).readlines(): ret = re.compile( r"^{}\s*=\s*(.+)\s*".format(key)).match(line) if ret: module_features[key] = cli_helpers.bool_str(ret[1]) break if key not in module_features: if key in DEFAULTS.get(service, {}).get(module, {}): default = DEFAULTS[service][module][key] module_features[key] = default # TODO: only include modules for which there is an actual agent # installed since otherwise their config is irrelevant. if module_features: if service not in SERVICE_FEATURES: SERVICE_FEATURES[service] = {} SERVICE_FEATURES[service][module] = module_features if __name__ == "__main__": get_service_features() if SERVICE_FEATURES: SERVICE_FEATURES = {"features": SERVICE_FEATURES} plugin_yaml.save_part(SERVICE_FEATURES, priority=5)
failovers[fo_type][ts_date][lb_id] = 1 for fo_type in failovers: # sort each failover by occurences for ts_date in failovers[fo_type]: d = utils.sorted_dict(failovers[fo_type][ts_date], key=lambda e: e[1], reverse=True) failovers[fo_type][ts_date] = d # now sort the dates d = utils.sorted_dict(failovers[fo_type]) if failovers: LB_CHECKS["lb-failovers"] = failovers def __call__(self): if self.core: self.get_lb_failovers() self.get_hm_amphora_missed_heartbeats() def run_checks(): # gate on whether octavia is installed return OctaviaLBChecks(["octavia-common"]) if __name__ == "__main__": run_checks()() if LB_CHECKS: plugin_yaml.save_part({"octavia": LB_CHECKS}, priority=9)
for path in self.get_sysfs_cachesets(): path = os.path.join(path, "cache_available_percent") with open(path) as fd: value = fd.read().strip() limit = CACHE_AVAILABLE_PERCENT_LIMIT_LP1900438 if int(value) <= limit: msg = ( "bcache cache_available_percent ({}) is <= {} - " "this node could be suffering from bug 1900438".format( value, limit)) add_issue(BcacheWarning(msg)) add_known_bug(1900438, "see BcacheWarning for info") def __call__(self): self.check_stats() def get_bcache_dev_checks(): return BcacheDeviceChecks() def get_bcache_stats_checks(): return BcacheStatsChecks() if __name__ == "__main__": get_bcache_dev_checks()() get_bcache_stats_checks()() if BCACHE_INFO: plugin_yaml.save_part(BCACHE_INFO, priority=1)
if machine in ps_machines: machines_running.add("{} (version={})".format( machine, version)) else: machines_stopped.add(machine) if machines_running: JUJU_MACHINE_INFO["machines"]["running"] = list(machines_running) if machines_stopped: JUJU_MACHINE_INFO["machines"]["stopped"] = list(machines_stopped) if not machines_running and (machines_stopped or self.get_local_running_units): msg = ("there is no Juju machined running on this host but it " "seems there should be") add_issue(JujuWarning(msg)) def __call__(self): self.get_machine_info() def get_machine_checks(): return JujuMachineChecks() if __name__ == "__main__": get_machine_checks()() if JUJU_MACHINE_INFO["machines"]: plugin_yaml.save_part(JUJU_MACHINE_INFO, priority=0)
def __call__(self): super().__call__() data_source = os.path.join(constants.DATA_ROOT, CEPH_LOGS, 'ceph*.log') if constants.USE_ALL_LOGS: data_source = "{}*".format(data_source) s = FileSearcher() for search in SEARCHES: s.add_search_term(search, data_source) self.results = s.search() self.process_osd_failure_reports() self.process_mon_elections() self.process_slow_requests() self.process_crc_bluestore() self.process_crc_rocksdb() self.process_long_heartbeat() self.process_heartbeat_no_reply() def get_ceph_daemon_log_checker(): # Do this way to make it easier to write unit tests. return CephDaemonLogChecks(CEPH_SERVICES_EXPRS) if __name__ == "__main__": get_ceph_daemon_log_checker()() if DAEMON_INFO: DAEMON_INFO = {"daemon-events": DAEMON_INFO} plugin_yaml.save_part(DAEMON_INFO, priority=2)
container_info.append(container) if container_info: KUBERNETES_INFO["containers"] = container_info def __call__(self): self.get_pod_info() self.get_container_info() def get_kubernetes_package_checker(): # Do this way to make it easier to write unit tests. return KubernetesPackageChecks(None) def get_kubernetes_service_checker(): # Do this way to make it easier to write unit tests. return KubernetesServiceChecks() def get_kubernetes_resource_checker(): return KubernetesResourceChecks() if __name__ == "__main__": get_kubernetes_service_checker()() get_kubernetes_package_checker()() get_kubernetes_resource_checker()() if KUBERNETES_INFO: plugin_yaml.save_part(KUBERNETES_INFO, priority=0)
def check_log_errors(self): path = os.path.join(constants.DATA_ROOT, 'var/log/rabbitmq/rabbit@*.log') if constants.USE_ALL_LOGS: path = f"{path}*" self.searcher.add_search_term(SearchDef(r".+ \S+_partitioned_network", tag="partitions"), path=path) results = self.searcher.search() if results.find_by_tag("partitions"): msg = ("cluster either has or has had partitions - check " "cluster_status") issues_utils.add_issue(issue_types.RabbitMQWarning(msg)) def __call__(self): super().__call__() self.check_log_errors() def get_rabbitmq_cluster_checker(): # Do this way to make it easier to write unit tests. return RabbitMQClusterChecks() if __name__ == "__main__": get_rabbitmq_cluster_checker()() if CLUSTER_INFO: plugin_yaml.save_part(CLUSTER_INFO, priority=1)
CEPH_INFO["mixed_crush_buckets"] = bad_buckets def __call__(self): super().__call__() self.get_osd_info() self.get_ceph_pg_imbalance() self.get_ceph_versions_mismatch() self.get_crushmap_mixed_buckets() def get_service_checker(): # Do this way to make it easier to write unit tests. return CephServiceChecks(CEPH_SERVICES_EXPRS) def get_pkg_checker(): return CephPackageChecks(CEPH_PKGS_CORE) def get_osd_checker(): # Do this way to make it easier to write unit tests. return CephOSDChecks(CEPH_SERVICES_EXPRS) if __name__ == "__main__": get_service_checker()() get_pkg_checker()() get_osd_checker()() if CEPH_INFO: plugin_yaml.save_part({"ceph": CEPH_INFO}, priority=0)
msg = ( "found {} ovs interfaces with 100% dropped packets".format( len(all_dropped))) issues_utils.add_issue(issue_types.OpenvSwitchWarning(msg)) if all_errors: msg = ( "found {} ovs interfaces with 100% packet errors".format( len(all_errors))) issues_utils.add_issue(issue_types.OpenvSwitchWarning(msg)) stats_sorted = {} for k in sorted(stats): stats_sorted[k] = stats[k] OVS_INFO["port-stats"] = stats_sorted def get_checks(): return [ OpenvSwitchvSwitchdChecks(), OpenvSwitchDaemonChecksCommon(), OpenvSwitchDPChecks() ] if __name__ == "__main__": [c() for c in get_checks()] if OVS_INFO: plugin_yaml.save_part(OVS_INFO, priority=1)
ret = re.compile(r".+load average:\s+(.+)").match(line) if ret: SYSTEM_INFO["load"] = ret[1] break df_output = cli_helpers.get_df() if df_output: for line in df_output: ret = re.compile(r"(.+\/$)").match(line) if ret: SYSTEM_INFO["rootfs"] = ret[1] break if self.unattended_upgrades_enabled: SYSTEM_INFO['unattended-upgrades'] = "ENABLED" else: SYSTEM_INFO['unattended-upgrades'] = "disabled" def __call__(self): self.get_system_info() def get_system_checks(): # do this way to facilitate unit tests return SystemChecks() if __name__ == "__main__": get_system_checks()() if SYSTEM_INFO: plugin_yaml.save_part(SYSTEM_INFO, priority=0)
extra += "node{}: {}".format( node, list_to_str(self.numa.cores(node))) extra += "\n{}: {}".format(self.cpu_dedicated_set_name, list_to_str(self.cpu_dedicated_set)) self.results.add_info( "{} has cores from > 1 numa node".format( self.cpu_dedicated_set_name), extra) if self.isolcpus or self.cpuaffinity: total_isolated = self.isolcpus.union(self.cpuaffinity) nonisolated = set(total_isolated).intersection() if len(nonisolated) <= 4: self.results.add_warn("Host has only {} cores unpinned. This " "might cause unintended performance " "problems".format(len(nonisolated))) def get_results(self): self.results.get() if __name__ == "__main__": checker = CPUPinningChecker() checker.run_cpu_pinning_checks() checker.get_results() if CPU_PINNING_INFO: CPU_PINNING_INFO = {"cpu-pinning-checks": CPU_PINNING_INFO} plugin_yaml.save_part(CPU_PINNING_INFO, priority=6)
#!/usr/bin/python3 from common import plugin_yaml from common.checks import PackageChecksBase from openstack_common import ( OST_PROJECTS, OST_DEP_PKGS, OST_PKG_ALIASES, ) OST_PKG_INFO = {} class OpenstackPackageChecks(PackageChecksBase): def __call__(self): p = self.packages if p: OST_PKG_INFO["dpkg"] = p def get_checks(): package_exprs = OST_PROJECTS + OST_PKG_ALIASES + OST_DEP_PKGS return OpenstackPackageChecks(package_exprs) if __name__ == "__main__": get_checks()() if OST_PKG_INFO: plugin_yaml.save_part(OST_PKG_INFO, priority=3)
class JujuCharmChecks(JujuChecksBase): def get_charm_versions(self): if not os.path.exists(JUJU_LIB_PATH): return versions = [] for entry in glob.glob(os.path.join(JUJU_LIB_PATH, CHARM_MANIFEST_GLOB)): for manifest in os.listdir(entry): base = os.path.basename(manifest) ret = re.compile(r".+_(\S+)-([0-9]+)$").match(base) if ret: versions.append("{}-{}".format(ret[1], ret[2])) if versions: CHARM_VERSIONS["charm-versions"] = sorted(versions) def __call__(self): self.get_charm_versions() def get_charm_checks(): return JujuCharmChecks() if __name__ == "__main__": get_charm_checks()() if CHARM_VERSIONS["charm-versions"]: plugin_yaml.save_part(CHARM_VERSIONS, priority=1)