Example #1
0
    def check_mtu_dropped_packets(self):
        ifaces = {}
        for r in self.results.find_by_tag("over-mtu"):
            if r.get(1) in ifaces:
                ifaces[r.get(1)] += 1
            else:
                ifaces[r.get(1)] = 1

        if ifaces:
            helper = HostNetworkingHelper()
            # only report the issue if the interfaces actually exist
            raise_issue = False
            host_interfaces = helper.get_host_interfaces(
                                                       include_namespaces=True)

            ifaces_extant = []
            for iface in ifaces:
                if iface in host_interfaces:
                    raise_issue = True
                    ifaces_extant.append(iface)

            if raise_issue:
                msg = ("kernel has reported over-mtu dropped packets for ({}) "
                       "interfaces".format(len(ifaces_extant)))
                issue = issue_types.NetworkWarning(msg)
                issues_utils.add_issue(issue)

            # sort by nuber of occurences
            sorted_dict = {}
            for k, v in sorted(ifaces.items(), key=lambda e: e[1],
                               reverse=True):
                sorted_dict[k] = v

            KERNEL_INFO["over-mtu-dropped-packets"] = sorted_dict
Example #2
0
def get_kernel_info():
    uname = helpers.get_uname()
    if uname:
        ret = re.compile(r"^Linux\s+\S+\s+(\S+)\s+.+").match(uname)
        if ret:
            KERNEL_INFO["version"] = ret[1]

    get_cmdline_info()

    check_nodes_memory("Normal")
    if KERNEL_INFO.get("memory-checks") is None:
        # only check other types of no issue detected on Normal
        check_nodes_memory("DMA32")

    # We only report on compaction errors if there is a shortage of high-order
    # zones.
    if KERNEL_INFO.get("memory-checks"):
        fail_count = get_vmstat_value("compact_fail")
        success_count = get_vmstat_value("compact_success")
        # we use an arbitrary threshold of 10k to suggest that a lot of
        # compaction has occurred but noting that this is a rolling counter
        # and is not necessarily representative of current state.
        if success_count > 10000:
            pcent = int(fail_count / (success_count / 100))
            if pcent > 10:
                msg = ("failures are at {}% of successes (see {})".format(
                    pcent, VMSTAT))
                KERNEL_INFO["memory-checks"]["compaction"] = msg
                issue = issue_types.MemoryWarning("compaction " + msg)
                issues_utils.add_issue(issue)

        get_slab_major_consumers()
    else:
        KERNEL_INFO["memory-checks"] = "no issues found"
Example #3
0
    def get_crushmap_mixed_buckets(self):
        """
        Report buckets that have mixed type of items,
        as they will cause crush map unable to compute
        the expected up set
        """
        osd_crush_dump = cli_helpers.get_osd_crush_dump_json_decoded()
        if not osd_crush_dump:
            return

        bad_buckets = []
        buckets = self.build_buckets_from_crushdump(osd_crush_dump)
        # check all bucket
        for bid in buckets:
            items = buckets[bid]["items"]
            type_ids = []
            for item in items:
                if item >= 0:
                    type_ids.append(0)
                else:
                    type_ids.append(buckets[item]["type_id"])

            # verify if the type_id list contain mixed type id
            if type_ids.count(type_ids[0]) != len(type_ids):
                bad_buckets.append(buckets[bid]["name"])

        if bad_buckets:
            issue = CephCrushWarning("mixed crush buckets indentified (see "
                                     "--storage for more info)")
            issues_utils.add_issue(issue)
            CEPH_INFO["mixed_crush_buckets"] = bad_buckets
Example #4
0
    def get_memory_info(self):
        self.check_nodes_memory("Normal")
        if KERNEL_INFO.get("memory-checks") is None:
            # only check other types of no issue detected on Normal
            self.check_nodes_memory("DMA32")

        # We only report on compaction errors if there is a shortage of
        # high-order zones.
        if KERNEL_INFO.get("memory-checks"):
            fail_count = self.get_vmstat_value("compact_fail")
            success_count = self.get_vmstat_value("compact_success")
            # we use an arbitrary threshold of 10k to suggest that a lot of
            # compaction has occurred but noting that this is a rolling counter
            # and is not necessarily representative of current state.
            if success_count > 10000:
                pcent = int(fail_count / (success_count / 100))
                if pcent > 10:
                    msg = ("failures are at {}% of successes (see {})".format(
                        pcent, VMSTAT))
                    issue = issue_types.MemoryWarning("compaction " + msg)
                    issues_utils.add_issue(issue)

            self.get_slab_major_consumers()
        else:
            KERNEL_INFO["memory-checks"] = "no issues found"
Example #5
0
 def test_add_issue(self):
     with mock.patch.object(issues_utils, 'PLUGIN_TMP_DIR',
                            self.tmpdir):
         issues_utils.add_issue(issue_types.MemoryWarning("test"))
         ret = issues_utils._get_issues()
         self.assertEquals(ret,
                           {issues_utils.MASTER_YAML_ISSUES_FOUND_KEY:
                            [{'type': 'MemoryWarning',
                              'desc': 'test',
                              'origin': 'testplugin.01part'}]})
Example #6
0
    def get_partition_handling(self):
        """Get the partition handling settings."""
        results = self.results.find_by_tag("cluster_partition_handling")
        if not results:
            return

        setting = results[0].get(1)
        if setting == "ignore":
            msg = "Cluster partition handling is currently set to ignore. " \
                "This is potentially dangerous and a setting of " \
                "pause_minority is recommended."
            issues_utils.add_issue(issue_types.RabbitMQWarning(msg))
            self.resources["cluster-partition-handling"] = setting
Example #7
0
    def check_log_errors(self):
        path = os.path.join(constants.DATA_ROOT,
                            'var/log/rabbitmq/rabbit@*.log')
        if constants.USE_ALL_LOGS:
            path = f"{path}*"

        self.searcher.add_search_term(SearchDef(r".+ \S+_partitioned_network",
                                                tag="partitions"),
                                      path=path)
        results = self.searcher.search()
        if results.find_by_tag("partitions"):
            msg = ("cluster either has or has had partitions - check "
                   "cluster_status")
            issues_utils.add_issue(issue_types.RabbitMQWarning(msg))
Example #8
0
    def get_machine_info(self):
        ps_machines = set()
        log_machines = set()
        machines_running = set()
        machines_stopped = set()

        if not os.path.exists(JUJU_LOG_PATH):
            return

        for line in cli_helpers.get_ps():
            if "machine-" in line:
                ret = re.compile(r".+machine-([0-9]+).*").match(line)
                if ret:
                    ps_machines.add(ret[1])

        for f in os.listdir(JUJU_LOG_PATH):
            ret = re.compile(r"machine-([0-9]+)\.log.*").match(f)
            if ret:
                log_machines.add(ret[1])

        combined_machines = ps_machines.union(log_machines)
        for machine in combined_machines:
            conf_path = (
                "var/lib/juju/agents/machine-{}/agent.conf".format(machine))
            agent_conf = os.path.join(constants.DATA_ROOT, conf_path)
            version = "unknown"
            if os.path.exists(agent_conf):
                for line in open(agent_conf).readlines():
                    ret = re.compile(r"upgradedToVersion:\s+(.+)").match(line)
                    if ret:
                        version = ret[1]

            if machine in ps_machines:
                machines_running.add("{} (version={})".format(
                    machine, version))
            else:
                machines_stopped.add(machine)

        if machines_running:
            JUJU_MACHINE_INFO["machines"]["running"] = list(machines_running)

        if machines_stopped:
            JUJU_MACHINE_INFO["machines"]["stopped"] = list(machines_stopped)

        if not machines_running and (machines_stopped
                                     or self.get_local_running_units):
            msg = ("there is no Juju machined running on this host but it "
                   "seems there should be")
            add_issue(JujuWarning(msg))
Example #9
0
    def check_stats(self):
        if not self.get_sysfs_cachesets():
            return

        for path in self.get_sysfs_cachesets():
            path = os.path.join(path, "cache_available_percent")
            with open(path) as fd:
                value = fd.read().strip()
                limit = CACHE_AVAILABLE_PERCENT_LIMIT_LP1900438
                if int(value) <= limit:
                    msg = (
                        "bcache cache_available_percent ({}) is <= {} - "
                        "this node could be suffering from bug 1900438".format(
                            value, limit))
                    add_issue(BcacheWarning(msg))
                    add_known_bug(1900438, "see BcacheWarning for info")
Example #10
0
    def get_queues(self):
        """Get distribution of queues across cluster."""
        sd = self._sequences["queues"]["searchdef"]
        vhost_queues = {}
        raise_issues = []
        for results in self.results.find_sequence_sections(sd).values():
            vhost = None
            queues = {}
            for result in results:
                if result.tag == sd.start_tag:
                    vhost = result.get(1)
                elif result.tag == sd.body_tag:
                    info = {"pid_name": result.get(1), "queue": result.get(2)}
                    if info["pid_name"] not in queues:
                        queues[info["pid_name"]] = 1
                    else:
                        queues[info["pid_name"]] += 1

            vhost_queues[vhost] = {}
            if len(queues.keys()) == 0:
                continue

            total = functools.reduce(lambda x, y: x + y, list(queues.values()),
                                     0)
            vhost_queues[vhost] = {}
            for pid in queues:
                if total > 0:
                    fraction = queues[pid] / total
                    fraction_string = "{:.2f}%".format(fraction * 100)
                    if fraction > 2 / 3:
                        raise_issues.append(
                            "{} holds more than 2/3 of queues".format(pid))
                else:
                    fraction_string = "N/A"

                vhost_queues[vhost][pid] = "{:d} ({})".format(
                    queues[pid], fraction_string)

        for issue in raise_issues:
            issues_utils.add_issue(issue_types.RabbitMQWarning(issue))

        if vhost_queues:
            # list all vhosts but only show their queues if not []
            self.resources["vhosts"] = sorted(list(vhost_queues.keys()))
            self.resources["vhost-queue-distributions"] = \
                {k: v for k, v in vhost_queues.items() if v}
Example #11
0
    def check_vrrp_transitions(self):
        if "transitions" not in L3HA_CHECKS.get("keepalived", {}):
            return

        max_transitions = 0
        warn_count = 0
        threshold = VRRP_TRANSITION_WARN_THRESHOLD
        for router in L3HA_CHECKS["keepalived"]["transitions"]:
            transitions = L3HA_CHECKS["keepalived"]["transitions"][router]
            if transitions > threshold:
                max_transitions = max(transitions, max_transitions)
                warn_count += 1

        if warn_count:
            msg = ("{} router(s) have had more than {} vrrp transitions "
                   "(max={}) in the last 24 hours".format(
                       warn_count, threshold, max_transitions))
            issues_utils.add_issue(issue_types.NeutronL3HAWarning(msg))
Example #12
0
    def check_ovs_cleanup(self):
        """
        Allow one run on node boot/reboot but not after.
        """
        raise_issue = False
        start_count = 0
        for line in cli_helpers.get_journalctl(unit="neutron-ovs-cleanup"):
            expr = r"Started OpenStack Neutron OVS cleanup."
            if re.compile("-- Reboot --").match(line):
                # reset after reboot
                start_count = 0
            elif re.compile(expr).search(line):
                if start_count:
                    raise_issue = True
                    break

                start_count += 1

        if raise_issue:
            msg = ("neutron-ovs-cleanup has been manually run on this "
                   "host. This is not recommended and can have unintended "
                   "side-effects.")
            issues_utils.add_issue(issue_types.OpenstackWarning(msg))
Example #13
0
    def process_results(self):
        """
        Report on interfaces that are showing packet drops or errors.

        Sometimes it is normal for an interface to have packet drops and if
        we think that is the case we ignore but otherwise we raise an issue
        to alert.

        Interfaces we currently ignore:

        OVS bridges.

        In Openstack for example when using Neutron HA routers, vrrp peers
        that are in BACKUP state may still receive packets on their external
        interface but these will be dropped since they have no where to go. In
        this case it is possible to have 100% packet drops on the interface
        if that VR has never been a vrrp MASTER. For this scenario we filter
        interfaces whose name matches e.g. qg-3ca935f4-07.
        """
        stats = {}
        all_dropped = []  # interfaces where all packets are dropped
        all_errors = []  # interfaces where all packets are errors
        for sd in self.sequence_defs:
            for results in self.results.find_sequence_sections(sd).values():
                port = None
                _stats = {}
                for result in results:
                    if result.tag == sd.start_tag:
                        port = result.get(1)
                    elif result.tag == sd.body_tag:
                        key = result.get(1)
                        packets = int(result.get(2))
                        errors = int(result.get(3))
                        dropped = int(result.get(4))

                        log_stats = False
                        if packets:
                            dropped_pcent = int((100 / packets) * dropped)
                            errors_pcent = int((100 / packets) * errors)
                            if dropped_pcent > 1 or errors_pcent > 1:
                                log_stats = True
                        elif errors or dropped:
                            log_stats = True

                        if log_stats:
                            _stats[key] = {"packets": packets}
                            if errors:
                                _stats[key]["errors"] = errors
                            if dropped:
                                _stats[key]["dropped"] = dropped

                if port and _stats:
                    # Ports to ignore - see docstring for info
                    if (port in self.ovs_bridges
                            or re.compile(r"^(q|s)g-\S{11}$").match(port)):
                        continue

                    for key in _stats:
                        s = _stats[key]
                        if s.get('dropped') and not s['packets']:
                            all_dropped.append(port)

                        if s.get('errors') and not s['packets']:
                            all_errors.append(port)

                    stats[port] = _stats

        if stats:
            if all_dropped:
                msg = (
                    "found {} ovs interfaces with 100% dropped packets".format(
                        len(all_dropped)))
                issues_utils.add_issue(issue_types.OpenvSwitchWarning(msg))

            if all_errors:
                msg = (
                    "found {} ovs interfaces with 100% packet errors".format(
                        len(all_errors)))
                issues_utils.add_issue(issue_types.OpenvSwitchWarning(msg))

            stats_sorted = {}
            for k in sorted(stats):
                stats_sorted[k] = stats[k]

            OVS_INFO["port-stats"] = stats_sorted
Example #14
0
    def get_queue_info(self):
        """Get distribution of queues across cluster."""
        sd = self._sequences["queues"]["searchdef"]
        vhost_queues = {}
        issues_raised = {}
        skewed_queue_nodes = {}
        for results in self.results.find_sequence_sections(sd).values():
            vhost = None
            queues = {}
            for result in results:
                if result.tag == sd.start_tag:
                    # check both report formats
                    vhost = result.get(1)
                elif result.tag == sd.body_tag:
                    node_name = result.get(1) or result.get(4)
                    # if we matched the section header, skip
                    if node_name == "pid":
                        continue

                    queue = result.get(2) or result.get(3)
                    # if we matched the section header, skip
                    if queue == "name":
                        continue

                    if node_name not in queues:
                        queues[node_name] = 0

                    queues[node_name] += 1

            vhost_queues[vhost] = {}
            if not queues:
                continue

            total = sum(queues.values())
            for node_name in queues:
                if total > 0:
                    fraction = queues[node_name] / total
                    fraction_string = "{:.2f}%".format(fraction * 100)
                    if fraction > 2 / 3:
                        if node_name not in skewed_queue_nodes:
                            skewed_queue_nodes[node_name] = 0

                        skewed_queue_nodes[node_name] += 1
                else:
                    fraction_string = "N/A"

                vhost_queues[vhost][node_name] = "{:d} ({})".format(
                    queues[node_name], fraction_string)

            # Report the node with the greatest skew of queues/vhost
            if skewed_queue_nodes:
                max_node = None
                for node_name in skewed_queue_nodes:
                    if max_node is None:
                        max_node = node_name
                    elif (skewed_queue_nodes[node_name] >=
                            skewed_queue_nodes[max_node]):
                        max_node = node_name

                if (skewed_queue_nodes[max_node] >
                        issues_raised.get(max_node, 0)):
                    issues_raised[max_node] = skewed_queue_nodes[max_node]

        # this should only actually ever report one node
        for node_name in issues_raised:
            msg = ("{} holds more than 2/3 of queues for {}/{} vhost(s)".
                   format(node_name, issues_raised[node_name],
                          len(vhost_queues)))
            issues_utils.add_issue(issue_types.RabbitMQWarning(msg))

        if vhost_queues:
            # list all vhosts but only show their queues if not []
            self.resources["vhosts"] = sorted(list(vhost_queues.keys()))
            self.resources["vhost-queue-distributions"] = \
                {k: v for k, v in vhost_queues.items() if v}