Ejemplo n.º 1
0
    def service_filtered_ps(self):
        ps_filtered = []
        path = os.path.join(HotSOSConfig.DATA_ROOT,
                            'sys/fs/cgroup/unified/system.slice')
        for svc in self.services:
            for svc in self.get_services_expanded(svc):
                _path = os.path.join(path, "{}.service".format(svc),
                                     'cgroup.procs')
                if not os.path.exists(_path):
                    _path = glob.glob(
                        os.path.join(path, 'system-*.slice',
                                     "{}.service".format(svc), 'cgroup.procs'))
                    if not _path or not os.path.exists(_path[0]):
                        continue

                    _path = _path[0]

                pids = []
                with open(_path) as fd:
                    for line in fd:
                        pids.append(int(line))

                for line in CLIHelper().ps():
                    for pid in pids:
                        if " {} ".format(pid) in line:
                            ps_filtered.append(line)

        return ps_filtered
Ejemplo n.º 2
0
    def __init__(self):
        try:
            self.numactl = CLIHelper().numactl() or ""
        except OSError:
            self.numactl = ""

        self._nodes = {}
Ejemplo n.º 3
0
    def path(self):
        if self.fs_path:  # pylint: disable=W0125
            path = os.path.join(HotSOSConfig.DATA_ROOT, self.fs_path)
            if (HotSOSConfig.USE_ALL_LOGS and not
                    self.options['disable-all-logs']):
                path = "{}*".format(path)

            return path

        if self.command:  # pylint: disable=W0125
            if self.cmd_tmp_path:
                return self.cmd_tmp_path

            args_callback = self.options['args-callback']
            if args_callback:
                args, kwargs = self.get_method(args_callback)
            else:
                args = self.options['args']
                kwargs = self.options['kwargs']

            # get command output
            out = getattr(CLIHelper(), self.command)(*args, **kwargs)
            # store in temp file to make it searchable
            # NOTE: we dont need to delete this at the the end since they are
            # created in the plugun tmp dir which is wiped at the end of the
            # plugin run.
            if type(out) == list:
                out = ''.join(out)
            elif type(out) == dict:
                out = str(out)

            self.cmd_tmp_path = mktemp_dump(out)
            return self.cmd_tmp_path

        log.debug("no input provided")
Ejemplo n.º 4
0
 def __summary_rootfs(self):
     df_output = CLIHelper().df()
     if df_output:
         for line in df_output:
             ret = re.compile(r"(.+\/$)").match(line)
             if ret:
                 return ret[1]
Ejemplo n.º 5
0
    def services(self):
        """
        Return a dict of identified systemd services and their state.

        Services are represented as either direct or indirect units and
        typically use one or the other. We homongenise these to present state
        based on the one we think is being used. Enabled units are aggregated
        but masked units are not so that they can be identified and reported.
        """
        if self._service_info:
            return self._service_info

        svc_info = {}
        indirect_svc_info = {}
        for line in CLIHelper().systemctl_list_unit_files():
            for expr in self.service_exprs:
                # Add snap prefix/suffixes
                base_expr = r"(?:snap\.)?{}(?:\.daemon)?".format(expr)
                # NOTE: we include indirect services (ending with @) so that
                #       we can search for related units later.
                unit_expr = r'^\s*({}(?:@\S*)?)\.service'.format(base_expr)
                # match entries in systemctl list-unit-files
                unit_files_expr = r'{}\s+(\S+)'.format(unit_expr)

                ret = re.compile(unit_files_expr).match(line)
                if ret:
                    unit = ret.group(1)
                    state = ret.group(2)
                    if unit.endswith('@'):
                        # indirect or "template" units can have "instantiated"
                        # units where only the latter represents whether the
                        # unit is in use. If an indirect unit has instanciated
                        # units we use them to represent the state of the
                        # service.
                        unit_svc_expr = r"\s+({}\d*)".format(unit)
                        unit = unit.partition('@')[0]
                        if self._get_systemd_units(unit_svc_expr):
                            state = 'enabled'

                        indirect_svc_info[unit] = state
                    else:
                        svc_info[unit] = SystemdService(unit, state)

        if indirect_svc_info:
            # Allow indirect unit info to override given certain conditions
            for unit, state in indirect_svc_info.items():
                if unit in svc_info:
                    if state == 'disabled' or svc_info[unit] == 'enabled':
                        continue

                    svc_info[unit].state = state
                else:
                    svc_info[unit] = SystemdService(unit, state)

        self._service_info = svc_info
        return self._service_info
Ejemplo n.º 6
0
    def num_cpus(self):
        """ Return number of cpus or 0 if none found. """
        lscpu_output = CLIHelper().lscpu()
        if lscpu_output:
            for line in lscpu_output:
                ret = re.compile(r"^CPU\(s\):\s+([0-9]+)\s*.*").match(line)
                if ret:
                    return int(ret[1])

        return 0
Ejemplo n.º 7
0
    def virtualisation_type(self):
        """
        @return: virt type e.g. kvm or lxc if host is virtualised otherwise
                 None.
        """
        info = CLIHelper().hostnamectl()
        for line in info:
            split_line = line.partition(': ')
            if 'Virtualization' in split_line[0]:
                return split_line[2].strip()

        return
Ejemplo n.º 8
0
    def get_services_expanded(self, name):
        _expanded = []
        for line in CLIHelper().systemctl_list_units():
            expr = r'^\s*({}(@\S*)?)\.service'.format(name)
            ret = re.compile(expr).match(line)
            if ret:
                _expanded.append(ret.group(1))

        if not _expanded:
            _expanded = [name]

        return _expanded
Ejemplo n.º 9
0
    def _get_systemd_units(self, expr):
        """
        Search systemd unit instances.

        @param expr: expression used to match one or more units in --list-units
        """
        units = []
        for line in CLIHelper().systemctl_list_units():
            ret = re.compile(expr).match(line)
            if ret:
                units.append(ret.group(1))

        return units
Ejemplo n.º 10
0
    def sysctl_all(self):
        if self._sysctl_all is not None:
            return self._sysctl_all

        actuals = {}
        for kv in CLIHelper().sysctl_all():
            k = kv.partition("=")[0].strip()
            v = kv.partition("=")[2].strip()
            # normalise multi-whitespace into a single
            actuals[k] = ' '.join(v.split())

        self._sysctl_all = actuals
        return self._sysctl_all
Ejemplo n.º 11
0
    def __summary_devices(self):
        devs = {}
        for dev_type in ['bcache', 'nvme']:
            for line in CLIHelper().ls_lanR_sys_block():
                expr = r".+[0-9:]+\s+({}[0-9a-z]+)\s+.+".format(dev_type)
                ret = re.compile(expr).match(line)
                if ret:
                    if dev_type not in devs:
                        devs[dev_type] = {}

                    devname = ret[1]
                    devs[dev_type][devname] = {}
                    for line in CLIHelper().udevadm_info_dev(device=devname):
                        expr = r'.+\s+disk/by-dname/(.+)'
                        ret = re.compile(expr).match(line)
                        if ret:
                            devs[dev_type][devname]['dname'] = ret[1]
                        elif 'dname' not in devs[dev_type][devname]:
                            devs[dev_type][devname]['dname'] = \
                                '<notfound>'

        if devs:
            return devs
Ejemplo n.º 12
0
    def unattended_upgrades_enabled(self):
        apt_config_dump = CLIHelper().apt_config_dump()
        if not apt_config_dump:
            return

        for line in apt_config_dump:
            ret = re.compile(r"^APT::Periodic::Unattended-Upgrade\s+"
                             "\"([0-9]+)\";").match(line)
            if ret:
                if int(ret[1]) == 0:
                    return False
                else:
                    return True

        return False
Ejemplo n.º 13
0
class KernelLogEventChecks(KernelEventChecksBase):
    def __init__(self):
        super().__init__(yaml_defs_group='kernlog',
                         searchobj=FileSearcher(),
                         callback_helper=EVENTCALLBACKS)
        self.cli_helper = CLIHelper()
        self.hostnet_helper = HostNetworkingHelper()

    @EVENTCALLBACKS.callback()
    def over_mtu_dropped_packets(self, event):
        interfaces = {}
        for r in event.results:
            if r.get(1) in interfaces:
                interfaces[r.get(1)] += 1
            else:
                interfaces[r.get(1)] = 1

        if interfaces:
            # only report on interfaces that currently exist
            host_interfaces = [
                iface.name for iface in self.hostnet_helper.host_interfaces_all
            ]
            # filter out interfaces that are actually ovs bridge aliases
            ovs_bridges = self.cli_helper.ovs_vsctl_list_br()
            # strip trailing newline chars
            ovs_bridges = [br.strip() for br in ovs_bridges]

            interfaces_extant = {}
            for iface in interfaces:
                if iface in host_interfaces:
                    if iface not in ovs_bridges:
                        interfaces_extant[iface] = interfaces[iface]

            if interfaces_extant:
                msg = ("kernel has reported over-mtu dropped packets for ({}) "
                       "interfaces.".format(len(interfaces_extant)))
                issue = NetworkWarning(msg)
                IssuesManager().add(issue)

                # sort by number of occurrences
                sorted_dict = {}
                for k, v in sorted(interfaces_extant.items(),
                                   key=lambda e: e[1],
                                   reverse=True):
                    sorted_dict[k] = v

                return sorted_dict
Ejemplo n.º 14
0
    def start_time(self):
        """ Get most recent start time of this service unit.

        @returns: datetime.datetime object or None if time not found.
        """
        if self._start_time:
            return self._start_time

        cexpr = re.compile(r"^(([0-9-]+)T[\d:]+\+[\d]+)\s+.+: "
                           "(Started|Starting) .+")
        journal = CLIHelper().journalctl(unit=self.name)
        last = None
        for line in journal:
            ret = cexpr.search(line)
            if ret:
                last = ret.group(1)

        if last:
            self._start_time = datetime.strptime(last, "%Y-%m-%dT%H:%M:%S+%f")

        return self._start_time
Ejemplo n.º 15
0
    def filter_by_age(cls, results, result_age_hours):
        if not result_age_hours:
            log.debug("result age filter not specified - skipping")
            return results

        current = CLIHelper().date(format='+%Y-%m-%d %H:%M:%S')
        if not current:
            log.warning("date() returned unexpected value '%s' - skipping "
                        "filter by age", current)
            return results

        current = datetime.strptime(current, "%Y-%m-%d %H:%M:%S")
        log.debug("applying search filter (result_age_hours=%s, "
                  "current='%s')", result_age_hours, current)

        _results = []
        for r in results:
            ts = cls.get_datetime_from_result(r)
            if ts and ts >= current - timedelta(hours=result_age_hours):
                _results.append(r)

        return _results
Ejemplo n.º 16
0
 def __init__(self):
     super().__init__(yaml_defs_group='kernlog',
                      searchobj=FileSearcher(),
                      callback_helper=EVENTCALLBACKS)
     self.cli_helper = CLIHelper()
     self.hostnet_helper = HostNetworkingHelper()
Ejemplo n.º 17
0
 def loadavg(self):
     uptime = CLIHelper().uptime()
     if uptime:
         ret = re.compile(r".+load average:\s+(.+)").match(uptime)
         if ret:
             return ret[1]
Ejemplo n.º 18
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self._bcache_devs = []
     self.cli = CLIHelper()
Ejemplo n.º 19
0
class BcacheBase(StorageBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._bcache_devs = []
        self.cli = CLIHelper()

    @property
    def bcache_enabled(self):
        """ Return True if there are any backing devices configured. """
        for cset in self.get_cachesets():
            if self.get_cacheset_bdevs(cset):
                return True

    def get_cachesets(self):
        return glob.glob(
            os.path.join(HotSOSConfig.DATA_ROOT, 'sys/fs/bcache/*'))

    def get_cacheset_bdevs(self, cset):
        return glob.glob(os.path.join(cset, 'bdev*'))

    def get_sysfs_cachesets(self):
        cachesets = []
        for entry in self.get_cachesets():
            if os.path.exists(os.path.join(entry, "cache_available_percent")):
                cachesets.append({
                    "path": entry,
                    "uuid": os.path.basename(entry)
                })

        for cset in cachesets:
            path = os.path.join(cset['path'], "cache_available_percent")
            with open(path) as fd:
                value = fd.read().strip()
                cset["cache_available_percent"] = int(value)

            # dont include in final output
            del cset["path"]

        return cachesets

    @property
    def udev_bcache_devs(self):
        """ If bcache devices exist fetch information and return as a list. """
        if self._bcache_devs:
            return self._bcache_devs

        udevadm_info = self.cli.udevadm_info_exportdb()
        if not udevadm_info:
            return self._bcache_devs

        s = FileSearcher()
        sdef = SequenceSearchDef(start=SearchDef(r"^P: .+/(bcache\S+)"),
                                 body=SearchDef(r"^S: disk/by-uuid/(\S+)"),
                                 tag="bcacheinfo")
        s.add_search_term(sdef, utils.mktemp_dump('\n'.join(udevadm_info)))
        results = s.search()
        devs = []
        for section in results.find_sequence_sections(sdef).values():
            dev = {}
            for r in section:
                if r.tag == sdef.start_tag:
                    dev["name"] = r.get(1)
                else:
                    dev["by-uuid"] = r.get(1)

            devs.append(dev)

        self._bcache_devs = devs
        return self._bcache_devs

    def is_bcache_device(self, dev):
        """
        Returns True if the device either is or is based on a bcache device
        e.g. dmcrypt device using bcache dev.
        """
        if dev.startswith("bcache"):
            return True

        if dev.startswith("/dev/bcache"):
            return True

        ret = re.compile(r"/dev/mapper/crypt-(\S+)").search(dev)
        if ret:
            for dev in self.udev_bcache_devs:
                if dev.get("by-uuid") == ret.group(1):
                    return True

        return False
Ejemplo n.º 20
0
 def date(self):
     return CLIHelper().date(no_format=True)
Ejemplo n.º 21
0
 def hostname(self):
     return CLIHelper().hostname()
Ejemplo n.º 22
0
class OpenstackNetworkChecks(OpenstackChecksBase):
    def __init__(self):
        super().__init__()
        self.cli = CLIHelper()

    @property
    def summary_subkey(self):
        return 'network'

    def _get_port_stat_outliers(self, counters):
        """ For a given port's packet counters, identify outliers i.e. > 1%
        and create a new dict with count and percent values.
        """
        stats = {}
        for rxtx in counters:
            total = sum(counters[rxtx].values())
            for key, value in counters[rxtx].items():
                if key == "packets":
                    continue

                if value:
                    pcent = int(100 / float(total) * float(value))
                    if pcent <= 1:
                        continue

                    if rxtx not in stats:
                        stats[rxtx] = {}

                    stats[rxtx][key] = "{} ({}%)".format(int(value), pcent)

        return stats

    def get_config_info(self):
        config_info = {}
        for project in ['nova', 'neutron', 'octavia']:
            _project = getattr(self, project)
            if _project and _project.bind_interfaces:
                for name, port in _project.bind_interfaces.items():
                    if project not in config_info:
                        config_info[project] = {}

                    config_info[project][name] = port.to_dict()

        return config_info

    def get_phy_port_health_info(self):
        """ Identify ports used by Openstack services, include them in output
        for informational purposes along with their health (dropped packets
        etc) for any outliers detected.
        """
        port_health_info = {}
        for project in ['nova', 'neutron', 'octavia']:
            _project = getattr(self, project)
            if _project and _project.bind_interfaces:
                for port in _project.bind_interfaces.values():
                    if port.stats:
                        stats = self._get_port_stat_outliers(port.stats)
                        if not stats:
                            continue

                        port_health_info[port.name] = stats

        return port_health_info

    def __summary_config(self):
        config_info = self.get_config_info()
        if config_info:
            return config_info

    def __summary_phy_port_health(self):
        port_health_info = self.get_phy_port_health_info()
        if port_health_info:
            return port_health_info

    def __summary_namespaces(self):
        """Populate namespace information dict."""
        ns_info = {}
        for line in self.cli.ip_netns():
            ret = re.compile(r"^([a-z0-9]+)-([0-9a-z\-]+)\s+.+").match(line)
            if ret:
                if ret[1] in ns_info:
                    ns_info[ret[1]] += 1
                else:
                    ns_info[ret[1]] = 1

        if ns_info:
            return ns_info

    def __summary_vm_port_health(self):
        """ For each instance get its ports and check port health, reporting on
        any outliers. """
        if not self.nova.instances:
            return

        port_health_info = {}
        for guest in self.nova.instances.values():
            for port in guest.ports:
                stats = port.stats
                if stats:
                    outliers = self._get_port_stat_outliers(stats)
                    if not outliers:
                        continue

                    if guest.uuid not in port_health_info:
                        port_health_info[guest.uuid] = {}

                    port_health_info[guest.uuid][port.hwaddr] = outliers

        if port_health_info:
            health = {
                'num-vms-checked': len(self.nova.instances),
                'stats': port_health_info
            }
            return health
Ejemplo n.º 23
0
 def __init__(self):
     super().__init__()
     self.cli = CLIHelper()
Ejemplo n.º 24
0
    def cli(data_root, version, defs_path, all_logs, quiet, debug, save,
            format, html_escape, user_summary, short, very_short, full,
            agent_error_key_by_time, max_logrotate_depth, max_parallel_tasks,
            list_plugins, machine_readable, **kwargs):
        """
        Run this tool on a host or against an unpacked sosreport to perform
        analysis of specific applications and the host itself. A summary of
        information is generated along with any issues or known bugs detected.
        Applications are defined as plugins and support currently includes
        Openstack, Kubernetes, Ceph and more (see --list-plugins). The
        standard output format is yaml to allow easy visual inspection and
        post-processing by other tools. Other formats are also supported.

        There a three main components to this tool; the core python library,
        plugin extensions and a library of checks written in a high level
        yaml-based language.

        \b
        DATA_ROOT
            Path to an unpacked sosreport. If none provided, will run against
            local host.
        """  # noqa

        full_mode_explicit = full
        minimal_mode = None
        if short:
            minimal_mode = 'short'
        elif very_short:
            minimal_mode = 'very-short'

        repo_info = get_repo_info()
        if repo_info:
            setup_config(REPO_INFO=repo_info)

        _version = get_version()
        setup_config(HOTSOS_VERSION=_version)

        if version:
            print(_version)
            return

        if not user_summary:
            if not data_root or data_root == '/':
                data_root = '/'
            elif data_root[-1] != '/':
                # Ensure trailing slash
                data_root += '/'

        setup_config(USE_ALL_LOGS=all_logs,
                     PLUGIN_YAML_DEFS=defs_path,
                     DATA_ROOT=data_root,
                     AGENT_ERROR_KEY_BY_TIME=agent_error_key_by_time,
                     MAX_LOGROTATE_DEPTH=max_logrotate_depth,
                     MAX_PARALLEL_TASKS=max_parallel_tasks,
                     MACHINE_READABLE=machine_readable)

        if debug and quiet:
            sys.stderr.write('ERROR: cannot use both --debug and --quiet\n')
            return

        if debug:
            setup_logging(debug)

        if list_plugins:
            sys.stdout.write('\n'.join(PLUGIN_CATALOG.keys()))
            sys.stdout.write('\n')
            return

        if data_root == '/':
            analysis_target = 'localhost'
        else:
            analysis_target = 'sosreport {}'.format(data_root)

        if quiet:
            show_spinner = False
            spinner_msg = ''
        else:
            show_spinner = not debug
            spinner_msg = 'INFO: analysing {} '.format(analysis_target)

        with progress_spinner(show_spinner, spinner_msg):
            if user_summary:
                log.debug("User summary provided in %s", data_root)
                with open(data_root) as fd:
                    summary = yaml.safe_load(fd)
            else:
                plugins = []
                for k, v in kwargs.items():
                    if v is True:
                        plugins.append(k)

                if plugins:
                    # always run these
                    plugins.append('hotsos')
                    if 'system' not in plugins:
                        plugins.append('system')

                summary = HotSOSClient().run(plugins)

        formatted = output_filter.apply_output_formatting(
            summary, format, html_escape, minimal_mode)
        if save:
            if user_summary:
                output_name = os.path.basename(data_root)
                output_name = output_name.rpartition('.')[0]
            else:
                if data_root != '/':
                    if data_root.endswith('/'):
                        data_root = data_root.rpartition('/')[0]

                    output_name = os.path.basename(data_root)
                else:
                    output_name = "hotsos-{}".format(CLIHelper().hostname())

            if minimal_mode:
                if formatted:
                    out = "{}.short.summary".format(output_name)
                    with open(out, 'w', encoding='utf-8') as fd:
                        fd.write(formatted)
                        fd.write('\n')

                    sys.stdout.write(
                        "INFO: short summary written to {}\n".format(out))
                if full_mode_explicit:
                    formatted = output_filter.apply_output_formatting(
                        summary, format, html_escape)

            if not minimal_mode or full_mode_explicit:
                if formatted:
                    out = "{}.summary".format(output_name)
                    with open(out, 'w', encoding='utf-8') as fd:
                        fd.write(formatted)
                        fd.write('\n')

                    sys.stdout.write(
                        "INFO: full summary written to {}\n".format(out))

        else:
            if debug:
                sys.stderr.write('Results:\n')

            if formatted:
                sys.stdout.write("{}\n".format(formatted))
Ejemplo n.º 25
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, callback_helper=EVENTCALLBACKS,
                      yaml_defs_group='neutron-router-checks', **kwargs)
     self.cli = CLIHelper()
     self.ha_info = NeutronHAInfo()
Ejemplo n.º 26
0
class NeutronL3HAEventChecks(OpenstackEventChecksBase):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, callback_helper=EVENTCALLBACKS,
                         yaml_defs_group='neutron-router-checks', **kwargs)
        self.cli = CLIHelper()
        self.ha_info = NeutronHAInfo()

    def check_vrrp_transitions(self, transitions):
        # there will likely be a large number of transitions if we look across
        # all time so dont run this check.
        if HotSOSConfig.USE_ALL_LOGS:
            return

        max_transitions = 0
        warn_count = 0
        threshold = VRRP_TRANSITION_WARN_THRESHOLD
        for router in transitions:
            r = transitions[router]
            _transitions = sum([t for d, t in r.items()])
            if _transitions > threshold:
                max_transitions = max(_transitions, max_transitions)
                warn_count += 1

        if warn_count:
            msg = ("{} router(s) have had more than {} vrrp transitions "
                   "(max={}) in the last 24 hours.".format(warn_count,
                                                           threshold,
                                                           max_transitions))
            IssuesManager().add(NeutronL3HAWarning(msg))

    def journalctl_args(self):
        """ Args callback for event cli command """
        args = []
        kwargs = {'unit': 'neutron-l3-agent'}
        if not HotSOSConfig.USE_ALL_LOGS:
            kwargs['date'] = self.cli.date(format="--iso-8601")

        return args, kwargs

    @EVENTCALLBACKS.callback()
    def vrrp_transitions(self, event):
        transitions = {}
        for r in event.results:
            ts_date = r.get(1)
            vr_id = r.get(2)
            router = self.ha_info.find_router_with_vr_id(vr_id)
            if not router:
                log.debug("no router found with vr_id %s", vr_id)
                continue

            uuid = router.uuid
            if uuid not in transitions:
                transitions[uuid] = {ts_date: 1}
            elif ts_date in transitions[uuid]:
                transitions[uuid][ts_date] += 1
            else:
                transitions[uuid][ts_date] = 1

        if transitions:
            # run checks
            self.check_vrrp_transitions(transitions)
            # add info to summary
            return {'transitions': transitions}, 'keepalived'

    def __summary_neutron_l3ha(self):
        return self.final_event_results