Ejemplo n.º 1
0
    def _check_solaris(self, instance):
        # Can't get bytes sent and received via netstat
        # Default to kstat -p link:0:
        custom_tags = instance.get('tags', [])
        try:
            netstat, _, _ = get_subprocess_output(["kstat", "-p", "link:0:"], self.log)
            metrics_by_interface = self._parse_solaris_netstat(netstat)
            for interface, metrics in metrics_by_interface.iteritems():
                self._submit_devicemetrics(interface, metrics, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting kstat stats.")

        try:
            netstat, _, _ = get_subprocess_output(["netstat", "-s", "-P" "tcp"], self.log)
            # TCP: tcpRtoAlgorithm=     4 tcpRtoMin           =   200
            # tcpRtoMax           = 60000 tcpMaxConn          =    -1
            # tcpActiveOpens      =    57 tcpPassiveOpens     =    50
            # tcpAttemptFails     =     1 tcpEstabResets      =     0
            # tcpCurrEstab        =     0 tcpOutSegs          =   254
            # tcpOutDataSegs      =   995 tcpOutDataBytes     =1216733
            # tcpRetransSegs      =     0 tcpRetransBytes     =     0
            # tcpOutAck           =   185 tcpOutAckDelayed    =     4
            # ...
            self._submit_regexed_values(netstat, SOLARIS_TCP_METRICS, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting TCP stats.")
Ejemplo n.º 2
0
    def check(self, instance):
        cluster_uri = instance['cluster_uri']
        public_key = instance['cluster_public_key']
        security_file = instance['user_security_file']
        node_id = instance['node_id']

        if public_key and security_file:
            metrics, err, retcode = get_subprocess_output(
                [
                    "python3", "-m", "qdb_datadog", "--cluster", cluster_uri,
                    "--node-id", node_id, "--prefix", "qdb",
                    "--cluster-public-key", public_key, "--user-security-file",
                    security_file
                ],
                self.log,
                raise_on_empty_output=True)
        else:
            metrics, err, retcode = get_subprocess_output(
                [
                    "python3", "-m", "qdb_datadog", "--cluster", cluster_uri,
                    "--node-id", node_id, "--prefix", "qdb"
                ],
                self.log,
                raise_on_empty_output=True)

        for m in metrics.splitlines():
            k, t, v = m.split(',')

            if t == 'GAUGE':
                self.gauge(k, int(v), tags=['node_id:' + node_id])
            elif t == 'COUNTER':
                self.monotonic_count(k, int(v), tags=['node_id:' + node_id])
            else:
                raise RuntimeError("Unrecognized counter type: '" + t + "'")
Ejemplo n.º 3
0
    def _get_postqueue_stats(self, postfix_config_dir, tags):

        # get some intersting configuratin values from postconf
        pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'],
                                                self.log, False)
        postfix_version = pc_output.strip('\n').split('=')[1].strip()
        pc_output, _, _ = get_subprocess_output(
            ['postconf', 'authorized_mailq_users'], self.log, False)
        authorized_mailq_users = pc_output.strip('\n').split('=')[1].strip()

        self.log.debug(
            'authorized_mailq_users : {}'.format(authorized_mailq_users))

        output, _, _ = get_subprocess_output(
            ['postqueue', '-c', postfix_config_dir, '-p'], self.log, False)

        active_count = 0
        hold_count = 0
        deferred_count = 0

        # postque -p sample output
        '''
        root@postfix:/opt/datadog-agent/agent/checks.d# postqueue -p
        ----Queue ID----- --Size-- ---Arrival Time---- --Sender/Recipient------
        3xWyLP6Nmfz23fk        367 Tue Aug 15 16:17:33 [email protected]
                                                            (deferred transport)
                                                            [email protected]

        3xWyD86NwZz23ff!       358 Tue Aug 15 16:12:08 [email protected]
                                                            (deferred transport)
                                                            [email protected]

        -- 1 Kbytes in 2 Requests.
        '''

        for line in output.splitlines():
            if '*' in line:
                active_count += 1
                continue
            if '!' in line:
                hold_count += 1
                continue
            if line[0:1].isdigit():
                deferred_count += 1

        self.log.debug('Postfix Version: %s' % postfix_version)

        self.gauge('postfix.queue.size',
                   active_count,
                   tags=tags +
                   ['queue:active', 'instance:{}'.format(postfix_config_dir)])
        self.gauge('postfix.queue.size',
                   hold_count,
                   tags=tags +
                   ['queue:hold', 'instance:{}'.format(postfix_config_dir)])
        self.gauge(
            'postfix.queue.size',
            deferred_count,
            tags=tags +
            ['queue:deferred', 'instance:{}'.format(postfix_config_dir)])
Ejemplo n.º 4
0
    def call_unbound_control(self, command, tags):
        try:
            # Pass raise_on_empty_output as False so we get a chance to log stderr
            ub_out, ub_err, returncode = get_subprocess_output(
                command, self.log, raise_on_empty_output=False)
        except Exception as e:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               message="exception collecting stats",
                               tags=tags)
            raise Exception("Unable to get unbound stats: {}".format(str(e)))

        for line in ub_err.splitlines():
            self.log.debug('stderr from %s: %s', command, line)

        # Check the return value
        if returncode != 0:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               message="non-zero return code collecting stats",
                               tags=tags)
            raise Exception('"{}" failed, return code: {}'.format(
                command, returncode))

        # And because we pass raise_on_empty_output as False, check that too
        if not ub_out:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               message="no stats",
                               tags=tags)
            raise Exception('no output from "{}"'.format(command))

        return ub_out
Ejemplo n.º 5
0
def which(program, use_sudo, log):
    def is_exe(fpath):
        return os.path.isfile(fpath) and os.access(fpath, os.X_OK)

    if use_sudo:
        # Pass raise_on_empty_output as False to Leave it to the caller to handle the not
        # found case.
        stdout, stderr, returncode = get_subprocess_output(
            ['sudo', 'which', program], log, raise_on_empty_output=False)
        if returncode == 0:
            return stdout

        for line in stderr.splitlines():
            log.debug('stderr from sudo which %s: %s', program, line)

        return None

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None
Ejemplo n.º 6
0
    def _get_queue_count(self, directory, queues, tags):
        for queue in queues:
            queue_path = os.path.join(directory, queue)
            if not os.path.exists(queue_path):
                raise Exception('{} does not exist'.format(queue_path))

            count = 0
            if os.geteuid() == 0:
                # dd-agent is running as root (not recommended)
                count = sum(len(files) for root, dirs, files in os.walk(queue_path))
            else:
                # can dd-agent user run sudo?
                test_sudo = os.system('setsid sudo -l < /dev/null')
                if test_sudo == 0:
                    # default to `root` for backward compatibility
                    postfix_user = self.init_config.get('postfix_user', 'root')
                    cmd = ['sudo', '-u', postfix_user, 'find', queue_path, '-type', 'f']
                    output, _, _ = get_subprocess_output(cmd, self.log, False)
                    count = len(output.splitlines())
                else:
                    raise Exception('The dd-agent user does not have sudo access')

            # emit an individually tagged metric
            self.gauge('postfix.queue.size', count,
                       tags=tags + ['queue:{}'.format(queue), 'instance:{}'.format(os.path.basename(directory))])
Ejemplo n.º 7
0
    def _add_conntrack_stats_metrics(self, conntrack_path, tags):
        """
        Parse the output of conntrack -S
        Add the parsed metrics
        """
        try:
            output, _, _ = get_subprocess_output(
                ["sudo", conntrack_path, "-S"], self.log)
            # conntrack -S sample:
            # cpu=0 found=27644 invalid=19060 ignore=485633411 insert=0 insert_failed=1 \
            #       drop=1 early_drop=0 error=0 search_restart=39936711
            # cpu=1 found=21960 invalid=17288 ignore=475938848 insert=0 insert_failed=1 \
            #       drop=1 early_drop=0 error=0 search_restart=36983181

            lines = output.splitlines()

            for line in lines:
                cols = line.split()
                cpu_num = cols[0].split('=')[-1]
                cpu_tag = ['cpu:{}'.format(cpu_num)]
                cols = cols[1:]

                for cell in cols:
                    metric, value = cell.split('=')
                    self.monotonic_count(
                        'system.net.conntrack.{}'.format(metric),
                        int(value),
                        tags=tags + cpu_tag)
        except SubprocessOutputEmptyError:
            self.log.debug("Couldn't use {} to get conntrack stats".format(
                conntrack_path))
    def _exec_ping(self, timeout, target_host):
        if platform.system() == "Windows":  # pragma: nocover
            countOption = "-n"
            timeoutOption = "-w"
            # The timeout option is in ms on Windows
            # https://docs.microsoft.com/en-us/windows-server/administration/windows-commands/ping
            timeout = timeout * 1000
        elif platform.system() == "Darwin":
            countOption = "-c"
            timeoutOption = "-W"  # Also in ms on Mac
            timeout = timeout * 1000
        else:
            # The timeout option is is seconds on Linux, leaving timeout as is
            # https://linux.die.net/man/8/ping
            countOption = "-c"
            timeoutOption = "-W"

        self.log.debug("Running: ping {} {} {} {} {}".format(
            countOption, "1", timeoutOption, str(timeout), target_host))

        lines, err, retcode = get_subprocess_output([
            "ping", countOption, "1", timeoutOption,
            str(timeout), target_host
        ],
                                                    self.log,
                                                    raise_on_empty_output=True)
        self.log.debug("ping returned {} - {} - {}".format(
            retcode, lines, err))
        if retcode != 0:
            raise CheckException("ping returned {}: {}".format(retcode, err))

        return lines
Ejemplo n.º 9
0
    def _collect_raw(self, ceph_cmd, ceph_cluster, instance):
        use_sudo = _is_affirmative(instance.get('use_sudo', False))
        ceph_args = []
        if use_sudo:
            test_sudo = os.system('setsid sudo -l < /dev/null')
            if test_sudo != 0:
                raise Exception('The dd-agent user does not have sudo access')
            ceph_args = 'sudo {}'.format(ceph_cmd)
        else:
            ceph_args = ceph_cmd

        ceph_args = '{} --cluster {}'.format(ceph_args, ceph_cluster)

        raw = {}
        for cmd in ('mon_status', 'status', 'df detail', 'osd pool stats',
                    'osd perf', 'health detail'):
            try:
                args = '{} {} -fjson'.format(ceph_args, cmd)
                output, _, _ = get_subprocess_output(args.split(), self.log)
                res = json.loads(output)
            except Exception as e:
                self.log.warning('Unable to parse data from cmd=%s: %s' %
                                 (cmd, str(e)))
                continue

            name = cmd.replace(' ', '_')
            raw[name] = res

        return raw
Ejemplo n.º 10
0
    def check(self, instance):
        stat_out, err, _ = get_subprocess_output(self.nfs_cmd, self.log)
        all_devices = []
        this_device = []
        custom_tags = instance.get("tags", [])

        for l in stat_out.splitlines():
            if not l:
                continue
            elif l.find(b'mounted on') >= 0 and len(this_device) > 0:
                # if it's a new device, create the device and add it to the array
                device = Device(this_device, self.log)
                all_devices.append(device)
                this_device = []
            this_device.append(l.strip().split())

        # Add the last device into the array
        device = Device(this_device, self.log)
        all_devices.append(device)

        # Disregard the first half of device stats (report 1 of 2)
        # as that is the moving average
        all_devices = all_devices[len(all_devices) // 2:]

        for device in all_devices:
            device.send_metrics(self.gauge, custom_tags)
Ejemplo n.º 11
0
    def check(self, instance):
        check_command = instance.get('check_command')
        metric_namespace = instance.get('metric_namespace')
        tags = instance.get('tags', [])
        create_service_check = instance.get('create_service_check', False)

        if not check_command:
            raise CheckException(
                "Configuration error. Missing check_command definition, please fix nagios_plugin_wrapper.yaml"
            )

        if not metric_namespace:
            raise CheckException(
                "Configuration error. Missing metric_namespace definition, please fix nagios_plugin_wrapper.yaml"
            )

        raw_output = None
        err = None
        ret = None
        try:
            raw_output, err, ret = get_subprocess_output(
                check_command, self.log)
        except Exception as e:
            error = "Failed to execute check_command {check_command} - {error}".format(
                check_command=check_command, error=e)
            self.log.warning(error)
            raise CheckException(
                "check_command '{check_command}' failed to execute, see agent.log for more information."
                .format(check_command=check_command))

        output, metrics = self._parse_output(raw_output)
        if metrics:
            metrics = self._parse_perfdata(metrics)
            for label, value in metrics:
                label = self._sanitize(label)
                self.log.debug(
                    "metric_namespace: {namespace} | tags: {tags} | value: {value} | ret_code: {ret}"
                    .format(namespace=metric_namespace,
                            tags=tags,
                            value=value,
                            ret=ret))
                self.gauge('{metric_namespace}.{label}'.format(
                    metric_namespace=metric_namespace, label=label),
                           value,
                           tags=tags)

        if output and create_service_check:
            if ret == 0:
                status = AgentCheck.OK
            elif ret == 1:
                status = AgentCheck.WARNING
            elif ret == 2:
                status = AgentCheck.CRITICAL
            else:
                status = AgentCheck.UNKNOWN
            self.service_check(metric_namespace,
                               status,
                               tags=tags,
                               message=output.rstrip())
Ejemplo n.º 12
0
    def _collect_metadata(self):
        pc_output, _, _ = get_subprocess_output(['postconf', 'mail_version'],
                                                self.log, False)
        self.log.debug('postconf mail_version output: %s', pc_output)

        postfix_version = pc_output.strip('\n').split('=')[1].strip()
        self.log.debug('Postfix Version: %s', postfix_version)

        self.set_metadata('version', postfix_version)
Ejemplo n.º 13
0
    def _collect_metadata(self):
        try:
            pc_output, _, _ = get_subprocess_output(
                ['postconf', 'mail_version'], self.log, False)
        except Exception as e:
            self.log.warning('unable to call `postconf mail_version`: %s', e)
            return

        self.log.debug('postconf mail_version output: %s', pc_output)

        postfix_version = pc_output.strip('\n').split('=')[1].strip()
        self.log.debug('Postfix Version: %s', postfix_version)

        self.set_metadata('version', postfix_version)
Ejemplo n.º 14
0
    def check(self, instance):
        config = self.get_instance_config(instance)
        command = config.get("command")
        metric_name = config.get("metric_name")
        metric_type = config.get("metric_type")
        tags = config.get("tags")

        output, err, retcode = get_subprocess_output(
            command, self.log, raise_on_empty_output=True)

        if "MySQL Replication Health is OK." in output:
            self.gauge(metric_name, 1, tags=tags)
        else:
            self.gauge(metric_name, 0, tags=tags)
Ejemplo n.º 15
0
 def check(self, instance):
     output, err, retcode = get_subprocess_output(
         [
             "powershell.exe",
             "C:\ProgramData\Datadog\checks.d\custom_WinServMonitor.ps1"
         ],
         self.log,
         raise_on_empty_output=True)
     if output.startswith('OK'):
         self.gauge('custom_WinServMonitor.autoNotStarted', 0)
     else:
         for service in output.split(';'):
             self.gauge('custom_WinServMonitor.autoNotStarted', 1,
                        ['service:' + service.strip()])
Ejemplo n.º 16
0
 def collect_metrics_manually(self):
     df_out, _, _ = get_subprocess_output(self.DF_COMMAND + ['-k'], self.log)
     self.log.debug(df_out)
     for device in self._list_devices(df_out):
         self.log.debug("Passed: {0}".format(device))
         tags = [device[1], 'filesystem:{}'.format(device[1])] if self._tag_by_filesystem else []
         tags.extend(self._custom_tags)
         device_name = device[-1] if self._use_mount else device[0]
         # apply device/mountpoint specific tags
         for regex, device_tags in self._device_tag_re:
             if regex.match(device_name):
                 tags += device_tags
         for metric_name, value in self._collect_metrics_manually(device).iteritems():
             self.gauge(metric_name, value, tags=tags,
                        device_name=device_name)
Ejemplo n.º 17
0
    def get_license_usage(self):
        sacli_path = self.init_config.get(
            "sacli_path", "/usr/local/openvpn_as/scripts/sacli")
        sacli_licusage = ["sudo", sacli_path, "LicUsage"]

        out, err, retcode = get_subprocess_output(sacli_licusage,
                                                  self.log,
                                                  raise_on_empty_output=True)
        out_split = out[3:-2].splitlines()
        lic_usage = [int(n.strip(", ")) for n in out_split]
        return {
            "used": lic_usage[0],
            "total": lic_usage[1],
            "available": lic_usage[1] - lic_usage[0],
        }
Ejemplo n.º 18
0
    def _get_version_info(self, varnishstat_path):
        # Get the varnish version from varnishstat
        output, error, _ = get_subprocess_output(varnishstat_path + ["-V"],
                                                 self.log,
                                                 raise_on_empty_output=False)

        # Assumptions regarding varnish's version
        varnishstat_format = "json"
        raw_version = None

        m1 = self.version_pattern.search(output, re.MULTILINE)
        # v2 prints the version on stderr, v3 on stdout
        m2 = self.version_pattern.search(error, re.MULTILINE)

        if m1 is None and m2 is None:
            self.log.warn(
                "Cannot determine the version of varnishstat, assuming 3 or greater"
            )
            self.warning(
                "Cannot determine the version of varnishstat, assuming 3 or greater"
            )
        else:
            if m1 is not None:
                raw_version = m1.group()
            elif m2 is not None:
                raw_version = m2.group()

        self.log.debug("Varnish version: %s", raw_version)

        if raw_version:
            self.set_metadata('version', raw_version)

        if raw_version is None:
            raw_version = '3.0.0'

        version = LooseVersion(raw_version)

        # Location of varnishstat
        if version < LooseVersion('3.0.0'):
            varnishstat_format = "text"
        elif version < LooseVersion(
                '5.0.0'):  # we default to json starting version 5.0.0
            varnishstat_format = "xml"

        return version, varnishstat_format
Ejemplo n.º 19
0
    def _get_sendmail_stats(self, sendmail_command, use_sudo):

        if not os.path.exists(sendmail_command):
            raise Exception('{} does not exist'.format(sendmail_command))

        self.log.debug(sendmail_command)

        # mailq sample output. sendmail output is similar.
        ##
        # MSP Queue status...
        # /var/spool/mqueue-client is empty
        #    Total requests: 0
        # MTA Queue status...
        # /var/spool/mqueue is empty
        #     Total requests: 0

        # if we want to use sendmail, we need to append -bp to it
        # https://www.electrictoolbox.com/show-sendmail-mail-queue/
        if "sendmail" in sendmail_command:
            command = [sendmail_command, '-bp']
        else:
            command = [sendmail_command]

        # Listing the directory might require sudo privileges
        if use_sudo:
            try:
                os.system('setsid sudo -l < /dev/null')
                command.insert(0, 'sudo')
            except OSError as e:
                self.log.exception(
                    "trying to retrieve %s with sudo failed with return code %s",
                    command, e)

        self.log.debug(command)

        mail_queue, err, retcode = get_subprocess_output(
            command, self.log, False)
        self.log.debug("Error: %s", err)
        count = mail_queue.splitlines()
        # Retrieve the last total number of requests
        queue_count = int(count[-1][-1])
        self.log.info("Number of mails in the queue: %s", queue_count)

        return queue_count
Ejemplo n.º 20
0
    def get_process_states(self):
        state_counts = defaultdict(int)
        prio_counts = defaultdict(int)
        ps = get_subprocess_output(['ps', '--no-header', '-eo', 'stat'], self.log)
        for state in ps[0]:
            # Each process state is a flag in a list of characters. See ps(1) for details.
            for flag in list(state):
                if state in PROCESS_STATES:
                    state_counts[PROCESS_STATES[state]] += 1
                elif state in PROCESS_PRIOS:
                    prio_counts[PROCESS_PRIOS[state]] += 1

        for state in state_counts:
            state_tags = list(self.tags)
            state_tags.append("state:" + state)
            self.gauge('system.processes.states', float(state_counts[state]), state_tags)

        for prio in prio_counts:
            prio_tags = list(self.tags)
            prio_tags.append("priority:" + prio)
            self.gauge('system.processes.priorities', float(prio_counts[prio]), prio_tags)
 def check(self, instance):
     metric = "ssl.expire_in_days"
     site = instance['site']
     tag = "site:" + site  # generate the tags
     command = [
         "timeout", "10", "bash", "-c",
         "openssl s_client -showcerts -servername " + site + " -connect " +
         site +
         ":443 2>/dev/null | openssl x509 -noout -dates | grep notAfter | cut -f 2 -d\= | xargs -0 -I arg date -d arg '+%s'"
     ]
     (output, err,
      returncode) = get_subprocess_output(command, self.log, False)
     if output:
         output = output.rstrip("\n")
         d0 = int(time.time())
         d1 = int(output)
         delta = d1 - d0
         days = delta / 24 / 60 / 60  # convert the timestamp to days
         self.gauge(metric, int(days), tags=[tag])
     else:
         self.gauge(metric, -1, tags=[tag])
  def check(self, instance):
    cmd = ["/usr/local/bin/lighthouse", instance["url"], "--output", "json", "--quiet", "--chrome-flags='--headless'"]

    json_string, error_message, exit_code = get_subprocess_output(cmd, self.log, raise_on_empty_output=False)
    
    # check for error since we have raise_on_empty_output set to False
    if exit_code > 0:
      raise Exception(json_string, error_message, exit_code)

    try:
      data = json.loads(json_string)
      score_accessibility = data["categories"]["accessibility"]["score"] * 100
      score_best_practices = data["categories"]["best-practices"]["score"] * 100
      score_performance = data["categories"]["performance"]["score"] * 100
      score_pwa = data["categories"]["pwa"]["score"] * 100
      score_seo = data["categories"]["seo"]["score"] * 100
    except Exception:
      self.log.warn("lighthouse response JSON structure different than expected")
      raise Exception(json_string, error_message, exit_code)

    # add tags
    try:
      tags = instance['tags']
      if type(tags) != list:
        self.log.warn('The tags list in the lighthouse check is not configured properly')
        tags = []
    except KeyError:
      tags = []

    tags.append("lighthouse_url:%s" % instance['url'])
    tags.append("lighthouse_name:%s" % instance['name'])

    self.gauge("custom_lighthouse.accessibility", score_accessibility, tags=tags)
    self.gauge("custom_lighthouse.best_practices", score_best_practices, tags=tags)
    self.gauge("custom_lighthouse.performance", score_performance, tags=tags)
    self.gauge("custom_lighthouse.pwa", score_pwa, tags=tags)
    self.gauge("custom_lighthouse.seo", score_seo, tags=tags)
Ejemplo n.º 23
0
    def _check_bsd(self, instance):
        netstat_flags = ['-i', '-b']

        custom_tags = instance.get('tags', [])

        # FreeBSD's netstat truncates device names unless you pass '-W'
        if Platform.is_freebsd():
            netstat_flags.append('-W')

        try:
            output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log)
            lines = output.splitlines()
            # Name  Mtu   Network       Address            Ipkts Ierrs     Ibytes    Opkts Oerrs     Obytes  Coll
            # lo0   16384 <Link#1>                        318258     0  428252203   318258     0  428252203     0
            # lo0   16384 localhost   fe80:1::1           318258     -  428252203   318258     -  428252203     -
            # lo0   16384 127           localhost         318258     -  428252203   318258     -  428252203     -
            # lo0   16384 localhost   ::1                 318258     -  428252203   318258     -  428252203     -
            # gif0* 1280  <Link#2>                             0     0          0        0     0          0     0
            # stf0* 1280  <Link#3>                             0     0          0        0     0          0     0
            # en0   1500  <Link#4>    04:0c:ce:db:4e:fa 20801309     0 13835457425 15149389     0 11508790198     0
            # en0   1500  seneca.loca fe80:4::60c:ceff: 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  192.168.1     192.168.1.63    20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # p2p0  2304  <Link#5>    06:0c:ce:db:4e:fa        0     0          0        0     0          0     0
            # ham0  1404  <Link#6>    7a:79:05:4d:bf:f5    30100     0    6815204    18742     0    8494811     0
            # ham0  1404  5             5.77.191.245       30100     -    6815204    18742     -    8494811     -
            # ham0  1404  seneca.loca fe80:6::7879:5ff:    30100     -    6815204    18742     -    8494811     -
            # ham0  1404  2620:9b::54 2620:9b::54d:bff5    30100     -    6815204    18742     -    8494811     -

            headers = lines[0].split()

            # Given the irregular structure of the table above, better to parse from the end of each line
            # Verify headers first
            #          -7       -6       -5        -4       -3       -2        -1
            for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"):
                if h not in headers:
                    self.log.error("%s not found in %s; cannot parse" % (h, headers))
                    return False

            current = None
            for l in lines[1:]:
                # Another header row, abort now, this is IPv6 land
                if "Name" in l:
                    break

                x = l.split()
                if len(x) == 0:
                    break

                iface = x[0]
                if iface.endswith("*"):
                    iface = iface[:-1]
                if iface == current:
                    # skip multiple lines of same interface
                    continue
                else:
                    current = iface

                # Filter inactive interfaces
                if self._parse_value(x[-5]) or self._parse_value(x[-2]):
                    iface = current
                    metrics = {
                        'bytes_rcvd': self._parse_value(x[-5]),
                        'bytes_sent': self._parse_value(x[-2]),
                        'packets_in.count': self._parse_value(x[-7]),
                        'packets_in.error': self._parse_value(x[-6]),
                        'packets_out.count': self._parse_value(x[-4]),
                        'packets_out.error': self._parse_value(x[-3]),
                    }
                    self._submit_devicemetrics(iface, metrics, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting connection stats.")

        try:
            netstat, _, _ = get_subprocess_output(["netstat", "-s", "-p" "tcp"], self.log)
            # 3651535 packets sent
            #         972097 data packets (615753248 bytes)
            #         5009 data packets (2832232 bytes) retransmitted
            #         0 resends initiated by MTU discovery
            #         2086952 ack-only packets (471 delayed)
            #         0 URG only packets
            #         0 window probe packets
            #         310851 window update packets
            #         336829 control packets
            #         0 data packets sent after flow control
            #         3058232 checksummed in software
            #         3058232 segments (571218834 bytes) over IPv4
            #         0 segments (0 bytes) over IPv6
            # 4807551 packets received
            #         1143534 acks (for 616095538 bytes)
            #         165400 duplicate acks
            #         ...

            self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting TCP stats.")
Ejemplo n.º 24
0
    def _check_linux(self, instance):
        """
        _check_linux can be run inside a container and still collects the network metrics from the host
        For that procfs_path can be set to something like "/host/proc"
        When a custom procfs_path is set, the collect_connection_state option is ignored
        """
        proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/')
        custom_tags = instance.get('tags', [])

        if Platform.is_containerized() and proc_location != "/proc":
            proc_location = "%s/1" % proc_location

        if self._is_collect_cx_state_runnable(proc_location):
            try:
                self.log.debug("Using `ss` to collect connection state")
                # Try using `ss` for increased performance over `netstat`
                for ip_version in ['4', '6']:
                    for protocol in ['tcp', 'udp']:
                        # Call `ss` for each IP version because there's no built-in way of distinguishing
                        # between the IP versions in the output
                        # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a
                        # bug that print `tcp` even if it's `udp`
                        output, _, _ = get_subprocess_output(["ss", "-n", "-{0}".format(protocol[0]),
                                                              "-a", "-{0}".format(ip_version)], self.log)
                        lines = output.splitlines()

                        # State      Recv-Q Send-Q     Local Address:Port       Peer Address:Port
                        # UNCONN     0      0              127.0.0.1:8125                  *:*
                        # ESTAB      0      0              127.0.0.1:37036         127.0.0.1:8125
                        # UNCONN     0      0        fe80::a00:27ff:fe1c:3c4:123          :::*
                        # TIME-WAIT  0      0          90.56.111.177:56867        46.105.75.4:143
                        # LISTEN     0      0       ::ffff:127.0.0.1:33217  ::ffff:127.0.0.1:7199
                        # ESTAB      0      0       ::ffff:127.0.0.1:58975  ::ffff:127.0.0.1:2181

                        metrics = self._parse_linux_cx_state(lines[1:], self.tcp_states['ss'], 0, protocol=protocol,
                                                             ip_version=ip_version)
                        # Only send the metrics which match the loop iteration's ip version
                        for stat, metric in self.cx_state_gauge.iteritems():
                            if stat[0].endswith(ip_version) and stat[0].startswith(protocol):
                                self.gauge(metric, metrics.get(metric), tags=custom_tags)

            except OSError:
                self.log.info("`ss` not found: using `netstat` as a fallback")
                output, _, _ = get_subprocess_output(["netstat", "-n", "-u", "-t", "-a"], self.log)
                lines = output.splitlines()
                # Active Internet connections (w/o servers)
                # Proto Recv-Q Send-Q Local Address           Foreign Address         State
                # tcp        0      0 46.105.75.4:80          79.220.227.193:2032     SYN_RECV
                # tcp        0      0 46.105.75.4:143         90.56.111.177:56867     ESTABLISHED
                # tcp        0      0 46.105.75.4:50468       107.20.207.175:443      TIME_WAIT
                # tcp6       0      0 46.105.75.4:80          93.15.237.188:58038     FIN_WAIT2
                # tcp6       0      0 46.105.75.4:80          79.220.227.193:2029     ESTABLISHED
                # udp        0      0 0.0.0.0:123             0.0.0.0:*
                # udp6       0      0 :::41458                :::*

                metrics = self._parse_linux_cx_state(lines[2:], self.tcp_states['netstat'], 5)
                for metric, value in metrics.iteritems():
                    self.gauge(metric, value, tags=custom_tags)
            except SubprocessOutputEmptyError:
                self.log.exception("Error collecting connection stats.")

        proc_dev_path = "{}/net/dev".format(proc_location)
        with open(proc_dev_path, 'r') as proc:
            lines = proc.readlines()
        # Inter-|   Receive                                                 |  Transmit
        #  face |bytes     packets errs drop fifo frame compressed multicast|bytes       packets errs drop fifo colls carrier compressed # noqa: E501
        #     lo:45890956   112797   0    0    0     0          0         0    45890956   112797    0    0    0     0       0          0 # noqa: E501
        #   eth0:631947052 1042233   0   19    0   184          0      1206  1208625538  1320529    0    0    0     0       0          0 # noqa: E501
        #   eth1:       0        0   0    0    0     0          0         0           0        0    0    0    0     0       0          0 # noqa: E501
        for l in lines[2:]:
            cols = l.split(':', 1)
            x = cols[1].split()
            # Filter inactive interfaces
            if self._parse_value(x[0]) or self._parse_value(x[8]):
                iface = cols[0].strip()
                metrics = {
                    'bytes_rcvd': self._parse_value(x[0]),
                    'bytes_sent': self._parse_value(x[8]),
                    'packets_in.count': self._parse_value(x[1]),
                    'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]),
                    'packets_out.count': self._parse_value(x[9]),
                    'packets_out.error': self._parse_value(x[10]) + self._parse_value(x[11]),
                }
                self._submit_devicemetrics(iface, metrics, custom_tags)

        netstat_data = {}
        for f in ['netstat', 'snmp']:
            proc_data_path = "{}/net/{}".format(proc_location, f)
            try:
                with open(proc_data_path, 'r') as netstat:
                    while True:
                        n_header = netstat.readline()
                        if not n_header:
                            break  # No more? Abort!
                        n_data = netstat.readline()

                        h_parts = n_header.strip().split(' ')
                        h_values = n_data.strip().split(' ')
                        ns_category = h_parts[0][:-1]
                        netstat_data[ns_category] = {}
                        # Turn the data into a dictionary
                        for idx, hpart in enumerate(h_parts[1:]):
                            netstat_data[ns_category][hpart] = h_values[idx + 1]
            except IOError:
                # On Openshift, /proc/net/snmp is only readable by root
                self.log.debug("Unable to read %s.", proc_data_path)

        nstat_metrics_names = {
            'Tcp': {
                'RetransSegs': 'system.net.tcp.retrans_segs',
                'InSegs': 'system.net.tcp.in_segs',
                'OutSegs': 'system.net.tcp.out_segs',
            },
            'TcpExt': {
                'ListenOverflows': 'system.net.tcp.listen_overflows',
                'ListenDrops': 'system.net.tcp.listen_drops',
                'TCPBacklogDrop': 'system.net.tcp.backlog_drops',
                'TCPRetransFail': 'system.net.tcp.failed_retransmits',
            },
            'Udp': {
                'InDatagrams': 'system.net.udp.in_datagrams',
                'NoPorts': 'system.net.udp.no_ports',
                'InErrors': 'system.net.udp.in_errors',
                'OutDatagrams': 'system.net.udp.out_datagrams',
                'RcvbufErrors': 'system.net.udp.rcv_buf_errors',
                'SndbufErrors': 'system.net.udp.snd_buf_errors',
                'InCsumErrors': 'system.net.udp.in_csum_errors'
            }
        }

        # Skip the first line, as it's junk
        for k in nstat_metrics_names:
            for met in nstat_metrics_names[k]:
                if met in netstat_data.get(k, {}):
                    self._submit_netmetric(nstat_metrics_names[k][met], self._parse_value(netstat_data[k][met]),
                                           tags=custom_tags)
    def check(self, instance):
        # Allow to specify a complete command for nodetool such as `docker exec container nodetool`
        nodetool_cmd = shlex.split(instance.get("nodetool", self.nodetool_cmd))
        host = instance.get("host", DEFAULT_HOST)
        port = instance.get("port", DEFAULT_PORT)
        keyspaces = instance.get("keyspaces", [])
        username = instance.get("username", "")
        password = instance.get("password", "")
        ssl = instance.get("ssl", False)
        tags = instance.get("tags", [])

        # Flag to send service checks only once and not for every keyspace
        send_service_checks = True

        if not keyspaces:
            self.log.info(
                "No keyspaces set in the configuration: no metrics will be sent"
            )

        for keyspace in keyspaces:
            # Build the nodetool command
            cmd = nodetool_cmd + ['-h', host, '-p', str(port)]
            if username and password:
                cmd += ['-u', username, '-pw', password]
            # add ssl if requested
            if ssl:
                cmd += ['--ssl']
            cmd += ['status', '--', keyspace]

            # Execute the command
            out, err, _ = get_subprocess_output(cmd,
                                                self.log,
                                                False,
                                                log_debug=False)
            if err or 'Error:' in out:
                self.log.error('Error executing nodetool status: %s', err
                               or out)
                continue
            nodes = self._process_nodetool_output(out)

            percent_up_by_dc = defaultdict(float)
            percent_total_by_dc = defaultdict(float)
            # Send the stats per node and compute the stats per datacenter
            for node in nodes:

                node_tags = [
                    'node_address:%s' % node['address'],
                    'node_id:%s' % node['id'],
                    'datacenter:%s' % node['datacenter'],
                    'rack:%s' % node['rack']
                ]

                # nodetool prints `?` when it can't compute the value of `owns` for certain keyspaces (e.g. system)
                # don't send metric in this case
                if node['owns'] != '?':
                    owns = float(node['owns'])
                    if node['status'] == 'U':
                        percent_up_by_dc[node['datacenter']] += owns
                    percent_total_by_dc[node['datacenter']] += owns
                    self.gauge('cassandra.nodetool.status.owns',
                               owns,
                               tags=tags + node_tags +
                               ['keyspace:%s' % keyspace])

                # Send service check only once for each node
                if send_service_checks:
                    status = AgentCheck.OK if node[
                        'status'] == 'U' else AgentCheck.CRITICAL
                    self.service_check('cassandra.nodetool.node_up', status,
                                       tags + node_tags)

                self.gauge('cassandra.nodetool.status.status',
                           1 if node['status'] == 'U' else 0,
                           tags=tags + node_tags)
                self.gauge('cassandra.nodetool.status.load',
                           float(node['load']) * TO_BYTES[node['load_unit']],
                           tags=tags + node_tags)

            # All service checks have been sent, don't resend
            send_service_checks = False

            # Send the stats per datacenter
            for datacenter, percent_up in percent_up_by_dc.items():
                self.gauge(
                    'cassandra.nodetool.status.replication_availability',
                    percent_up,
                    tags=tags +
                    ['keyspace:%s' % keyspace,
                     'datacenter:%s' % datacenter])
            for datacenter, percent_total in percent_total_by_dc.items():
                self.gauge(
                    'cassandra.nodetool.status.replication_factor',
                    int(round(percent_total / 100)),
                    tags=tags +
                    ['keyspace:%s' % keyspace,
                     'datacenter:%s' % datacenter])
 def _get_lighthouse_report(command, logger, raise_on_empty=False):
     json, err_msg, exit_code = get_subprocess_output(
         command, logger, raise_on_empty_output=raise_on_empty)
     return json, err_msg, exit_code
Ejemplo n.º 27
0
    def check(self, instance):
        # Not configured? Not a problem.
        if instance.get("varnishstat", None) is None:
            raise Exception("varnishstat is not configured")
        custom_tags = instance.get('tags', [])
        if custom_tags is None:
            custom_tags = []
        else:
            custom_tags = list(set(custom_tags))
        # Split the varnishstat command so that additional arguments can be passed in
        # In order to support monitoring a Varnish instance which is running as a Docker
        # container we need to wrap commands (varnishstat, varnishadm) with scripts which
        # perform a docker exec on the running container. This works fine when running a
        # single container on the host but breaks down when attempting to use the auto
        # discovery feature. This change allows for passing in additional parameters to
        # the script (i.e. %%host%%) so that the command is properly formatted and the
        # desired container is queried.
        varnishstat_path = shlex.split(instance.get("varnishstat"))
        name = instance.get('name')
        metrics_filter = instance.get("metrics_filter", [])
        if not isinstance(metrics_filter, list):
            raise Exception("The parameter 'metrics_filter' must be a list")

        # Get version and version-specific args from varnishstat -V.
        version, varnishstat_format = self._get_version_info(varnishstat_path)

        cmd = varnishstat_path + [
            self.VARNISHSTAT_FORMAT_OPTION[varnishstat_format]
        ]
        for metric in metrics_filter:
            cmd.extend(["-f", metric])

        if name is not None:
            cmd.extend(['-n', name])
            tags = custom_tags + [u'varnish_name:%s' % name]
        else:
            tags = custom_tags + [u'varnish_name:default']

        output, _, _ = get_subprocess_output(cmd, self.log)

        self._parse_varnishstat(output, varnishstat_format, tags)

        # Parse service checks from varnishadm.
        if instance.get("varnishadm", None):
            # Split the varnishadm command so that additional arguments can be passed in
            # In order to support monitoring a Varnish instance which is running as a Docker
            # container we need to wrap commands (varnishstat, varnishadm) with scripts which
            # perform a docker exec on the running container. This works fine when running a
            # single container on the host but breaks down when attempting to use the auto
            # discovery feature. This change allows for passing in additional parameters to
            # the script (i.e. %%host%%) so that the command is properly formatted and the
            # desired container is queried.
            varnishadm_path = shlex.split(instance.get('varnishadm'))
            secretfile_path = instance.get('secretfile', '/etc/varnish/secret')

            daemon_host = instance.get('daemon_host', 'localhost')
            daemon_port = instance.get('daemon_port', '6082')

            cmd = []
            if geteuid() != 0:
                cmd.append('sudo')

            if version < LooseVersion('4.1.0'):
                cmd.extend(varnishadm_path +
                           ['-S', secretfile_path, 'debug.health'])
            else:
                cmd.extend(varnishadm_path + [
                    '-T', '{}:{}'.format(daemon_host, daemon_port), '-S',
                    secretfile_path, 'backend.list', '-p'
                ])

            try:
                output, err, _ = get_subprocess_output(cmd, self.log)
            except OSError as e:
                self.log.error(
                    "There was an error running varnishadm. Make sure 'sudo' is available. %s",
                    e)
                output = None
            if err:
                self.log.error(
                    'Error getting service check from varnishadm: %s', err)

            if output:
                self._parse_varnishadm(output, custom_tags)