Ejemplo n.º 1
0
    def check(self, instance):
        if instance is None:
            instance = {}

        self._excluded_ifaces = instance.get('excluded_interfaces', [])
        self._collect_cx_state = instance.get(
            'collect_connection_state', False)
        self._collect_rate_metrics = instance.get(
            'collect_rate_metrics', True)
        self._collect_count_metrics = instance.get(
            'collect_count_metrics', False)

        # This decides whether we should split or combine connection states,
        # along with a few other things
        self._setup_metrics(instance)

        self._exclude_iface_re = None
        exclude_re = instance.get('excluded_interface_re', None)
        if exclude_re:
            self.log.debug("Excluding network devices matching: %s" % exclude_re)
            self._exclude_iface_re = re.compile(exclude_re)

        if Platform.is_linux():
            self._check_linux(instance)
        elif Platform.is_bsd():
            self._check_bsd(instance)
        elif Platform.is_solaris():
            self._check_solaris(instance)
        elif Platform.is_windows():
            self._check_psutil(instance)
Ejemplo n.º 2
0
    def psutil_wrapper(self, process, method, accessors, try_sudo, *args,
                       **kwargs):
        """
        A psutil wrapper that is calling
        * psutil.method(*args, **kwargs) and returns the result
        OR
        * psutil.method(*args, **kwargs).accessor[i] for each accessors
        given in a list, the result being indexed in a dictionary
        by the accessor name
        """

        if accessors is None:
            result = None
        else:
            result = {}

        # Ban certain method that we know fail
        if method == 'num_fds' and not Platform.is_unix():
            return result
        elif method == 'num_handles' and not Platform.is_win32():
            return result

        try:
            res = getattr(process, method)(*args, **kwargs)
            if accessors is None:
                result = res
            else:
                for acc in accessors:
                    try:
                        result[acc] = getattr(res, acc)
                    except AttributeError:
                        self.log.debug(
                            "psutil.%s().%s attribute does not exist", method,
                            acc)
        except (NotImplementedError, AttributeError):
            self.log.debug("psutil method %s not implemented", method)
        except psutil.AccessDenied:
            self.log.debug("psutil was denied access for method %s", method)
            if method == 'num_fds' and Platform.is_unix() and try_sudo:
                try:
                    # It is up the agent's packager to grant
                    # corresponding sudo policy on unix platforms
                    ls_args = [
                        'sudo', 'ls', '/proc/{}/fd/'.format(process.pid)
                    ]
                    process_ls = subprocess.check_output(ls_args)
                    result = len(process_ls.splitlines())

                except subprocess.CalledProcessError as e:
                    self.log.exception(
                        "trying to retrieve %s with sudo failed with return code %s",
                        method, e.returncode)
                except Exception:
                    self.log.exception(
                        "trying to retrieve %s with sudo also failed", method)
        except psutil.NoSuchProcess:
            self.warning("Process %s disappeared while scanning", process.pid)

        return result
Ejemplo n.º 3
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        # ad stands for access denied
        # We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration``
        # This cache is for all PIDs so it's global, but it should be refreshed by instance
        self.last_ad_cache_ts = {}
        self.ad_cache = set()
        self.access_denied_cache_duration = int(
            init_config.get('access_denied_cache_duration',
                            DEFAULT_AD_CACHE_DURATION))

        # By default cache the PID list for a while
        # Sometimes it's not wanted b/c it can mess with no-data monitoring
        # This cache is indexed per instance
        self.last_pid_cache_ts = {}
        self.pid_cache = {}
        self.pid_cache_duration = int(
            init_config.get('pid_cache_duration', DEFAULT_PID_CACHE_DURATION))

        self._conflicting_procfs = False
        self._deprecated_init_procfs = False
        if Platform.is_linux():
            procfs_path = init_config.get('procfs_path')
            if procfs_path:
                if 'procfs_path' in agentConfig and procfs_path != agentConfig.get(
                        'procfs_path').rstrip('/'):
                    self._conflicting_procfs = True
                else:
                    self._deprecated_init_procfs = True
                    psutil.PROCFS_PATH = procfs_path

        # Process cache, indexed by instance
        self.process_cache = defaultdict(dict)
Ejemplo n.º 4
0
 def _exclude_disk_psutil(self, part):
     # skip cd-rom drives with no disk in it; they may raise
     # ENOENT, pop-up a Windows GUI error for a non-ready
     # partition or just hang;
     # and all the other excluded disks
     skip_win = Platform.is_win32() and ('cdrom' in part.opts or part.fstype == '')
     return skip_win or self._exclude_disk(part.device, part.fstype, part.mountpoint)
Ejemplo n.º 5
0
    def _collect_part_metrics(self, part, usage):
        metrics = {}
        for name in ['total', 'used', 'free']:
            # For legacy reasons,  the standard unit it kB
            metrics[self.METRIC_DISK.format(name)] = getattr(usage, name) / 1024.0
        # FIXME: 6.x, use percent, a lot more logical than in_use
        metrics[self.METRIC_DISK.format('in_use')] = usage.percent / 100.0
        if Platform.is_unix():
            metrics.update(self._collect_inodes_metrics(part.mountpoint))

        return metrics
Ejemplo n.º 6
0
def spin_up_haproxy():
    env = os.environ
    env['HAPROXY_CONFIG_DIR'] = os.path.join(common.HERE, 'compose')
    env['HAPROXY_CONFIG'] = os.path.join(common.HERE, 'compose', 'haproxy.cfg')
    env['HAPROXY_CONFIG_OPEN'] = os.path.join(common.HERE, 'compose',
                                              'haproxy-open.cfg')
    env['HAPROXY_SOCKET_DIR'] = common.UNIXSOCKET_DIR
    if Platform.is_linux() and not os.path.exists(common.UNIXSOCKET_DIR):
        # make the temp directory on linux
        os.makedirs(common.UNIXSOCKET_DIR)
    args = [
        "docker-compose", "-f",
        os.path.join(common.HERE, 'compose', 'haproxy.yaml')
    ]
    subprocess.check_call(args + ["down"], env=env)
    subprocess.check_call(args + ["up", "-d"], env=env)
    wait_for_haproxy()
    # subprocess.check_call(["ls", "-al", "/tmp/"], env=env)
    # subprocess.check_call(["ls", "-al", "/tmp/haproxy"], env=env)
    try:
        if Platform.is_linux():
            # on linux this needs access to the socket
            # it won't work without access
            chown_args = []
            user = getpass.getuser()
            if user != 'root':
                chown_args += ['sudo']
            chown_args += ["chown", user, common.UNIXSOCKET_PATH]
            subprocess.check_call(chown_args, env=env)
    except subprocess.CalledProcessError:
        # it's not always bad if this fails
        pass
    time.sleep(20)
    yield
    subprocess.check_call(args + ["down"], env=env)
    if Platform.is_linux():
        # make the temp directory on linux
        try:
            os.removedirs(common.UNIXSOCKET_DIR)
        except OSError:
            pass
Ejemplo n.º 7
0
def test_unixsocket_config(aggregator, spin_up_haproxy):
    if not Platform.is_linux():
        return

    haproxy_check = HAProxy(common.CHECK_NAME, {}, {})
    haproxy_check.check(common.CONFIG_UNIXSOCKET)

    shared_tag = ["instance_url:{0}".format(common.UNIXSOCKET_URL)]

    _test_frontend_metrics(aggregator, shared_tag)
    _test_backend_metrics(aggregator, shared_tag)
    _test_service_checks(aggregator)

    aggregator.assert_all_metrics_covered()
Ejemplo n.º 8
0
    def get_pagefault_stats(self, pid):
        if not Platform.is_linux():
            return None

        def file_to_string(path):
            with open(path, 'r') as f:
                res = f.read()
            return res

        # http://man7.org/linux/man-pages/man5/proc.5.html
        try:
            data = file_to_string('/{}/{}/stat'.format(psutil.PROCFS_PATH, pid))
        except Exception:
            self.log.debug('error getting proc stats: file_to_string failed for /%s/%s/stat', psutil.PROCFS_PATH, pid)
            return None
        return (int(i) for i in data.split()[9:13])
Ejemplo n.º 9
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue

            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
            except TimeoutException:
                self.log.warn(
                    u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...",
                    part.mountpoint)
                continue
            except Exception as e:
                self.log.warn("Unable to get disk metrics for %s: %s",
                              part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype, 'filesystem:{}'.format(part.fstype)
                    ] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # apply device/mountpoint specific tags
            for regex, device_tags in self._device_tag_re:
                if regex.match(device_name):
                    tags += device_tags

            tags.extend(self._custom_tags)
            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(
                    part, disk_usage).iteritems():
                self.gauge(metric_name,
                           metric_value,
                           tags=tags,
                           device_name=device_name)

        self.collect_latency_metrics()
Ejemplo n.º 10
0
def test_check_real_process_regex(aggregator):
    "Check to specifically find this python pytest running process using regex."
    from datadog_checks.utils.platform import Platform

    instance = {
        'name': 'py',
        'search_string': ['.*python.*pytest'],
        'exact_match': False,
        'ignored_denied_access': True,
        'thresholds': {
            'warning': [1, 10],
            'critical': [1, 100]
        },
    }
    process = ProcessCheck(common.CHECK_NAME, {}, {})
    expected_tags = generate_expected_tags(instance)
    process.check(instance)
    for mname in common.PROCESS_METRIC:
        # cases where we don't actually expect some metrics here:
        #  - if io_counters() is not available
        #  - if memory_info_ex() is not available
        #  - first run so no `cpu.pct`
        if ((not _PSUTIL_IO_COUNTERS and '.io' in mname)
                or (not _PSUTIL_MEM_SHARED and 'mem.real' in mname)
                or mname == 'system.processes.cpu.pct'):
            continue

        if Platform.is_windows():
            metric = common.UNIX_TO_WINDOWS_MAP.get(mname, mname)
        else:
            metric = mname
        aggregator.assert_metric(metric, at_least=1, tags=expected_tags)

    aggregator.assert_service_check('process.up',
                                    count=1,
                                    tags=expected_tags + ['process:py'])

    # this requires another run
    process.check(instance)
    aggregator.assert_metric('system.processes.cpu.pct',
                             count=1,
                             tags=expected_tags)
    aggregator.assert_metric('system.processes.cpu.normalized_pct',
                             count=1,
                             tags=expected_tags)
Ejemplo n.º 11
0
def test_complex_config_replica(aggregator, spin_up_mysql):
    mysql_check = MySql(common.CHECK_NAME, {}, {})
    config = copy.deepcopy(common_config.MYSQL_COMPLEX_CONFIG)
    config['port'] = common.SLAVE_PORT
    mysql_check.check(config)

    # self.assertMetricTag('mysql.replication.seconds_behind_master', 'channel:default')

    # Test service check
    aggregator.assert_service_check('mysql.can_connect', status=MySql.OK,
                                    tags=tags.SC_TAGS_REPLICA, count=1)

    # Travis MySQL not running replication - FIX in flavored test.
    aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK,
                                    tags=tags.SC_TAGS_REPLICA, at_least=1)

    ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()])
    ver = tuple(ver)

    testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS +
                        variables.INNODB_VARS + variables.BINLOG_VARS +
                        variables.SYSTEM_METRICS + variables.SCHEMA_VARS +
                        variables.SYNTHETIC_VARS)

    # Test metrics
    for mname in testable_metrics:
        # These two are currently not guaranteed outside of a Linux
        # environment.
        if mname == 'mysql.performance.user_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.kernel_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.cpu_time' and Platform.is_windows():
            continue

        if mname == 'mysql.performance.query_run_time.avg':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'], count=1)
        elif mname == 'mysql.info.schema.size':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'], count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:information_schema'], count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:performance_schema'], count=1)
        else:
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS, at_least=0)

    # test custom query metrics
    aggregator.assert_metric('alice.age', value=25)
    aggregator.assert_metric('bob.age', value=20)

    # test optional metrics
    optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS +
                        variables.OPTIONAL_INNODB_VARS +
                        variables.OPTIONAL_STATUS_VARS +
                        variables.OPTIONAL_STATUS_VARS_5_6_6)
    _test_optional_metrics(aggregator, optional_metrics, 1)

    # Raises when coverage < 100%
    aggregator.assert_all_metrics_covered()
Ejemplo n.º 12
0
def test_relocated_procfs(aggregator):
    from datadog_checks.utils.platform import Platform
    import tempfile
    import shutil
    import uuid

    already_linux = Platform.is_linux()
    unique_process_name = str(uuid.uuid4())
    my_procfs = tempfile.mkdtemp()

    def _fake_procfs(arg, root=my_procfs):
        for key, val in arg.iteritems():
            path = os.path.join(root, key)
            if isinstance(val, dict):
                os.mkdir(path)
                _fake_procfs(val, path)
            else:
                with open(path, "w") as f:
                    f.write(str(val))

    _fake_procfs({
        '1': {
            'status': ("Name:\t{}\nThreads:\t1\n").format(unique_process_name),
            'stat':
            ('1 ({}) S 0 1 1 ' + ' 0' * 46).format(unique_process_name),
            'cmdline': unique_process_name,
        },
        'stat': ("cpu  13034 0 18596 380856797 2013 2 2962 0 0 0\n"
                 "btime 1448632481\n"),
    })

    config = {
        'init_config': {
            'procfs_path': my_procfs
        },
        'instances': [{
            'name': 'moved_procfs',
            'search_string': [unique_process_name],
            'exact_match': False,
            'ignored_denied_access': True,
            'thresholds': {
                'warning': [1, 10],
                'critical': [1, 100]
            },
        }]
    }
    version = int(psutil.__version__.replace(".", ""))
    process = ProcessCheck(common.CHECK_NAME, config['init_config'], {},
                           config['instances'])

    try:

        def import_mock(name,
                        i_globals={},
                        i_locals={},
                        fromlist=[],
                        level=-1,
                        orig_import=__import__):
            # _psutil_linux and _psutil_posix are the
            #  C bindings; use a mock for those
            if name in ('_psutil_linux', '_psutil_posix') or level >= 1 and\
               ('_psutil_linux' in fromlist or '_psutil_posix' in fromlist):
                m = MagicMock()
                # the import system will ask us for our own name
                m._psutil_linux = m
                m._psutil_posix = m
                # there's a version safety check in psutil/__init__.py;
                # this skips it
                m.version = version
                return m
            return orig_import(name, i_globals, i_locals, fromlist, level)

        # contextlib.nested is deprecated in favor of with MGR1, MGR2, ... etc
        # but we have too many mocks to fit on one line and apparently \ line
        # continuation is not flake8 compliant, even when semantically
        # required (as here). Patch is unlikely to throw errors that are
        # suppressed, so the main downside of contextlib is avoided.
        with contextlib.nested(
                patch('sys.platform', 'linux'),
                patch('socket.AF_PACKET', create=True),
                patch('__builtin__.__import__', side_effect=import_mock)):
            if not already_linux:
                # Reloading psutil fails on linux, but we only
                # need to do so if we didn't start out on a linux platform
                reload(psutil)
            assert Platform.is_linux()
            process.check(config["instances"][0])
    finally:
        shutil.rmtree(my_procfs)
        if not already_linux:
            # restore the original psutil that doesn't have our mocks
            reload(psutil)
        else:
            psutil.PROCFS_PATH = '/proc'

    expected_tags = generate_expected_tags(config['instances'][0])
    expected_tags += ['process:moved_procfs']
    aggregator.assert_service_check('process.up', count=1, tags=expected_tags)
Ejemplo n.º 13
0
    def collect_metrics_psutil(self):
        self._valid_disks = {}
        for part in psutil.disk_partitions(all=True):
            # we check all exclude conditions
            if self._exclude_disk_psutil(part):
                continue

            # Get disk metrics here to be able to exclude on total usage
            try:
                disk_usage = timeout(5)(psutil.disk_usage)(part.mountpoint)
            except TimeoutException:
                self.log.warn(
                    u"Timeout while retrieving the disk usage of `%s` mountpoint. Skipping...",
                    part.mountpoint)
                continue
            except Exception as e:
                self.log.warn("Unable to get disk metrics for %s: %s",
                              part.mountpoint, e)
                continue
            # Exclude disks with total disk size 0
            if disk_usage.total == 0:
                continue
            # For later, latency metrics
            self._valid_disks[part.device] = (part.fstype, part.mountpoint)
            self.log.debug('Passed: {0}'.format(part.device))

            tags = [part.fstype, 'filesystem:{}'.format(part.fstype)
                    ] if self._tag_by_filesystem else []
            device_name = part.mountpoint if self._use_mount else part.device

            # apply device/mountpoint specific tags
            for regex, device_tags in self._device_tag_re:
                if regex.match(device_name):
                    tags += device_tags

            tags.extend(self._custom_tags)
            # legacy check names c: vs psutil name C:\\
            if Platform.is_win32():
                device_name = device_name.strip('\\').lower()
            for metric_name, metric_value in self._collect_part_metrics(
                    part, disk_usage).iteritems():
                self.gauge(metric_name,
                           metric_value,
                           tags=tags,
                           device_name=device_name)

            # Add in a disk read write or read only check
            if self._service_check_rw:
                rwro = list(set(['rw', 'ro']) & set(part.opts.split(',')))
                if len(rwro) == 1:
                    self.service_check(
                        'disk.read_write',
                        AgentCheck.OK
                        if rwro[0] == 'rw' else AgentCheck.CRITICAL,
                        tags=tags + ['device:%s' % (device_name)])
                else:
                    self.service_check('disk.read_write',
                                       AgentCheck.UNKNOWN,
                                       tags=tags +
                                       ['device:%s' % (device_name)])

        self.collect_latency_metrics()
Ejemplo n.º 14
0
# (C) Datadog, Inc. 2019-present
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)
import psutil

from datadog_checks.dev import get_here
from datadog_checks.utils.platform import Platform

HERE = get_here()
CHECK_NAME = "system_core"

INSTANCE = {"tags": ["tag1:value1"]}

if Platform.is_mac():
    CHECK_RATES = [
        'system.core.idle', 'system.core.nice', 'system.core.system',
        'system.core.user'
    ]
    MOCK_PSUTIL_CPU_TIMES = [
        psutil._psosx.scputimes(user=7877.29,
                                nice=0.0,
                                system=7469.72,
                                idle=38164.81),
        psutil._psosx.scputimes(user=3826.74,
                                nice=0.0,
                                system=2701.6,
                                idle=46981.39),
        psutil._psosx.scputimes(user=7486.51,
                                nice=0.0,
                                system=5991.36,
                                idle=40031.88),
Ejemplo n.º 15
0
    def _check_bsd(self, instance):
        netstat_flags = ['-i', '-b']

        custom_tags = instance.get('tags', [])

        # FreeBSD's netstat truncates device names unless you pass '-W'
        if Platform.is_freebsd():
            netstat_flags.append('-W')

        try:
            output, _, _ = get_subprocess_output(["netstat"] + netstat_flags, self.log)
            lines = output.splitlines()
            # Name  Mtu   Network       Address            Ipkts Ierrs     Ibytes    Opkts Oerrs     Obytes  Coll
            # lo0   16384 <Link#1>                        318258     0  428252203   318258     0  428252203     0
            # lo0   16384 localhost   fe80:1::1           318258     -  428252203   318258     -  428252203     -
            # lo0   16384 127           localhost         318258     -  428252203   318258     -  428252203     -
            # lo0   16384 localhost   ::1                 318258     -  428252203   318258     -  428252203     -
            # gif0* 1280  <Link#2>                             0     0          0        0     0          0     0
            # stf0* 1280  <Link#3>                             0     0          0        0     0          0     0
            # en0   1500  <Link#4>    04:0c:ce:db:4e:fa 20801309     0 13835457425 15149389     0 11508790198     0
            # en0   1500  seneca.loca fe80:4::60c:ceff: 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  192.168.1     192.168.1.63    20801309     - 13835457425 15149389     - 11508790198     -
            # en0   1500  2001:470:1f 2001:470:1f07:11d 20801309     - 13835457425 15149389     - 11508790198     -
            # p2p0  2304  <Link#5>    06:0c:ce:db:4e:fa        0     0          0        0     0          0     0
            # ham0  1404  <Link#6>    7a:79:05:4d:bf:f5    30100     0    6815204    18742     0    8494811     0
            # ham0  1404  5             5.77.191.245       30100     -    6815204    18742     -    8494811     -
            # ham0  1404  seneca.loca fe80:6::7879:5ff:    30100     -    6815204    18742     -    8494811     -
            # ham0  1404  2620:9b::54 2620:9b::54d:bff5    30100     -    6815204    18742     -    8494811     -

            headers = lines[0].split()

            # Given the irregular structure of the table above, better to parse from the end of each line
            # Verify headers first
            #          -7       -6       -5        -4       -3       -2        -1
            for h in ("Ipkts", "Ierrs", "Ibytes", "Opkts", "Oerrs", "Obytes", "Coll"):
                if h not in headers:
                    self.log.error("%s not found in %s; cannot parse" % (h, headers))
                    return False

            current = None
            for l in lines[1:]:
                # Another header row, abort now, this is IPv6 land
                if "Name" in l:
                    break

                x = l.split()
                if len(x) == 0:
                    break

                iface = x[0]
                if iface.endswith("*"):
                    iface = iface[:-1]
                if iface == current:
                    # skip multiple lines of same interface
                    continue
                else:
                    current = iface

                # Filter inactive interfaces
                if self._parse_value(x[-5]) or self._parse_value(x[-2]):
                    iface = current
                    metrics = {
                        'bytes_rcvd': self._parse_value(x[-5]),
                        'bytes_sent': self._parse_value(x[-2]),
                        'packets_in.count': self._parse_value(x[-7]),
                        'packets_in.error': self._parse_value(x[-6]),
                        'packets_out.count': self._parse_value(x[-4]),
                        'packets_out.error': self._parse_value(x[-3]),
                    }
                    self._submit_devicemetrics(iface, metrics, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting connection stats.")

        try:
            netstat, _, _ = get_subprocess_output(["netstat", "-s", "-p" "tcp"], self.log)
            # 3651535 packets sent
            #         972097 data packets (615753248 bytes)
            #         5009 data packets (2832232 bytes) retransmitted
            #         0 resends initiated by MTU discovery
            #         2086952 ack-only packets (471 delayed)
            #         0 URG only packets
            #         0 window probe packets
            #         310851 window update packets
            #         336829 control packets
            #         0 data packets sent after flow control
            #         3058232 checksummed in software
            #         3058232 segments (571218834 bytes) over IPv4
            #         0 segments (0 bytes) over IPv6
            # 4807551 packets received
            #         1143534 acks (for 616095538 bytes)
            #         165400 duplicate acks
            #         ...

            self._submit_regexed_values(netstat, BSD_TCP_METRICS, custom_tags)
        except SubprocessOutputEmptyError:
            self.log.exception("Error collecting TCP stats.")
Ejemplo n.º 16
0
    def _check_linux(self, instance):
        """
        _check_linux can be run inside a container and still collects the network metrics from the host
        For that procfs_path can be set to something like "/host/proc"
        When a custom procfs_path is set, the collect_connection_state option is ignored
        """
        proc_location = self.agentConfig.get('procfs_path', '/proc').rstrip('/')
        custom_tags = instance.get('tags', [])

        if Platform.is_containerized() and proc_location != "/proc":
            proc_location = "%s/1" % proc_location

        if self._is_collect_cx_state_runnable(proc_location):
            try:
                self.log.debug("Using `ss` to collect connection state")
                # Try using `ss` for increased performance over `netstat`
                for ip_version in ['4', '6']:
                    for protocol in ['tcp', 'udp']:
                        # Call `ss` for each IP version because there's no built-in way of distinguishing
                        # between the IP versions in the output
                        # Also calls `ss` for each protocol, because on some systems (e.g. Ubuntu 14.04), there is a
                        # bug that print `tcp` even if it's `udp`
                        output, _, _ = get_subprocess_output(["ss", "-n", "-{0}".format(protocol[0]),
                                                              "-a", "-{0}".format(ip_version)], self.log)
                        lines = output.splitlines()

                        # State      Recv-Q Send-Q     Local Address:Port       Peer Address:Port
                        # UNCONN     0      0              127.0.0.1:8125                  *:*
                        # ESTAB      0      0              127.0.0.1:37036         127.0.0.1:8125
                        # UNCONN     0      0        fe80::a00:27ff:fe1c:3c4:123          :::*
                        # TIME-WAIT  0      0          90.56.111.177:56867        46.105.75.4:143
                        # LISTEN     0      0       ::ffff:127.0.0.1:33217  ::ffff:127.0.0.1:7199
                        # ESTAB      0      0       ::ffff:127.0.0.1:58975  ::ffff:127.0.0.1:2181

                        metrics = self._parse_linux_cx_state(lines[1:], self.tcp_states['ss'], 0, protocol=protocol,
                                                             ip_version=ip_version)
                        # Only send the metrics which match the loop iteration's ip version
                        for stat, metric in self.cx_state_gauge.iteritems():
                            if stat[0].endswith(ip_version) and stat[0].startswith(protocol):
                                self.gauge(metric, metrics.get(metric), tags=custom_tags)

            except OSError:
                self.log.info("`ss` not found: using `netstat` as a fallback")
                output, _, _ = get_subprocess_output(["netstat", "-n", "-u", "-t", "-a"], self.log)
                lines = output.splitlines()
                # Active Internet connections (w/o servers)
                # Proto Recv-Q Send-Q Local Address           Foreign Address         State
                # tcp        0      0 46.105.75.4:80          79.220.227.193:2032     SYN_RECV
                # tcp        0      0 46.105.75.4:143         90.56.111.177:56867     ESTABLISHED
                # tcp        0      0 46.105.75.4:50468       107.20.207.175:443      TIME_WAIT
                # tcp6       0      0 46.105.75.4:80          93.15.237.188:58038     FIN_WAIT2
                # tcp6       0      0 46.105.75.4:80          79.220.227.193:2029     ESTABLISHED
                # udp        0      0 0.0.0.0:123             0.0.0.0:*
                # udp6       0      0 :::41458                :::*

                metrics = self._parse_linux_cx_state(lines[2:], self.tcp_states['netstat'], 5)
                for metric, value in metrics.iteritems():
                    self.gauge(metric, value, tags=custom_tags)
            except SubprocessOutputEmptyError:
                self.log.exception("Error collecting connection stats.")

        proc_dev_path = "{}/net/dev".format(proc_location)
        with open(proc_dev_path, 'r') as proc:
            lines = proc.readlines()
        # Inter-|   Receive                                                 |  Transmit
        #  face |bytes     packets errs drop fifo frame compressed multicast|bytes       packets errs drop fifo colls carrier compressed # noqa: E501
        #     lo:45890956   112797   0    0    0     0          0         0    45890956   112797    0    0    0     0       0          0 # noqa: E501
        #   eth0:631947052 1042233   0   19    0   184          0      1206  1208625538  1320529    0    0    0     0       0          0 # noqa: E501
        #   eth1:       0        0   0    0    0     0          0         0           0        0    0    0    0     0       0          0 # noqa: E501
        for l in lines[2:]:
            cols = l.split(':', 1)
            x = cols[1].split()
            # Filter inactive interfaces
            if self._parse_value(x[0]) or self._parse_value(x[8]):
                iface = cols[0].strip()
                metrics = {
                    'bytes_rcvd': self._parse_value(x[0]),
                    'bytes_sent': self._parse_value(x[8]),
                    'packets_in.count': self._parse_value(x[1]),
                    'packets_in.error': self._parse_value(x[2]) + self._parse_value(x[3]),
                    'packets_out.count': self._parse_value(x[9]),
                    'packets_out.error': self._parse_value(x[10]) + self._parse_value(x[11]),
                }
                self._submit_devicemetrics(iface, metrics, custom_tags)

        netstat_data = {}
        for f in ['netstat', 'snmp']:
            proc_data_path = "{}/net/{}".format(proc_location, f)
            try:
                with open(proc_data_path, 'r') as netstat:
                    while True:
                        n_header = netstat.readline()
                        if not n_header:
                            break  # No more? Abort!
                        n_data = netstat.readline()

                        h_parts = n_header.strip().split(' ')
                        h_values = n_data.strip().split(' ')
                        ns_category = h_parts[0][:-1]
                        netstat_data[ns_category] = {}
                        # Turn the data into a dictionary
                        for idx, hpart in enumerate(h_parts[1:]):
                            netstat_data[ns_category][hpart] = h_values[idx + 1]
            except IOError:
                # On Openshift, /proc/net/snmp is only readable by root
                self.log.debug("Unable to read %s.", proc_data_path)

        nstat_metrics_names = {
            'Tcp': {
                'RetransSegs': 'system.net.tcp.retrans_segs',
                'InSegs': 'system.net.tcp.in_segs',
                'OutSegs': 'system.net.tcp.out_segs',
            },
            'TcpExt': {
                'ListenOverflows': 'system.net.tcp.listen_overflows',
                'ListenDrops': 'system.net.tcp.listen_drops',
                'TCPBacklogDrop': 'system.net.tcp.backlog_drops',
                'TCPRetransFail': 'system.net.tcp.failed_retransmits',
            },
            'Udp': {
                'InDatagrams': 'system.net.udp.in_datagrams',
                'NoPorts': 'system.net.udp.no_ports',
                'InErrors': 'system.net.udp.in_errors',
                'OutDatagrams': 'system.net.udp.out_datagrams',
                'RcvbufErrors': 'system.net.udp.rcv_buf_errors',
                'SndbufErrors': 'system.net.udp.snd_buf_errors',
                'InCsumErrors': 'system.net.udp.in_csum_errors'
            }
        }

        # Skip the first line, as it's junk
        for k in nstat_metrics_names:
            for met in nstat_metrics_names[k]:
                if met in netstat_data.get(k, {}):
                    self._submit_netmetric(nstat_metrics_names[k][met], self._parse_value(netstat_data[k][met]),
                                           tags=custom_tags)
Ejemplo n.º 17
0
    aggregator.assert_all_metrics_covered()


def test_check_ssl(aggregator, check, openldap_server, instance_ssl):
    tags = ["url:{}".format(instance_ssl["url"]), "test:integration"]
    # Should fail certificate verification
    with pytest.raises(ldap3.core.exceptions.LDAPExceptionError):
        check.check(instance_ssl)
        aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags)
    instance_ssl["ssl_verify"] = False
    # Should work now
    check.check(instance_ssl)
    aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)


def test_check_connection_failure(aggregator, check, openldap_server, instance):
    instance["url"] = "bad_url"
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    # Should fail certificate verification
    with pytest.raises(ldap3.core.exceptions.LDAPExceptionError):
        check.check(instance)
        aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags)


@pytest.mark.skipif(not Platform.is_linux(), reason='Windows sockets are not file handles')
def test_check_socket(aggregator, check, openldap_server, instance):
    instance["url"] = "ldapi://{}".format(openldap_server)
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    check.check(instance)
    aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)
Ejemplo n.º 18
0
    # Should work now
    check.check(instance_ssl)
    aggregator.assert_service_check("openldap.can_connect",
                                    check.OK,
                                    tags=tags)


@pytest.mark.usefixtures('dd_environment')
def test_check_connection_failure(aggregator, check, instance):
    instance["url"] = "bad_url"
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    # Should fail certificate verification
    with pytest.raises(ldap3.core.exceptions.LDAPExceptionError):
        check.check(instance)
        aggregator.assert_service_check("openldap.can_connect",
                                        check.CRITICAL,
                                        tags=tags)


@pytest.mark.skipif(not Platform.is_linux(),
                    reason='Windows sockets are not file handles')
@pytest.mark.usefixtures('dd_environment')
def test_check_socket(aggregator, check, instance):
    host_socket_path = os.path.join(os.environ['HOST_SOCKET_DIR'], 'ldapi')
    instance["url"] = "ldapi://{}".format(host_socket_path)
    tags = ["url:{}".format(instance["url"]), "test:integration"]
    check.check(instance)
    aggregator.assert_service_check("openldap.can_connect",
                                    check.OK,
                                    tags=tags)
Ejemplo n.º 19
0
# All rights reserved
# Licensed under a 3-clause BSD style license (see LICENSE)

from __future__ import unicode_literals

import time

import dns.resolver

from datadog_checks.checks import NetworkCheck, Status
from datadog_checks.utils.platform import Platform

# These imports are necessary because otherwise dynamic type
# resolution will fail on windows without it.
# See more here: https://github.com/rthalley/dnspython/issues/39.
if Platform.is_win32():
    from dns.rdtypes.ANY import *  # noqa
    from dns.rdtypes.IN import *  # noqa

    # for tiny time deltas, time.time on Windows reports the same value
    # of the clock more than once, causing the computation of response_time
    # to be often 0; let's use time.clock that is more precise.
    time_func = time.clock
else:
    time_func = time.time


class BadConfException(Exception):
    pass

Ejemplo n.º 20
0
def test_complex_config(aggregator, spin_up_mysql):
    mysql_check = MySql(common.CHECK_NAME, {}, {}, instances=[common_config.MYSQL_COMPLEX_CONFIG])
    mysql_check.check(common_config.MYSQL_COMPLEX_CONFIG)

    # Test service check
    aggregator.assert_service_check('mysql.can_connect', status=MySql.OK,
                                    tags=tags.SC_TAGS, count=1)

    aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK,
                                    tags=tags.SC_TAGS, at_least=1)

    ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()])
    ver = tuple(ver)

    testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS +
                        variables.INNODB_VARS + variables.BINLOG_VARS +
                        variables.SYSTEM_METRICS + variables.SCHEMA_VARS +
                        variables.SYNTHETIC_VARS)

    if ver >= (5, 6, 0) and environ.get('MYSQL_FLAVOR') != 'mariadb':
        testable_metrics.extend(variables.PERFORMANCE_VARS)

    # Test metrics
    for mname in testable_metrics:
        # These two are currently not guaranteed outside of a Linux
        # environment.
        if mname == 'mysql.performance.user_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.kernel_time' and not Platform.is_linux():
            continue
        if mname == 'mysql.performance.cpu_time' and Platform.is_windows():
            continue

        if mname == 'mysql.performance.query_run_time.avg':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'],
                                     count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:mysql'],
                                     count=1)
        elif mname == 'mysql.info.schema.size':
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:testdb'],
                                     count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:information_schema'],
                                     count=1)
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS+['schema:performance_schema'],
                                     count=1)
        else:
            aggregator.assert_metric(mname,
                                     tags=tags.METRIC_TAGS,
                                     at_least=0)

    # TODO: test this if it is implemented
    # Assert service metadata
    # version_metadata = mysql_check.service_metadata['version']
    # assert len(version_metadata) == 1

    # test custom query metrics
    aggregator.assert_metric('alice.age', value=25)
    aggregator.assert_metric('bob.age', value=20)

    # test optional metrics
    optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS +
                        variables.OPTIONAL_INNODB_VARS +
                        variables.OPTIONAL_STATUS_VARS +
                        variables.OPTIONAL_STATUS_VARS_5_6_6)
    _test_optional_metrics(aggregator, optional_metrics, 1)

    # Raises when coverage < 100%
    aggregator.assert_all_metrics_covered()