def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) # ad stands for access denied # We cache the PIDs getting this error and don't iterate on them more often than `access_denied_cache_duration`` # This cache is for all PIDs so it's global, but it should be refreshed by instance self.last_ad_cache_ts = {} self.ad_cache = set() self.access_denied_cache_duration = int( init_config.get('access_denied_cache_duration', DEFAULT_AD_CACHE_DURATION)) # By default cache the PID list for a while # Sometimes it's not wanted b/c it can mess with no-data monitoring # This cache is indexed per instance self.last_pid_cache_ts = {} self.pid_cache = {} self.pid_cache_duration = int( init_config.get('pid_cache_duration', DEFAULT_PID_CACHE_DURATION)) self._conflicting_procfs = False self._deprecated_init_procfs = False if Platform.is_linux(): procfs_path = init_config.get('procfs_path') if procfs_path: if 'procfs_path' in agentConfig and procfs_path != agentConfig.get( 'procfs_path').rstrip('/'): self._conflicting_procfs = True else: self._deprecated_init_procfs = True psutil.PROCFS_PATH = procfs_path # Process cache, indexed by instance self.process_cache = defaultdict(dict)
def check(self, instance): if instance is None: instance = {} self._excluded_ifaces = instance.get('excluded_interfaces', []) self._collect_cx_state = instance.get( 'collect_connection_state', False) self._collect_rate_metrics = instance.get( 'collect_rate_metrics', True) self._collect_count_metrics = instance.get( 'collect_count_metrics', False) # This decides whether we should split or combine connection states, # along with a few other things self._setup_metrics(instance) self._exclude_iface_re = None exclude_re = instance.get('excluded_interface_re', None) if exclude_re: self.log.debug("Excluding network devices matching: %s" % exclude_re) self._exclude_iface_re = re.compile(exclude_re) if Platform.is_linux(): self._check_linux(instance) elif Platform.is_bsd(): self._check_bsd(instance) elif Platform.is_solaris(): self._check_solaris(instance) elif Platform.is_windows(): self._check_psutil(instance)
def spin_up_haproxy(): env = os.environ env['HAPROXY_CONFIG_DIR'] = os.path.join(common.HERE, 'compose') env['HAPROXY_CONFIG'] = os.path.join(common.HERE, 'compose', 'haproxy.cfg') env['HAPROXY_CONFIG_OPEN'] = os.path.join(common.HERE, 'compose', 'haproxy-open.cfg') env['HAPROXY_SOCKET_DIR'] = common.UNIXSOCKET_DIR if Platform.is_linux() and not os.path.exists(common.UNIXSOCKET_DIR): # make the temp directory on linux os.makedirs(common.UNIXSOCKET_DIR) args = [ "docker-compose", "-f", os.path.join(common.HERE, 'compose', 'haproxy.yaml') ] subprocess.check_call(args + ["down"], env=env) subprocess.check_call(args + ["up", "-d"], env=env) wait_for_haproxy() # subprocess.check_call(["ls", "-al", "/tmp/"], env=env) # subprocess.check_call(["ls", "-al", "/tmp/haproxy"], env=env) try: if Platform.is_linux(): # on linux this needs access to the socket # it won't work without access chown_args = [] user = getpass.getuser() if user != 'root': chown_args += ['sudo'] chown_args += ["chown", user, common.UNIXSOCKET_PATH] subprocess.check_call(chown_args, env=env) except subprocess.CalledProcessError: # it's not always bad if this fails pass time.sleep(20) yield subprocess.check_call(args + ["down"], env=env) if Platform.is_linux(): # make the temp directory on linux try: os.removedirs(common.UNIXSOCKET_DIR) except OSError: pass
def test_unixsocket_config(aggregator, spin_up_haproxy): if not Platform.is_linux(): return haproxy_check = HAProxy(common.CHECK_NAME, {}, {}) haproxy_check.check(common.CONFIG_UNIXSOCKET) shared_tag = ["instance_url:{0}".format(common.UNIXSOCKET_URL)] _test_frontend_metrics(aggregator, shared_tag) _test_backend_metrics(aggregator, shared_tag) _test_service_checks(aggregator) aggregator.assert_all_metrics_covered()
def get_pagefault_stats(self, pid): if not Platform.is_linux(): return None def file_to_string(path): with open(path, 'r') as f: res = f.read() return res # http://man7.org/linux/man-pages/man5/proc.5.html try: data = file_to_string('/{}/{}/stat'.format(psutil.PROCFS_PATH, pid)) except Exception: self.log.debug('error getting proc stats: file_to_string failed for /%s/%s/stat', psutil.PROCFS_PATH, pid) return None return (int(i) for i in data.split()[9:13])
def test_relocated_procfs(aggregator): from datadog_checks.utils.platform import Platform import tempfile import shutil import uuid already_linux = Platform.is_linux() unique_process_name = str(uuid.uuid4()) my_procfs = tempfile.mkdtemp() def _fake_procfs(arg, root=my_procfs): for key, val in arg.iteritems(): path = os.path.join(root, key) if isinstance(val, dict): os.mkdir(path) _fake_procfs(val, path) else: with open(path, "w") as f: f.write(str(val)) _fake_procfs({ '1': { 'status': ("Name:\t{}\nThreads:\t1\n").format(unique_process_name), 'stat': ('1 ({}) S 0 1 1 ' + ' 0' * 46).format(unique_process_name), 'cmdline': unique_process_name, }, 'stat': ("cpu 13034 0 18596 380856797 2013 2 2962 0 0 0\n" "btime 1448632481\n"), }) config = { 'init_config': { 'procfs_path': my_procfs }, 'instances': [{ 'name': 'moved_procfs', 'search_string': [unique_process_name], 'exact_match': False, 'ignored_denied_access': True, 'thresholds': { 'warning': [1, 10], 'critical': [1, 100] }, }] } version = int(psutil.__version__.replace(".", "")) process = ProcessCheck(common.CHECK_NAME, config['init_config'], {}, config['instances']) try: def import_mock(name, i_globals={}, i_locals={}, fromlist=[], level=-1, orig_import=__import__): # _psutil_linux and _psutil_posix are the # C bindings; use a mock for those if name in ('_psutil_linux', '_psutil_posix') or level >= 1 and\ ('_psutil_linux' in fromlist or '_psutil_posix' in fromlist): m = MagicMock() # the import system will ask us for our own name m._psutil_linux = m m._psutil_posix = m # there's a version safety check in psutil/__init__.py; # this skips it m.version = version return m return orig_import(name, i_globals, i_locals, fromlist, level) # contextlib.nested is deprecated in favor of with MGR1, MGR2, ... etc # but we have too many mocks to fit on one line and apparently \ line # continuation is not flake8 compliant, even when semantically # required (as here). Patch is unlikely to throw errors that are # suppressed, so the main downside of contextlib is avoided. with contextlib.nested( patch('sys.platform', 'linux'), patch('socket.AF_PACKET', create=True), patch('__builtin__.__import__', side_effect=import_mock)): if not already_linux: # Reloading psutil fails on linux, but we only # need to do so if we didn't start out on a linux platform reload(psutil) assert Platform.is_linux() process.check(config["instances"][0]) finally: shutil.rmtree(my_procfs) if not already_linux: # restore the original psutil that doesn't have our mocks reload(psutil) else: psutil.PROCFS_PATH = '/proc' expected_tags = generate_expected_tags(config['instances'][0]) expected_tags += ['process:moved_procfs'] aggregator.assert_service_check('process.up', count=1, tags=expected_tags)
def test_complex_config(aggregator, spin_up_mysql): mysql_check = MySql(common.CHECK_NAME, {}, {}, instances=[common_config.MYSQL_COMPLEX_CONFIG]) mysql_check.check(common_config.MYSQL_COMPLEX_CONFIG) # Test service check aggregator.assert_service_check('mysql.can_connect', status=MySql.OK, tags=tags.SC_TAGS, count=1) aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK, tags=tags.SC_TAGS, at_least=1) ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()]) ver = tuple(ver) testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS + variables.INNODB_VARS + variables.BINLOG_VARS + variables.SYSTEM_METRICS + variables.SCHEMA_VARS + variables.SYNTHETIC_VARS) if ver >= (5, 6, 0) and environ.get('MYSQL_FLAVOR') != 'mariadb': testable_metrics.extend(variables.PERFORMANCE_VARS) # Test metrics for mname in testable_metrics: # These two are currently not guaranteed outside of a Linux # environment. if mname == 'mysql.performance.user_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.kernel_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.cpu_time' and Platform.is_windows(): continue if mname == 'mysql.performance.query_run_time.avg': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:mysql'], count=1) elif mname == 'mysql.info.schema.size': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:information_schema'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:performance_schema'], count=1) else: aggregator.assert_metric(mname, tags=tags.METRIC_TAGS, at_least=0) # TODO: test this if it is implemented # Assert service metadata # version_metadata = mysql_check.service_metadata['version'] # assert len(version_metadata) == 1 # test custom query metrics aggregator.assert_metric('alice.age', value=25) aggregator.assert_metric('bob.age', value=20) # test optional metrics optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS + variables.OPTIONAL_INNODB_VARS + variables.OPTIONAL_STATUS_VARS + variables.OPTIONAL_STATUS_VARS_5_6_6) _test_optional_metrics(aggregator, optional_metrics, 1) # Raises when coverage < 100% aggregator.assert_all_metrics_covered()
def test_complex_config_replica(aggregator, spin_up_mysql): mysql_check = MySql(common.CHECK_NAME, {}, {}) config = copy.deepcopy(common_config.MYSQL_COMPLEX_CONFIG) config['port'] = common.SLAVE_PORT mysql_check.check(config) # self.assertMetricTag('mysql.replication.seconds_behind_master', 'channel:default') # Test service check aggregator.assert_service_check('mysql.can_connect', status=MySql.OK, tags=tags.SC_TAGS_REPLICA, count=1) # Travis MySQL not running replication - FIX in flavored test. aggregator.assert_service_check('mysql.replication.slave_running', status=MySql.OK, tags=tags.SC_TAGS_REPLICA, at_least=1) ver = map(lambda x: int(x), mysql_check.mysql_version[mysql_check._get_host_key()]) ver = tuple(ver) testable_metrics = (variables.STATUS_VARS + variables.VARIABLES_VARS + variables.INNODB_VARS + variables.BINLOG_VARS + variables.SYSTEM_METRICS + variables.SCHEMA_VARS + variables.SYNTHETIC_VARS) # Test metrics for mname in testable_metrics: # These two are currently not guaranteed outside of a Linux # environment. if mname == 'mysql.performance.user_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.kernel_time' and not Platform.is_linux(): continue if mname == 'mysql.performance.cpu_time' and Platform.is_windows(): continue if mname == 'mysql.performance.query_run_time.avg': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) elif mname == 'mysql.info.schema.size': aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:testdb'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:information_schema'], count=1) aggregator.assert_metric(mname, tags=tags.METRIC_TAGS+['schema:performance_schema'], count=1) else: aggregator.assert_metric(mname, tags=tags.METRIC_TAGS, at_least=0) # test custom query metrics aggregator.assert_metric('alice.age', value=25) aggregator.assert_metric('bob.age', value=20) # test optional metrics optional_metrics = (variables.OPTIONAL_REPLICATION_METRICS + variables.OPTIONAL_INNODB_VARS + variables.OPTIONAL_STATUS_VARS + variables.OPTIONAL_STATUS_VARS_5_6_6) _test_optional_metrics(aggregator, optional_metrics, 1) # Raises when coverage < 100% aggregator.assert_all_metrics_covered()
aggregator.assert_all_metrics_covered() def test_check_ssl(aggregator, check, openldap_server, instance_ssl): tags = ["url:{}".format(instance_ssl["url"]), "test:integration"] # Should fail certificate verification with pytest.raises(ldap3.core.exceptions.LDAPExceptionError): check.check(instance_ssl) aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags) instance_ssl["ssl_verify"] = False # Should work now check.check(instance_ssl) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags) def test_check_connection_failure(aggregator, check, openldap_server, instance): instance["url"] = "bad_url" tags = ["url:{}".format(instance["url"]), "test:integration"] # Should fail certificate verification with pytest.raises(ldap3.core.exceptions.LDAPExceptionError): check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags) @pytest.mark.skipif(not Platform.is_linux(), reason='Windows sockets are not file handles') def test_check_socket(aggregator, check, openldap_server, instance): instance["url"] = "ldapi://{}".format(openldap_server) tags = ["url:{}".format(instance["url"]), "test:integration"] check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)
# Should work now check.check(instance_ssl) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags) @pytest.mark.usefixtures('dd_environment') def test_check_connection_failure(aggregator, check, instance): instance["url"] = "bad_url" tags = ["url:{}".format(instance["url"]), "test:integration"] # Should fail certificate verification with pytest.raises(ldap3.core.exceptions.LDAPExceptionError): check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.CRITICAL, tags=tags) @pytest.mark.skipif(not Platform.is_linux(), reason='Windows sockets are not file handles') @pytest.mark.usefixtures('dd_environment') def test_check_socket(aggregator, check, instance): host_socket_path = os.path.join(os.environ['HOST_SOCKET_DIR'], 'ldapi') instance["url"] = "ldapi://{}".format(host_socket_path) tags = ["url:{}".format(instance["url"]), "test:integration"] check.check(instance) aggregator.assert_service_check("openldap.can_connect", check.OK, tags=tags)