Example #1
0
    def test_parse_subsystem(self):
        lines = [
            # (line, expected_result)
            (
                # Kubernetes < 1.6
                ['10', 'memory', '/2ea504688cad325b9105f183b0d7831266a05f95b513c7327a6e9989ce8a450a'],
                '2ea504688cad325b9105f183b0d7831266a05f95b513c7327a6e9989ce8a450a'
            ), (
                # New CoreOS / most systems
                ['10', 'memory', '/docker/2ea504688cad325b9105f183b0d7831266a05f95b513c7327a6e9989ce8a450a'],
                'docker/2ea504688cad325b9105f183b0d7831266a05f95b513c7327a6e9989ce8a450a'
            ), (
                # Unidentified legacy system?
                ['10', 'memory', '2ea504688cad325b9105f183b0d7831266a05f95b513c7327a6e9989ce8a450a'],
                '2ea504688cad325b9105f183b0d7831266a05f95b513c7327a6e9989ce8a450a'
            ), (
                # Rancher
                ['10', 'memory', '/docker/864daa0a0b19aa4703231b6c76f85c6f369b2452a5a7f777f0c9101c0fd5772a/docker/3bac629503293d1bb61e74f3e25b6c525f0c262f22974634c5d6988bb4b07927'],
                'docker/3bac629503293d1bb61e74f3e25b6c525f0c262f22974634c5d6988bb4b07927'
            ), (
                # Legacy CoreOS 7xx
                ['7', 'memory', '/system.slice/docker-71116698eb215f2a5819f11ece7ea721f0e8d45169c7484d1cd7812596fad454.scope'],
                'system.slice/docker-71116698eb215f2a5819f11ece7ea721f0e8d45169c7484d1cd7812596fad454.scope'
            ), (
                # Kubernetes >= 1.6 QoS cgroups
                ['7', 'memory', '/kubepods/burstable/poda0f63163-3fa8-11e7-a098-42010a840216/7e071d0086ebe623dcbf3a7e0005f23eb08d7ea4df4bb42075df43c9359ce078'],
                'kubepods/burstable/poda0f63163-3fa8-11e7-a098-42010a840216/7e071d0086ebe623dcbf3a7e0005f23eb08d7ea4df4bb42075df43c9359ce078'
            )
        ]

        du = DockerUtil()
        for line, exp_res in lines:
            self.assertEquals(du._parse_subsystem(line), exp_res)
Example #2
0
def agent_container_inspect():
    # Self inspection based on cgroups
    # On all platforms, the container ID is the last part of the path.
    REGEX_PATTERN = '(.*/)+([a-z0-9]{64})$'

    dockerutil = DockerUtil()
    cgroup_path = '/proc/self/cgroup'
    container_id = None

    with open(cgroup_path, 'r') as f:
        for ind in f:
            id_match = re.search(REGEX_PATTERN, ind)
            if id_match:
                container_id = id_match.group(2)
                break
    if container_id is None:
        print("The container_id could not be found. Refer to the docker log of the container running the agent")
        return 1
    try:
        inspect = dockerutil.inspect_container(container_id)
        key_indices = [i for i, k in enumerate(inspect['Config']['Env']) if 'API_KEY' in k]
        for ind in key_indices:
            inspect['Config']['Env'][ind] = '%s=%s' % (inspect['Config']['Env'][ind].split('=', 1)[0], 'redacted')
        print json.dumps(inspect, indent=4)
        return 0
    except Exception as e:
        print "Could not inspect container: %s" % e
Example #3
0
    def test_healthcheck(self):
        config = {
            "init_config": {},
            "instances": [{
                "url": "unix://var/run/docker.sock",
                "health_service_check_whitelist": ["docker_image:nginx", "docker_image:redis"],
            },
            ],
        }

        DockerUtil().set_docker_settings(config['init_config'], config['instances'][0])

        self.run_check(config, force_reload=True)
        self.assertServiceCheck('docker.container_health', count=2)

        config = {
            "init_config": {},
            "instances": [{
                "url": "unix://var/run/docker.sock",
                "health_service_check_whitelist": [],
            },
            ],
        }

        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'], instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        self.assertServiceCheck('docker.container_health', count=0)
Example #4
0
    def test_docker_host_tags_ok(self):
        du = DockerUtil()
        mock_isswarm = mock.MagicMock(name='is_swarm', return_value=False)
        du._client = mock.MagicMock()
        du.is_swarm = mock_isswarm

        self.assertEqual([], DockerUtil().get_host_tags())
Example #5
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                self.kubeutil = KubeUtil()
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #6
0
    def test_image_name_from_image_repodigests(self):
        du = DockerUtil()
        du._client = mock.MagicMock()
        du._client.inspect_image = mock.MagicMock(name='inspect_image', return_value = {'RepoTags': [],
            'RepoDigests': ['alpine@sha256:4f2d8bbad359e3e6f23c0498e009aaa3e2f31996cbea7269b78f92ee43647811']})

        co = {'Image': 'sha256:e48e77eee11b6d9ac9fc35a23992b4158355a8ec3fd3725526eba3f467e4b6d9'}
        self.assertEqual('alpine', du.image_name_extractor(co))
Example #7
0
 def test_docker_host_metadata_ok(self):
     mock_version = mock.MagicMock(name='version', return_value={'Version': '1.13.1'})
     du = DockerUtil()
     du._client = mock.MagicMock()
     du._client.version = mock_version
     du.swarm_node_state = 'inactive'
     self.assertEqual({'docker_version': '1.13.1', 'docker_swarm': 'inactive'}, du.get_host_metadata())
     mock_version.assert_called_once()
Example #8
0
 def test_docker_host_metadata_invalid_response(self):
     mock_version = mock.MagicMock(name='version', return_value=None)
     du = DockerUtil()
     du._client = mock.MagicMock()
     du._client.version = mock_version
     du.swarm_node_state = 'inactive'
     self.assertEqual({'docker_swarm': 'inactive'}, DockerUtil().get_host_metadata())
     mock_version.assert_called_once()
Example #9
0
    def test_docker_host_tags_swarm_ok(self):
        du = DockerUtil()
        mock_info = mock.MagicMock(name='info', return_value={'Swarm': {'ControlAvailable' : True}})
        mock_isswarm = mock.MagicMock(name='is_swarm', return_value=True)
        du._client = mock.MagicMock()
        du._client.info = mock_info
        du.is_swarm = mock_isswarm

        self.assertEqual(['docker_swarm_node_role:manager'], DockerUtil().get_host_tags())
        mock_info.assert_called_once()
Example #10
0
    def test_docker_host_metadata_swarm_ok(self):
        du = DockerUtil()
        mock_version = mock.MagicMock(name='version', return_value={'Version': '1.13.1'})
        mock_isswarm = mock.MagicMock(name='is_swarm', return_value=True)
        du._client = mock.MagicMock()
        du._client.version = mock_version
        du.is_swarm = mock_isswarm

        self.assertEqual({'docker_version': '1.13.1', 'docker_swarm': 'active'}, DockerUtil().get_host_metadata())
        mock_version.assert_called_once()
Example #11
0
    def test_auto_inspect(self):
        du = DockerUtil()
        du._client = mock.MagicMock()
        mock_inspect = mock.MagicMock(name='inspect_container', return_value = {'RepoTags': ["redis:3.2"], 'RepoDigests': []})
        du._client.inspect_container = mock_inspect

        dummy = self.NeedLabelsUtil()
        dummy.reset_cache()

        dummy.get_container_tags(cid=CO_ID)
        mock_inspect.assert_called_once()
Example #12
0
    def test_include_filter(self):
        expected_metrics = [
            ('docker.containers.running', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.containers.stopped', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.cpu.system', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.cpu.user', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.io.write_bytes', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.mem.cache', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.mem.rss', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.net.bytes_rcvd', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.net.bytes_sent', ['container_name:test-new-redis-latest', 'docker_image:redis:latest', 'image_name:redis', 'image_tag:latest'])
        ]
        config = {
            "init_config": {},
            "instances": [{
                "url": "unix://var/run/docker.sock",
                "include": ["image_name:redis"],
                "exclude": [".*"],
                "collect_images_stats": True,
                "collect_image_size": True,
            },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'], instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)

        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        perf_metrics = [
            "docker.cpu.system",
            "docker.cpu.user",
            "docker.io.read_bytes",
            "docker.io.write_bytes",
            "docker.mem.cache",
            "docker.mem.rss",
            "docker.net.bytes_rcvd",
            "docker.net.bytes_sent"
        ]

        nginx_tags = ['container_name:test-new-nginx-latest', 'docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']
        for m in perf_metrics:
            self.assertMetric(mname, tags=nginx_tags, count=0)
Example #13
0
def print_containers():
    dockerutil = DockerUtil()
    containers = dockerutil.client.containers()
    print("\nContainers info:\n")
    print("Number of containers found: %s" % len(containers))
    for co in containers:
        c_id = 'ID: %s' % co.get('Id')[:12]
        c_image = 'image: %s' % dockerutil.image_name_extractor(co)
        c_name = 'name: %s' % dockerutil.container_name_extractor(co)[0]
        print("\t- %s %s %s" % (c_id, c_image, c_name))
    print('\n')
Example #14
0
    def test_image_name_from_image_repotags(self):
        du = DockerUtil()
        du._client = mock.MagicMock()
        mock_img = mock.MagicMock(name='inspect_image', return_value = {'RepoTags': ["redis:3.2"], 'RepoDigests': []})
        du._client.inspect_image = mock_img
        sha = 'sha256:e48e77eee11b6d9ac9fc35a23992b4158355a8ec3fd3725526eba3f467e4b6c9'
        co = {'Image': sha}
        self.assertEqual('redis:3.2', DockerUtil().image_name_extractor(co))
        mock_img.assert_called_once_with(sha)

        # Make sure cache is used insead of call again inspect_image
        DockerUtil().image_name_extractor(co)
        mock_img.assert_called_once()
Example #15
0
 def test_image_tags_extraction(self):
     entities = [
         # ({'Image': image_name}, [expected_image_name, expected_image_tag])
         ({'Image': 'nginx:latest'}, [['nginx'], ['latest']]),
         ({'Image': 'localhost/nginx:latest'}, [['localhost/nginx'], ['latest']]),
         ({'Image': 'localhost:5000/nginx:latest'}, [['localhost:5000/nginx'], ['latest']]),
         ({'RepoTags': ['redis:latest']}, [['redis'], ['latest']]),
         ({'RepoTags': ['localhost/redis:latest']}, [['localhost/redis'], ['latest']]),
         ({'RepoTags': ['localhost:5000/redis:latest']}, [['localhost:5000/redis'], ['latest']]),
         ({'RepoTags': ['localhost:5000/redis:latest', 'localhost:5000/redis:v1.1']}, [['localhost:5000/redis'], ['latest', 'v1.1']]),
     ]
     for entity in entities:
         self.assertEqual(sorted(DockerUtil.image_tag_extractor(entity[0], 0)), sorted(entity[1][0]))
         self.assertEqual(sorted(DockerUtil.image_tag_extractor(entity[0], 1)), sorted(entity[1][1]))
Example #16
0
class TestProxy(AsyncTestCase):
    @attr(requires='core_integration')
    def test_proxy(self):
        config = {
            "endpoints": {"https://app.datadoghq.com": ["foo"]},
            "proxy_settings": {
                "host": "localhost",
                "port": PROXY_PORT,
                "user": None,
                "password": None
            }
        }

        app = Application()
        app.skip_ssl_validation = True
        app._agentConfig = config

        trManager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE, THROTTLING_DELAY)
        trManager._flush_without_ioloop = True  # Use blocking API to emulate tornado ioloop
        CustomAgentTransaction.set_tr_manager(trManager)
        app.use_simple_http_client = False # We need proxy capabilities
        app.agent_dns_caching = False
        # _test is the instance of this class. It is needed to call the method stop() and deal with the asynchronous
        # calls as described here : http://www.tornadoweb.org/en/stable/testing.html
        CustomAgentTransaction._test = self
        CustomAgentTransaction.set_application(app)
        CustomAgentTransaction.set_endpoints(config['endpoints'])

        CustomAgentTransaction('body', {}, "") # Create and flush the transaction
        self.wait()
        del CustomAgentTransaction._test
        access_log = self.docker_client.exec_start(
            self.docker_client.exec_create(CONTAINER_NAME, 'cat /var/log/squid/access.log')['Id'])
        self.assertTrue("CONNECT" in access_log) # There should be an entry in the proxy access log
        self.assertEquals(len(trManager._endpoints_errors), 1) # There should be an error since we gave a bogus api_key

    def setUp(self):
        super(TestProxy, self).setUp()
        self.docker_client = DockerUtil().client

        self.docker_client.pull(CONTAINER_TO_RUN)

        self.container = self.docker_client.create_container(CONTAINER_TO_RUN, detach=True, name=CONTAINER_NAME,
            ports=[PROXY_PORT], host_config=self.docker_client.create_host_config(port_bindings={3128: PROXY_PORT}))
        log.info("Starting container: {0}".format(CONTAINER_TO_RUN))
        self.docker_client.start(CONTAINER_NAME)
        for line in self.docker_client.logs(CONTAINER_NAME, stdout=True, stream=True):
            if "Accepting HTTP Socket connections" in line:
                break # Wait for the container to properly start, otherwise we get 'Proxy CONNECT aborted'

    def tearDown(self):
        log.info("Stopping container: {0}".format(CONTAINER_TO_RUN))
        self.docker_client.remove_container(CONTAINER_NAME, force=True)
        super(TestProxy, self).tearDown()
Example #17
0
    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers
Example #18
0
    def __init__(self, instance=None):
        self.docker_util = DockerUtil()
        if instance is None:
            try:
                config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
                check_config = check_yaml(config_file_path)
                instance = check_config['instances'][0]
            # kubernetes.yaml was not found
            except IOError as ex:
                log.error(ex.message)
                instance = {}
            except Exception:
                log.error('Kubernetes configuration file is invalid. '
                          'Trying connecting to kubelet with default settings anyway...')
                instance = {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self.host = instance.get("host") or self.docker_util.get_hostname()
        self._node_ip = self._node_name = None  # lazy evaluation
        self.host_name = os.environ.get('HOSTNAME')

        self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT)
        self.kubelet_port = instance.get('kubelet_port', KubeUtil.DEFAULT_KUBELET_PORT)

        self.kubelet_api_url = '%s://%s:%d' % (self.method, self.host, self.kubelet_port)
        self.cadvisor_url = '%s://%s:%d' % (self.method, self.host, self.cadvisor_port)
        self.kubernetes_api_url = 'https://%s/api/v1' % (os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME)

        self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH)
        self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH)
        self.kube_health_url = urljoin(self.kubelet_api_url, 'healthz')

        # keep track of the latest k8s event we collected and posted
        # default value is 0 but TTL for k8s events is one hour anyways
        self.last_event_collection_ts = defaultdict(int)
Example #19
0
    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)

            try:
                self._report_cgroup_metrics(container, tags)
                if "_proc_root" not in container:
                    containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                    continue
                self._report_net_metrics(container, tags)
            except BogusPIDException as e:
                self.log.warning('Unable to report cgroup metrics: %s', e)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)
Example #20
0
class TestDockerutil(unittest.TestCase):
    def setUp(self):
        self.dockerutil = DockerUtil()

    @mock.patch("utils.dockerutil.DockerUtil.client")
    def test_get_events(self, mocked_client):
        mocked_client.events.return_value = [
            {"status": "stop", "id": "1234567890", "from": "1234567890", "time": 1423247867}
        ]
        events_generator, _ = self.dockerutil.get_events()
        self.assertEqual(len(events_generator), 1)

        # bug in dockerpy, we should be resilient
        mocked_client.events.return_value = [u"an error from Docker API here"]
        events_generator, _ = self.dockerutil.get_events()
        self.assertEqual(len(list(events_generator)), 0)
    def test_event_attributes_tag(self):
        self.docker_client = DockerUtil().client
        config = {
            "init_config": {},
            "instances": [{
                "url": "unix://var/run/docker.sock",
                "event_attributes_as_tags": ["exitCode", "name"],
            },
            ],
        }

        DockerUtil().set_docker_settings(config['init_config'], config['instances'][0])

        container_fail = self.docker_client.create_container(
            "nginx:latest", detach=True, name='event-tags-test', entrypoint='/bin/false')
        log.debug('start nginx:latest with entrypoint /bin/false')
        self.docker_client.start(container_fail)
        log.debug('container exited with %s' % self.docker_client.wait(container_fail, 1))
        # Wait 1 second after exit so the event will be picked up
        from time import sleep
        sleep(1)
        self.run_check(config, force_reload=True)
        self.docker_client.remove_container(container_fail)

        # Previous tests might have left unprocessed events, to be ignored
        filtered_events = []
        for event in self.events:
            if 'container_name:event-tags-test' in event.get('tags', []):
                filtered_events.append(event)

        self.assertEqual(len(filtered_events), 1)
        self.assertIn("exitCode:1", filtered_events[0]["tags"])
        self.assertNotIn("name:test-exit-fail", filtered_events[0]["tags"])
Example #22
0
    def __init__(self):
        self.docker_util = DockerUtil()
        try:
            config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
            check_config = check_yaml(config_file_path)
            instance = check_config['instances'][0]
        # kubernetes.yaml was not found
        except IOError as ex:
            log.error(ex.message)
            instance = {}
        except Exception:
            log.error('Kubernetes configuration file is invalid. '
                      'Trying connecting to kubelet with default settings anyway...')
            instance = {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self.host = instance.get("host") or self.docker_util.get_hostname()

        self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT)
        self.kubelet_port = instance.get('kubelet_port', KubeUtil.DEFAULT_KUBELET_PORT)

        self.metrics_url = urljoin(
            '%s://%s:%d' % (self.method, self.host, self.cadvisor_port), KubeUtil.METRICS_PATH)
        self.pods_list_url = urljoin(
            '%s://%s:%d' % (self.method, self.host, self.kubelet_port), KubeUtil.PODS_LIST_PATH)

        self.kube_health_url = '%s://%s:%d/healthz' % (self.method, self.host, self.kubelet_port)
Example #23
0
    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            low_prio_events = []
            normal_prio_events = []

            for event in event_group:
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)

                # health checks generate tons of these so we treat them separately and lower their priority
                if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'):
                    low_prio_events.append((event, container_name))
                else:
                    normal_prio_events.append((event, container_name))

            exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low')
            if exec_event:
                events.append(exec_event)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events
Example #24
0
    def setUp(self):
        self.docker_client = DockerUtil().client
        for c in CONTAINERS_TO_RUN:
            images = [
                i["RepoTags"][0] for i in self.docker_client.images(c.split(":")[0]) if i["RepoTags"][0].startswith(c)
            ]
            if len(images) == 0:
                for line in self.docker_client.pull(c, stream=True):
                    print line

        self.containers = []
        for c in CONTAINERS_TO_RUN:
            name = "test-new-{0}".format(c.replace(":", "-"))
            host_config = None
            labels = None
            if c == "nginx:latest":
                host_config = {"Memory": 137438953472}
                labels = {"label1": "nginx", "foo": "bar"}

            cont = self.docker_client.create_container(
                c, detach=True, name=name, host_config=host_config, labels=labels
            )
            self.containers.append(cont)

        for c in self.containers:
            log.info("Starting container: {0}".format(c))
            self.docker_client.start(c)
Example #25
0
    def _get_and_count_containers(self, custom_cgroups=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Filter containers according to the exclude/include rules
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups:
                try:
                    inspect_dict = self.docker_client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)


        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))

        return containers_by_id
Example #26
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.docker_client = self.dockerutil.client
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                self.kubeutil = None
                log.error("Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        if Platform.is_nomad():
            self.nomadutil = NomadUtil()
        elif Platform.is_ecs_instance():
            self.ecsutil = ECSUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
Example #27
0
 def test_image_tags_extraction(self):
     entities = [
         # ({'Image': image_name}, [expected_image_name, expected_image_tag])
         ({"Image": "nginx:latest"}, [["nginx"], ["latest"]]),
         ({"Image": "localhost/nginx:latest"}, [["localhost/nginx"], ["latest"]]),
         ({"Image": "localhost:5000/nginx:latest"}, [["localhost:5000/nginx"], ["latest"]]),
         ({"RepoTags": ["redis:latest"]}, [["redis"], ["latest"]]),
         ({"RepoTags": ["localhost/redis:latest"]}, [["localhost/redis"], ["latest"]]),
         ({"RepoTags": ["localhost:5000/redis:latest"]}, [["localhost:5000/redis"], ["latest"]]),
         (
             {"RepoTags": ["localhost:5000/redis:latest", "localhost:5000/redis:v1.1"]},
             [["localhost:5000/redis"], ["latest", "v1.1"]],
         ),
     ]
     for entity in entities:
         self.assertEqual(sorted(DockerUtil.image_tag_extractor(entity[0], 0)), sorted(entity[1][0]))
         self.assertEqual(sorted(DockerUtil.image_tag_extractor(entity[0], 1)), sorted(entity[1][1]))
Example #28
0
    def test_histogram(self):

        metric_suffix = ["count", "avg", "median", "max", "95percentile"]

        expected_metrics = [
            ('docker.containers.running', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.containers.running', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.containers.stopped', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.containers.stopped', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
        ]

        histo_metrics = [
            ('docker.mem.cache', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.mem.cache', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.mem.rss', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.mem.rss', ['docker_image:redis:latest', 'image_name:redis', 'image_tag:latest']),
            ('docker.mem.limit', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
            ('docker.mem.in_use', ['docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest']),
        ]

        config = {
            "init_config": {},
            "instances": [{
                "url": "unix://var/run/docker.sock",
                "collect_image_size": True,
                "collect_images_stats": True,
                "use_histogram": True,
            },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'], instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        for mname, tags in histo_metrics:
            for suffix in metric_suffix:
                self.assertMetric(mname + "." + suffix, tags=tags, at_least=1)
Example #29
0
    def test_tags_options(self):
        expected_metrics = [
            ('docker.containers.running', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.containers.running', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.containers.stopped', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.containers.stopped', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.system', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.cpu.system', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.user', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.user', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size', ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.io.read_bytes', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.io.write_bytes', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.io.write_bytes', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.mem.cache', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.mem.cache', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.mem.rss', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.mem.rss', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.net.bytes_rcvd', ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.net.bytes_rcvd', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.net.bytes_sent', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.net.bytes_sent', ['container_command:docker-entrypoint.sh redis-server'])
        ]
        config = {
            "init_config": {},
            "instances": [{
                "url": "unix://var/run/docker.sock",
                "performance_tags": ["container_command"],
                "container_tags": ["container_command"],
                "collect_images_stats": True,
                "collect_image_size": True,
            },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'], instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)
Example #30
0
    def _get_cgroup_file(self, cgroup, container_id, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "mountpoint": self._mountpoints[cgroup],
            "id": container_id,
            "file": filename,
        }

        return DockerUtil.find_cgroup_filename_pattern(self._mountpoints, container_id) % (params)
Example #31
0
 def is_ecs_instance():
     from utils.dockerutil import DockerUtil
     return DockerUtil().is_ecs()
Example #32
0
 def is_rancher():
     from utils.dockerutil import DockerUtil
     return DockerUtil().is_rancher()
Example #33
0
]

DEFAULT_PERFORMANCE_TAGS = [
    "container_name",
    "docker_image",
    "image_name",
    "image_tag",
]

DEFAULT_IMAGE_TAGS = ['image_name', 'image_tag']

DEFAULT_LABELS_AS_TAGS = [SWARM_SVC_LABEL]

TAG_EXTRACTORS = {
    "docker_image": lambda c: [c["Image"]],
    "image_name": lambda c: DockerUtil.image_tag_extractor(c, 0),
    "image_tag": lambda c: DockerUtil.image_tag_extractor(c, 1),
    "container_command": lambda c: [c["Command"]],
    "container_name": DockerUtil.container_name_extractor,
    "container_id": lambda c: [c["Id"]],
}

CONTAINER = "container"
PERFORMANCE = "performance"
FILTERED = "filtered"
HEALTHCHECK = "healthcheck"
IMAGE = "image"

ECS_INTROSPECT_DEFAULT_PORT = 51678

ERROR_ALERT_TYPE = ['oom', 'kill']
    def test_histogram(self):

        metric_suffix = ["count", "avg", "median", "max", "95percentile"]

        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
        ]

        histo_metrics = [
            ('docker.mem.cache', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.limit', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.in_use', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_image_size": True,
                    "collect_images_stats": True,
                    "use_histogram": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        for mname, tags in histo_metrics:
            for suffix in metric_suffix:
                self.assertMetric(mname + "." + suffix, tags=tags, at_least=1)
    def test_tags_options(self):
        expected_metrics = [
            ('docker.containers.running',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.containers.running',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.containers.stopped',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.containers.stopped',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.system', ["container_command:nginx -g 'daemon off;'"
                                   ]),
            ('docker.cpu.system',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.user',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.user', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.io.read_bytes',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.io.write_bytes',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.io.write_bytes',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.mem.cache', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.mem.cache',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.mem.rss',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.mem.rss', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.net.bytes_rcvd', [
                'container_command:docker-entrypoint.sh redis-server',
                'docker_network:bridge'
            ]),
            ('docker.net.bytes_rcvd', [
                "container_command:nginx -g 'daemon off;'",
                'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                "container_command:nginx -g 'daemon off;'",
                'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_command:docker-entrypoint.sh redis-server',
                'docker_network:bridge'
            ])
        ]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "performance_tags": ["container_command"],
                    "container_tags": ["container_command"],
                    "collect_images_stats": True,
                    "collect_image_size": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)
class TestCheckDockerDaemon(AgentCheckTest):
    """Basic Test for docker_daemon integration."""
    CHECK_NAME = 'docker_daemon'

    # Mock tests #

    def mock_normal_get_info(self):
        return {
            'DriverStatus': [
                ['Data Space Used', '1 GB'],
                ['Data Space Available', '9 GB'],
                ['Data Space Total', '10 GB'],
                ['Metadata Space Used', '1 MB'],
                ['Metadata Space Available', '9 MB'],
                ['Metadata Space Total', '10 MB'],
            ]
        }

    def mock_get_info_no_used(self):
        return {
            'DriverStatus': [
                ['Data Space Available', '9 GB'],
                ['Data Space Total', '10 GB'],
                ['Metadata Space Available', '9 MB'],
                ['Metadata Space Total', '10 MB'],
            ]
        }

    def mock_get_info_no_data(self):
        return {
            'DriverStatus': [
                ['Metadata Space Available', '9 MB'],
                ['Metadata Space Total', '10 MB'],
                ['Metadata Space Used', '1 MB'],
            ]
        }

    def mock_get_info_invalid_values(self):
        return {
            'DriverStatus': [
                ['Metadata Space Available', '9 MB'],
                ['Metadata Space Total', '10 MB'],
                ['Metadata Space Used', '11 MB'],
            ]
        }

    def mock_get_info_all_zeros(self):
        return {
            'DriverStatus': [
                ['Data Space Available', '0 MB'],
                ['Data Space Total', '0 GB'],
                ['Data Space Used', '0 KB'],
            ]
        }

    @mock.patch('docker.Client.info')
    def test_devicemapper_disk_metrics(self, mock_info):
        mock_info.return_value = self.mock_normal_get_info()

        self.run_check(MOCK_CONFIG, force_reload=True)
        self.assertMetric('docker.data.free', value=9e9)
        self.assertMetric('docker.data.used', value=1e9)
        self.assertMetric('docker.data.total', value=10e9)
        self.assertMetric('docker.data.percent', value=10.0)
        self.assertMetric('docker.metadata.free', value=9e6)
        self.assertMetric('docker.metadata.used', value=1e6)
        self.assertMetric('docker.metadata.total', value=10e6)
        self.assertMetric('docker.metadata.percent', value=10.0)

    @mock.patch('docker.Client.info')
    def test_devicemapper_no_used_info(self, mock_info):
        """Disk metrics collection should still work and `percent` can be calculated"""
        mock_info.return_value = self.mock_get_info_no_used()

        self.run_check(MOCK_CONFIG, force_reload=True)
        self.assertMetric('docker.data.free', value=9e9)
        self.assertMetric('docker.data.total', value=10e9)
        self.assertMetric('docker.data.percent', value=10.0)
        self.assertMetric('docker.metadata.free', value=9e6)
        self.assertMetric('docker.metadata.total', value=10e6)
        self.assertMetric('docker.metadata.percent', value=10.0)

    @mock.patch('docker.Client.info')
    def test_devicemapper_no_data_info(self, mock_info):
        """Disk metrics collection should still partially work for metadata"""
        mock_info.return_value = self.mock_get_info_no_data()

        self.run_check(MOCK_CONFIG, force_reload=True)
        self.assertMetric('docker.metadata.free', value=9e6)
        self.assertMetric('docker.metadata.total', value=10e6)
        self.assertMetric('docker.metadata.percent', value=10.0)

    @mock.patch('docker.Client.info')
    def test_devicemapper_invalid_values(self, mock_info):
        """Invalid values are detected in _calc_percent_disk_stats and 'percent' use 'free'+'used' instead of 'total' """
        mock_info.return_value = self.mock_get_info_invalid_values()

        self.run_check(MOCK_CONFIG, force_reload=True)
        self.assertMetric('docker.metadata.free', value=9e6)
        self.assertMetric('docker.metadata.used', value=11e6)
        self.assertMetric('docker.metadata.total', value=10e6)
        self.assertMetric('docker.metadata.percent', value=55)

    @mock.patch('docker.Client.info')
    def test_devicemapper_all_zeros(self, mock_info):
        """Percentage should not be calculated, other metrics should be collected correctly"""
        mock_info.return_value = self.mock_get_info_all_zeros()

        self.run_check(MOCK_CONFIG, force_reload=True)
        metric_names = [metric[0] for metric in self.metrics]
        self.assertMetric('docker.data.free', value=0)
        self.assertMetric('docker.data.used', value=0)
        self.assertMetric('docker.data.total', value=0)
        self.assertNotIn('docker.data.percent', metric_names)

    # integration tests #

    def setUp(self):
        self.docker_client = DockerUtil().client

        self.second_network = self.docker_client.create_network(
            "second", driver="bridge")['Id']

        for c in CONTAINERS_TO_RUN:
            images = [
                i["RepoTags"][0]
                for i in self.docker_client.images(c.split(":")[0])
                if i["RepoTags"][0].startswith(c)
            ]
            if len(images) == 0:
                for line in self.docker_client.pull(c, stream=True):
                    print line

        self.containers = []
        for c in CONTAINERS_TO_RUN:
            name = "test-new-{0}".format(c.replace(":", "-"))
            host_config = None
            labels = None
            if c == "nginx:latest":
                host_config = {"Memory": 137438953472}
                labels = {"label1": "nginx", "foo": "bar"}

            cont = self.docker_client.create_container(c,
                                                       detach=True,
                                                       name=name,
                                                       host_config=host_config,
                                                       labels=labels)
            self.containers.append(cont)

            if c == "nginx:latest":
                self.docker_client.connect_container_to_network(
                    cont['Id'], self.second_network)

        for c in self.containers:
            log.info("Starting container: {0}".format(c))
            self.docker_client.start(c)

    def tearDown(self):
        for c in self.containers:
            log.info("Stopping container: {0}".format(c))
            self.docker_client.remove_container(c, force=True)
        self.docker_client.remove_network(self.second_network)

    def test_basic_config_single(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]), ('docker.image.size', ['image_name:nginx',
                                       'image_tag:latest']),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ])
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_image_size": True,
                    "collect_images_stats": True
                },
            ],
        }
        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

    def test_basic_config_twice(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]), ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.cpu.system', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.cpu.system', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.read_bytes', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.io.read_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:bridge'
            ])
        ]

        custom_tags = ["extra_tag", "env:testing"]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "tags": custom_tags,
                    "collect_image_size": True,
                    "collect_images_stats": True,
                },
            ],
        }
        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check_twice(config, force_reload=True)
        for mname, tags in expected_metrics:
            expected_tags = list(custom_tags)
            if tags is not None:
                expected_tags += tags
            self.assertMetric(mname, tags=expected_tags, count=1, at_least=1)

    def test_exclude_filter(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.system', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]), ('docker.image.size', ['image_name:redis',
                                       'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ])
        ]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "exclude": ["docker_image:nginx"],
                    "collect_images_stats": True,
                    "collect_image_size": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)

        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        perf_metrics = [
            "docker.cpu.system", "docker.cpu.user", "docker.io.read_bytes",
            "docker.io.write_bytes", "docker.mem.cache", "docker.mem.rss",
            "docker.net.bytes_rcvd", "docker.net.bytes_sent"
        ]

        nginx_tags = [
            'container_name:test-new-nginx-latest',
            'docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest'
        ]
        for mname in perf_metrics:
            self.assertMetric(mname, tags=nginx_tags, count=0)

    def test_include_filter(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.system', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]), ('docker.image.size', ['image_name:redis',
                                       'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ])
        ]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "include": ["image_name:redis"],
                    "exclude": [".*"],
                    "collect_images_stats": True,
                    "collect_image_size": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)

        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        perf_metrics = [
            "docker.cpu.system", "docker.cpu.user", "docker.io.read_bytes",
            "docker.io.write_bytes", "docker.mem.cache", "docker.mem.rss",
            "docker.net.bytes_rcvd", "docker.net.bytes_sent"
        ]

        nginx_tags = [
            'container_name:test-new-nginx-latest',
            'docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest'
        ]
        for m in perf_metrics:
            self.assertMetric(mname, tags=nginx_tags, count=0)

    def test_tags_options(self):
        expected_metrics = [
            ('docker.containers.running',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.containers.running',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.containers.stopped',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.containers.stopped',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.system', ["container_command:nginx -g 'daemon off;'"
                                   ]),
            ('docker.cpu.system',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.user',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.cpu.user', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.io.read_bytes',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.io.write_bytes',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.io.write_bytes',
             ["container_command:nginx -g 'daemon off;'"]),
            ('docker.mem.cache', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.mem.cache',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.mem.rss',
             ['container_command:docker-entrypoint.sh redis-server']),
            ('docker.mem.rss', ["container_command:nginx -g 'daemon off;'"]),
            ('docker.net.bytes_rcvd', [
                'container_command:docker-entrypoint.sh redis-server',
                'docker_network:bridge'
            ]),
            ('docker.net.bytes_rcvd', [
                "container_command:nginx -g 'daemon off;'",
                'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                "container_command:nginx -g 'daemon off;'",
                'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_command:docker-entrypoint.sh redis-server',
                'docker_network:bridge'
            ])
        ]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "performance_tags": ["container_command"],
                    "container_tags": ["container_command"],
                    "collect_images_stats": True,
                    "collect_image_size": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

    def test_set_docker_settings(self):
        """Test a client settings update"""
        self.assertEqual(DockerUtil().settings["version"], "auto")
        cur_loc = __file__
        init_config = {
            "api_version": "foobar",
            "timeout": "42",
            "tls_client_cert": cur_loc,
            "tls_client_key": cur_loc,
            "tls_cacert": cur_loc,
            "tls": True
        }

        instance = {
            "url": "https://foo.bar:42",
        }

        DockerUtil().set_docker_settings(init_config, instance)
        client = DockerUtil().client
        self.assertEqual(client.verify, cur_loc)
        self.assertEqual(client.cert, (cur_loc, cur_loc))
        reset_docker_settings()

    def test_labels_collection(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.limit', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.mem.in_use', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_labels_as_tags": ["label1"],
                    "collect_image_size": True,
                    "collect_images_stats": True,
                    "collect_container_count": True,
                    "collect_dead_container_count": True,
                    "collect_exited_container_count": True,
                    "collect_volume_count": True,
                    "collect_dangling_volume_count": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

    def test_histogram(self):

        metric_suffix = ["count", "avg", "median", "max", "95percentile"]

        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
        ]

        histo_metrics = [
            ('docker.mem.cache', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.limit', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.in_use', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_image_size": True,
                    "collect_images_stats": True,
                    "use_histogram": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        for mname, tags in histo_metrics:
            for suffix in metric_suffix:
                self.assertMetric(mname + "." + suffix, tags=tags, at_least=1)

    def test_events(self):
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_images_stats": True,
                },
            ],
        }

        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check(config, force_reload=True)
        self.assertEqual(len(self.events), 2)

    def test_healthcheck(self):
        config = {
            "init_config": {},
            "instances": [
                {
                    "url":
                    "unix://var/run/docker.sock",
                    "health_service_check_whitelist":
                    ["docker_image:nginx", "docker_image:redis"],
                },
            ],
        }

        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])
        DockerUtil().filtering_enabled = False

        self.run_check(config, force_reload=True)
        self.assertServiceCheck('docker.container_health', count=2)

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "health_service_check_whitelist": [],
                },
            ],
        }

        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        self.assertServiceCheck('docker.container_health', count=0)

    def test_container_size(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.limit', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.in_use', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            # Container size metrics
            ("docker.container.size_rootfs", [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ("docker.container.size_rootfs", [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ("docker.container.size_rw", [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_container_size": True,
                    "collect_image_size": True,
                    "collect_images_stats": True,
                },
            ],
        }
        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

    def test_image_tags_extraction(self):
        entities = [
            # ({'Image': image_name}, [expected_image_name, expected_image_tag])
            ({
                'Image': 'nginx:latest'
            }, [['nginx'], ['latest']]),
            ({
                'Image': 'localhost/nginx:latest'
            }, [['localhost/nginx'], ['latest']]),
            ({
                'Image': 'localhost:5000/nginx:latest'
            }, [['localhost:5000/nginx'], ['latest']]),
            ({
                'RepoTags': ['redis:latest']
            }, [['redis'], ['latest']]),
            ({
                'RepoTags': ['localhost/redis:latest']
            }, [['localhost/redis'], ['latest']]),
            ({
                'RepoTags': ['localhost:5000/redis:latest']
            }, [['localhost:5000/redis'], ['latest']]),
            ({
                'RepoTags':
                ['localhost:5000/redis:latest', 'localhost:5000/redis:v1.1']
            }, [['localhost:5000/redis'], ['latest', 'v1.1']]),
            ({
                'RepoTags': [],
                'RepoDigests': [
                    u'datadog/docker-dd-agent@sha256:47a59c2ea4f6d9555884aacc608b303f18bde113b1a3a6743844bfc364d73b44'
                ]
            }, [['datadog/docker-dd-agent'], None]),
        ]
        for entity in entities:
            self.assertEqual(
                sorted(DockerUtil.image_tag_extractor(entity[0], 0)),
                sorted(entity[1][0]))
            tags = DockerUtil.image_tag_extractor(entity[0], 1)
            if isinstance(entity[1][1], list):
                self.assertEqual(sorted(tags), sorted(entity[1][1]))
            else:
                self.assertEqual(tags, entity[1][1])

    def test_container_name_extraction(self):
        containers = [
            ({
                'Id': 'deadbeef'
            }, ['deadbeef']),
            ({
                'Names': ['/redis'],
                'Id': 'deadbeef'
            }, ['redis']),
            ({
                'Names': ['/mongo', '/redis/mongo'],
                'Id': 'deadbeef'
            }, ['mongo']),
            ({
                'Names': ['/redis/mongo', '/mongo'],
                'Id': 'deadbeef'
            }, ['mongo']),
        ]
        for co in containers:
            self.assertEqual(DockerUtil.container_name_extractor(co[0]), co[1])

    def test_network_tagging(self):
        expected_metrics = [
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:second'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:second'
            ])
        ]

        custom_tags = ["extra_tag", "env:testing"]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "tags": custom_tags,
                    "collect_image_size": True,
                    "collect_images_stats": True,
                },
            ],
        }
        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check_twice(config, force_reload=True)
        for mname, tags in expected_metrics:
            expected_tags = list(custom_tags)
            if tags is not None:
                expected_tags += tags
            self.assertMetric(mname, tags=expected_tags, count=1, at_least=1)
    def test_basic_config_twice(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]), ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.cpu.system', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.cpu.system', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.read_bytes', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.io.read_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'docker_network:bridge'
            ])
        ]

        custom_tags = ["extra_tag", "env:testing"]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "tags": custom_tags,
                    "collect_image_size": True,
                    "collect_images_stats": True,
                },
            ],
        }
        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check_twice(config, force_reload=True)
        for mname, tags in expected_metrics:
            expected_tags = list(custom_tags)
            if tags is not None:
                expected_tags += tags
            self.assertMetric(mname, tags=expected_tags, count=1, at_least=1)
Example #38
0
class KubeUtil:
    __metaclass__ = Singleton

    DEFAULT_METHOD = 'http'
    KUBELET_HEALTH_PATH = '/healthz'
    MACHINE_INFO_PATH = '/api/v1.3/machine/'
    METRICS_PATH = '/api/v1.3/subcontainers/'
    PODS_LIST_PATH = '/pods/'
    DEFAULT_CADVISOR_PORT = 4194
    DEFAULT_HTTP_KUBELET_PORT = 10255
    DEFAULT_HTTPS_KUBELET_PORT = 10250
    DEFAULT_MASTER_PORT = 443
    DEFAULT_MASTER_NAME = 'kubernetes'  # DNS name to reach the master from a pod.
    DEFAULT_LABEL_PREFIX = 'kube_'
    DEFAULT_COLLECT_SERVICE_TAG = True
    CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
    AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token'

    POD_NAME_LABEL = "io.kubernetes.pod.name"
    NAMESPACE_LABEL = "io.kubernetes.pod.namespace"
    CONTAINER_NAME_LABEL = "io.kubernetes.container.name"

    def __init__(self, **kwargs):
        self.docker_util = DockerUtil()
        if 'init_config' in kwargs and 'instance' in kwargs:
            init_config = kwargs.get('init_config', {})
            instance = kwargs.get('instance', {})
        else:
            try:
                config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
                check_config = check_yaml(config_file_path)
                init_config = check_config['init_config'] or {}
                instance = check_config['instances'][0] or {}
            # kubernetes.yaml was not found
            except IOError as ex:
                log.error(ex.message)
                init_config, instance = {}, {}
            except Exception:
                log.error(
                    'Kubernetes configuration file is invalid. '
                    'Trying connecting to kubelet with default settings anyway...'
                )
                init_config, instance = {}, {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self._node_ip = self._node_name = None  # lazy evaluation
        self.host_name = os.environ.get('HOSTNAME')
        self.pod_name = os.environ.get('KUBERNETES_POD_NAME') or self.host_name
        self.tls_settings = self._init_tls_settings(instance)

        # apiserver
        if 'api_server_url' in instance:
            self.kubernetes_api_root_url = instance.get('api_server_url')
        else:
            master_host = os.environ.get(
                'KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME
            master_port = os.environ.get(
                'KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT
            self.kubernetes_api_root_url = 'https://%s:%s' % (master_host,
                                                              master_port)

        self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url

        # Service mapping helper class
        self._service_mapper = PodServiceMapper(self)
        from config import _is_affirmative
        self.collect_service_tag = _is_affirmative(
            instance.get('collect_service_tags',
                         KubeUtil.DEFAULT_COLLECT_SERVICE_TAG))

        # leader status triggers event collection
        self.is_leader = False
        self.leader_elector = None
        self.leader_lease_duration = instance.get('leader_lease_duration')

        # kubelet
        # If kubelet_api_url is None, init_kubelet didn't succeed yet.
        self.init_success = False
        self.kubelet_api_url = None
        self.init_retry_interval = init_config.get('init_retry_interval',
                                                   DEFAULT_RETRY_INTERVAL)
        self.last_init_retry = None
        self.left_init_retries = init_config.get('init_retries',
                                                 DEFAULT_INIT_RETRIES) + 1
        self.init_kubelet(instance)

        self.kube_label_prefix = instance.get('label_to_tag_prefix',
                                              KubeUtil.DEFAULT_LABEL_PREFIX)
        self.kube_node_labels = instance.get('node_labels_to_host_tags', {})

        # keep track of the latest k8s event we collected and posted
        # default value is 0 but TTL for k8s events is one hour anyways
        self.last_event_collection_ts = 0

    def _init_tls_settings(self, instance):
        """
        Initialize TLS settings for connection to apiserver and kubelet.
        """
        tls_settings = {}

        # apiserver
        client_crt = instance.get('apiserver_client_crt')
        client_key = instance.get('apiserver_client_key')
        apiserver_cacert = instance.get('apiserver_ca_cert')

        if client_crt and client_key and os.path.exists(
                client_crt) and os.path.exists(client_key):
            tls_settings['apiserver_client_cert'] = (client_crt, client_key)

        if apiserver_cacert and os.path.exists(apiserver_cacert):
            tls_settings['apiserver_cacert'] = apiserver_cacert

        # kubelet
        kubelet_client_crt = instance.get('kubelet_client_crt')
        kubelet_client_key = instance.get('kubelet_client_key')
        if kubelet_client_crt and kubelet_client_key and os.path.exists(
                kubelet_client_crt) and os.path.exists(kubelet_client_key):
            tls_settings['kubelet_client_cert'] = (kubelet_client_crt,
                                                   kubelet_client_key)

        cert = instance.get('kubelet_cert')
        if cert:
            tls_settings['kubelet_verify'] = cert
        else:
            tls_settings['kubelet_verify'] = instance.get(
                'kubelet_tls_verify', DEFAULT_TLS_VERIFY)

        if ('apiserver_client_cert'
                not in tls_settings) or ('kubelet_client_cert'
                                         not in tls_settings):
            # Only lookup token if we don't have client certs for both
            token = self.get_auth_token(instance)
            if token:
                tls_settings['bearer_token'] = token

        return tls_settings

    def init_kubelet(self, instance):
        """
        Handles the retry logic around _locate_kubelet.
        Once _locate_kubelet succeeds, initialize all kubelet-related
        URLs and settings.
        """
        if self.left_init_retries == 0:
            raise Exception(
                "Kubernetes client initialization failed permanently. "
                "Kubernetes-related features will fail.")

        now = time.time()

        # last retry was less than retry_interval ago
        if self.last_init_retry and now <= self.last_init_retry + self.init_retry_interval:
            return
        # else it's the first try, or last retry was long enough ago
        self.last_init_retry = now
        self.left_init_retries -= 1

        try:
            self.kubelet_api_url = self._locate_kubelet(instance)
        except Exception as ex:
            log.error(
                "Failed to initialize kubelet connection. Will retry %s time(s). Error: %s"
                % (self.left_init_retries, str(ex)))
            return
        if not self.kubelet_api_url:
            log.error(
                "Failed to initialize kubelet connection. Will retry %s time(s)."
                % self.left_init_retries)
            return

        self.init_success = True

        self.kubelet_host = self.kubelet_api_url.split(':')[1].lstrip('/')
        self.pods_list_url = urljoin(self.kubelet_api_url,
                                     KubeUtil.PODS_LIST_PATH)
        self.kube_health_url = urljoin(self.kubelet_api_url,
                                       KubeUtil.KUBELET_HEALTH_PATH)

        # namespace of the agent pod
        try:
            self.self_namespace = self.get_self_namespace()
        except Exception:
            log.warning(
                "Failed to get the agent pod namespace, defaulting to default."
            )
            self.self_namespace = DEFAULT_NAMESPACE

        # cadvisor
        self.cadvisor_port = instance.get('port',
                                          KubeUtil.DEFAULT_CADVISOR_PORT)
        self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host,
                                            self.cadvisor_port)
        self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH)
        self.machine_info_url = urljoin(self.cadvisor_url,
                                        KubeUtil.MACHINE_INFO_PATH)

    def _locate_kubelet(self, instance):
        """
        Kubelet may or may not accept un-authenticated http requests.
        If it doesn't we need to use its HTTPS API that may or may not
        require auth.
        Returns the kubelet URL or raises.
        """
        host = os.environ.get('KUBERNETES_KUBELET_HOST') or instance.get(
            "host")
        if not host:
            # if no hostname was provided, use the docker hostname if cert
            # validation is not required, the kubernetes hostname otherwise.
            docker_hostname = self.docker_util.get_hostname(
                should_resolve=True)
            if self.tls_settings.get('kubelet_verify'):
                try:
                    k8s_hostname = self.get_node_hostname(docker_hostname)
                    host = k8s_hostname or docker_hostname
                except Exception as ex:
                    log.error(str(ex))
                    host = docker_hostname
            else:
                host = docker_hostname

        # check if the no-auth endpoint is enabled
        port = instance.get('kubelet_port', KubeUtil.DEFAULT_HTTP_KUBELET_PORT)
        no_auth_url = 'http://%s:%s' % (host, port)
        test_url = urljoin(no_auth_url, KubeUtil.KUBELET_HEALTH_PATH)
        try:
            self.perform_kubelet_query(test_url)
            return no_auth_url
        except Exception:
            log.debug(
                "Couldn't query kubelet over HTTP, assuming it's not in no_auth mode."
            )

        port = instance.get('kubelet_port',
                            KubeUtil.DEFAULT_HTTPS_KUBELET_PORT)
        https_url = 'https://%s:%s' % (host, port)
        test_url = urljoin(https_url, KubeUtil.KUBELET_HEALTH_PATH)
        try:
            self.perform_kubelet_query(test_url)
            return https_url
        except Exception as ex:
            log.warning(
                "Couldn't query kubelet over HTTP, assuming it's not in no_auth mode."
            )
            raise ex

    def get_self_namespace(self):
        pods = self.retrieve_pods_list()
        for pod in pods.get('items', []):
            if pod.get('metadata', {}).get('name') == self.pod_name:
                return pod['metadata']['namespace']
        log.warning(
            "Couldn't find the agent pod and namespace, using the default.")
        return DEFAULT_NAMESPACE

    def get_node_hostname(self, host):
        """
        Query the API server for the kubernetes hostname of the node
        using the docker hostname as a filter.
        """
        node_filter = {'labelSelector': 'kubernetes.io/hostname=%s' % host}
        node = self.retrieve_json_auth(self.kubernetes_api_url + '/nodes?%s' %
                                       urlencode(node_filter)).json()
        if len(node['items']) != 1:
            log.error(
                'Error while getting node hostname: expected 1 node, got %s.' %
                len(node['items']))
        else:
            addresses = (node or {}).get('items',
                                         [{}])[0].get('status',
                                                      {}).get('addresses', [])
            for address in addresses:
                if address.get('type') == 'Hostname':
                    return address['address']
        return None

    def get_kube_pod_tags(self, excluded_keys=None):
        """
        Gets pods' labels as tags + creator and service tags.
        Returns a dict{namespace/podname: [tags]}
        """
        if not self.init_success:
            log.warning(
                "Kubernetes client is not initialized, can't get pod tags.")
            return {}
        pods = self.retrieve_pods_list()
        return self.extract_kube_pod_tags(pods, excluded_keys=excluded_keys)

    def extract_kube_pod_tags(self,
                              pods_list,
                              excluded_keys=None,
                              label_prefix=None):
        """
        Extract labels + creator and service tags from a list of
        pods coming from the kubelet API.

        :param excluded_keys: labels to skip
        :param label_prefix: prefix for label->tag conversion, None defaults
        to the configuration option label_to_tag_prefix
        Returns a dict{namespace/podname: [tags]}
        """
        excluded_keys = excluded_keys or []
        kube_labels = defaultdict(list)
        pod_items = pods_list.get("items") or []
        label_prefix = label_prefix or self.kube_label_prefix
        for pod in pod_items:
            metadata = pod.get("metadata", {})
            name = metadata.get("name")
            namespace = metadata.get("namespace")
            labels = metadata.get("labels", {})
            if name and namespace:
                key = "%s/%s" % (namespace, name)

                # Extract creator tags
                podtags = self.get_pod_creator_tags(metadata)

                # Extract services tags
                if self.collect_service_tag:
                    for service in self.match_services_for_pod(metadata):
                        if service is not None:
                            podtags.append(u'kube_service:%s' % service)

                # Extract labels
                for k, v in labels.iteritems():
                    if k in excluded_keys:
                        continue
                    podtags.append(u"%s%s:%s" % (label_prefix, k, v))

                kube_labels[key] = podtags

        return kube_labels

    def retrieve_pods_list(self):
        """
        Retrieve the list of pods for this cluster querying the kubelet API.

        TODO: the list of pods could be cached with some policy to be decided.
        """
        return self.perform_kubelet_query(self.pods_list_url).json()

    def retrieve_machine_info(self):
        """
        Retrieve machine info from Cadvisor.
        """
        return retrieve_json(self.machine_info_url)

    def retrieve_metrics(self):
        """
        Retrieve metrics from Cadvisor.
        """
        return retrieve_json(self.metrics_url)

    def get_deployment_for_replicaset(self, rs_name):
        """
        Get the deployment name for a given replicaset name
        For now, the rs name's first part always is the deployment's name, see
        https://github.com/kubernetes/kubernetes/blob/release-1.6/pkg/controller/deployment/sync.go#L299
        But it might change in a future k8s version. The other way to match RS and deployments is
        to parse and cache /apis/extensions/v1beta1/replicasets, mirroring PodServiceMapper
        In 1.8, the hash generation logic changed: https://github.com/kubernetes/kubernetes/pull/51538/files

        As we are matching both patterns without checking the apiserver version, we might have
        some false positives. For agent6, we plan on doing this pod->replicaset->deployment matching
        in the cluster agent, with replicaset data from the apiserver. This will address that risk.
        """
        end = rs_name.rfind("-")
        if end > 0 and rs_name[end + 1:].isdigit():
            # k8s before 1.8
            return rs_name[0:end]
        if end > 0 and len(rs_name[end + 1:]) == 10:
            # k8s 1.8+ maybe? Check contents
            for char in rs_name[end + 1:]:
                if char not in ALLOWED_ENCODESTRING_ALPHANUMS:
                    return None
            return rs_name[0:end]
        else:
            return None

    def perform_kubelet_query(self, url, verbose=True, timeout=10):
        """
        Perform and return a GET request against kubelet. Support auth and TLS validation.
        """
        tls_context = self.tls_settings

        headers = None
        cert = tls_context.get('kubelet_client_cert')
        verify = tls_context.get('kubelet_verify', DEFAULT_TLS_VERIFY)

        # if cert-based auth is enabled, don't use the token.
        if not cert and url.lower().startswith(
                'https') and 'bearer_token' in self.tls_settings:
            headers = {
                'Authorization':
                'Bearer {}'.format(self.tls_settings.get('bearer_token'))
            }

        return requests.get(url,
                            timeout=timeout,
                            verify=verify,
                            cert=cert,
                            headers=headers,
                            params={'verbose': verbose})

    def get_apiserver_auth_settings(self):
        """
        Kubernetes API requires authentication using a token available in
        every pod, or with a client X509 cert/key pair.
        We authenticate using the service account token by default
        and replace this behavior with cert authentication if the user provided
        a cert/key pair in the instance.

        We try to verify the server TLS cert if the public cert is available.
        """
        verify = self.tls_settings.get('apiserver_cacert')
        if not verify:
            verify = self.CA_CRT_PATH if os.path.exists(
                self.CA_CRT_PATH) else False
        log.debug('tls validation: {}'.format(verify))

        cert = self.tls_settings.get('apiserver_client_cert')
        bearer_token = self.tls_settings.get(
            'bearer_token') if not cert else None
        headers = {
            'Authorization': 'Bearer {}'.format(bearer_token)
        } if bearer_token else {}
        headers['content-type'] = 'application/json'
        return cert, headers, verify

    def retrieve_json_auth(self, url, params=None, timeout=3):
        cert, headers, verify = self.get_apiserver_auth_settings()
        res = requests.get(url,
                           timeout=timeout,
                           headers=headers,
                           verify=verify,
                           cert=cert,
                           params=params)
        res.raise_for_status()
        return res

    def post_json_to_apiserver(self, url, data, timeout=3):
        cert, headers, verify = self.get_apiserver_auth_settings()
        res = requests.post(url,
                            timeout=timeout,
                            headers=headers,
                            verify=verify,
                            cert=cert,
                            data=json.dumps(data))
        res.raise_for_status()
        return res

    def put_json_to_apiserver(self, url, data, timeout=3):
        cert, headers, verify = self.get_apiserver_auth_settings()
        res = requests.put(url,
                           timeout=timeout,
                           headers=headers,
                           verify=verify,
                           cert=cert,
                           data=json.dumps(data))
        res.raise_for_status()
        return res

    def delete_to_apiserver(self, url, timeout=3):
        cert, headers, verify = self.get_apiserver_auth_settings()
        res = requests.delete(url,
                              timeout=timeout,
                              headers=headers,
                              verify=verify,
                              cert=cert)
        res.raise_for_status()
        return res

    def get_node_info(self):
        """
        Return the IP address and the hostname of the node where the pod is running.
        """
        if None in (self._node_ip, self._node_name):
            self._fetch_host_data()
        return self._node_ip, self._node_name

    def get_node_metadata(self):
        """Returns host metadata about the local k8s node"""
        meta = {}

        # API server version
        try:
            request_url = "%s/version" % self.kubernetes_api_root_url
            master_info = self.retrieve_json_auth(request_url).json()
            version = master_info.get("gitVersion")
            meta['kube_master_version'] = version[1:]
        except Exception as ex:
            # Intentional use of non-safe lookups to get the exception in the debug logs
            # if the parsing were to fail
            log.debug("Error getting Kube master version: %s" % str(ex))

        # Kubelet version & labels
        if not self.init_success:
            log.warning(
                "Kubelet client failed to initialize, kubelet host tags will be missing for now."
            )
            return meta
        try:
            _, node_name = self.get_node_info()
            if not node_name:
                raise ValueError("node name missing or empty")
            request_url = "%s/nodes/%s" % (self.kubernetes_api_url, node_name)
            node_info = self.retrieve_json_auth(request_url).json()
            version = node_info.get("status").get("nodeInfo").get(
                "kubeletVersion")
            meta['kubelet_version'] = version[1:]
        except Exception as ex:
            log.debug("Error getting Kubelet version: %s" % str(ex))

        return meta

    def get_node_hosttags(self):
        """
        Returns node labels as tags. Tag name is transformed as defined
        in node_labels_to_host_tags in the kubernetes check configuration.
        Note: queries the API server for node info. Configure RBAC accordingly.
        """
        tags = []

        try:
            _, node_name = self.get_node_info()
            if not node_name:
                raise ValueError("node name missing or empty")

            request_url = "%s/nodes/%s" % (self.kubernetes_api_url, node_name)
            node_info = self.retrieve_json_auth(request_url).json()
            node_labels = node_info.get('metadata', {}).get('labels', {})

            for l_name, t_name in self.kube_node_labels.iteritems():
                if l_name in node_labels:
                    tags.append('%s:%s' % (t_name, node_labels[l_name]))

        except Exception as ex:
            log.debug("Error getting node labels: %s" % str(ex))

        return tags

    def _fetch_host_data(self):
        """
        Retrieve host name and IP address from the payload returned by the listing
        pods endpoints from kubelet.

        The host IP address is different from the default router for the pod.
        """
        try:
            pod_items = self.retrieve_pods_list().get("items") or []
        except Exception as e:
            log.warning(
                "Unable to retrieve pod list %s. Not fetching host data",
                str(e))
            return

        for pod in pod_items:
            metadata = pod.get("metadata", {})
            name = metadata.get("name")
            if name == self.pod_name:
                status = pod.get('status', {})
                spec = pod.get('spec', {})
                # if not found, use an empty string - we use None as "not initialized"
                self._node_ip = status.get('hostIP', '')
                self._node_name = spec.get('nodeName', '')
                break

    def extract_event_tags(self, event):
        """
        Return a list of tags extracted from an event object
        """
        tags = []

        if 'reason' in event:
            tags.append('reason:%s' % event.get('reason', '').lower())
        if 'namespace' in event.get('metadata', {}):
            tags.append('namespace:%s' % event['metadata']['namespace'])
        if 'host' in event.get('source', {}):
            tags.append('node_name:%s' % event['source']['host'])
        if 'kind' in event.get('involvedObject', {}):
            tags.append('object_type:%s' %
                        event['involvedObject'].get('kind', '').lower())
        if 'name' in event.get('involvedObject', {}):
            tags.append('object_name:%s' %
                        event['involvedObject'].get('name', '').lower())
        if 'component' in event.get('source', {}):
            tags.append('source_component:%s' %
                        event['source'].get('component', '').lower())

        return tags

    def are_tags_filtered(self, tags):
        """
        Because it is a pain to call it from the kubernetes check otherwise.
        """
        return self.docker_util.are_tags_filtered(tags)

    @classmethod
    def get_auth_token(cls, instance):
        """
        Return a string containing the authorization token for the pod.
        """

        token_path = instance.get('bearer_token_path', cls.AUTH_TOKEN_PATH)
        try:
            with open(token_path) as f:
                return f.read().strip()
        except IOError as e:
            log.error('Unable to read token from {}: {}'.format(token_path, e))

        return None

    def match_services_for_pod(self, pod_metadata, refresh=False):
        """
        Match the pods labels with services' label selectors to determine the list
        of services that point to that pod. Returns an array of service names.

        Pass refresh=True if you want to bypass the cached cid->services mapping (after a service change)
        """
        s = self._service_mapper.match_services_for_pod(pod_metadata,
                                                        refresh,
                                                        names=True)
        #log.warning("Matches for %s: %s" % (pod_metadata.get('name'), str(s)))
        return s

    def get_event_retriever(self, namespaces=None, kinds=None, delay=None):
        """
        Returns a KubeEventRetriever object ready for action
        """
        return KubeEventRetriever(self, namespaces, kinds, delay)

    def match_containers_for_pods(self, pod_uids, podlist=None):
        """
        Reads a set of pod uids and returns the set of docker
        container ids they manage
        podlist should be a recent self.retrieve_pods_list return value,
        if not given that method will be called
        """
        cids = set()

        if not isinstance(pod_uids, set) or len(pod_uids) < 1:
            return cids

        if podlist is None:
            podlist = self.retrieve_pods_list()

        for pod in podlist.get('items', {}):
            uid = pod.get('metadata', {}).get('uid', None)
            if uid in pod_uids:
                for container in pod.get('status',
                                         {}).get('containerStatuses', None):
                    id = container.get('containerID', "")
                    if id.startswith("docker://"):
                        cids.add(id[9:])

        return cids

    def get_pod_creator(self, pod_metadata):
        """
        Get the pod's creator from its metadata and returns a
        tuple (creator_kind, creator_name)

        This allows for consitency across code path
        """
        try:
            created_by = json.loads(
                pod_metadata['annotations']['kubernetes.io/created-by'])
            creator_kind = created_by.get('reference', {}).get('kind')
            creator_name = created_by.get('reference', {}).get('name')
            return (creator_kind, creator_name)
        except Exception:
            log.debug('Could not parse creator for pod ' +
                      pod_metadata.get('name', ''))
            return (None, None)

    def get_pod_creator_tags(self,
                             pod_metadata,
                             legacy_rep_controller_tag=False):
        """
        Get the pod's creator from its metadata and returns a list of tags
        in the form kube_$kind:$name, ready to add to the metrics
        """
        try:
            tags = []
            creator_kind, creator_name = self.get_pod_creator(pod_metadata)
            if creator_kind in CREATOR_KIND_TO_TAG and creator_name:
                tags.append("%s:%s" %
                            (CREATOR_KIND_TO_TAG[creator_kind], creator_name))
                if creator_kind == 'ReplicaSet':
                    deployment = self.get_deployment_for_replicaset(
                        creator_name)
                    if deployment:
                        tags.append(
                            "%s:%s" %
                            (CREATOR_KIND_TO_TAG['Deployment'], deployment))
            if legacy_rep_controller_tag and creator_kind != 'ReplicationController' and creator_name:
                tags.append(
                    'kube_replication_controller:{0}'.format(creator_name))

            return tags
        except Exception:
            log.warning('Could not parse creator tags for pod ' +
                        pod_metadata.get('name'))
            return []

    def process_events(self, event_array, podlist=None):
        """
        Reads a list of kube events, invalidates caches and and computes a set
        of containers impacted by the changes, to refresh service discovery
        Pod creation/deletion events are ignored for now, as docker_daemon already
        sends container creation/deletion events to SD

        Pod->containers matching is done using match_containers_for_pods
        """
        try:
            pods = set()
            if self._service_mapper:
                pods.update(self._service_mapper.process_events(event_array))
            return self.match_containers_for_pods(pods, podlist)
        except Exception as e:
            log.warning("Error processing events %s: %s" %
                        (str(event_array), e))
            return set()

    def refresh_leader(self):
        if not self.init_success:
            log.warning(
                "Kubelet client is not initialized, leader election is disabled."
            )
            return
        if not self.leader_elector:
            self.leader_elector = LeaderElector(self)
        self.leader_elector.try_acquire_or_refresh()

    def image_name_resolver(self, image):
        """
        Wraps around the sibling dockerutil method and catches exceptions
        """
        if image is None:
            return None
        try:
            return self.docker_util.image_name_resolver(image)
        except Exception as e:
            log.warning("Error resolving image name: %s", str(e))
            return image
Example #39
0
    def init(self):
        try:
            instance = self.instances[0]

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            if self.is_k8s():
                self.kubeutil = KubeUtil()
            self._mountpoints = self.docker_util.get_mountpoints(
                CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning(
                        "You must specify an exclude section to enable filtering"
                    )
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(
                    include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception, e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
Example #40
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                "Docker check only supports one configured instance.")
        AgentCheck.__init__(self,
                            name,
                            init_config,
                            agentConfig,
                            instances=instances)

        self.init_success = False
        self.init()
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'

    def is_k8s(self):
        return 'KUBERNETES_PORT' in os.environ

    def init(self):
        try:
            instance = self.instances[0]

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            if self.is_k8s():
                self.kubeutil = KubeUtil()
            self._mountpoints = self.docker_util.get_mountpoints(
                CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning(
                        "You must specify an exclude section to enable filtering"
                    )
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(
                    include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception, e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
Example #41
0
    "image_name",
    "image_tag",
]

DEFAULT_PERFORMANCE_TAGS = [
    "container_name",
    "docker_image",
    "image_name",
    "image_tag",
]

DEFAULT_IMAGE_TAGS = ['image_name', 'image_tag']

TAG_EXTRACTORS = {
    "docker_image": lambda c: [c["Image"]],
    "image_name": lambda c: DockerUtil.image_tag_extractor(c, 0),
    "image_tag": lambda c: DockerUtil.image_tag_extractor(c, 1),
    "container_command": lambda c: [c["Command"]],
    "container_name": DockerUtil.container_name_extractor,
}

CONTAINER = "container"
PERFORMANCE = "performance"
FILTERED = "filtered"
IMAGE = "image"


def get_filters(include, exclude):
    # The reasoning is to check exclude first, so we can skip if there is no exclude
    if not exclude:
        return
Example #42
0
class TestProxy(AsyncTestCase):
    @attr(requires='core_integration')
    def test_proxy(self):
        config = {
            "endpoints": {
                "https://app.datadoghq.com": ["foo"]
            },
            "proxy_settings": {
                "host": "localhost",
                "port": PROXY_PORT,
                "user": None,
                "password": None
            }
        }

        app = Application()
        app.skip_ssl_validation = True
        app._agentConfig = config

        trManager = TransactionManager(MAX_WAIT_FOR_REPLAY, MAX_QUEUE_SIZE,
                                       THROTTLING_DELAY)
        trManager._flush_without_ioloop = True  # Use blocking API to emulate tornado ioloop
        CustomAgentTransaction.set_tr_manager(trManager)
        app.use_simple_http_client = False  # We need proxy capabilities
        app.agent_dns_caching = False
        # _test is the instance of this class. It is needed to call the method stop() and deal with the asynchronous
        # calls as described here : http://www.tornadoweb.org/en/stable/testing.html
        CustomAgentTransaction._test = self
        CustomAgentTransaction.set_application(app)
        CustomAgentTransaction.set_endpoints(config['endpoints'])

        CustomAgentTransaction('body', {},
                               "")  # Create and flush the transaction
        self.wait(timeout=30)
        del CustomAgentTransaction._test
        access_log = self.docker_client.exec_start(
            self.docker_client.exec_create(
                CONTAINER_NAME, 'cat /var/log/squid/access.log')['Id'])
        self.assertTrue(
            "CONNECT"
            in access_log)  # There should be an entry in the proxy access log
        self.assertEquals(
            len(trManager._endpoints_errors),
            1)  # There should be an error since we gave a bogus api_key

    def setUp(self):
        super(TestProxy, self).setUp()
        self.docker_client = DockerUtil().client

        self.docker_client.pull(CONTAINER_TO_RUN)

        self.container = self.docker_client.create_container(
            CONTAINER_TO_RUN,
            detach=True,
            name=CONTAINER_NAME,
            ports=[PROXY_PORT],
            host_config=self.docker_client.create_host_config(
                port_bindings={3128: PROXY_PORT}))
        log.info("Starting container: {0}".format(CONTAINER_TO_RUN))
        self.docker_client.start(CONTAINER_NAME)
        for line in self.docker_client.logs(CONTAINER_NAME,
                                            stdout=True,
                                            stream=True):
            if "Accepting HTTP Socket connections" in line:
                break  # Wait for the container to properly start, otherwise we get 'Proxy CONNECT aborted'

    def tearDown(self):
        log.info("Stopping container: {0}".format(CONTAINER_TO_RUN))
        self.docker_client.remove_container(CONTAINER_NAME, force=True)
        super(TestProxy, self).tearDown()
Example #43
0
 def setUp(self):
     self.dockerutil = DockerUtil()
Example #44
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    from utils.dockerutil import DockerUtil
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    # Try to get GCE instance name
    if hostname is None:
        gce_hostname = GCE.get_hostname(config)
        if gce_hostname is not None:
            if is_valid_hostname(gce_hostname):
                return gce_hostname

    # Try to get the docker hostname
    docker_util = DockerUtil(agentConfig=config)
    if hostname is None and docker_util.is_dockerized():
        docker_hostname = docker_util.get_hostname()
        if docker_hostname is not None and is_valid_hostname(docker_hostname):
            hostname = docker_hostname

    # then move on to os-specific detection
    if hostname is None:
        def _get_hostname_unix():
            try:
                # try fqdn
                out, _, rtcode = get_subprocess_output(['/bin/hostname', '-f'], log)
                if rtcode == 0:
                    return out.strip()
            except Exception:
                return None

        os_name = get_os()
        if os_name in ['mac', 'freebsd', 'linux', 'solaris']:
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if we have an ec2 default hostname, see if there's an instance-id available
    if (Platform.is_ecs_instance()) or (hostname is not None and EC2.is_default(hostname)):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname

    if hostname is None:
        log.critical('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file')
        raise Exception('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file')
    else:
        return hostname
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error(
                    "Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s"
                    % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _make_fetch_state(self):
        pod_list = []
        if Platform.is_k8s():
            if not self.kubeutil or not self.kubeutil.init_success:
                log.error(
                    "kubelet client not initialized, cannot retrieve pod list."
                )
            else:
                try:
                    pod_list = self.kubeutil.retrieve_pods_list().get(
                        'items', [])
                except Exception as ex:
                    log.warning("Failed to retrieve pod list: %s" % str(ex))
        return _SDDockerBackendConfigFetchState(
            self.dockerutil.client.inspect_container, pod_list)

    def update_checks(self, changed_containers):
        """
        Takes a list of container IDs that changed recently
        and marks their corresponding checks as
        """
        if not self.dockerutil.client:
            log.warning(
                "Docker client is not initialized, pausing auto discovery.")
            return

        state = self._make_fetch_state()

        conf_reload_set = set()
        for c_id in changed_containers:
            checks = self._get_checks_to_refresh(state, c_id)
            if checks:
                conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the STACKSTATE_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                    and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        labels = inspect.get('Config', {}).get('Labels', {})
        identifier = labels.get(STACKSTATE_ID) or \
            self.dockerutil.image_name_extractor(inspect)

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels
        return self.config_store.get_checks_to_refresh(identifier,
                                                       **platform_kwargs)

    def _get_container_pid(self, state, cid, tpl_var):
        """Extract the host-namespace pid of the container pid 0"""
        pid = state.inspect_container(cid).get('State', {}).get('Pid')
        if not pid:
            return None

        return str(pid)

    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id = c_inspect.get('Id', '')
        c_img = self.dockerutil.image_name_extractor(c_inspect)

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
                  "networks, trying with the IPAddress field" %
                  (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        if Platform.is_rancher():
            # try to get the rancher IP address
            log.debug("No IP address was found in container %s (%s) "
                      "trying with the Rancher label" % (c_id[:12], c_img))

            ip_addr = c_inspect.get('Config',
                                    {}).get('Labels',
                                            {}).get(RANCHER_CONTAINER_IP)
            if ip_addr:
                return ip_addr.split('/')[0]

        log.error("No IP address was found for container %s (%s)" %
                  (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.debug("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning(
                    "The key passed for template variable %s was not found." %
                    tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.debug("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.debug("Trying with the last (sorted) network: '%s'." %
                      last_key)
            return ip_dict[last_key]

    def _get_port(self, state, c_id, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        container_inspect = state.inspect_container(c_id)

        try:
            ports = map(lambda x: x.split('/')[0],
                        container_inspect['NetworkSettings']['Ports'].keys())
            if len(
                    ports
            ) == 0:  # There might be a key Port in NetworkSettings but no ports so we raise IndexError to check in ExposedPorts
                raise IndexError
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(
                lambda x: x.split('/')[0],
                container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug(
                    "Didn't find the port for container %s (%s), trying the kubernetes way."
                    % (c_id[:12], container_inspect.get('Config', {}).get(
                        'Image', '')))
                spec = state.get_kube_container_spec(c_id)
                if spec:
                    ports = [
                        str(x.get('containerPort'))
                        for x in spec.get('ports', [])
                    ]
        ports = sorted(ports, key=int)
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error(
                "Port index is not an integer. Using the last element instead."
            )
        except IndexError:
            log.error(
                "Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        c_inspect = state.inspect_container(c_id)
        tags = self.dockerutil.extract_container_tags(c_inspect)

        if Platform.is_k8s():
            if not self.kubeutil.init_success:
                log.warning(
                    "kubelet client not initialized, kubernetes tags will be missing."
                )
                return tags

            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags will be missing." % c_id[:12])
                return tags

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            if not self.kubeutil:
                log.warning("The agent can't connect to kubelet, creator and "
                            "service tags will be missing for container %s." %
                            c_id[:12])
            else:
                # add creator tags
                creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
                tags.extend(creator_tags)

                # add services tags
                if self.kubeutil.collect_service_tag:
                    services = self.kubeutil.match_services_for_pod(
                        pod_metadata)
                    for s in services:
                        if s is not None:
                            tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = c_inspect.get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            service_name = c_inspect.get('Config',
                                         {}).get('Labels',
                                                 {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config',
                                       {}).get('Labels',
                                               {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get(
                'Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        if self.metadata_collector.has_detected():
            orch_tags = self.metadata_collector.get_container_tags(
                co=c_inspect)
            tags.extend(orch_tags)

        return tags

    def _get_container_name(self, state, c_id, tpl_var):
        container_inspect = state.inspect_container(c_id)
        return container_inspect.get('Name', '').lstrip('/')

    def _get_additional_tags(self, state, c_id, *args):
        tags = []

        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')
            pod_spec = state.get_kube_config(c_id, 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning(
                    "Failed to fetch pod metadata or pod spec for container %s."
                    " Additional Kubernetes tags may be missing." % c_id[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))

            c_inspect = state.inspect_container(c_id)
            c_name = c_inspect.get('Config', {}).get('Labels', {}).get(
                KubeUtil.CONTAINER_NAME_LABEL)
            if c_name:
                tags.append('kube_container_name:%s' % c_name)
        return tags

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        if not self.dockerutil.client:
            log.warning(
                "Docker client is not initialized, pausing auto discovery.")
            return configs

        state = self._make_fetch_state()
        containers = [(self.dockerutil.image_name_extractor(container),
                       container.get('Id'), container.get('Labels'))
                      for container in self.dockerutil.client.containers()]

        for image, cid, labels in containers:
            try:
                # value of the STACKSTATE_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(
                    state, cid, identifier, labels) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if isinstance(instance, list):
                            configs[check_name] = (source, (init_config,
                                                            instance))
                        else:
                            configs[check_name] = (source, (init_config,
                                                            [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        if isinstance(instance, list):
                            for inst in instance:
                                configs[check_name][1][1].append(inst)
                        else:
                            configs[check_name][1][1].append(instance)
            except Exception:
                log.exception(
                    'Building config for container %s based on image %s using service '
                    'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a STACKSTATE_ID label, return its value or the image name if missing"""
        return labels.get(STACKSTATE_ID) or image

    def _get_check_configs(self, state, c_id, identifier, labels=None):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels

        config_templates = self._get_config_templates(identifier,
                                                      **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # covering mono-instance and multi-instances cases
            tmpl_array = instance_tpl
            if not isinstance(instance_tpl, list):
                tmpl_array = [instance_tpl]

            # insert tags in instance_tpl and process values for template variables
            result_instances = []
            result_init_config = None
            for inst_tmpl in tmpl_array:
                instance_tpl, var_values = self._fill_tpl(
                    state, c_id, inst_tmpl, variables, tags)
                tpl = self._render_template(init_config_tpl or {}, instance_tpl
                                            or {}, var_values)
                if tpl and len(tpl) == 2:
                    init_config, instance = tpl
                    result_instances.append(instance)
                    if not result_init_config:
                        result_init_config = init_config
                    elif result_init_config != init_config:
                        self.log.warning(
                            "Different versions of `init_config` found for "
                            "check {}. Keeping the first one found.".format(
                                'check_name'))
            check_configs.append(
                (source, (check_name, result_init_config, result_instances)))

        return check_configs

    def _get_config_templates(self, identifier, **platform_kwargs):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
        else:
            auto_conf = False

        # format [(source, ('ident', {init_tpl}, {instance_tpl}))]
        raw_tpls = self.config_store.get_check_tpls(identifier,
                                                    auto_conf=auto_conf,
                                                    **platform_kwargs)
        for tpl in raw_tpls:
            # each template can come from either auto configuration or user-supplied templates
            try:
                source, (check_name, init_config_tpl, instance_tpl) = tpl
            except (TypeError, IndexError, ValueError):
                log.debug(
                    'No template was found for identifier %s, leaving it alone: %s'
                    % (identifier, tpl))
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict) and not isinstance(
                        instance_tpl, list):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception(
                    'Failed to decode the JSON template fetched for check {0}. Its configuration'
                    ' by service discovery failed for ident  {1}.'.format(
                        check_name, identifier))
                return None

            templates.append((source, (check_name, init_config_tpl,
                                       instance_tpl, variables)))

        return templates

    def _fill_tpl(self, state, c_id, instance_tpl, variables, c_tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_image = state.inspect_container(c_id).get('Config',
                                                    {}).get('Image', '')

        # add only default c_tags to the instance to avoid duplicate tags from conf
        if c_tags:
            tags = c_tags[:]  # shallow copy of the c_tags array
        else:
            tags = []
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            if isinstance(tpl_tags, dict):
                for key, val in tpl_tags.iteritems():
                    tags.append("{}:{}".format(key, val))
            else:
                tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](state, c_id, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." %
                                         var)
                    var_values[var] = res
                except Exception as ex:
                    log.error(
                        "Could not find a value for the template variable %s for container %s "
                        "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error(
                    "No method was found to interpolate template variable %s for container %s "
                    "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
Example #46
0
    def __init__(self, **kwargs):
        self.docker_util = DockerUtil()
        if 'init_config' in kwargs and 'instance' in kwargs:
            init_config = kwargs.get('init_config', {})
            instance = kwargs.get('instance', {})
        else:
            try:
                config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
                check_config = check_yaml(config_file_path)
                init_config = check_config['init_config'] or {}
                instance = check_config['instances'][0] or {}
            # kubernetes.yaml was not found
            except IOError as ex:
                log.error(ex.message)
                init_config, instance = {}, {}
            except Exception:
                log.error(
                    'Kubernetes configuration file is invalid. '
                    'Trying connecting to kubelet with default settings anyway...'
                )
                init_config, instance = {}, {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self._node_ip = self._node_name = None  # lazy evaluation
        self.host_name = os.environ.get('HOSTNAME')
        self.pod_name = os.environ.get('KUBERNETES_POD_NAME') or self.host_name
        self.tls_settings = self._init_tls_settings(instance)

        # apiserver
        if 'api_server_url' in instance:
            self.kubernetes_api_root_url = instance.get('api_server_url')
        else:
            master_host = os.environ.get(
                'KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME
            master_port = os.environ.get(
                'KUBERNETES_SERVICE_PORT') or self.DEFAULT_MASTER_PORT
            self.kubernetes_api_root_url = 'https://%s:%s' % (master_host,
                                                              master_port)

        self.kubernetes_api_url = '%s/api/v1' % self.kubernetes_api_root_url

        # Service mapping helper class
        self._service_mapper = PodServiceMapper(self)
        from config import _is_affirmative
        self.collect_service_tag = _is_affirmative(
            instance.get('collect_service_tags',
                         KubeUtil.DEFAULT_COLLECT_SERVICE_TAG))

        # leader status triggers event collection
        self.is_leader = False
        self.leader_elector = None
        self.leader_lease_duration = instance.get('leader_lease_duration')

        # kubelet
        # If kubelet_api_url is None, init_kubelet didn't succeed yet.
        self.init_success = False
        self.kubelet_api_url = None
        self.init_retry_interval = init_config.get('init_retry_interval',
                                                   DEFAULT_RETRY_INTERVAL)
        self.last_init_retry = None
        self.left_init_retries = init_config.get('init_retries',
                                                 DEFAULT_INIT_RETRIES) + 1
        self.init_kubelet(instance)

        self.kube_label_prefix = instance.get('label_to_tag_prefix',
                                              KubeUtil.DEFAULT_LABEL_PREFIX)
        self.kube_node_labels = instance.get('node_labels_to_host_tags', {})

        # keep track of the latest k8s event we collected and posted
        # default value is 0 but TTL for k8s events is one hour anyways
        self.last_event_collection_ts = 0
def reset_docker_settings():
    """Populate docker settings with default, dummy settings"""
    DockerUtil().set_docker_settings({}, {})
Example #48
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error(
                        "Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s"
                        % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(
                CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get(
                'health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(
                    health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(
                instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(
                instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(
                instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}
            self.ecs_agent_local = None

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
    def test_include_filter(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.system', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.cpu.user', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]), ('docker.image.size', ['image_name:redis',
                                       'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.io.read_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.io.write_bytes', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.net.bytes_rcvd', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ]),
            ('docker.net.bytes_sent', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest', 'docker_network:bridge'
            ])
        ]
        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "include": ["image_name:redis"],
                    "exclude": [".*"],
                    "collect_images_stats": True,
                    "collect_image_size": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check_twice(config, force_reload=True)

        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)

        perf_metrics = [
            "docker.cpu.system", "docker.cpu.user", "docker.io.read_bytes",
            "docker.io.write_bytes", "docker.mem.cache", "docker.mem.rss",
            "docker.net.bytes_rcvd", "docker.net.bytes_sent"
        ]

        nginx_tags = [
            'container_name:test-new-nginx-latest',
            'docker_image:nginx:latest', 'image_name:nginx', 'image_tag:latest'
        ]
        for m in perf_metrics:
            self.assertMetric(mname, tags=nginx_tags, count=0)
Example #50
0
 def is_swarm():
     from utils.dockerutil import DockerUtil
     return DockerUtil().is_swarm()
    def test_labels_collection(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.limit', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
            ('docker.mem.in_use', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest', 'label1:nginx'
            ]),
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_labels_as_tags": ["label1"],
                    "collect_image_size": True,
                    "collect_images_stats": True,
                    "collect_container_count": True,
                    "collect_dead_container_count": True,
                    "collect_exited_container_count": True,
                    "collect_volume_count": True,
                    "collect_dangling_volume_count": True,
                },
            ],
        }
        DockerUtil._drop()
        DockerUtil(init_config=config['init_config'],
                   instance=config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)
Example #52
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    # Try to get GCE instance name
    gce_hostname = GCE.get_hostname(config)
    if gce_hostname is not None:
        if is_valid_hostname(gce_hostname):
            return gce_hostname

    # Try to get the docker hostname
    if Platform.is_containerized():

        # First we try from the Docker API
        docker_util = DockerUtil()
        docker_hostname = docker_util.get_hostname(use_default_gw=False)
        if docker_hostname is not None and is_valid_hostname(docker_hostname):
            hostname = docker_hostname

        elif Platform.is_k8s():  # Let's try from the kubelet
            try:
                kube_util = KubeUtil()
            except Exception as ex:
                log.error("Couldn't instantiate the kubernetes client, "
                          "getting the k8s hostname won't work. Error: %s" %
                          str(ex))
            else:
                _, kube_hostname = kube_util.get_node_info()
                if kube_hostname is not None and is_valid_hostname(
                        kube_hostname):
                    hostname = kube_hostname

    # then move on to os-specific detection
    if hostname is None:
        if Platform.is_unix() or Platform.is_solaris():
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if we don't have a hostname, or we have an ec2 default hostname,
    # see if there's an instance-id available
    if not Platform.is_windows() and (hostname is None
                                      or Platform.is_ecs_instance()
                                      or EC2.is_default(hostname)):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname

    if hostname is None:
        log.critical(
            'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file'
        )
        raise Exception(
            'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file'
        )

    return hostname
    def test_container_size(self):
        expected_metrics = [
            ('docker.containers.running', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.containers.running', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.containers.stopped', [
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.image.size', ['image_name:redis', 'image_tag:latest']),
            ('docker.image.size', ['image_name:nginx', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:redis', 'image_tag:latest']),
            ('docker.image.virtual_size',
             ['image_name:nginx', 'image_tag:latest']),
            ('docker.images.available', None),
            ('docker.images.intermediate', None),
            ('docker.mem.cache', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.cache', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.rss', [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ('docker.mem.limit', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ('docker.mem.in_use', [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            # Container size metrics
            ("docker.container.size_rootfs", [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
            ("docker.container.size_rootfs", [
                'container_name:test-new-redis-latest',
                'docker_image:redis:latest', 'image_name:redis',
                'image_tag:latest'
            ]),
            ("docker.container.size_rw", [
                'container_name:test-new-nginx-latest',
                'docker_image:nginx:latest', 'image_name:nginx',
                'image_tag:latest'
            ]),
        ]

        config = {
            "init_config": {},
            "instances": [
                {
                    "url": "unix://var/run/docker.sock",
                    "collect_container_size": True,
                    "collect_image_size": True,
                    "collect_images_stats": True,
                },
            ],
        }
        DockerUtil().set_docker_settings(config['init_config'],
                                         config['instances'][0])

        self.run_check(config, force_reload=True)
        for mname, tags in expected_metrics:
            self.assertMetric(mname, tags=tags, count=1, at_least=1)
Example #54
0
 def _get_cgroup_from_proc(self, cgroup, pid, filename):
     """Find a specific cgroup file, containing metrics to extract."""
     params = {
         "file": filename,
     }
     return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)
Example #55
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                "Docker check only supports one configured instance.")
        AgentCheck.__init__(self,
                            name,
                            init_config,
                            agentConfig,
                            instances=instances)

        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error(
                        "Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s"
                        % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(
                CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get(
                'health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(
                    health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(
                instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(
                instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(
                instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}
            self.ecs_agent_local = None

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()
            if not self.init_success:
                # Initialization failed, will try later
                return

        # Report image metrics
        if self.collect_image_stats:
            self._count_and_weigh_images()

        if self.collect_ecs_tags:
            self.refresh_ecs_tags()

        if Platform.is_k8s():
            self.kube_labels = {}
            if self.kubeutil:
                try:
                    self.kube_labels = self.kubeutil.get_kube_labels()
                except Exception as e:
                    self.log.warning(
                        'Could not retrieve kubernetes labels: %s' % str(e))

        # containers running with custom cgroups?
        custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

        # Get the list of containers and the index of their names
        health_service_checks = True if self.whitelist_patterns else False
        containers_by_id = self._get_and_count_containers(
            custom_cgroups, health_service_checks)
        containers_by_id = self._crawl_container_pids(containers_by_id,
                                                      custom_cgroups)

        # Send events from Docker API
        if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes:
            self._process_events(containers_by_id)

        # Report performance container metrics (cpu, mem, net, io)
        self._report_performance_metrics(containers_by_id)

        if self.collect_container_size:
            self._report_container_size(containers_by_id)

        if self.collect_container_count:
            self._report_container_count(containers_by_id)

        if self.collect_volume_count:
            self._report_volume_count()

        # Collect disk stats from Docker info command
        if self.collect_disk_stats:
            self._report_disk_stats()

        if health_service_checks:
            self._send_container_healthcheck_sc(containers_by_id)

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(
                self.docker_client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate",
                       (all_images_len - active_images_len),
                       tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning(
                "Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self,
                                  custom_cgroups=False,
                                  healthchecks=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query +
                                   1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True,
                                                       size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(
                    sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug(
                    "Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups or healthchecks:
                try:
                    inspect_dict = self.docker_client.inspect_container(
                        container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                    container['health'] = inspect_dict['State'].get(
                        'Health', {})
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)

        # TODO: deprecate these 2, they should be replaced by _report_container_count
        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped",
                       stopped_count,
                       tags=list(tags))

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith(
            "Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s(
        ) and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        # Collect container names as tags on rancher
        if Platform.is_rancher():
            if RANCHER_CONTAINER_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_CONTAINER_NAME)
            if RANCHER_SVC_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_SVC_NAME)
            if RANCHER_STACK_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_STACK_NAME)

        if entity is not None:
            pod_name = None
            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(
                                    pod_name.split("-")[:-1])
                                if "/" in replication_controller:  # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split(
                                        "/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels:  # k8s >= 1.2
                                    namespace = labels[
                                        KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(
                                        namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" %
                                            replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif k == SWARM_SVC_LABEL and Platform.is_swarm():
                            if v:
                                tags.append("swarm_service:%s" % v)
                        elif k == RANCHER_CONTAINER_NAME and Platform.is_rancher(
                        ):
                            if v:
                                tags.append('rancher_container:%s' % v)
                        elif k == RANCHER_SVC_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_service:%s' % v)
                        elif k == RANCHER_STACK_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_stack:%s' % v)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k, v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s(
                    ) and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add ECS tags
            if self.collect_ecs_tags:
                entity_id = entity.get("Id")
                if entity_id in self.ecs_tags:
                    ecs_tags = self.ecs_tags[entity_id]
                    tags.extend(ecs_tags)

            # Add kube labels
            if Platform.is_k8s():
                kube_tags = self.kube_labels.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def _is_ecs_agent_local(self):
        """Return True if we can reach the ecs-agent over localhost, False otherwise.
        This is needed because if the ecs-agent is started with --net=host it won't have an IP address attached.
        """
        if self.ecs_agent_local is not None:
            return self.ecs_agent_local

        self.ecs_agent_local = False
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(5)
        try:
            result = sock.connect_ex(
                ('localhost', ECS_INTROSPECT_DEFAULT_PORT))
        except Exception as e:
            self.log.debug(
                "Unable to connect to ecs-agent. Exception: {0}".format(e))
        else:
            if result == 0:
                self.ecs_agent_local = True
            else:
                self.log.debug(
                    "ecs-agent is not available locally, encountered error code: {0}"
                    .format(result))
        sock.close()
        return self.ecs_agent_local

    def refresh_ecs_tags(self):
        ecs_config = self.docker_client.inspect_container('ecs-agent')
        ip = ecs_config.get('NetworkSettings', {}).get('IPAddress')
        ports = ecs_config.get('NetworkSettings', {}).get('Ports')
        port = ports.keys()[0].split('/')[0] if ports else None
        if not ip:
            port = ECS_INTROSPECT_DEFAULT_PORT
            if self._is_ecs_agent_local():
                ip = "localhost"
            elif Platform.is_containerized() and self.docker_gateway:
                ip = self.docker_gateway
            else:
                self.log.warning(
                    "Unable to determine ecs-agent IP address, skipping task tagging"
                )
                return

        ecs_tags = {}
        try:
            if ip and port:
                tasks = requests.get('http://%s:%s/v1/tasks' %
                                     (ip, port)).json()
                for task in tasks.get('Tasks', []):
                    for container in task.get('Containers', []):
                        tags = [
                            'task_name:%s' % task['Family'],
                            'task_version:%s' % task['Version']
                        ]
                        ecs_tags[container['DockerId']] = tags
        except (requests.exceptions.HTTPError,
                requests.exceptions.HTTPError) as e:
            self.log.warning("Unable to collect ECS task names: %s" % e)

        self.ecs_tags = ecs_tags

    def _filter_containers(self, containers):
        if not self.docker_util.filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            # exclude/include patterns are stored in docker_util to share them with other container-related checks
            if self.docker_util.are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(
                    container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug(
                    "Container {0} is filtered".format(container_name))

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:
                m_func(self,
                       'docker.container.size_rw',
                       container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(self,
                       'docker.container.size_rootfs',
                       container['SizeRootFs'],
                       tags=tags)

    def _send_container_healthcheck_sc(self, containers_by_id):
        """Send health service checks for containers."""
        for container in containers_by_id.itervalues():
            healthcheck_tags = self._get_tags(container, HEALTHCHECK)
            match = False
            for tag in healthcheck_tags:
                for rule in self.whitelist_patterns:
                    if re.match(rule, tag):
                        match = True

                        self._submit_healthcheck_sc(container)
                        break

                if match:
                    break

    def _submit_healthcheck_sc(self, container):
        health = container.get('health', {})
        status = AgentCheck.UNKNOWN
        if health:
            _health = health.get('Status', '')
            if _health == 'unhealthy':
                status = AgentCheck.CRITICAL
            elif _health == 'healthy':
                status = AgentCheck.OK

        tags = self._get_tags(container, CONTAINER)
        self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags)

    def _report_container_count(self, containers_by_id):
        """Report container count per state"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        per_state_count = defaultdict(int)

        filterlambda = lambda ctr: not self._is_container_excluded(ctr)
        containers = list(filter(filterlambda, containers_by_id.values()))

        for ctr in containers:
            per_state_count[ctr.get('State', '')] += 1

        for state in per_state_count:
            if state:
                m_func(self,
                       'docker.container.count',
                       per_state_count[state],
                       tags=['container_state:%s' % state.lower()])

    def _report_volume_count(self):
        """Report volume count per state (dangling or not)"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        attached_volumes = self.docker_client.volumes(
            filters={'dangling': False})
        dangling_volumes = self.docker_client.volumes(
            filters={'dangling': True})
        attached_count = len(attached_volumes['Volumes'])
        dangling_count = len(dangling_volumes['Volumes'])
        m_func(self,
               'docker.volume.count',
               attached_count,
               tags=['volume_state:attached'])
        m_func(self,
               'docker.volume.count',
               dangling_count,
               tags=['volume_state:dangling'])

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size',
                           image['VirtualSize'],
                           tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(
                    container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)

            try:
                self._report_cgroup_metrics(container, tags)
                if "_proc_root" not in container:
                    containers_without_proc_root.append(
                        DockerUtil.container_name_extractor(container)[0])
                    continue
                self._report_net_metrics(container, tags)
            except BogusPIDException as e:
                self.log.warning('Unable to report cgroup metrics: %s', e)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        cgroup_stat_file_failures = 0
        if not container.get('_pid'):
            raise BogusPIDException('Cannot report on bogus pid(0)')

        for cgroup in CGROUP_METRICS:
            try:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"],
                                                       container['_pid'],
                                                       cgroup['file'])
            except MountException as e:
                # We can't find a stat file
                self.warning(str(e))
                cgroup_stat_file_failures += 1
                if cgroup_stat_file_failures >= len(CGROUP_METRICS):
                    self.warning(
                        "Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now."
                    )
            else:
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key,
                              metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self,
                                        dd_key,
                                        int(stats[key]),
                                        tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct,
                                metric_func) in cgroup.get('to_compute',
                                                           {}).iteritems():
                        values = [
                            stats[key] for key in key_list if key in stats
                        ]
                        if len(values) != len(key_list):
                            self.log.debug(
                                "Couldn't compute {0}, some keys were missing."
                                .format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')

        try:
            if container['Id'] in self.network_mappings:
                networks = self.network_mappings[container['Id']]
            else:
                networks = self.docker_util.get_container_network_mapping(
                    container)
                self.network_mappings[container['Id']] = networks
        except Exception as e:
            # Revert to previous behaviour if the method is missing or failing
            # Debug message will only appear once per container, then the cache is used
            self.log.debug(
                "Failed to build docker network mapping, using failsafe. Exception: {0}"
                .format(e))
            networks = {'eth0': 'bridge'}
            self.network_mappings[container['Id']] = networks

        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name in networks:
                        net_tags = tags + [
                            'docker_network:' + networks[interface_name]
                        ]
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]),
                               net_tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]),
                               net_tags)

        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning(
                "Failed to report IO metrics from file {0}. Exception: {1}".
                format(proc_net_file, e))

    def _invalidate_network_mapping_cache(self, api_events):
        for ev in api_events:
            try:
                if ev.get('Type') == 'network' and ev.get('Action').endswith(
                        'connect'):
                    container_id = ev.get('Actor').get('Attributes').get(
                        'container')
                    if container_id in self.network_mappings:
                        self.log.debug(
                            "Removing network mapping cache for container %s" %
                            container_id)
                        del self.network_mappings[container_id]
            except Exception:
                self.log.warning('Malformed network event: %s' % str(ev))

    def _process_events(self, containers_by_id):
        api_events = self._get_events()

        if self.collect_exit_codes:
            self._report_exit_codes(api_events, containers_by_id)

        if self.collect_events:
            try:
                aggregated_events = self._pre_aggregate_events(
                    api_events, containers_by_id)
                events = self._format_events(aggregated_events,
                                             containers_by_id)
            except (socket.timeout, urllib2.URLError):
                self.warning(
                    'Timeout when collecting events. Events will be missing.')
                return
            except Exception as e:
                self.warning(
                    "Unexpected exception when collecting events: {0}. "
                    "Events will be missing".format(e))
                return

            for ev in events:
                self.log.debug("Creating event: %s" % ev['msg_title'])
                self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if not self._disable_net_metrics:
            self._invalidate_network_mapping_cache(events)
        if changed_container_ids and self._service_discovery:
            get_sd_backend(
                self.agentConfig).update_checks(changed_container_ids)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(
                    container):
                self.log.debug(
                    "Excluded event: container {0} status changed to {1}".
                    format(event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                events[event['from']].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            low_prio_events = []
            normal_prio_events = []

            for event in event_group:
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(
                        cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)

                # health checks generate tons of these so we treat them separately and lower their priority
                if event['status'].startswith('exec_create:') or event[
                        'status'].startswith('exec_start:'):
                    low_prio_events.append((event, container_name))
                else:
                    normal_prio_events.append((event, container_name))

            exec_event = self._create_dd_event(low_prio_events,
                                               image_name,
                                               container_tags,
                                               priority='Low')
            if exec_event:
                events.append(exec_event)

            normal_event = self._create_dd_event(normal_prio_events,
                                                 image_name,
                                                 container_tags,
                                                 priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events

    def _report_exit_codes(self, events, containers_by_id):
        for event in events:
            container_tags = set()
            container = containers_by_id.get(event.get('id'))
            # Skip events related to filtered containers
            if container is not None and self._is_container_excluded(
                    container):
                continue

            # Report the exit code in case of a DIE event
            if container is not None and event['status'] == 'die':
                container_name = DockerUtil.container_name_extractor(
                    container)[0]
                container_tags.update(self._get_tags(container, CONTAINER))
                container_tags.add('container_name:%s' % container_name)
                try:
                    exit_code = int(event['Actor']['Attributes']['exitCode'])
                    message = 'Container %s exited with %s' % (container_name,
                                                               exit_code)
                    status = AgentCheck.OK if exit_code == 0 else AgentCheck.CRITICAL
                    self.service_check(EXIT_SERVICE_CHECK_NAME,
                                       status,
                                       tags=list(container_tags),
                                       message=message)
                except KeyError:
                    self.log.warning(
                        'Unable to collect the exit code for container %s' %
                        container_name)

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        if not events:
            return

        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(
            ["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = ("%%%\n"
                    "{image_name} {status} on {hostname}\n"
                    "```\n{status_changes}\n```\n"
                    "%%%").format(image_name=image,
                                  status=status_text,
                                  hostname=self.hostname,
                                  status_changes="\n".join([
                                      "%s \t%s" %
                                      (change[1].upper(), change[0])
                                      for change in status_change
                                  ]))

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }

    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning(
                'Disk metrics collection is enabled but docker info did not'
                ' report any. Your storage driver might not support them, skipping.'
            )
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                val, unit = raw_val.split(' ')
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error(
                        'Unrecognized unit %s for disk metric %s. Dropping it.'
                        % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug(
                    'used, free, and total disk metrics may be wrong, '
                    'used: %s, free: %s, total: %s', used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(
                        100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(
                        100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error(
                    'docker.{0}.total is 0, calculating docker.{1}.percent'
                    ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(
            self._mountpoints, pid, cgroup,
            self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                else:
                    return dict(
                        map(lambda x: x.split(' ', 1),
                            fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now.
            # Some files can also be missing (like cpu.stat) and that's fine.
            self.log.debug("Can't open %s. Its metrics will be missing." %
                           stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu',
                           'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]:  # general case
            return True
        if 'docker' in selinux_policy:  # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE,
                                                line[2][1:]):  # kubernetes
            return True
        if line[2].startswith('/') and re.match(
                CONTAINER_ID_RE,
                line[2].split('/')[-1]):  # kube 1.6+ qos hierarchy
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning(
                "Unable to find any pid directory in {0}. "
                "If you are running the agent in a container, make sure to "
                'share the volume properly: "/proc:/host/proc:ro". '
                "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. "
                "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:
            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [
                        line.strip().split(':') for line in f.readlines()
                    ]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug(
                    "Cannot read %s, process likely raced to finish : %s",
                    path, e)
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug(
                            "Container %s not in container_dict, it's likely excluded",
                            container_id)
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(
                        proc_path, folder)
                elif custom_cgroups:  # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(
                                proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
Example #56
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    # Try to get GCE instance name
    if hostname is None:
        gce_hostname = GCE.get_hostname(config)
        if gce_hostname is not None:
            if is_valid_hostname(gce_hostname):
                return gce_hostname

    # Try to get the docker hostname
    docker_util = DockerUtil()
    if hostname is None and docker_util.is_dockerized():
        docker_hostname = docker_util.get_hostname()
        if docker_hostname is not None and is_valid_hostname(docker_hostname):
            return docker_hostname

    # then move on to os-specific detection
    if hostname is None:
        def _get_hostname_unix():
            try:
                # try fqdn
                out, _, rtcode = get_subprocess_output(['/bin/hostname', '-f'], log)
                if rtcode == 0:
                    return out.strip()
            except Exception:
                return None

        os_name = get_os()
        if os_name in ['mac', 'freebsd', 'linux', 'solaris']:
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if the host is an ECS worker, or has an EC2 hostname
    # or it's a windows machine and the EC2 config service folder exists
    # try and find an EC2 instance ID
    if (Platform.is_ecs_instance()) or \
       (hostname is not None and True in [hostname.lower().startswith(p) for p in [u'ip-', u'domu']]) or \
       (os_name == 'windows' and os.path.exists('C:\Program Files\Amazon\Ec2ConfigService')):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname

    if hostname is None:
        log.critical('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file')
        raise Exception('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file')
    else:
        return hostname
Example #57
0
    def _get_and_count_containers(self,
                                  custom_cgroups=False,
                                  healthchecks=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query +
                                   1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True,
                                                       size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(
                    sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug(
                    "Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups or healthchecks:
                try:
                    inspect_dict = self.docker_client.inspect_container(
                        container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                    container['health'] = inspect_dict['State'].get(
                        'Health', {})
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)

        # TODO: deprecate these 2, they should be replaced by _report_container_count
        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped",
                       stopped_count,
                       tags=list(tags))

        return containers_by_id
Example #58
0
class KubeUtil:
    __metaclass__ = Singleton

    DEFAULT_METHOD = 'http'
    MACHINE_INFO_PATH = '/api/v1.3/machine/'
    METRICS_PATH = '/api/v1.3/subcontainers/'
    PODS_LIST_PATH = '/pods/'
    DEFAULT_CADVISOR_PORT = 4194
    DEFAULT_KUBELET_PORT = 10255
    DEFAULT_MASTER_PORT = 8080
    DEFAULT_MASTER_NAME = 'kubernetes'  # DNS name to reach the master from a pod.
    CA_CRT_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/ca.crt'
    AUTH_TOKEN_PATH = '/var/run/secrets/kubernetes.io/serviceaccount/token'

    POD_NAME_LABEL = "io.kubernetes.pod.name"
    NAMESPACE_LABEL = "io.kubernetes.pod.namespace"

    def __init__(self, instance=None):
        self.docker_util = DockerUtil()
        if instance is None:
            try:
                config_file_path = get_conf_path(KUBERNETES_CHECK_NAME)
                check_config = check_yaml(config_file_path)
                instance = check_config['instances'][0]
            # kubernetes.yaml was not found
            except IOError as ex:
                log.error(ex.message)
                instance = {}
            except Exception:
                log.error('Kubernetes configuration file is invalid. '
                          'Trying connecting to kubelet with default settings anyway...')
                instance = {}

        self.method = instance.get('method', KubeUtil.DEFAULT_METHOD)
        self.host = instance.get("host") or self.docker_util.get_hostname()
        self.kubelet_host = os.environ.get('KUBERNETES_KUBELET_HOST') or self.host
        self._node_ip = self._node_name = None  # lazy evaluation
        self.host_name = os.environ.get('HOSTNAME')

        self.cadvisor_port = instance.get('port', KubeUtil.DEFAULT_CADVISOR_PORT)
        self.kubelet_port = instance.get('kubelet_port', KubeUtil.DEFAULT_KUBELET_PORT)

        self.kubelet_api_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.kubelet_port)
        self.cadvisor_url = '%s://%s:%d' % (self.method, self.kubelet_host, self.cadvisor_port)
        self.kubernetes_api_url = 'https://%s/api/v1' % (os.environ.get('KUBERNETES_SERVICE_HOST') or self.DEFAULT_MASTER_NAME)
        self.tls_settings = self._init_tls_settings(instance)

        self.metrics_url = urljoin(self.cadvisor_url, KubeUtil.METRICS_PATH)
        self.machine_info_url = urljoin(self.cadvisor_url, KubeUtil.MACHINE_INFO_PATH)
        self.pods_list_url = urljoin(self.kubelet_api_url, KubeUtil.PODS_LIST_PATH)
        self.kube_health_url = urljoin(self.kubelet_api_url, 'healthz')

        # keep track of the latest k8s event we collected and posted
        # default value is 0 but TTL for k8s events is one hour anyways
        self.last_event_collection_ts = 0

    def _init_tls_settings(self, instance):
        """
        Initialize TLS settings for connection to apiserver and kubelet.
        """
        tls_settings = {}

        client_crt = instance.get('apiserver_client_crt')
        client_key = instance.get('apiserver_client_key')
        apiserver_cacert = instance.get('apiserver_ca_cert')

        if client_crt and client_key and os.path.exists(client_crt) and os.path.exists(client_key):
            tls_settings['apiserver_client_cert'] = (client_crt, client_key)

        if apiserver_cacert and os.path.exists(apiserver_cacert):
            tls_settings['apiserver_cacert'] = apiserver_cacert

        token = self.get_auth_token()
        if token:
            tls_settings['bearer_token'] = token

        return tls_settings

    def get_kube_labels(self, excluded_keys=None):
        pods = self.retrieve_pods_list()
        return self.extract_kube_labels(pods, excluded_keys=excluded_keys)

    def extract_kube_labels(self, pods_list, excluded_keys=None):
        """
        Extract labels from a list of pods coming from
        the kubelet API.
        """
        excluded_keys = excluded_keys or []
        kube_labels = defaultdict(list)
        pod_items = pods_list.get("items") or []
        for pod in pod_items:
            metadata = pod.get("metadata", {})
            name = metadata.get("name")
            namespace = metadata.get("namespace")
            labels = metadata.get("labels")
            if name and labels and namespace:
                key = "%s/%s" % (namespace, name)

                for k, v in labels.iteritems():
                    if k in excluded_keys:
                        continue

                    kube_labels[key].append(u"kube_%s:%s" % (k, v))

        return kube_labels

    def extract_meta(self, pods_list, field_name):
        """
        Exctract fields like `uid` or `name` from the `metadata` section of a
        list of pods coming from the kubelet API.

        TODO: currently not in use, was added to support events filtering, consider to remove it.
        """
        uids = []
        pods = pods_list.get("items") or []
        for p in pods:
            value = p.get('metadata', {}).get(field_name)
            if value is not None:
                uids.append(value)
        return uids

    def retrieve_pods_list(self):
        """
        Retrieve the list of pods for this cluster querying the kubelet API.

        TODO: the list of pods could be cached with some policy to be decided.
        """
        return retrieve_json(self.pods_list_url)

    def retrieve_machine_info(self):
        """
        Retrieve machine info from Cadvisor.
        """
        return retrieve_json(self.machine_info_url)

    def retrieve_metrics(self):
        """
        Retrieve metrics from Cadvisor.
        """
        return retrieve_json(self.metrics_url)

    def filter_pods_list(self, pods_list, host_ip):
        """
        Filter out (in place) pods that are not running on the given host.

        TODO: currently not in use, was added to support events filtering, consider to remove it.
        """
        pod_items = pods_list.get('items') or []
        log.debug('Found {} pods to filter'.format(len(pod_items)))

        filtered_pods = []
        for pod in pod_items:
            status = pod.get('status', {})
            if status.get('hostIP') == host_ip:
                filtered_pods.append(pod)
        log.debug('Pods after filtering: {}'.format(len(filtered_pods)))

        pods_list['items'] = filtered_pods
        return pods_list

    def retrieve_json_auth(self, url, timeout=10):
        """
        Kubernetes API requires authentication using a token available in
        every pod, or with a client X509 cert/key pair.
        We authenticate using the service account token by default
        and replace this behavior with cert authentication if the user provided
        a cert/key pair in the instance.

        We try to verify the server TLS cert if the public cert is available.
        """
        verify = self.tls_settings.get('apiserver_cacert')
        if not verify:
            verify = self.CA_CRT_PATH if os.path.exists(self.CA_CRT_PATH) else False
        log.debug('ssl validation: {}'.format(verify))

        cert = self.tls_settings.get('apiserver_client_cert')
        bearer_token = self.tls_settings.get('bearer_token') if not cert else None
        headers = {'Authorization': 'Bearer {}'.format(bearer_token)} if bearer_token else None

        r = requests.get(url, timeout=timeout, headers=headers, verify=verify, cert=cert)
        r.raise_for_status()
        return r.json()

    def get_node_info(self):
        """
        Return the IP address and the hostname of the node where the pod is running.
        """
        if None in (self._node_ip, self._node_name):
            self._fetch_host_data()
        return self._node_ip, self._node_name

    def _fetch_host_data(self):
        """
        Retrieve host name and IP address from the payload returned by the listing
        pods endpoints from kubelet or kubernetes API.

        The host IP address is different from the default router for the pod.
        """
        try:
            pod_items = self.retrieve_pods_list().get("items") or []
        except Exception as e:
            log.warning("Unable to retrieve pod list %s. Not fetching host data", str(e))
            return

        for pod in pod_items:
            metadata = pod.get("metadata", {})
            name = metadata.get("name")
            if name == self.host_name:
                status = pod.get('status', {})
                spec = pod.get('spec', {})
                # if not found, use an empty string - we use None as "not initialized"
                self._node_ip = status.get('hostIP', '')
                self._node_name = spec.get('nodeName', '')
                break

    def extract_event_tags(self, event):
        """
        Return a list of tags extracted from an event object
        """
        tags = []

        if 'reason' in event:
            tags.append('reason:%s' % event.get('reason', '').lower())
        if 'namespace' in event.get('metadata', {}):
            tags.append('namespace:%s' % event['metadata']['namespace'])
        if 'host' in event.get('source', {}):
            tags.append('node_name:%s' % event['source']['host'])
        if 'kind' in event.get('involvedObject', {}):
            tags.append('object_type:%s' % event['involvedObject'].get('kind', '').lower())

        return tags

    def are_tags_filtered(self, tags):
        """
        Because it is a pain to call it from the kubernetes check otherwise.
        """
        return self.docker_util.are_tags_filtered(tags)

    @classmethod
    def get_auth_token(cls):
        """
        Return a string containing the authorization token for the pod.
        """
        try:
            with open(cls.AUTH_TOKEN_PATH) as f:
                return f.read()
        except IOError as e:
            log.error('Unable to read token from {}: {}'.format(cls.AUTH_TOKEN_PATH, e))

        return None
Example #59
0
 def test_image_name_from_container(self):
     co = {'Image': 'redis:3.2'}
     self.assertEqual('redis:3.2', DockerUtil().image_name_extractor(co))
     pass
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""

    def __init__(self, agentConfig):
        self.docker_client = DockerUtil().client
        if is_k8s():
            self.kubeutil = KubeUtil()

        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _get_host_address(self, c_inspect, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '')
        tpl_parts = tpl_var.split('_')

        # a specifier was given
        if len(tpl_parts) > 1:
            networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
            ip_dict = {}
            for net_name, net_desc in networks.iteritems():
                ip = net_desc.get('IPAddress')
                if ip:
                    ip_dict[net_name] = ip
            ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
            if ip_addr:
                return ip_addr

        # try to get the bridge IP address
        log.debug("No network found for container %s (%s), trying with IPAddress field" % (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
            for pod in pod_list:
                pod_ip = pod.get('status', {}).get('podIP')
                if pod_ip is None:
                    continue
                else:
                    c_statuses = pod.get('status', {}).get('containerStatuses', [])
                    for status in c_statuses:
                        # compare the container id with those of containers in the current pod
                        if c_id == status.get('containerID', '').split('//')[-1]:
                            return pod_ip

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_')

        # no specifier
        if len(tpl_parts) < 2:
            log.warning("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning("The key passed for template variable %s was not found." % tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.warning("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.warning("Trying with the last key: '%s'." % last_key)
            return ip_dict[last_key]

    def _get_port(self, container_inspect, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        c_id = container_inspect.get('Id', '')

        try:
            ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and is_k8s():
                log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." %
                          (c_id[:12], container_inspect.get('Config', {}).get('Image', '')))
                co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', [])
                c_name = None
                for co in co_statuses:
                    if co.get('containerID', '').split('//')[-1] == c_id:
                        c_name = co.get('name')
                        break
                containers = self._get_kube_config(c_id, 'spec').get('containers', [])
                for co in containers:
                    if co.get('name') == c_name:
                        ports = map(lambda x: str(x.get('containerPort')), co.get('ports', []))
        ports = sorted(ports, key=lambda x: int(x))
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_')

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error("Port index is not an integer. Using the last element instead.")
        except IndexError:
            log.error("Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, c_inspect):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12])
                return []
            # get labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get replication controller
            created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}'))
            if created_by.get('reference', {}).get('kind') == 'ReplicationController':
                tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name'))

            # get kubernetes namespace
            tags.append('kube_namespace:%s' % pod_metadata.get('namespace'))

        return tags

    def _get_additional_tags(self, container_inspect, *args):
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata')
            pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning("Failed to fetch pod metadata or pod spec for container %s."
                            " Additional Kubernetes tags may be missing." % container_inspect.get('Id', '')[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def _get_kube_config(self, c_id, key):
        """Get a part of a pod config from the kubernetes API"""
        pods = self.kubeutil.retrieve_pods_list().get('items', [])
        for pod in pods:
            c_statuses = pod.get('status', {}).get('containerStatuses', [])
            for status in c_statuses:
                if c_id == status.get('containerID', '').split('//')[-1]:
                    return pod.get(key, {})

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        containers = [(
            container.get('Image'),
            container.get('Id'), container.get('Labels')
        ) for container in self.docker_client.containers()]

        # used by the configcheck agent command to trace where check configs come from
        trace_config = self.agentConfig.get(TRACE_CONFIG, False)

        for image, cid, labels in containers:
            try:
                # value of the DATADOG_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(cid, identifier, trace_config=trace_config) or []
                for conf in check_configs:
                    if trace_config and conf is not None:
                        source, conf = conf

                    check_name, init_config, instance = conf
                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if trace_config:
                            configs[check_name] = (source, (init_config, [instance]))
                        else:
                            configs[check_name] = (init_config, [instance])
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \
                            'Keeping the first one found.'
                        if trace_config:
                            if configs[check_name][1][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1][1].append(instance)
                        else:
                            if configs[check_name][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service '
                              'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a DATADOG_ID label, return its value or the image name if missing"""
        return labels.get(DATADOG_ID) or image

    def _get_check_configs(self, c_id, identifier, trace_config=False):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        inspect = self.docker_client.inspect_container(c_id)
        config_templates = self._get_config_templates(identifier, trace_config=trace_config)
        if not config_templates:
            log.debug('No config template for container %s with identifier %s. '
                      'It will be left unconfigured.' % (c_id[:12], identifier))
            return None

        check_configs = []
        tags = self.get_tags(inspect)
        for config_tpl in config_templates:
            if trace_config:
                source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
            if tpl and len(tpl) == 2:
                init_config, instance = tpl
                if trace_config:
                    check_configs.append((source, (check_name, init_config, instance)))
                else:
                    check_configs.append((check_name, init_config, instance))

        return check_configs

    def _get_config_templates(self, identifier, trace_config=False):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
            log.warning('No supported configuration backend was provided, using auto-config only.')
        else:
            auto_conf = False

        # format: [('ident', {init_tpl}, {instance_tpl})] without trace_config
        # or      [(source, ('ident', {init_tpl}, {instance_tpl}))] with trace_config
        raw_tpls = self.config_store.get_check_tpls(
            identifier, auto_conf=auto_conf, trace_config=trace_config)
        for tpl in raw_tpls:
            if trace_config and tpl is not None:
                # each template can come from either auto configuration or user-supplied templates
                source, tpl = tpl
            if tpl is not None and len(tpl) == 3:
                check_name, init_config_tpl, instance_tpl = tpl
            else:
                log.debug('No template was found for identifier %s, leaving it alone.' % identifier)
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration'
                              ' by service discovery failed for ident  {1}.'.format(check_name, identifier))
                return None

            if trace_config:
                templates.append((source, (check_name, init_config_tpl, instance_tpl, variables)))
            else:
                templates.append((check_name, init_config_tpl, instance_tpl, variables))

        return templates

    def _fill_tpl(self, inspect, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_id, c_image = inspect.get('Id', ''), inspect.get('Config', {}).get('Image', '')

        # add default tags to the instance
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](inspect, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." % var)
                    var_values[var] = res
                except Exception as ex:
                    log.error("Could not find a value for the template variable %s for container %s "
                              "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error("No method was found to interpolate template variable %s for container %s "
                          "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values