Example #1
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)

        if not self.kubeutil.kubelet_api_url:
            raise Exception('Unable to reach kubelet. Try setting the host parameter.')

        if agentConfig.get('service_discovery') and \
           agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.leader_candidate = inst.get(LEADER_CANDIDATE)
        if self.leader_candidate:
            self.kubeutil.refresh_leader()

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))

            self.event_retriever = None
            self._configure_event_collection(inst)
Example #2
0
 def test_pod_name(self):
     with patch.object(KubeUtil,
                       '_locate_kubelet',
                       return_value='http://localhost:10255'):
         kube = KubeUtil()
         kube.__init__()
         self.assertEqual('test', kube.pod_name)
Example #3
0
class KubeTestCase(unittest.TestCase):
    # Patch _locate_kubelet that is used by KubeUtil.__init__
    def setUp(self):
        with patch.object(KubeUtil,
                          '_locate_kubelet',
                          return_value='http://localhost:10255'):
            self.kube = KubeUtil()
            self.kube.__init__()  # It's a singleton, force re-init

    def tearDown(self):
        self.kube = None

    @classmethod
    def _load_json_array(cls, names):
        json_array = []
        for filename in names:
            path = os.path.join(os.path.dirname(__file__), 'fixtures',
                                'kubeutil', filename)
            with open(path) as data_file:
                json_array.append(json.load(data_file))
        return json_array

    @classmethod
    def _load_resp_array(cls, names):
        json_array = cls._load_json_array(names)
        return map(lambda x: MockResponse(x, 200), json_array)
Example #4
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.docker_client = self.dockerutil.client
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                self.kubeutil = None
                log.error(
                    "Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s"
                    % str(ex))

        if Platform.is_nomad():
            self.nomadutil = NomadUtil()
        elif Platform.is_ecs_instance():
            self.ecsutil = ECSUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error(
                    "Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s"
                    % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
Example #6
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                self.kubeutil = KubeUtil()

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #7
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.host:
            raise Exception(
                'Unable to retrieve Docker hostname and host parameter is not set'
            )
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(init_config=init_config, instance=inst, use_kubelet=False)

        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.log.warning("Kubelet client failed to initialized for now, pausing the Kubernetes check.")
            else:
                raise Exception('Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently.')
    def check(self, instance):
        url = instance[
            'url'] if 'url' in instance else self.DEFAULT_KUBERNETES_URL
        instance_key = {'type': self.INSTANCE_TYPE, 'url': url}
        msg = None
        status = None

        kubeutil = KubeUtil(instance=instance)
        if not kubeutil.host:
            raise Exception(
                'Unable to retrieve Docker hostname and host parameter is not set'
            )

        self.start_snapshot(instance_key)
        try:
            self._extract_topology(kubeutil, instance_key)
        except requests.exceptions.Timeout as e:
            # If there's a timeout
            msg = "%s seconds timeout when hitting %s" % (
                kubeutil.timeoutSeconds, url)
            status = AgentCheck.CRITICAL
        except Exception as e:
            msg = str(e)
            status = AgentCheck.CRITICAL
        finally:
            if status is AgentCheck.CRITICAL:
                self.service_check(self.SERVICE_CHECK_NAME,
                                   status,
                                   message=msg)

        self.stop_snapshot(instance_key)
Example #10
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.docker_client = self.dockerutil.client
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                self.kubeutil = None
                log.error("Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        if Platform.is_nomad():
            self.nomadutil = NomadUtil()
        elif Platform.is_ecs_instance():
            self.ecsutil = ECSUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
Example #11
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception(
                'Unable to reach kubelet. Try setting the host parameter.')

        if agentConfig.get('service_discovery') and \
           agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning(
                        'Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s'
                        % str(e))

            self._collect_events = _is_affirmative(
                inst.get('collect_events', DEFAULT_COLLECT_EVENTS))
            if self._collect_events:
                self.event_retriever = self.kubeutil.get_event_retriever()
            elif self.kubeutil.collect_service_tag:
                # Only fetch service and pod events for service mapping
                event_delay = inst.get('service_tag_update_freq',
                                       DEFAULT_SERVICE_EVENT_FREQ)
                self.event_retriever = self.kubeutil.get_event_retriever(
                    kinds=['Service', 'Pod'], delay=event_delay)
            else:
                self.event_retriever = None
        else:
            self._collect_events = None
            self.event_retriever = None
Example #12
0
class KubeTestCase(unittest.TestCase):
    # Patch _locate_kubelet that is used by KubeUtil.__init__
    def setUp(self):
        with patch.object(KubeUtil, '_locate_kubelet', return_value='http://localhost:10255'):
            self.kube = KubeUtil()
            self.kube.__init__()    # It's a singleton, force re-init

    def tearDown(self):
        self.kube = None

    @classmethod
    def _load_json_array(cls, names):
        json_array = []
        for filename in names:
            path = os.path.join(os.path.dirname(__file__), 'fixtures', 'kubeutil', filename)
            with open(path) as data_file:
                json_array.append(json.load(data_file))
        return json_array
Example #13
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception('Unable to reach kubelet. Try setting the host parameter.')

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))
Example #14
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.host:
            raise Exception('Unable to retrieve Docker hostname and host parameter is not set')
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.docker_client = DockerUtil(config_store=self.config_store).client
        if Platform.is_k8s():
            self.kubeutil = KubeUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
Example #16
0
    def test_extract_event_tags(self):
        events = json.loads(Fixtures.read_file("events.json", string_escape=False))['items']
        for ev in events:
            tags = KubeUtil().extract_event_tags(ev)
            # there should be 4 tags except for some events where source.host is missing
            self.assertTrue(len(tags) >= 3)

            tag_names = [tag.split(':')[0] for tag in tags]
            self.assertIn('reason', tag_names)
            self.assertIn('namespace', tag_names)
            self.assertIn('object_type', tag_names)
            if len(tags) == 4:
                self.assertIn('node_name', tag_names)
Example #17
0
    def test_namespaced_events(self, *args):
        # reset last event pulling time
        KubeUtil().last_event_collection_ts = 0

        # Verify that we are retro compatible with the old 'namespace' configuration key
        config = {'instances': [{'host': 'bar', 'collect_events': True, 'namespace': 'test-namespace-1'}]}
        self.run_check(config, force_reload=True)
        self.assertEvent('dd-agent-a769 SuccessfulDelete on Bar', count=1, exact_match=False)
        self.assertEvent('hello-node-47289321-91tfd Scheduled on Bar', count=1, exact_match=False)

        # reset last event pulling time
        KubeUtil().last_event_collection_ts = 0

        # Using 'namespaces' list
        config = {'instances': [{'host': 'bar', 'collect_events': True, 'namespaces': ['test-namespace-1', 'test-namespace-2']}]}
        self.run_check(config, force_reload=True)
        self.assertEvent('dd-agent-a769 SuccessfulDelete on Bar', count=1, exact_match=False)
        self.assertEvent('hello-node-47289321-91tfd Scheduled on Bar', count=0, exact_match=False)

        # reset last event pulling time
        KubeUtil().last_event_collection_ts = 0

        # Using 'namespace_name_regexp' (since 'namespaces' is not set it should
        # fallback to ['default'] and add any namespaces that matched with the regexp
        config = {'instances': [{'host': 'bar', 'collect_events': True, 'namespace_name_regexp': 'test-namespace.*'}]}
        self.run_check(config, force_reload=True)
        self.assertEvent('dd-agent-a769 SuccessfulDelete on Bar', count=1, exact_match=False)
        self.assertEvent('hello-node-47289321-91tfd Scheduled on Bar', count=1, exact_match=False)

        # reset last event pulling time
        KubeUtil().last_event_collection_ts = 0

        # muting the 'default' namespace
        config = {'instances': [{'host': 'bar', 'collect_events': True, 'namespaces': [], 'namespace_name_regexp': 'test-namespace.*'}]}
        self.run_check(config, force_reload=True)
        self.assertEvent('dd-agent-a769 SuccessfulDelete on Bar', count=1, exact_match=False)
        self.assertEvent('hello-node-47289321-91tfd Scheduled on Bar', count=0, exact_match=False)
Example #18
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.host:
            raise Exception('Unable to retrieve Docker hostname and host parameter is not set')

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))
Example #19
0
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.docker_client = DockerUtil(config_store=self.config_store).client
        if Platform.is_k8s():
            self.kubeutil = KubeUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)
Example #20
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception('Unable to reach kubelet. Try setting the host parameter.')

        if agentConfig.get('service_discovery') and \
           agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))

            self._collect_events = _is_affirmative(inst.get('collect_events', DEFAULT_COLLECT_EVENTS))
            if self._collect_events:
                self.event_retriever = self.kubeutil.get_event_retriever()
            elif self.kubeutil.collect_service_tag:
                # Only fetch service and pod events for service mapping
                event_delay = inst.get('service_tag_update_freq', DEFAULT_SERVICE_EVENT_FREQ)
                self.event_retriever = self.kubeutil.get_event_retriever(kinds=['Service', 'Pod'],
                                                                         delay=event_delay)
            else:
                self.event_retriever = None
        else:
            self._collect_events = None
            self.event_retriever = None
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error("Couldn't instantiate the kubernetes client, "
                          "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        # docker labels we'll add as tags to all instances SD configures
        self.docker_labels_as_tags = agentConfig.get('docker_labels_as_tags', '')
        if self.docker_labels_as_tags:
            self.docker_labels_as_tags = [label.strip() for label in self.docker_labels_as_tags.split(',')]
        else:
            self.docker_labels_as_tags = []

        AbstractSDBackend.__init__(self, agentConfig)
Example #22
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            self.metadata_collector = MetadataCollector()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error("Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #23
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)
        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'

        global_labels_as_tags = agentConfig.get('docker_labels_as_tags')
        if global_labels_as_tags:
            self.collect_labels_as_tags = [label.strip() for label in global_labels_as_tags.split(',')]
        else:
            self.collect_labels_as_tags = DEFAULT_LABELS_AS_TAGS
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            # Getting custom tags for service checks when docker is down
            self.custom_tags = instance.get("tags", [])

            self.docker_util = DockerUtil()
            if not self.docker_util.client:
                raise Exception("Failed to initialize Docker client.")

            self.docker_gateway = DockerUtil.get_gateway()
            self.metadata_collector = MetadataCollector()

            self.kubeutil = None
            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.log.error("Couldn't instantiate the kubernetes client, "
                                   "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            # The collect_labels_as_tags is legacy, only tagging docker metrics.
            # It is replaced by docker_labels_as_tags in config.cfg.
            # We keep this line for backward compatibility.
            if "collect_labels_as_tags" in instance:
                self.collect_labels_as_tags = instance.get("collect_labels_as_tags")

            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.filtered_event_types = tuple(instance.get("filtered_event_types", DEFAULT_FILTERED_EVENT_TYPES))

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready or docker daemon is down. So we retry if needed
            self.init()

            try:
                if self.docker_util.client is None:
                    message = "Unable to connect to Docker daemon"
                    self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                       message=message, tags=self.custom_tags)
                    return
            except Exception as ex:
                self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                                   message=str(ex), tags=self.custom_tags)
                return

            if not self.init_success:
                # Initialization failed, will try later
                return

        try:
            # Report image metrics
            if self.collect_image_stats:
                self._count_and_weigh_images()

            if Platform.is_k8s():
                self.kube_pod_tags = {}
                if self.kubeutil:
                    try:
                        self.kube_pod_tags = self.kubeutil.get_kube_pod_tags()
                    except Exception as e:
                        self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))

            # containers running with custom cgroups?
            custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

            # Get the list of containers and the index of their names
            health_service_checks = True if self.whitelist_patterns else False
            containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks)
            containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

            # Send events from Docker API
            if self.collect_events or self._service_discovery or not self._disable_net_metrics or self.collect_exit_codes:
                self._process_events(containers_by_id)

            # Report performance container metrics (cpu, mem, net, io)
            self._report_performance_metrics(containers_by_id)

            if self.collect_container_size:
                self._report_container_size(containers_by_id)

            if self.collect_container_count:
                self._report_container_count(containers_by_id)

            if self.collect_volume_count:
                self._report_volume_count()

            # Collect disk stats from Docker info command
            if self.collect_disk_stats:
                self._report_disk_stats()

            if health_service_checks:
                self._send_container_healthcheck_sc(containers_by_id)
        except:
            self.log.exception("Docker_daemon check failed")
            self.warning("Check failed. Will retry at next iteration")

        if self.capped_metrics:
            self.filter_capped_metrics()

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_util.client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_util.client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False, healthchecks=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_util.client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message, tags=self.custom_tags)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.custom_tags)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups or healthchecks:
                try:
                    inspect_dict = self.docker_util.client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                    container['health'] = inspect_dict['State'].get('Health', {})
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)

        total_count = 0
        # TODO: deprecate these 2, they should be replaced by _report_container_count
        for tags, count in running_containers_count.iteritems():
            total_count += count
            self.gauge("docker.containers.running", count, tags=list(tags))
        self.gauge("docker.containers.running.total", total_count, tags=self.custom_tags)

        total_count = 0
        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            total_count += stopped_count
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))
        self.gauge("docker.containers.stopped.total", total_count, tags=self.custom_tags)

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)
            self.collect_labels_as_tags.append(KubeUtil.CONTAINER_NAME_LABEL)

        # Collect container names as tags on rancher
        if Platform.is_rancher():
            if RANCHER_CONTAINER_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_CONTAINER_NAME)
            if RANCHER_SVC_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_SVC_NAME)
            if RANCHER_STACK_NAME not in self.collect_labels_as_tags:
                self.collect_labels_as_tags.append(RANCHER_STACK_NAME)

        if entity is not None:
            pod_name = None
            namespace = None
            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller:  # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels:  # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif k == KubeUtil.CONTAINER_NAME_LABEL and Platform.is_k8s():
                            if v:
                                tags.append("kube_container_name:%s" % v)
                        elif k == SWARM_SVC_LABEL and Platform.is_swarm():
                            if v:
                                tags.append("swarm_service:%s" % v)
                        elif k == RANCHER_CONTAINER_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_container:%s' % v)
                        elif k == RANCHER_SVC_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_service:%s' % v)
                        elif k == RANCHER_STACK_NAME and Platform.is_rancher():
                            if v:
                                tags.append('rancher_stack:%s' % v)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k, v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add kube labels and creator/service tags
            if Platform.is_k8s() and namespace and pod_name:
                kube_tags = self.kube_pod_tags.get("{0}/{1}".format(namespace, pod_name))
                if kube_tags:
                    tags.extend(list(kube_tags))

            if self.metadata_collector.has_detected():
                orch_tags = self.metadata_collector.get_container_tags(co=entity)
                tags.extend(orch_tags)

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def _filter_containers(self, containers):
        if not self.docker_util.filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            # exclude/include patterns are stored in docker_util to share them with other container-related checks
            if self.docker_util.are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:
                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _send_container_healthcheck_sc(self, containers_by_id):
        """Send health service checks for containers."""
        for container in containers_by_id.itervalues():
            healthcheck_tags = self._get_tags(container, HEALTHCHECK)
            match = False
            for tag in healthcheck_tags:
                for rule in self.whitelist_patterns:
                    if re.match(rule, tag):
                        match = True

                        self._submit_healthcheck_sc(container)
                        break

                if match:
                    break

    def _submit_healthcheck_sc(self, container):
        health = container.get('health', {})
        status = AgentCheck.UNKNOWN
        if health:
            _health = health.get('Status', '')
            if _health == 'unhealthy':
                status = AgentCheck.CRITICAL
            elif _health == 'healthy':
                status = AgentCheck.OK

        tags = self._get_tags(container, CONTAINER)
        self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags)

    def _report_container_count(self, containers_by_id):
        """Report container count per state"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        per_state_count = defaultdict(int)

        filterlambda = lambda ctr: not self._is_container_excluded(ctr)
        containers = list(filter(filterlambda, containers_by_id.values()))

        for ctr in containers:
            per_state_count[ctr.get('State', '')] += 1

        for state in per_state_count:
            if state:
                m_func(self, 'docker.container.count', per_state_count[state], tags=['container_state:%s' % state.lower()])

    def _report_volume_count(self):
        """Report volume count per state (dangling or not)"""
        m_func = FUNC_MAP[GAUGE][self.use_histogram]

        attached_volumes = self.docker_util.client.volumes(filters={'dangling': False})
        dangling_volumes = self.docker_util.client.volumes(filters={'dangling': True})
        attached_count = len(attached_volumes.get('Volumes', []) or [])
        dangling_count = len(dangling_volumes.get('Volumes', []) or [])
        m_func(self, 'docker.volume.count', attached_count, tags=['volume_state:attached'])
        m_func(self, 'docker.volume.count', dangling_count, tags=['volume_state:dangling'])

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container_id, container in containers_by_id.iteritems():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)

            try:
                self._report_cgroup_metrics(container, tags)
                if "_proc_root" not in container:
                    containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                    continue
                self._report_net_metrics(container, tags)
            except BogusPIDException as e:
                self.log.warning('Unable to report cgroup metrics for container %s: %s', container_id[:12], e)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        cgroup_stat_file_failures = 0
        if not container.get('_pid'):
            raise BogusPIDException('Cannot report on bogus pid(0)')

        for cgroup in CGROUP_METRICS:
            try:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
            except MountException as e:
                # We can't find a stat file
                self.warning(str(e))
                cgroup_stat_file_failures += 1
                if cgroup_stat_file_failures >= len(CGROUP_METRICS):
                    self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now.")
            except IOError as e:
                self.log.debug("Cannot read cgroup file, container likely raced to finish : %s", e)
            else:
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')

        try:
            if container['Id'] in self.network_mappings:
                networks = self.network_mappings[container['Id']]
            else:
                networks = self.docker_util.get_container_network_mapping(container)
                if not networks:
                    networks = {'eth0': 'bridge'}
                self.network_mappings[container['Id']] = networks
        except Exception as e:
            # Revert to previous behaviour if the method is missing or failing
            # Debug message will only appear once per container, then the cache is used
            self.log.debug("Failed to build docker network mapping, using failsafe. Exception: {0}".format(e))
            networks = {'eth0': 'bridge'}
            self.network_mappings[container['Id']] = networks

        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name in networks:
                        net_tags = tags + ['docker_network:'+networks[interface_name]]
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), net_tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), net_tags)

        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _invalidate_network_mapping_cache(self, api_events):
        for ev in api_events:
            try:
                if ev.get('Type') == 'network' and ev.get('Action').endswith('connect'):
                    container_id = ev.get('Actor').get('Attributes').get('container')
                    if container_id in self.network_mappings:
                        self.log.debug("Removing network mapping cache for container %s" % container_id)
                        del self.network_mappings[container_id]
            except Exception:
                self.log.warning('Malformed network event: %s' % str(ev))

    def _process_events(self, containers_by_id):
        api_events = self._get_events()

        if self.collect_exit_codes:
            self._report_exit_codes(api_events, containers_by_id)

        if self.collect_events:
            try:
                aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
                events = self._format_events(aggregated_events, containers_by_id)
            except (socket.timeout, urllib2.URLError):
                self.warning('Timeout when collecting events. Events will be missing.')
                return
            except Exception as e:
                self.warning("Unexpected exception when collecting events: {0}. "
                             "Events will be missing".format(e))
                return

            for ev in events:
                self.log.debug("Creating event: %s" % ev['msg_title'])
                self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if not self._disable_net_metrics:
            self._invalidate_network_mapping_cache(events)
        if changed_container_ids and self._service_discovery:
            get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
        if changed_container_ids:
            self.metadata_collector.invalidate_cache(events)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                image_name = event['from']
                if image_name.startswith('sha256:'):
                    image_name = self.docker_util.image_name_extractor({'Image': image_name})
                events[image_name].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            filtered_events_count = 0
            normal_prio_events = []

            for event in event_group:
                # Only keep events that are not configured to be filtered out
                if event['status'].startswith(self.filtered_event_types):
                    filtered_events_count += 1
                    continue
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)
                    # Add additionnal docker event attributes as tag
                    for attr in self.event_attributes_as_tags:
                        if attr in event['Actor']['Attributes'] and attr not in EXCLUDED_ATTRIBUTES:
                            container_tags.add('%s:%s' % (attr, event['Actor']['Attributes'][attr]))

                normal_prio_events.append((event, container_name))
            if filtered_events_count:
                self.log.debug('%d events were filtered out because of ignored event type' % filtered_events_count)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events

    def _report_exit_codes(self, events, containers_by_id):
        for event in events:
            container_tags = set()
            container = containers_by_id.get(event.get('id'))
            # Skip events related to filtered containers
            if container is not None and self._is_container_excluded(container):
                continue

            # Report the exit code in case of a DIE event
            if container is not None and event['status'] == 'die':
                container_name = DockerUtil.container_name_extractor(container)[0]
                container_tags.update(self._get_tags(container, CONTAINER))
                container_tags.add('container_name:%s' % container_name)
                try:
                    exit_code = int(event['Actor']['Attributes']['exitCode'])
                    message = 'Container %s exited with %s' % (container_name, exit_code)
                    status = AgentCheck.OK if exit_code == 0 else AgentCheck.CRITICAL
                    self.service_check(EXIT_SERVICE_CHECK_NAME, status, tags=list(container_tags), message=message)
                except KeyError:
                    self.log.warning('Unable to collect the exit code for container %s' % container_name)

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        if not events:
            return

        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = (
            "%%%\n"
            "{image_name} {status} on {hostname}\n"
            "```\n{status_changes}\n```\n"
            "%%%"
        ).format(
            image_name=image,
            status=status_text,
            hostname=self.hostname,
            status_changes="\n".join(
                ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
        )

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }

    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_util.client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                match = DISK_STATS_RE.search(raw_val)
                if match is None or len(match.groups()) != 2:
                    self.log.warning('Can\'t parse value %s for disk metric %s. Dropping it.' % (raw_val, name))
                    metrics[name] = None
                val, unit = match.groups()
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                elif 'cpuacct.usage' in stat_file:
                    return dict({'usage': str(int(fp.read())/10000000)})
                elif 'memory.soft_limit_in_bytes' in stat_file:
                    value = int(fp.read())
                    # do not report kernel max default value (uint64 * 4096)
                    # see https://github.com/torvalds/linux/blob/5b36577109be007a6ecf4b65b54cbc9118463c2b/mm/memcontrol.c#L2844-L2845
                    # 2 ** 60 is kept for consistency of other cgroups metrics
                    if value < 2 ** 60:
                        return dict({'softlimit': value})
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now.
            # Some files can also be missing (like cpu.stat) and that's fine.
            self.log.debug("Can't open %s. Its metrics will be missing." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]:  # general case
            return True
        if 'docker' in selinux_policy:  # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]):  # kubernetes
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2].split('/')[-1]): # kube 1.6+ qos hierarchy
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                         "If you are running the agent in a container, make sure to "
                         'share the volume properly: "/proc:/host/proc:ro". '
                         "See https://github.com/serverdensity/docker-sd-agent/blob/master/README.md for more information. "
                         "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:
            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, process likely raced to finish : %s", path, e)
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug(
                            "Container %s not in container_dict, it's likely excluded", container_id
                        )
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif custom_cgroups:  # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
Example #24
0
    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.kubeutil = None
                    self.log.error(
                        "Couldn't instantiate the kubernetes client, "
                        "subsequent kubernetes calls will fail as well. Error: %s"
                        % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(
                CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get(
                "collect_labels_as_tags", DEFAULT_LABELS_AS_TAGS)
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(
                instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags",
                                            DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags",
                                        DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get(
                'health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(
                    health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(
                instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(
                instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(
                instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(
                instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(
                instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(
                instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(
                instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(
                instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(
                instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}
            self.ecs_agent_local = None

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.kubeutil = None
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                log.error(
                    "Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s"
                    % str(ex))

        self.metadata_collector = MetadataCollector()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'pid': self._get_container_pid,
            'port': self._get_port,
            'container-name': self._get_container_name,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _make_fetch_state(self):
        pod_list = []
        if Platform.is_k8s():
            if not self.kubeutil or not self.kubeutil.init_success:
                log.error(
                    "kubelet client not initialized, cannot retrieve pod list."
                )
            else:
                try:
                    pod_list = self.kubeutil.retrieve_pods_list().get(
                        'items', [])
                except Exception as ex:
                    log.warning("Failed to retrieve pod list: %s" % str(ex))
        return _SDDockerBackendConfigFetchState(
            self.dockerutil.client.inspect_container, pod_list)

    def update_checks(self, changed_containers):
        """
        Takes a list of container IDs that changed recently
        and marks their corresponding checks as
        """
        if not self.dockerutil.client:
            log.warning(
                "Docker client is not initialized, pausing auto discovery.")
            return

        state = self._make_fetch_state()

        conf_reload_set = set()
        for c_id in changed_containers:
            checks = self._get_checks_to_refresh(state, c_id)
            if checks:
                conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the STACKSTATE_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                    and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        labels = inspect.get('Config', {}).get('Labels', {})
        identifier = labels.get(STACKSTATE_ID) or \
            self.dockerutil.image_name_extractor(inspect)

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels
        return self.config_store.get_checks_to_refresh(identifier,
                                                       **platform_kwargs)

    def _get_container_pid(self, state, cid, tpl_var):
        """Extract the host-namespace pid of the container pid 0"""
        pid = state.inspect_container(cid).get('State', {}).get('Pid')
        if not pid:
            return None

        return str(pid)

    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id = c_inspect.get('Id', '')
        c_img = self.dockerutil.image_name_extractor(c_inspect)

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
                  "networks, trying with the IPAddress field" %
                  (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        if Platform.is_rancher():
            # try to get the rancher IP address
            log.debug("No IP address was found in container %s (%s) "
                      "trying with the Rancher label" % (c_id[:12], c_img))

            ip_addr = c_inspect.get('Config',
                                    {}).get('Labels',
                                            {}).get(RANCHER_CONTAINER_IP)
            if ip_addr:
                return ip_addr.split('/')[0]

        log.error("No IP address was found for container %s (%s)" %
                  (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.debug("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning(
                    "The key passed for template variable %s was not found." %
                    tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.debug("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.debug("Trying with the last (sorted) network: '%s'." %
                      last_key)
            return ip_dict[last_key]

    def _get_port(self, state, c_id, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        container_inspect = state.inspect_container(c_id)

        try:
            ports = map(lambda x: x.split('/')[0],
                        container_inspect['NetworkSettings']['Ports'].keys())
            if len(
                    ports
            ) == 0:  # There might be a key Port in NetworkSettings but no ports so we raise IndexError to check in ExposedPorts
                raise IndexError
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(
                lambda x: x.split('/')[0],
                container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug(
                    "Didn't find the port for container %s (%s), trying the kubernetes way."
                    % (c_id[:12], container_inspect.get('Config', {}).get(
                        'Image', '')))
                spec = state.get_kube_container_spec(c_id)
                if spec:
                    ports = [
                        str(x.get('containerPort'))
                        for x in spec.get('ports', [])
                    ]
        ports = sorted(ports, key=int)
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error(
                "Port index is not an integer. Using the last element instead."
            )
        except IndexError:
            log.error(
                "Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        c_inspect = state.inspect_container(c_id)
        tags = self.dockerutil.extract_container_tags(c_inspect)

        if Platform.is_k8s():
            if not self.kubeutil.init_success:
                log.warning(
                    "kubelet client not initialized, kubernetes tags will be missing."
                )
                return tags

            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags will be missing." % c_id[:12])
                return tags

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            if not self.kubeutil:
                log.warning("The agent can't connect to kubelet, creator and "
                            "service tags will be missing for container %s." %
                            c_id[:12])
            else:
                # add creator tags
                creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
                tags.extend(creator_tags)

                # add services tags
                if self.kubeutil.collect_service_tag:
                    services = self.kubeutil.match_services_for_pod(
                        pod_metadata)
                    for s in services:
                        if s is not None:
                            tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = c_inspect.get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            service_name = c_inspect.get('Config',
                                         {}).get('Labels',
                                                 {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config',
                                       {}).get('Labels',
                                               {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get(
                'Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        if self.metadata_collector.has_detected():
            orch_tags = self.metadata_collector.get_container_tags(
                co=c_inspect)
            tags.extend(orch_tags)

        return tags

    def _get_container_name(self, state, c_id, tpl_var):
        container_inspect = state.inspect_container(c_id)
        return container_inspect.get('Name', '').lstrip('/')

    def _get_additional_tags(self, state, c_id, *args):
        tags = []

        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')
            pod_spec = state.get_kube_config(c_id, 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning(
                    "Failed to fetch pod metadata or pod spec for container %s."
                    " Additional Kubernetes tags may be missing." % c_id[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))

            c_inspect = state.inspect_container(c_id)
            c_name = c_inspect.get('Config', {}).get('Labels', {}).get(
                KubeUtil.CONTAINER_NAME_LABEL)
            if c_name:
                tags.append('kube_container_name:%s' % c_name)
        return tags

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        if not self.dockerutil.client:
            log.warning(
                "Docker client is not initialized, pausing auto discovery.")
            return configs

        state = self._make_fetch_state()
        containers = [(self.dockerutil.image_name_extractor(container),
                       container.get('Id'), container.get('Labels'))
                      for container in self.dockerutil.client.containers()]

        for image, cid, labels in containers:
            try:
                # value of the STACKSTATE_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(
                    state, cid, identifier, labels) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if isinstance(instance, list):
                            configs[check_name] = (source, (init_config,
                                                            instance))
                        else:
                            configs[check_name] = (source, (init_config,
                                                            [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        if isinstance(instance, list):
                            for inst in instance:
                                configs[check_name][1][1].append(inst)
                        else:
                            configs[check_name][1][1].append(instance)
            except Exception:
                log.exception(
                    'Building config for container %s based on image %s using service '
                    'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a STACKSTATE_ID label, return its value or the image name if missing"""
        return labels.get(STACKSTATE_ID) or image

    def _get_check_configs(self, state, c_id, identifier, labels=None):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        if labels:
            platform_kwargs['docker_labels'] = labels

        config_templates = self._get_config_templates(identifier,
                                                      **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # covering mono-instance and multi-instances cases
            tmpl_array = instance_tpl
            if not isinstance(instance_tpl, list):
                tmpl_array = [instance_tpl]

            # insert tags in instance_tpl and process values for template variables
            result_instances = []
            result_init_config = None
            for inst_tmpl in tmpl_array:
                instance_tpl, var_values = self._fill_tpl(
                    state, c_id, inst_tmpl, variables, tags)
                tpl = self._render_template(init_config_tpl or {}, instance_tpl
                                            or {}, var_values)
                if tpl and len(tpl) == 2:
                    init_config, instance = tpl
                    result_instances.append(instance)
                    if not result_init_config:
                        result_init_config = init_config
                    elif result_init_config != init_config:
                        self.log.warning(
                            "Different versions of `init_config` found for "
                            "check {}. Keeping the first one found.".format(
                                'check_name'))
            check_configs.append(
                (source, (check_name, result_init_config, result_instances)))

        return check_configs

    def _get_config_templates(self, identifier, **platform_kwargs):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
        else:
            auto_conf = False

        # format [(source, ('ident', {init_tpl}, {instance_tpl}))]
        raw_tpls = self.config_store.get_check_tpls(identifier,
                                                    auto_conf=auto_conf,
                                                    **platform_kwargs)
        for tpl in raw_tpls:
            # each template can come from either auto configuration or user-supplied templates
            try:
                source, (check_name, init_config_tpl, instance_tpl) = tpl
            except (TypeError, IndexError, ValueError):
                log.debug(
                    'No template was found for identifier %s, leaving it alone: %s'
                    % (identifier, tpl))
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict) and not isinstance(
                        instance_tpl, list):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception(
                    'Failed to decode the JSON template fetched for check {0}. Its configuration'
                    ' by service discovery failed for ident  {1}.'.format(
                        check_name, identifier))
                return None

            templates.append((source, (check_name, init_config_tpl,
                                       instance_tpl, variables)))

        return templates

    def _fill_tpl(self, state, c_id, instance_tpl, variables, c_tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_image = state.inspect_container(c_id).get('Config',
                                                    {}).get('Image', '')

        # add only default c_tags to the instance to avoid duplicate tags from conf
        if c_tags:
            tags = c_tags[:]  # shallow copy of the c_tags array
        else:
            tags = []
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            if isinstance(tpl_tags, dict):
                for key, val in tpl_tags.iteritems():
                    tags.append("{}:{}".format(key, val))
            else:
                tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](state, c_id, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." %
                                         var)
                    var_values[var] = res
                except Exception as ex:
                    log.error(
                        "Could not find a value for the template variable %s for container %s "
                        "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error(
                    "No method was found to interpolate template variable %s for container %s "
                    "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
Example #26
0
 def get_host_tags(self):
     return KubeUtil().get_node_hosttags()
Example #27
0
 def test_get_auth_token(self):
     KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
     self.assertIsNone(KubeUtil.get_auth_token())
     KubeUtil.AUTH_TOKEN_PATH = Fixtures.file('events.json')  # any file could do the trick
     self.assertIsNotNone(KubeUtil.get_auth_token())
Example #28
0
 def setUp(self):
     self.kubeutil = KubeUtil()
Example #29
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch('utils.kubernetes.KubeUtil.retrieve_pods_list', side_effect=['foo'])
    @mock.patch('utils.kubernetes.KubeUtil.extract_kube_labels')
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys='bar')
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar')

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ['foo'])
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, 'foo')
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 4)

    @mock.patch('utils.kubernetes.kubeutil.retrieve_json')
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch('utils.kubernetes.kubeutil.retrieve_json')
    def test_retrieve_machine_info(self, retrieve_json):
        self.kubeutil.retrieve_machine_info()
        retrieve_json.assert_called_once_with(self.kubeutil.machine_info_url)

    @mock.patch('utils.kubernetes.kubeutil.retrieve_json')
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.9')
        self.assertEqual(len(res.get('items')), 5)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.5')
        self.assertEqual(len(res.get('items')), 1)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

    @mock.patch('utils.kubernetes.kubeutil.requests')
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_once_with('url', verify=False, timeout=10, headers={'Authorization': 'Bearer foo_tok'})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_with('url', verify=__file__, timeout=10, headers={'Authorization': 'Bearer foo_tok'})

    def test_get_node_info(self):
        with mock.patch('utils.kubernetes.KubeUtil._fetch_host_data') as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = 'foo'
            self.kubeutil._node_name = 'bar'
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, 'foo')
            self.assertEqual(name, 'bar')
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch('utils.kubernetes.KubeUtil.retrieve_pods_list') as mock_pods:
            self.kubeutil.host_name = 'dd-agent-1rxlh'
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name, 'kubernetes-massi-minion-k23m')

            self.kubeutil.host_name = 'heapster-v11-l8sh1'
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name, 'gke-cluster-1-8046fdfa-node-ld35')

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file('events.json')  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv('KUBERNETES_PORT')
        self.assertFalse(Platform.is_k8s())
        os.environ['KUBERNETES_PORT'] = '999'
        self.assertTrue(Platform.is_k8s())

    def test_extract_event_tags(self):
        events = json.loads(Fixtures.read_file("events.json", string_escape=False))['items']
        for ev in events:
            tags = KubeUtil().extract_event_tags(ev)
            # there should be 4 tags except for some events where source.host is missing
            self.assertTrue(len(tags) >= 3)

            tag_names = [tag.split(':')[0] for tag in tags]
            self.assertIn('reason', tag_names)
            self.assertIn('namespace', tag_names)
            self.assertIn('object_type', tag_names)
            if len(tags) == 4:
                self.assertIn('node_name', tag_names)
Example #30
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)

        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                self.kubeutil = KubeUtil()

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()
            if not self.init_success:
                # Initialization failed, will try later
                return

        # Report image metrics
        if self.collect_image_stats:
            self._count_and_weigh_images()

        if self.collect_ecs_tags:
            self.refresh_ecs_tags()

        if Platform.is_k8s():
            try:
                self.kube_labels = self.kubeutil.get_kube_labels()
            except Exception as e:
                self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))
                self.kube_labels = {}

        # containers running with custom cgroups?
        custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

        # Get the list of containers and the index of their names
        containers_by_id = self._get_and_count_containers(custom_cgroups)
        containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

        # Send events from Docker API
        if self.collect_events or self._service_discovery:
            self._process_events(containers_by_id)

        # Report performance container metrics (cpu, mem, net, io)
        self._report_performance_metrics(containers_by_id)

        if self.collect_container_size:
            self._report_container_size(containers_by_id)

        # Collect disk stats from Docker info command
        if self.collect_disk_stats:
            self._report_disk_stats()

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups:
                try:
                    inspect_dict = self.docker_client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)


        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        if entity is not None:
            pod_name = None

            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller: # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k,v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add ECS tags
            if self.collect_ecs_tags:
                entity_id = entity.get("Id")
                if entity_id in self.ecs_tags:
                    ecs_tags = self.ecs_tags[entity_id]
                    tags.extend(ecs_tags)

            # Add kube labels
            if Platform.is_k8s():
                kube_tags = self.kube_labels.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def refresh_ecs_tags(self):
        ecs_config = self.docker_client.inspect_container('ecs-agent')
        ip = ecs_config.get('NetworkSettings', {}).get('IPAddress')
        ports = ecs_config.get('NetworkSettings', {}).get('Ports')
        port = ports.keys()[0].split('/')[0] if ports else None
        if not ip:
            port = ECS_INTROSPECT_DEFAULT_PORT
            if Platform.is_containerized() and self.docker_gateway:
                ip = self.docker_gateway
            else:
                ip = "localhost"

        ecs_tags = {}
        try:
            if ip and port:
                tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json()
                for task in tasks.get('Tasks', []):
                    for container in task.get('Containers', []):
                        tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']]
                        ecs_tags[container['DockerId']] = tags
        except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e:
            self.log.warning("Unable to collect ECS task names: %s" % e)

        self.ecs_tags = ecs_tags

    def _filter_containers(self, containers):
        if not self.docker_util.filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            # exclude/include patterns are stored in docker_util to share them with other container-related checks
            if self.docker_util.are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:

                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            self._report_cgroup_metrics(container, tags)
            if "_proc_root" not in container:
                containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                continue
            self._report_net_metrics(container, tags)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        try:
            for cgroup in CGROUP_METRICS:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

        except MountException as ex:
            if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES:
                raise ex
            else:
                self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now."
                             "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries))
                self.cgroup_listing_retries += 1
        else:
            self.cgroup_listing_retries = 0

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')
        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name == 'eth0':
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), tags)
                        break
        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _process_events(self, containers_by_id):
        if self.collect_events is False:
            # Crawl events for service discovery only
            self._get_events()
            return
        try:
            api_events = self._get_events()
            aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
            events = self._format_events(aggregated_events, containers_by_id)
        except (socket.timeout, urllib2.URLError):
            self.warning('Timeout when collecting events. Events will be missing.')
            return
        except Exception as e:
            self.warning("Unexpected exception when collecting events: {0}. "
                         "Events will be missing".format(e))
            return

        for ev in events:
            self.log.debug("Creating event: %s" % ev['msg_title'])
            self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if changed_container_ids and self._service_discovery:
            get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                events[event['from']].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            low_prio_events = []
            normal_prio_events = []

            for event in event_group:
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)

                # health checks generate tons of these so we treat them separately and lower their priority
                if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'):
                    low_prio_events.append((event, container_name))
                else:
                    normal_prio_events.append((event, container_name))

            exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low')
            if exec_event:
                events.append(exec_event)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        if not events:
            return

        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = (
            "%%%\n"
            "{image_name} {status} on {hostname}\n"
            "```\n{status_changes}\n```\n"
            "%%%"
        ).format(
            image_name=image,
            status=status_text,
            hostname=self.hostname,
            status_changes="\n".join(
                ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
        )

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }


    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                val, unit = raw_val.split(' ')
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now.
            # Some files can also be missing (like cpu.stat) and that's fine.
            self.log.info("Can't open %s. Some metrics for this container may be missing." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]: # general case
            return True
        if 'docker' in selinux_policy: # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                "If you are running the agent in a container, make sure to "
                'share the volume properly: "/proc:/host/proc:ro". '
                "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. "
                "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:

            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, "
                               "process likely raced to finish : %s" %
                               (path, str(e)))
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug("Container %s not in container_dict, it's likely excluded", container_id)
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
Example #31
0
 def setUp(self):
     with patch.object(KubeUtil, '_locate_kubelet', return_value='http://localhost:10255'):
         self.kube = KubeUtil()
         self.kube.__init__()    # It's a singleton, force re-init
Example #32
0
 def test_get_auth_token(self):
     KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
     self.assertIsNone(KubeUtil.get_auth_token())
     KubeUtil.AUTH_TOKEN_PATH = Fixtures.file(
         'events.json')  # any file could do the trick
     self.assertIsNotNone(KubeUtil.get_auth_token())
Example #33
0
 def test_pod_name(self):
     with patch.object(KubeUtil, '_locate_kubelet', return_value='http://localhost:10255'):
         kube = KubeUtil()
         kube.__init__()
         self.assertEqual('test', kube.pod_name)
Example #34
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception(
                'Unable to reach kubelet. Try setting the host parameter.')

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning(
                        'Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s'
                        % str(e))

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            req = self.kubeutil.perform_kubelet_query(url)
            for line in req.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(
                    2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base,
                               AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' %
                               (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges
        ]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates
        ]

        self.publish_aliases = _is_affirmative(
            instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(
            instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        pods_list = self.kubeutil.retrieve_pods_list()

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, pods_list)

        # kubelet events
        if _is_affirmative(
                instance.get('collect_events', DEFAULT_COLLECT_EVENTS)):
            try:
                self._process_events(instance, pods_list)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any(
                [fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any(
                [fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags,
                                          depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split(
                    "/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" %
                            (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        container_image = subcontainer['spec'].get('image')
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer,
                                            kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer,
                                           kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage']) / float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct',
                               fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS), tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(
            pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer,
                                                      kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception, e:
                self.log.error(
                    "Unable to collect metrics for container: {0} ({1}".format(
                        c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug(
                    "Pod %s does not have containers specs, skipping...",
                    pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources'][
                            'limits'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing limits value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.limits'.format(NAMESPACE, limit),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container limits for %s: %s",
                        c_name, e)
                    self.log.debug("Container object for {}: {}".format(
                        c_name, container))

                # requests
                try:
                    for request, value_str in container['resources'][
                            'requests'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing requests value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.requests'.format(NAMESPACE, request),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container requests for %s: %s",
                        c_name, e)
                    self.log.debug("Container object for {}: {}".format(
                        c_name, container))

        self._update_pods_metrics(instance, pods_list)
        self._update_node(instance)
Example #35
0
 def setUp(self):
     self.kubeutil = KubeUtil()
Example #36
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.host:
            raise Exception('Unable to retrieve Docker hostname and host parameter is not set')

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            r = requests.get(url, params={'verbose': True})
            for line in r.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base, AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' % (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        pods_list = self.kubeutil.retrieve_pods_list()

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, pods_list)

        # kubelet events
        if _is_affirmative(instance.get('collect_events', DEFAULT_COLLECT_EVENTS)):
            try:
                self._process_events(instance, pods_list)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split("/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" % (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        container_image = subcontainer['spec'].get('image')
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage'])/float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS),
                              tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer, kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception, e:
                self.log.error("Unable to collect metrics for container: {0} ({1}".format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug("Pod %s does not have containers specs, skipping...", pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources']['limits'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing limits value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.limits'.format(NAMESPACE, limit), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container limits for %s: %s", c_name, e)
                    self.log.debug("Container object for {}: {}".format(c_name, container))

                # requests
                try:
                    for request, value_str in container['resources']['requests'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing requests value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.requests'.format(NAMESPACE, request), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.error("Unable to retrieve container requests for %s: %s", c_name, e)
                    self.log.debug("Container object for {}: {}".format(c_name, container))

        self._update_pods_metrics(instance, pods_list)
        self._update_node(instance)
Example #37
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""

    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.dockerutil = DockerUtil(config_store=self.config_store)
        self.docker_client = self.dockerutil.client
        if Platform.is_k8s():
            try:
                self.kubeutil = KubeUtil()
            except Exception as ex:
                self.kubeutil = None
                log.error("Couldn't instantiate the kubernetes client, "
                    "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

        if Platform.is_nomad():
            self.nomadutil = NomadUtil()
        elif Platform.is_ecs_instance():
            self.ecsutil = ECSUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _make_fetch_state(self):
        pod_list = []
        if Platform.is_k8s():
            if not self.kubeutil:
                log.error("kubelet client not created, cannot retrieve pod list.")
            else:
                try:
                    pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
                except Exception as ex:
                    log.warning("Failed to retrieve pod list: %s" % str(ex))
        return _SDDockerBackendConfigFetchState(self.docker_client.inspect_container, pod_list)

    def update_checks(self, changed_containers):
        state = self._make_fetch_state()

        if Platform.is_k8s():
            self.kubeutil.check_services_cache_freshness()

        conf_reload_set = set()
        for c_id in changed_containers:
            checks = self._get_checks_to_refresh(state, c_id)
            if checks:
                conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the DATADOG_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                    and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        identifier = inspect.get('Config', {}).get('Labels', {}).get(DATADOG_ID) or \
            self.dockerutil.image_name_extractor(inspect)

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }

        return self.config_store.get_checks_to_refresh(identifier, **platform_kwargs)

    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '')

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
            "networks, trying with the IPAddress field" % (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        if Platform.is_rancher():
            # try to get the rancher IP address
            log.debug("No IP address was found in container %s (%s) "
                "trying with the Rancher label" % (c_id[:12], c_img))

            ip_addr = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_IP)
            if ip_addr:
                return ip_addr.split('/')[0]

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.debug("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning("The key passed for template variable %s was not found." % tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.debug("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.debug("Trying with the last (sorted) network: '%s'." % last_key)
            return ip_dict[last_key]

    def _get_port(self, state, c_id, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        container_inspect = state.inspect_container(c_id)

        try:
            ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys())
            if len(ports) == 0: # There might be a key Port in NetworkSettings but no ports so we raise IndexError to check in ExposedPorts
                raise IndexError
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." %
                          (c_id[:12], container_inspect.get('Config', {}).get('Image', '')))
                spec = state.get_kube_container_spec(c_id)
                if spec:
                    ports = [str(x.get('containerPort')) for x in spec.get('ports', [])]
        ports = sorted(ports, key=int)
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error("Port index is not an integer. Using the last element instead.")
        except IndexError:
            log.error("Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []

        ctr = state.inspect_container(c_id)
        # TODO: extend with labels, container ID, etc.
        tags.append('docker_image:%s' % self.dockerutil.image_name_extractor(ctr))
        tags.append('image_name:%s' % self.dockerutil.image_tag_extractor(ctr, 0)[0])
        tags.append('image_tag:%s' % self.dockerutil.image_tag_extractor(ctr, 1)[0])

        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_id[:12])
                return []

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            # add creator tags
            creator_tags = self.kubeutil.get_pod_creator_tags(pod_metadata)
            tags.extend(creator_tags)

            # add services tags
            services = self.kubeutil.match_services_for_pod(pod_metadata)
            for s in services:
                if s is not None:
                    tags.append('kube_service:%s' % s)

        elif Platform.is_swarm():
            c_labels = state.inspect_container(c_id).get('Config', {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        elif Platform.is_rancher():
            c_inspect = state.inspect_container(c_id)
            service_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_SVC_NAME)
            stack_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_STACK_NAME)
            container_name = c_inspect.get('Config', {}).get('Labels', {}).get(RANCHER_CONTAINER_NAME)
            if service_name:
                tags.append('rancher_service:%s' % service_name)
            if stack_name:
                tags.append('rancher_stack:%s' % stack_name)
            if container_name:
                tags.append('rancher_container:%s' % container_name)

        elif Platform.is_nomad():
            nomad_tags = self.nomadutil.extract_container_tags(state.inspect_container(c_id))
            if nomad_tags:
                tags.extend(nomad_tags)

        elif Platform.is_ecs_instance():
            ecs_tags = self.ecsutil.extract_container_tags(state.inspect_container(c_id))
            tags.extend(ecs_tags)

        return tags

    def _get_additional_tags(self, state, c_id, *args):
        tags = []
        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')
            pod_spec = state.get_kube_config(c_id, 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning("Failed to fetch pod metadata or pod spec for container %s."
                            " Additional Kubernetes tags may be missing." % c_id[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        state = self._make_fetch_state()
        containers = [(
            self.dockerutil.image_name_extractor(container),
            container.get('Id'), container.get('Labels')
        ) for container in self.docker_client.containers()]

        if Platform.is_k8s():
            self.kubeutil.check_services_cache_freshness()

        for image, cid, labels in containers:
            try:
                # value of the DATADOG_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(state, cid, identifier) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        configs[check_name] = (source, (init_config, [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        configs[check_name][1][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service '
                              'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a DATADOG_ID label, return its value or the image name if missing"""
        return labels.get(DATADOG_ID) or image

    def _get_check_configs(self, state, c_id, identifier):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        config_templates = self._get_config_templates(identifier, **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(state, c_id, instance_tpl, variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
            if tpl and len(tpl) == 2:
                init_config, instance = tpl
                check_configs.append((source, (check_name, init_config, instance)))

        return check_configs

    def _get_config_templates(self, identifier, **platform_kwargs):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
        else:
            auto_conf = False

        # format [(source, ('ident', {init_tpl}, {instance_tpl}))]
        raw_tpls = self.config_store.get_check_tpls(identifier, auto_conf=auto_conf, **platform_kwargs)
        for tpl in raw_tpls:
            # each template can come from either auto configuration or user-supplied templates
            try:
                source, (check_name, init_config_tpl, instance_tpl) = tpl
            except (TypeError, IndexError, ValueError):
                log.debug('No template was found for identifier %s, leaving it alone: %s' % (identifier, tpl))
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration'
                              ' by service discovery failed for ident  {1}.'.format(check_name, identifier))
                return None

            templates.append((source,
                              (check_name, init_config_tpl, instance_tpl, variables)))

        return templates

    def _fill_tpl(self, state, c_id, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_image = state.inspect_container(c_id).get('Config', {}).get('Image', '')

        # add default tags to the instance
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](state, c_id, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." % var)
                    var_values[var] = res
                except Exception as ex:
                    log.error("Could not find a value for the template variable %s for container %s "
                              "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error("No method was found to interpolate template variable %s for container %s "
                          "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
Example #38
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    # Try to get GCE instance name
    gce_hostname = GCE.get_hostname(config)
    if gce_hostname is not None:
        if is_valid_hostname(gce_hostname):
            return gce_hostname

    # Try to get the docker hostname
    if Platform.is_containerized():

        # First we try from the Docker API
        docker_util = DockerUtil()
        docker_hostname = docker_util.get_hostname(use_default_gw=False)
        if docker_hostname is not None and is_valid_hostname(docker_hostname):
            hostname = docker_hostname

        elif Platform.is_k8s():  # Let's try from the kubelet
            try:
                kube_util = KubeUtil()
            except Exception as ex:
                log.error("Couldn't instantiate the kubernetes client, "
                          "getting the k8s hostname won't work. Error: %s" %
                          str(ex))
            else:
                _, kube_hostname = kube_util.get_node_info()
                if kube_hostname is not None and is_valid_hostname(
                        kube_hostname):
                    hostname = kube_hostname

    # then move on to os-specific detection
    if hostname is None:
        if Platform.is_unix() or Platform.is_solaris():
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if we don't have a hostname, or we have an ec2 default hostname,
    # see if there's an instance-id available
    if not Platform.is_windows() and (hostname is None
                                      or Platform.is_ecs_instance()
                                      or EC2.is_default(hostname)):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname

    if hostname is None:
        log.critical(
            'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file'
        )
        raise Exception(
            'Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file'
        )

    return hostname
Example #39
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from Kubernetes """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(init_config=init_config, instance=inst)

        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.log.warning(
                    "Kubelet client failed to initialized for now, pausing the Kubernetes check."
                )
            else:
                raise Exception(
                    'Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently.'
                )

        if agentConfig.get('service_discovery') and \
                agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.leader_candidate = inst.get(LEADER_CANDIDATE)
        if self.leader_candidate:
            self.kubeutil.refresh_leader()

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning(
                        'Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s'
                        % str(e))

            self.event_retriever = None
            self._configure_event_collection(inst)

    def _perform_kubelet_checks(self, url, instance):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            req = self.kubeutil.perform_kubelet_query(url)
            for line in req.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match(r'\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(
                    2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name,
                                       AgentCheck.OK,
                                       tags=instance.get('tags', []))
                else:
                    self.service_check(service_check_name,
                                       AgentCheck.CRITICAL,
                                       tags=instance.get('tags', []))
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base,
                               AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' %
                               (url, str(e)),
                               tags=instance.get('tags', []))
        else:
            if is_ok:
                self.service_check(service_check_base,
                                   AgentCheck.OK,
                                   tags=instance.get('tags', []))
            else:
                self.service_check(service_check_base,
                                   AgentCheck.CRITICAL,
                                   tags=instance.get('tags', []))

    def _configure_event_collection(self, instance):
        self._collect_events = self.kubeutil.is_leader or _is_affirmative(
            instance.get('collect_events', DEFAULT_COLLECT_EVENTS))
        if self._collect_events:
            if self.event_retriever:
                self.event_retriever.set_kinds(None)
                self.event_retriever.set_delay(None)
            else:
                self.event_retriever = self.kubeutil.get_event_retriever()
        elif self.kubeutil.collect_service_tag:
            # Only fetch service and pod events for service mapping
            event_delay = instance.get('service_tag_update_freq',
                                       DEFAULT_SERVICE_EVENT_FREQ)
            if self.event_retriever:
                self.event_retriever.set_kinds(['Service', 'Pod'])
                self.event_retriever.set_delay(event_delay)
            else:
                self.event_retriever = self.kubeutil.get_event_retriever(
                    kinds=['Service', 'Pod'], delay=event_delay)
        else:
            self.event_retriever = None

    def check(self, instance):
        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.kubeutil.init_kubelet(instance)
                self.log.warning(
                    "Kubelet client is not initialized, Kubernetes check is paused."
                )
                return
            else:
                raise Exception(
                    "Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently."
                )

        # Leader election
        self.refresh_leader_status(instance)

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges
        ]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates
        ]

        self.publish_aliases = _is_affirmative(
            instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(
            instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        try:
            pods_list = self.kubeutil.retrieve_pods_list()
        except:
            pods_list = None

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url, instance)

        if pods_list is not None:
            # Will not fail if cAdvisor is not available
            self._update_pods_metrics(instance, pods_list)
            # cAdvisor & kubelet metrics, will fail if port 4194 is not open
            try:
                if int(instance.get('port',
                                    KubeUtil.DEFAULT_CADVISOR_PORT)) > 0:
                    self._update_metrics(instance, pods_list)
            except ConnectionError:
                self.warning(
                    '''Can't access the cAdvisor metrics, performance metrics and'''
                    ''' limits/requests will not be collected. Please setup'''
                    ''' your kubelet with the --cadvisor-port=4194 option, or set port to 0'''
                    ''' in this check's configuration to disable cAdvisor lookup.'''
                )
            except Exception as err:
                self.log.warning(
                    "Error while getting performance metrics: %s" % str(err))

        # kubernetes events
        if self.event_retriever is not None:
            try:
                events = self.event_retriever.get_event_array()
                changed_cids = self.kubeutil.process_events(events,
                                                            podlist=pods_list)
                if (changed_cids and self._sd_backend):
                    self._sd_backend.update_checks(changed_cids)
                if events and self._collect_events:
                    self._update_kube_events(instance, pods_list, events)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any(
                [fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any(
                [fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags,
                                          depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        # kube_container_name is the name of the Kubernetes container resource,
        # not the name of the docker container (that's tagged as container_name)
        kube_container_name = cont_labels[KubeUtil.CONTAINER_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))
        tags.append(u"kube_container_name:{0}".format(kube_container_name))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split(
                    "/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" %
                            (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            self.log.debug("Subcontainer doesn't have a name, skipping.")
            return

        tags.append('container_name:%s' % container_name)

        container_image = self.kubeutil.image_name_resolver(
            subcontainer['spec'].get('image'))
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer,
                                            kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer,
                                           kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get(
                "spec",
            {}).get("has_filesystem") and stats.get('filesystem', []) != []:
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage']) / float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct',
                               fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS), tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_pod_tags(
            pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            if 'aliases' not in subcontainer:
                # it means the subcontainer is about a higher-level entity than a container
                continue
            try:
                tags = self._update_container_metrics(instance, subcontainer,
                                                      kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception as e:
                self.log.error(
                    "Unable to collect metrics for container: {0} ({1})".
                    format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug(
                    "Pod %s does not have containers specs, skipping...",
                    pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources'][
                            'limits'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing limits value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.limits'.format(NAMESPACE, limit),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container limits for %s: %s",
                        c_name, e)

                # requests
                try:
                    for request, value_str in container['resources'][
                            'requests'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing requests value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.requests'.format(NAMESPACE, request),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container requests for %s: %s",
                        c_name, e)

        self._update_node(instance)

    def _update_node(self, instance):
        machine_info = self.kubeutil.retrieve_machine_info()
        num_cores = machine_info.get('num_cores', 0)
        memory_capacity = machine_info.get('memory_capacity', 0)

        tags = instance.get('tags', [])
        self.publish_gauge(self, NAMESPACE + '.cpu.capacity', float(num_cores),
                           tags)
        self.publish_gauge(self, NAMESPACE + '.memory.capacity',
                           float(memory_capacity), tags)
        # TODO(markine): Report 'allocatable' which is capacity minus capacity
        # reserved for system/Kubernetes.

    def _update_pods_metrics(self, instance, pods):
        """
        Reports the number of running pods on this node, tagged by service and creator

        We go though all the pods, extract tags then count them by tag list, sorted and
        serialized in a pipe-separated string (it is an illegar character for tags)
        """
        tags_map = defaultdict(int)
        for pod in pods['items']:
            pod_meta = pod.get('metadata', {})
            pod_tags = self.kubeutil.get_pod_creator_tags(
                pod_meta, legacy_rep_controller_tag=True)
            services = self.kubeutil.match_services_for_pod(pod_meta)
            if isinstance(services, list):
                for service in services:
                    pod_tags.append('kube_service:%s' % service)
            if 'namespace' in pod_meta:
                pod_tags.append('kube_namespace:%s' % pod_meta['namespace'])

            tags_map[frozenset(pod_tags)] += 1

        commmon_tags = instance.get('tags', [])
        for pod_tags, pod_count in tags_map.iteritems():
            tags = list(pod_tags)
            tags.extend(commmon_tags)
            self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count,
                               tags)

    def _update_kube_events(self, instance, pods_list, event_items):
        """
        Process kube events and send ddog events
        The namespace filtering is done here instead of KubeEventRetriever
        to avoid interfering with service discovery
        """
        node_ip, node_name = self.kubeutil.get_node_info()
        self.log.debug('Processing events on {} [{}]'.format(
            node_name, node_ip))

        k8s_namespaces = instance.get('namespaces', DEFAULT_NAMESPACES)
        if not isinstance(k8s_namespaces, list):
            self.log.warning(
                'Configuration key "namespaces" is not a list: fallback to the default value'
            )
            k8s_namespaces = DEFAULT_NAMESPACES

        # handle old config value
        if 'namespace' in instance and instance.get('namespace') not in (
                None, 'default'):
            self.log.warning(
                '''The 'namespace' parameter is deprecated and will stop being supported starting '''
                '''from 5.13. Please use 'namespaces' and/or 'namespace_name_regexp' instead.'''
            )
            k8s_namespaces.append(instance.get('namespace'))

        if self.k8s_namespace_regexp:
            namespaces_endpoint = '{}/namespaces'.format(
                self.kubeutil.kubernetes_api_url)
            self.log.debug('Kubernetes API endpoint to query namespaces: %s' %
                           namespaces_endpoint)

            namespaces = self.kubeutil.retrieve_json_auth(
                namespaces_endpoint).json()
            for namespace in namespaces.get('items', []):
                name = namespace.get('metadata', {}).get('name', None)
                if name and self.k8s_namespace_regexp.match(name):
                    k8s_namespaces.append(name)

        k8s_namespaces = set(k8s_namespaces)

        for event in event_items:
            event_ts = calendar.timegm(
                time.strptime(event.get('lastTimestamp'),
                              '%Y-%m-%dT%H:%M:%SZ'))
            involved_obj = event.get('involvedObject', {})

            # filter events by white listed namespaces (empty namespace belong to the 'default' one)
            if involved_obj.get('namespace', 'default') not in k8s_namespaces:
                continue

            tags = self.kubeutil.extract_event_tags(event)
            tags.extend(instance.get('tags', []))

            title = '{} {} on {}'.format(involved_obj.get('name'),
                                         event.get('reason'), node_name)
            message = event.get('message')
            source = event.get('source')
            k8s_event_type = event.get('type')
            alert_type = K8S_ALERT_MAP.get(k8s_event_type, 'info')

            if source:
                message += '\nSource: {} {}\n'.format(
                    source.get('component', ''), source.get('host', ''))
            msg_body = "%%%\n{}\n```\n{}\n```\n%%%".format(title, message)
            dd_event = {
                'timestamp': event_ts,
                'host': node_ip,
                'event_type': EVENT_TYPE,
                'msg_title': title,
                'msg_text': msg_body,
                'source_type_name': EVENT_TYPE,
                'alert_type': alert_type,
                'event_object':
                'kubernetes:{}'.format(involved_obj.get('name')),
                'tags': tags,
            }
            self.event(dd_event)

    def refresh_leader_status(self, instance):
        """
        calls kubeutil.refresh_leader and compares the resulting
        leader status with the previous one.
        If it changed, update the event collection logic
        """
        if not self.leader_candidate:
            return

        leader_status = self.kubeutil.is_leader
        self.kubeutil.refresh_leader()

        # nothing changed, no-op
        if leader_status == self.kubeutil.is_leader:
            return
        # else, reset the event collection config
        else:
            self.log.info(
                "Leader status changed, updating event collection config...")
            self._configure_event_collection(instance)
Example #40
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch('utils.kubernetes.KubeUtil.retrieve_pods_list',
                side_effect=['foo'])
    @mock.patch('utils.kubernetes.KubeUtil.extract_kube_labels')
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys='bar')
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar')

    @mock.patch('utils.kubernetes.kubeutil.KubeUtil.get_auth_token',
                return_value=None)
    def test_init_tls_settings(self, get_tkn):
        self.assertEqual(self.kubeutil._init_tls_settings({}), {})
        self.assertEqual(
            self.kubeutil._init_tls_settings(
                {'apiserver_client_crt': 'foo.crt'}), {})

        instance = {
            'apiserver_client_crt': 'foo.crt',
            'apiserver_client_key': 'foo.key',
            'apiserver_ca_cert': 'ca.crt'
        }
        with mock.patch('utils.kubernetes.kubeutil.os.path.exists',
                        return_value=True):
            expected_res = {
                'apiserver_client_cert': ('foo.crt', 'foo.key'),
                'apiserver_cacert': 'ca.crt'
            }
            self.assertEqual(self.kubeutil._init_tls_settings(instance),
                             expected_res)

        with mock.patch('utils.kubernetes.kubeutil.os.path.exists',
                        return_value=False):
            self.assertEqual(self.kubeutil._init_tls_settings(instance), {})

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ['foo'])
        self.assertEqual(len(res), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, 'foo')
        self.assertEqual(len(res), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 6)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 4)

    @mock.patch('utils.kubernetes.kubeutil.retrieve_json')
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch('utils.kubernetes.kubeutil.retrieve_json')
    def test_retrieve_machine_info(self, retrieve_json):
        self.kubeutil.retrieve_machine_info()
        retrieve_json.assert_called_once_with(self.kubeutil.machine_info_url)

    @mock.patch('utils.kubernetes.kubeutil.retrieve_json')
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.9')
        self.assertEqual(len(res.get('items')), 5)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.5')
        self.assertEqual(len(res.get('items')), 1)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

    @mock.patch('utils.kubernetes.kubeutil.requests')
    def test_retrieve_json_auth(self, r):
        instances = [
            # tls_settings, expected_params
            ({}, {
                'verify': False,
                'timeout': 10,
                'headers': None,
                'cert': None
            }),
            ({
                'bearer_token': 'foo_tok'
            }, {
                'verify': False,
                'timeout': 10,
                'headers': {
                    'Authorization': 'Bearer foo_tok'
                },
                'cert': None
            }),
            ({
                'bearer_token': 'foo_tok',
                'apiserver_client_cert': ('foo.crt', 'foo.key')
            }, {
                'verify': False,
                'timeout': 10,
                'headers': None,
                'cert': ('foo.crt', 'foo.key')
            }),
        ]

        for tls_settings, expected_params in instances:
            r.get.reset_mock()
            self.kubeutil.tls_settings = tls_settings
            self.kubeutil.retrieve_json_auth('url')
            r.get.assert_called_once_with('url', **expected_params)

        r.get.reset_mock()
        self.kubeutil.tls_settings = {'bearer_token': 'foo_tok'}
        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth('url')
        r.get.assert_called_with('url',
                                 verify=__file__,
                                 timeout=10,
                                 headers={'Authorization': 'Bearer foo_tok'},
                                 cert=None)

    def test_get_node_info(self):
        with mock.patch('utils.kubernetes.KubeUtil._fetch_host_data') as f:
            self.kubeutil._node_ip = None
            self.kubeutil._node_name = None
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = 'foo'
            self.kubeutil._node_name = 'bar'
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, 'foo')
            self.assertEqual(name, 'bar')
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch(
                'utils.kubernetes.KubeUtil.retrieve_pods_list') as mock_pods:
            self.kubeutil.host_name = 'dd-agent-1rxlh'
            mock_pods.return_value = json.loads(
                Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name,
                             'kubernetes-massi-minion-k23m')

            self.kubeutil.host_name = 'heapster-v11-l8sh1'
            mock_pods.return_value = json.loads(
                Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name,
                             'gke-cluster-1-8046fdfa-node-ld35')

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file(
            'events.json')  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv('KUBERNETES_PORT')
        self.assertFalse(Platform.is_k8s())
        os.environ['KUBERNETES_PORT'] = '999'
        self.assertTrue(Platform.is_k8s())

    def test_extract_event_tags(self):
        events = json.loads(
            Fixtures.read_file("events.json", string_escape=False))['items']
        for ev in events:
            tags = KubeUtil().extract_event_tags(ev)
            # there should be 4 tags except for some events where source.host is missing
            self.assertTrue(len(tags) >= 3)

            tag_names = [tag.split(':')[0] for tag in tags]
            self.assertIn('reason', tag_names)
            self.assertIn('namespace', tag_names)
            self.assertIn('object_type', tag_names)
            if len(tags) == 4:
                self.assertIn('node_name', tag_names)
Example #41
0
def get_hostname(config=None):
    """
    Get the canonical host name this agent should identify as. This is
    the authoritative source of the host name for the agent.

    Tries, in order:

      * agent config (datadog.conf, "hostname:")
      * 'hostname -f' (on unix)
      * socket.gethostname()
    """
    hostname = None

    # first, try the config
    if config is None:
        from config import get_config
        config = get_config(parse_args=True)
    config_hostname = config.get('hostname')
    if config_hostname and is_valid_hostname(config_hostname):
        return config_hostname

    # Try to get GCE instance name
    gce_hostname = GCE.get_hostname(config)
    if gce_hostname is not None:
        if is_valid_hostname(gce_hostname):
            return gce_hostname

    # Try to get the docker hostname
    if Platform.is_containerized():

        # First we try from the Docker API
        docker_util = DockerUtil()
        docker_hostname = docker_util.get_hostname(use_default_gw=False)
        if docker_hostname is not None and is_valid_hostname(docker_hostname):
            hostname = docker_hostname

        elif Platform.is_k8s(): # Let's try from the kubelet
            kube_util = KubeUtil()
            _, kube_hostname = kube_util.get_node_info()
            if kube_hostname is not None and is_valid_hostname(kube_hostname):
                hostname = kube_hostname

    # then move on to os-specific detection
    if hostname is None:
        if Platform.is_unix() or Platform.is_solaris():
            unix_hostname = _get_hostname_unix()
            if unix_hostname and is_valid_hostname(unix_hostname):
                hostname = unix_hostname

    # if we have an ec2 default hostname, see if there's an instance-id available
    if (Platform.is_ecs_instance()) or (hostname is not None and EC2.is_default(hostname)):
        instanceid = EC2.get_instance_id(config)
        if instanceid:
            hostname = instanceid

    # fall back on socket.gethostname(), socket.getfqdn() is too unreliable
    if hostname is None:
        try:
            socket_hostname = socket.gethostname()
        except socket.error:
            socket_hostname = None
        if socket_hostname and is_valid_hostname(socket_hostname):
            hostname = socket_hostname

    if hostname is None:
        log.critical('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file')
        raise Exception('Unable to reliably determine host name. You can define one in datadog.conf or in your hosts file')

    return hostname
class KubernetesTopology(AgentCheck):
    INSTANCE_TYPE = "kubernetes"
    SERVICE_CHECK_NAME = "kubernetes.topology_information"

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(init_config=init_config, instance=inst, use_kubelet=False)

        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.log.warning("Kubelet client failed to initialized for now, pausing the Kubernetes check.")
            else:
                raise Exception('Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently.')

    def check(self, instance):
        instance_key = {'type': self.INSTANCE_TYPE, 'url': self.kubeutil.kubernetes_api_root_url}
        msg = None
        status = None
        url = self.kubeutil.kubernetes_api_url

        if not url:
            raise Exception('Unable to reach kubernetes. Try setting the master_name and master_port parameter.')

        self.start_snapshot(instance_key)
        try:
            self._extract_topology(instance_key)
        except requests.exceptions.Timeout as e:
            # If there's a timeout
            msg = "%s seconds timeout when hitting %s" % (self.kubeutil.timeoutSeconds, url)
            status = AgentCheck.CRITICAL
        except Exception as e:
            self.log.warning('kubernetes topology check %s failed: %s' % (url, str(e)))
            msg = str(e)
            status = AgentCheck.CRITICAL
        finally:
            if status is AgentCheck.CRITICAL:
                self.service_check(self.SERVICE_CHECK_NAME, status, message=msg)

        self.stop_snapshot(instance_key)

    def _extract_topology(self, instance_key):
        self._extract_services(instance_key)
        self._extract_nodes(instance_key)
        self._extract_pods(instance_key)
        self._link_pods_to_services(instance_key)
        self._extract_deployments(instance_key)

    def _extract_services(self, instance_key):
        for service in self.kubeutil.retrieve_services_list()['items']:
            data = dict()
            data['type'] = service['spec']['type']
            data['namespace'] = service['metadata']['namespace']
            data['ports'] = service['spec'].get('ports', [])
            data['labels'] = self._make_labels(service['metadata'])
            if 'clusterIP' in service['spec'].keys():
                data['cluster_ip'] = service['spec']['clusterIP']
            self.component(instance_key, service['metadata']['name'], {'name': 'KUBERNETES_SERVICE'}, data)

    def _extract_nodes(self, instance_key):
        for node in self.kubeutil.retrieve_nodes_list()['items']:
            status_addresses = node['status'].get("addresses",[])
            addresses = {item['type']: item['address'] for item in status_addresses}

            data = dict()
            data['labels'] = self._make_labels(node['metadata'])
            data['internal_ip'] = addresses.get('InternalIP', None)
            data['legacy_host_ip'] = addresses.get('LegacyHostIP', None)
            data['hostname'] = addresses.get('Hostname', None)
            data['external_ip'] = addresses.get('ExternalIP', None)

            self.component(instance_key, node['metadata']['name'], {'name': 'KUBERNETES_NODE'}, data)

    def _extract_deployments(self, instance_key):
        for deployment in self.kubeutil.retrieve_deployments_list()['items']:
            data = dict()
            externalId = "deployment: %s" % deployment['metadata']['name']
            data['namespace'] = deployment['metadata']['namespace']
            data['name'] = deployment['metadata']['name']
            data['labels'] = self._make_labels(deployment['metadata'])

            deployment_template = deployment['spec']['template']
            if deployment_template and deployment_template['metadata']['labels'] and len(deployment_template['metadata']['labels']) > 0:
                data['template_labels'] = self._make_labels(deployment_template['metadata'])
                replicasets = self.kubeutil.retrieve_replicaset_filtered_list(deployment['metadata']['namespace'], deployment_template['metadata']['labels'])
                if replicasets['items']:
                    for replicaset in replicasets['items']:
                        self.relation(instance_key, externalId, replicaset['metadata']['name'], {'name': 'CREATED'}, dict())

            self.component(instance_key, externalId, {'name': 'KUBERNETES_DEPLOYMENT'}, data)

    def _extract_pods(self, instance_key):
        replicasets_to_pods = defaultdict(list)
        replicaset_to_data = dict()
        for pod in self.kubeutil.retrieve_master_pods_list()['items']:
            data = dict()
            pod_name = pod['metadata']['name']
            data['uid'] = pod['metadata']['uid']
            data['namespace'] = pod['metadata']['namespace']
            data['labels'] = self._make_labels(pod['metadata'])

            self.component(instance_key, pod_name, {'name': 'KUBERNETES_POD'}, data)

            relation_data = dict()
            if 'nodeName' in pod['spec']:
                self.relation(instance_key, pod_name, pod['spec']['nodeName'], {'name': 'PLACED_ON'}, relation_data)

            if 'containerStatuses' in pod['status'].keys():
                if 'nodeName' in pod['spec']:
                    pod_node_name = pod['spec']['nodeName']

                    if 'podIP' in pod['status']:
                        pod_ip = pod['status']['podIP']
                    else:
                        pod_ip = None

                    if 'hostIP' in pod['status']:
                        host_ip = pod['status']['hostIP']
                    else:
                        host_ip = None

                    self._extract_containers(instance_key, pod_name, pod_ip, host_ip, pod_node_name, pod['metadata']['namespace'], pod['status']['containerStatuses'])

            if 'ownerReferences' in pod['metadata'].keys():
                for reference in pod['metadata']['ownerReferences']:
                    if reference['kind'] == 'ReplicaSet':
                        data = dict()
                        data['name'] = pod_name
                        replicasets_to_pods[reference['name']].append(data)
                        if reference['name'] not in replicaset_to_data:
                            replicaset_data = dict()
                            replicaset_data['labels'] = self._make_labels(pod['metadata'])
                            replicaset_data['namespace'] = pod['metadata']['namespace']
                            replicaset_to_data[reference['name']] = replicaset_data

        for replicaset_name in replicasets_to_pods:
            self.component(instance_key, replicaset_name, {'name': 'KUBERNETES_REPLICASET'}, replicaset_to_data[replicaset_name])
            for pod in replicasets_to_pods[replicaset_name]:
                self.relation(instance_key, replicaset_name, pod['name'], {'name': 'CONTROLS'}, dict())

    def _extract_containers(self, instance_key, pod_name, pod_ip, host_ip, host_name, namespace, statuses):
        for containerStatus in statuses:
            container_id = containerStatus['containerID']
            data = dict()
            data['pod_ip'] = pod_ip
            data['host_ip'] = host_ip
            data['namespace'] = namespace
            data['labels'] = ["namespace:%s" % namespace]
            data['docker'] = {
                'image': containerStatus['image'],
                'container_id': container_id
            }
            self.component(instance_key, container_id, {'name': 'KUBERNETES_CONTAINER'}, data)

            relation_data = dict()
            self.relation(instance_key, pod_name, container_id, {'name': 'CONSISTS_OF'}, relation_data)
            self.relation(instance_key, container_id, host_name, {'name': 'HOSTED_ON'}, relation_data)

    def _link_pods_to_services(self, instance_key):
        for endpoint in self.kubeutil.retrieve_endpoints_list()['items']:
            service_name = endpoint['metadata']['name']
            if 'subsets' in endpoint:
                for subset in endpoint['subsets']:
                    if 'addresses' in subset:
                        for address in subset['addresses']:
                            if 'targetRef' in address.keys() and address['targetRef']['kind'] == 'Pod':
                                data = dict()
                                pod_name = address['targetRef']['name']
                                self.relation(instance_key, service_name, pod_name, {'name': 'EXPOSES'}, data)

    @staticmethod
    def extract_metadata_labels(metadata):
        """
        Extract labels from metadata section coming from the kubelet API.
        """
        kube_labels = defaultdict(list)
        name = metadata.get("name")
        namespace = metadata.get("namespace")
        labels = metadata.get("labels")
        if name and labels:
            if namespace:
                key = "%s/%s" % (namespace, name)
            else:
                key = name

            for k, v in labels.iteritems():
                kube_labels[key].append(u"%s:%s" % (k, v))

        return kube_labels

    def _make_labels(self, metadata):
        original_labels = self._flatten_dict(KubernetesTopology.extract_metadata_labels(metadata=metadata))
        if 'namespace' in metadata:
            original_labels.append("namespace:%s" % metadata['namespace'])
        return original_labels

    def _flatten_dict(self, dict_of_list):
        from itertools import chain
        return sorted(set(chain.from_iterable(dict_of_list.itervalues())))
Example #43
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception('Unable to reach kubelet. Try setting the host parameter.')

        if agentConfig.get('service_discovery') and \
           agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))

            self._collect_events = _is_affirmative(inst.get('collect_events', DEFAULT_COLLECT_EVENTS))
            if self._collect_events:
                self.event_retriever = self.kubeutil.get_event_retriever()
            elif self.kubeutil.collect_service_tag:
                # Only fetch service and pod events for service mapping
                event_delay = inst.get('service_tag_update_freq', DEFAULT_SERVICE_EVENT_FREQ)
                self.event_retriever = self.kubeutil.get_event_retriever(kinds=['Service', 'Pod'],
                                                                         delay=event_delay)
            else:
                self.event_retriever = None
        else:
            self._collect_events = None
            self.event_retriever = None

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            req = self.kubeutil.perform_kubelet_query(url)
            for line in req.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base, AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' % (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        try:
            pods_list = self.kubeutil.retrieve_pods_list()
        except:
            pods_list = None

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        if pods_list is not None:
            # kubelet metrics
            self._update_metrics(instance, pods_list)

        # kubelet events
        if self.event_retriever is not None:
            try:
                events = self.event_retriever.get_event_array()
                changed_cids = self.kubeutil.process_events(events, podlist=pods_list)
                if (changed_cids and self._sd_backend):
                    self._sd_backend.update_checks(changed_cids)
                if events and self._collect_events:
                    self._update_kube_events(instance, pods_list, events)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split("/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" % (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        container_image = subcontainer['spec'].get('image')
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem") and stats.get('filesystem', []) != []:
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage'])/float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS),
                              tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_pod_tags(pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer, kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception as e:
                self.log.error("Unable to collect metrics for container: {0} ({1})".format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug("Pod %s does not have containers specs, skipping...", pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources']['limits'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing limits value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.limits'.format(NAMESPACE, limit), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container limits for %s: %s", c_name, e)

                # requests
                try:
                    for request, value_str in container['resources']['requests'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing requests value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.requests'.format(NAMESPACE, request), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container requests for %s: %s", c_name, e)

        self._update_pods_metrics(instance, pods_list)
        self._update_node(instance)

    def _update_node(self, instance):
        machine_info = self.kubeutil.retrieve_machine_info()
        num_cores = machine_info.get('num_cores', 0)
        memory_capacity = machine_info.get('memory_capacity', 0)

        tags = instance.get('tags', [])
        self.publish_gauge(self, NAMESPACE + '.cpu.capacity', float(num_cores), tags)
        self.publish_gauge(self, NAMESPACE + '.memory.capacity', float(memory_capacity), tags)
        # TODO(markine): Report 'allocatable' which is capacity minus capacity
        # reserved for system/Kubernetes.

    def _update_pods_metrics(self, instance, pods):
        """
        Reports the number of running pods, tagged by service and creator

        We go though all the pods, extract tags then count them by tag list, sorted and
        serialized in a pipe-separated string (it is an illegar character for tags)
        """
        tags_map = defaultdict(int)
        for pod in pods['items']:
            pod_meta = pod.get('metadata', {})
            pod_tags = self.kubeutil.get_pod_creator_tags(pod_meta, legacy_rep_controller_tag=True)
            services = self.kubeutil.match_services_for_pod(pod_meta)
            if isinstance(services, list):
                for service in services:
                    pod_tags.append('kube_service:%s' % service)
            if 'namespace' in pod_meta:
                pod_tags.append('kube_namespace:%s' % pod_meta['namespace'])

            tags_map[frozenset(pod_tags)] += 1

        commmon_tags = instance.get('tags', [])
        for pod_tags, pod_count in tags_map.iteritems():
            tags = list(pod_tags)
            tags.extend(commmon_tags)
            self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count, tags)

    def _update_kube_events(self, instance, pods_list, event_items):
        """
        Process kube events and send ddog events
        The namespace filtering is done here instead of KubeEventRetriever
        to avoid interfering with service discovery
        """
        node_ip, node_name = self.kubeutil.get_node_info()
        self.log.debug('Processing events on {} [{}]'.format(node_name, node_ip))

        k8s_namespaces = instance.get('namespaces', DEFAULT_NAMESPACES)
        if not isinstance(k8s_namespaces, list):
            self.log.warning('Configuration key "namespaces" is not a list: fallback to the default value')
            k8s_namespaces = DEFAULT_NAMESPACES

        # handle old config value
        if 'namespace' in instance and instance.get('namespace') not in (None, 'default'):
            self.log.warning('''The 'namespace' parameter is deprecated and will stop being supported starting '''
                             '''from 5.13. Please use 'namespaces' and/or 'namespace_name_regexp' instead.''')
            k8s_namespaces.append(instance.get('namespace'))

        if self.k8s_namespace_regexp:
            namespaces_endpoint = '{}/namespaces'.format(self.kubeutil.kubernetes_api_url)
            self.log.debug('Kubernetes API endpoint to query namespaces: %s' % namespaces_endpoint)

            namespaces = self.kubeutil.retrieve_json_auth(namespaces_endpoint)
            for namespace in namespaces.get('items', []):
                name = namespace.get('metadata', {}).get('name', None)
                if name and self.k8s_namespace_regexp.match(name):
                    k8s_namespaces.append(name)

        k8s_namespaces = set(k8s_namespaces)

        for event in event_items:
            event_ts = calendar.timegm(time.strptime(event.get('lastTimestamp'), '%Y-%m-%dT%H:%M:%SZ'))
            involved_obj = event.get('involvedObject', {})

            # filter events by white listed namespaces (empty namespace belong to the 'default' one)
            if involved_obj.get('namespace', 'default') not in k8s_namespaces:
                continue

            tags = self.kubeutil.extract_event_tags(event)
            tags.extend(instance.get('tags', []))

            title = '{} {} on {}'.format(involved_obj.get('name'), event.get('reason'), node_name)
            message = event.get('message')
            source = event.get('source')
            if source:
                message += '\nSource: {} {}\n'.format(source.get('component', ''), source.get('host', ''))
            msg_body = "%%%\n{}\n```\n{}\n```\n%%%".format(title, message)
            dd_event = {
                'timestamp': event_ts,
                'host': node_ip,
                'event_type': EVENT_TYPE,
                'msg_title': title,
                'msg_text': msg_body,
                'source_type_name': EVENT_TYPE,
                'event_object': 'kubernetes:{}'.format(involved_obj.get('name')),
                'tags': tags,
            }
            self.event(dd_event)
Example #44
0
 def setUp(self):
     with patch.object(KubeUtil,
                       '_locate_kubelet',
                       return_value='http://localhost:10255'):
         self.kube = KubeUtil()
         self.kube.__init__()  # It's a singleton, force re-init
Example #45
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)

        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()

            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                self.kubeutil = KubeUtil()

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names


            # get the health check whitelist
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)


            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()
            if not self.init_success:
                # Initialization failed, will try later
                return

        # Report image metrics
        if self.collect_image_stats:
            self._count_and_weigh_images()

        if self.collect_ecs_tags:
            self.refresh_ecs_tags()

        if Platform.is_k8s():
            try:
                self.kube_labels = self.kubeutil.get_kube_labels()
            except Exception as e:
                self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))
                self.kube_labels = {}

        # containers running with custom cgroups?
        custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

        # Get the list of containers and the index of their names
        health_service_checks = True if self.whitelist_patterns else False
        containers_by_id = self._get_and_count_containers(custom_cgroups, health_service_checks)
        containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

        # Send events from Docker API
        if self.collect_events or self._service_discovery:
            self._process_events(containers_by_id)

        # Report performance container metrics (cpu, mem, net, io)
        self._report_performance_metrics(containers_by_id)

        if self.collect_container_size:
            self._report_container_size(containers_by_id)

        # Collect disk stats from Docker info command
        if self.collect_disk_stats:
            self._report_disk_stats()

        if health_service_checks:
            self._send_container_healthcheck_sc(containers_by_id)

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False, healthchecks=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Create a set of filtered containers based on the exclude/include rules
        # and cache these rules in docker_util
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups or healthchecks:
                try:
                    inspect_dict = self.docker_client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                    container['health'] = inspect_dict['State'].get('Health', {})
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)


        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        if entity is not None:
            pod_name = None

            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller: # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k,v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add ECS tags
            if self.collect_ecs_tags:
                entity_id = entity.get("Id")
                if entity_id in self.ecs_tags:
                    ecs_tags = self.ecs_tags[entity_id]
                    tags.extend(ecs_tags)

            # Add kube labels
            if Platform.is_k8s():
                kube_tags = self.kube_labels.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def refresh_ecs_tags(self):
        ecs_config = self.docker_client.inspect_container('ecs-agent')
        ip = ecs_config.get('NetworkSettings', {}).get('IPAddress')
        ports = ecs_config.get('NetworkSettings', {}).get('Ports')
        port = ports.keys()[0].split('/')[0] if ports else None
        if not ip:
            port = ECS_INTROSPECT_DEFAULT_PORT
            if Platform.is_containerized() and self.docker_gateway:
                ip = self.docker_gateway
            else:
                ip = "localhost"

        ecs_tags = {}
        try:
            if ip and port:
                tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json()
                for task in tasks.get('Tasks', []):
                    for container in task.get('Containers', []):
                        tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']]
                        ecs_tags[container['DockerId']] = tags
        except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e:
            self.log.warning("Unable to collect ECS task names: %s" % e)

        self.ecs_tags = ecs_tags

    def _filter_containers(self, containers):
        if not self.docker_util.filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            # exclude/include patterns are stored in docker_util to share them with other container-related checks
            if self.docker_util.are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:

                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _send_container_healthcheck_sc(self, containers_by_id):
        """Send health service checks for containers."""
        for container in containers_by_id.itervalues():
            healthcheck_tags = self._get_tags(container, HEALTHCHECK)
            match = False
            for tag in healthcheck_tags:
                for rule in self.whitelist_patterns:
                    if re.match(rule, tag):
                        match = True

                        self._submit_healthcheck_sc(container)
                        break

                if match:
                    break

    def _submit_healthcheck_sc(self, container):
        health = container.get('health', {})
        status = AgentCheck.UNKNOWN
        if health:
            _health = health.get('Status', '')
            if _health == 'unhealthy':
                status = AgentCheck.CRITICAL
            elif _health == 'healthy':
                status = AgentCheck.OK

        tags = self._get_tags(container, CONTAINER)
        self.service_check(HEALTHCHECK_SERVICE_CHECK_NAME, status, tags=tags)

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            self._report_cgroup_metrics(container, tags)
            if "_proc_root" not in container:
                containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                continue
            self._report_net_metrics(container, tags)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        try:
            for cgroup in CGROUP_METRICS:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

        except MountException as ex:
            if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES:
                raise ex
            else:
                self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now."
                             "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries))
                self.cgroup_listing_retries += 1
        else:
            self.cgroup_listing_retries = 0

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')
        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name == 'eth0':
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), tags)
                        break
        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _process_events(self, containers_by_id):
        if self.collect_events is False:
            # Crawl events for service discovery only
            self._get_events()
            return
        try:
            api_events = self._get_events()
            aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
            events = self._format_events(aggregated_events, containers_by_id)
        except (socket.timeout, urllib2.URLError):
            self.warning('Timeout when collecting events. Events will be missing.')
            return
        except Exception as e:
            self.warning("Unexpected exception when collecting events: {0}. "
                         "Events will be missing".format(e))
            return

        for ev in events:
            self.log.debug("Creating event: %s" % ev['msg_title'])
            self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if changed_container_ids and self._service_discovery:
            get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                events[event['from']].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            low_prio_events = []
            normal_prio_events = []

            for event in event_group:
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)

                # health checks generate tons of these so we treat them separately and lower their priority
                if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'):
                    low_prio_events.append((event, container_name))
                else:
                    normal_prio_events.append((event, container_name))

            exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low')
            if exec_event:
                events.append(exec_event)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            if normal_event:
                events.append(normal_event)

        return events

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        if not events:
            return

        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = (
            "%%%\n"
            "{image_name} {status} on {hostname}\n"
            "```\n{status_changes}\n```\n"
            "%%%"
        ).format(
            image_name=image,
            status=status_text,
            hostname=self.hostname,
            status_changes="\n".join(
                ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
        )

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }


    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                val, unit = raw_val.split(' ')
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now.
            # Some files can also be missing (like cpu.stat) and that's fine.
            self.log.info("Can't open %s. Some metrics for this container may be missing." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]: # general case
            return True
        if 'docker' in selinux_policy: # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                "If you are running the agent in a container, make sure to "
                'share the volume properly: "/proc:/host/proc:ro". '
                "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. "
                "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:

            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, "
                               "process likely raced to finish : %s" %
                               (path, str(e)))
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug("Container %s not in container_dict, it's likely excluded", container_id)
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
Example #46
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""

    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.docker_client = DockerUtil(config_store=self.config_store).client
        if Platform.is_k8s():
            self.kubeutil = KubeUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def update_checks(self, changed_containers):
        conf_reload_set = set()
        for id_ in changed_containers:
            try:
                inspect = self.docker_client.inspect_container(id_)
            except (NullResource, NotFound):
                # if the container was removed we can't tell which check is concerned
                # so we have to reload everything
                self.reload_check_configs = True
                return

            checks = self._get_checks_from_inspect(inspect)
            conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_from_inspect(self, inspect):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the DATADOG_ID label or the image."""
        identifier = inspect.get('Config', {}).get('Labels', {}).get(DATADOG_ID) or \
            inspect.get('Config', {}).get('Image')
        annotations = (self._get_kube_config(inspect.get('Id'), 'metadata') or {}).get('annotations') if Platform.is_k8s() else None

        return self.config_store.get_checks_to_refresh(identifier, kube_annotations=annotations)

    def _get_host_address(self, c_inspect, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '')

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
            "networks, trying with the IPAddress field" % (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
            for pod in pod_list:
                pod_ip = pod.get('status', {}).get('podIP')
                if pod_ip is None:
                    continue
                else:
                    c_statuses = pod.get('status', {}).get('containerStatuses', [])
                    for status in c_statuses:
                        # compare the container id with those of containers in the current pod
                        if c_id == status.get('containerID', '').split('//')[-1]:
                            return pod_ip

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.warning("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning("The key passed for template variable %s was not found." % tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.warning("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.warning("Trying with the last (sorted) network: '%s'." % last_key)
            return ip_dict[last_key]

    def _get_port(self, container_inspect, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        c_id = container_inspect.get('Id', '')

        try:
            ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." %
                          (c_id[:12], container_inspect.get('Config', {}).get('Image', '')))
                co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', [])
                c_name = None
                for co in co_statuses:
                    if co.get('containerID', '').split('//')[-1] == c_id:
                        c_name = co.get('name')
                        break
                containers = self._get_kube_config(c_id, 'spec').get('containers', [])
                for co in containers:
                    if co.get('name') == c_name:
                        ports = map(lambda x: str(x.get('containerPort')), co.get('ports', []))
        ports = sorted(ports, key=lambda x: int(x))
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error("Port index is not an integer. Using the last element instead.")
        except IndexError:
            log.error("Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, c_inspect):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if Platform.is_k8s():
            pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12])
                return []
            # get labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get replication controller
            created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}'))
            if created_by.get('reference', {}).get('kind') == 'ReplicationController':
                tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name'))

            # get kubernetes namespace
            tags.append('kube_namespace:%s' % pod_metadata.get('namespace'))

        return tags

    def _get_additional_tags(self, container_inspect, *args):
        tags = []
        if Platform.is_k8s():
            pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata')
            pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning("Failed to fetch pod metadata or pod spec for container %s."
                            " Additional Kubernetes tags may be missing." % container_inspect.get('Id', '')[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def _get_kube_config(self, c_id, key):
        """Get a part of a pod config from the kubernetes API"""
        pods = self.kubeutil.retrieve_pods_list().get('items', [])
        for pod in pods:
            c_statuses = pod.get('status', {}).get('containerStatuses', [])
            for status in c_statuses:
                if c_id == status.get('containerID', '').split('//')[-1]:
                    return pod.get(key, {})

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        containers = [(
            container.get('Image'),
            container.get('Id'), container.get('Labels')
        ) for container in self.docker_client.containers()]

        # used by the configcheck agent command to trace where check configs come from
        trace_config = self.agentConfig.get(TRACE_CONFIG, False)

        for image, cid, labels in containers:
            try:
                # value of the DATADOG_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(cid, identifier, trace_config=trace_config) or []
                for conf in check_configs:
                    if trace_config and conf is not None:
                        source, conf = conf

                    check_name, init_config, instance = conf
                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if trace_config:
                            configs[check_name] = (source, (init_config, [instance]))
                        else:
                            configs[check_name] = (init_config, [instance])
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if trace_config:
                            if configs[check_name][1][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1][1].append(instance)
                        else:
                            if configs[check_name][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service '
                              'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a DATADOG_ID label, return its value or the image name if missing"""
        return labels.get(DATADOG_ID) or image

    def _get_check_configs(self, c_id, identifier, trace_config=False):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        inspect = self.docker_client.inspect_container(c_id)
        annotations = (self._get_kube_config(inspect.get('Id'), 'metadata') or {}).get('annotations') if Platform.is_k8s() else None
        config_templates = self._get_config_templates(identifier, trace_config=trace_config, kube_annotations=annotations)
        if not config_templates:
            log.debug('No config template for container %s with identifier %s. '
                      'It will be left unconfigured.' % (c_id[:12], identifier))
            return None

        check_configs = []
        tags = self.get_tags(inspect)
        for config_tpl in config_templates:
            if trace_config:
                source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
            if tpl and len(tpl) == 2:
                init_config, instance = tpl
                if trace_config:
                    check_configs.append((source, (check_name, init_config, instance)))
                else:
                    check_configs.append((check_name, init_config, instance))

        return check_configs

    def _get_config_templates(self, identifier, trace_config=False, kube_annotations=None):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
            log.warning('No supported configuration backend was provided, using auto-config only.')
        else:
            auto_conf = False

        # format: [('ident', {init_tpl}, {instance_tpl})] without trace_config
        # or      [(source, ('ident', {init_tpl}, {instance_tpl}))] with trace_config
        raw_tpls = self.config_store.get_check_tpls(
            identifier, auto_conf=auto_conf, trace_config=trace_config, kube_annotations=kube_annotations)
        for tpl in raw_tpls:
            if trace_config and tpl is not None:
                # each template can come from either auto configuration or user-supplied templates
                source, tpl = tpl
            if tpl is not None and len(tpl) == 3:
                check_name, init_config_tpl, instance_tpl = tpl
            else:
                log.debug('No template was found for identifier %s, leaving it alone.' % identifier)
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration'
                              ' by service discovery failed for ident  {1}.'.format(check_name, identifier))
                return None

            if trace_config:
                templates.append((source, (check_name, init_config_tpl, instance_tpl, variables)))
            else:
                templates.append((check_name, init_config_tpl, instance_tpl, variables))

        return templates

    def _fill_tpl(self, inspect, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_id, c_image = inspect.get('Id', ''), inspect.get('Config', {}).get('Image', '')

        # add default tags to the instance
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](inspect, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." % var)
                    var_values[var] = res
                except Exception as ex:
                    log.error("Could not find a value for the template variable %s for container %s "
                              "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error("No method was found to interpolate template variable %s for container %s "
                          "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
Example #47
0
    def init(self):
        try:
            instance = self.instances[0]

            # Getting custom tags for service checks when docker is down
            self.custom_tags = instance.get("tags", [])

            self.docker_util = DockerUtil()
            if not self.docker_util.client:
                raise Exception("Failed to initialize Docker client.")

            self.docker_gateway = DockerUtil.get_gateway()
            self.metadata_collector = MetadataCollector()

            self.kubeutil = None
            if Platform.is_k8s():
                try:
                    self.kubeutil = KubeUtil()
                except Exception as ex:
                    self.log.error("Couldn't instantiate the kubernetes client, "
                                   "subsequent kubernetes calls will fail as well. Error: %s" % str(ex))

            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            # The collect_labels_as_tags is legacy, only tagging docker metrics.
            # It is replaced by docker_labels_as_tags in config.cfg.
            # We keep this line for backward compatibility.
            if "collect_labels_as_tags" in instance:
                self.collect_labels_as_tags = instance.get("collect_labels_as_tags")

            self.kube_pod_tags = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if self.docker_util.filtering_enabled:
                self.tag_names[FILTERED] = self.docker_util.filtered_tag_names

            # Container network mapping cache
            self.network_mappings = {}

            # get the health check whitelist
            self.whitelist_patterns = None
            health_scs_whitelist = instance.get('health_service_check_whitelist', [])
            if health_scs_whitelist:
                patterns, whitelist_tags = compile_filter_rules(health_scs_whitelist)
                self.whitelist_patterns = set(patterns)
                self.tag_names[HEALTHCHECK] = set(whitelist_tags)

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_container_count = _is_affirmative(instance.get('collect_container_count', False))
            self.collect_volume_count = _is_affirmative(instance.get('collect_volume_count', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.event_attributes_as_tags = instance.get('event_attributes_as_tags', [])
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_exit_codes = _is_affirmative(instance.get('collect_exit_codes', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.filtered_event_types = tuple(instance.get("filtered_event_types", DEFAULT_FILTERED_EVENT_TYPES))

            self.capped_metrics = instance.get('capped_metrics')

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True
Example #48
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""
    def __init__(self, agentConfig):
        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.docker_client = DockerUtil(config_store=self.config_store).client
        if Platform.is_k8s():
            self.kubeutil = KubeUtil()

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _make_fetch_state(self):
        pod_list = []
        if Platform.is_k8s():
            try:
                pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
            except Exception as ex:
                log.warning("Failed to retrieve pod list: %s" % str(ex))
        return _SDDockerBackendConfigFetchState(
            self.docker_client.inspect_container, pod_list)

    def update_checks(self, changed_containers):
        state = self._make_fetch_state()

        conf_reload_set = set()
        for c_id in changed_containers:
            checks = self._get_checks_to_refresh(state, c_id)
            if checks:
                conf_reload_set.update(set(checks))

        if conf_reload_set:
            self.reload_check_configs = conf_reload_set

    def _get_checks_to_refresh(self, state, c_id):
        """Get the list of checks applied to a container from the identifier_to_checks cache in the config store.
        Use the DATADOG_ID label or the image."""
        inspect = state.inspect_container(c_id)

        # If the container was removed we can't tell which check is concerned
        # so we have to reload everything.
        # Same thing if it's stopped and we're on Kubernetes in auto_conf mode
        # because the pod was deleted and its template could have been in the annotations.
        if not inspect or \
                (not inspect.get('State', {}).get('Running')
                    and Platform.is_k8s() and not self.agentConfig.get('sd_config_backend')):
            self.reload_check_configs = True
            return

        identifier = inspect.get('Config', {}).get('Labels', {}).get(DATADOG_ID) or \
            inspect.get('Config', {}).get('Image')

        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_annotations': kube_metadata.get('annotations'),
                'kube_container_name': state.get_kube_container_name(c_id),
            }

        return self.config_store.get_checks_to_refresh(identifier,
                                                       **platform_kwargs)

    def _get_host_address(self, state, c_id, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_inspect = state.inspect_container(c_id)
        c_id, c_img = c_inspect.get('Id',
                                    ''), c_inspect.get('Config',
                                                       {}).get('Image', '')

        networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
        ip_dict = {}
        for net_name, net_desc in networks.iteritems():
            ip = net_desc.get('IPAddress')
            if ip:
                ip_dict[net_name] = ip
        ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
        if ip_addr:
            return ip_addr

        # try to get the bridge (default) IP address
        log.debug("No IP address was found in container %s (%s) "
                  "networks, trying with the IPAddress field" %
                  (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if Platform.is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_ip = state.get_kube_config(c_id, 'status').get('podIP')
            if pod_ip:
                return pod_ip

        log.error("No IP address was found for container %s (%s)" %
                  (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_', 1)

        # no specifier
        if len(tpl_parts) < 2:
            log.debug("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning(
                    "The key passed for template variable %s was not found." %
                    tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.debug("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.debug("Trying with the last (sorted) network: '%s'." %
                      last_key)
            return ip_dict[last_key]

    def _get_port(self, state, c_id, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        container_inspect = state.inspect_container(c_id)

        try:
            ports = map(lambda x: x.split('/')[0],
                        container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(
                lambda x: x.split('/')[0],
                container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and Platform.is_k8s():
                log.debug(
                    "Didn't find the port for container %s (%s), trying the kubernetes way."
                    % (c_id[:12], container_inspect.get('Config', {}).get(
                        'Image', '')))
                spec = state.get_kube_container_spec(c_id)
                if spec:
                    ports = [
                        str(x.get('containerPort'))
                        for x in spec.get('ports', [])
                    ]
        ports = sorted(ports, key=int)
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_', 1)

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error(
                "Port index is not an integer. Using the last element instead."
            )
        except IndexError:
            log.error(
                "Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, state, c_id):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_id[:12])
                return []

            # get pod labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get kubernetes namespace
            namespace = pod_metadata.get('namespace')
            tags.append('kube_namespace:%s' % namespace)

            # get created-by
            created_by = json.loads(
                pod_metadata.get('annotations',
                                 {}).get('kubernetes.io/created-by', '{}'))
            creator_kind = created_by.get('reference', {}).get('kind')
            creator_name = created_by.get('reference', {}).get('name')

            # add creator tags
            if creator_name:
                if creator_kind == 'ReplicationController':
                    tags.append('kube_replication_controller:%s' %
                                creator_name)
                elif creator_kind == 'DaemonSet':
                    tags.append('kube_daemon_set:%s' % creator_name)
                elif creator_kind == 'ReplicaSet':
                    tags.append('kube_replica_set:%s' % creator_name)
            else:
                log.debug(
                    'creator-name for pod %s is empty, this should not happen'
                    % pod_metadata.get('name'))

            # FIXME haissam: for service and deployment we need to store a list of these guys
            # that we query from the apiserver and to compare their selectors with the pod labels.
            # For service it's straight forward.
            # For deployment we only need to do it if the pod creator is a ReplicaSet.
            # Details: https://kubernetes.io/docs/user-guide/deployments/#selector

        elif Platform.is_swarm():
            c_labels = state.inspect_container(c_id).get('Config',
                                                         {}).get('Labels', {})
            swarm_svc = c_labels.get(SWARM_SVC_LABEL)
            if swarm_svc:
                tags.append('swarm_service:%s' % swarm_svc)

        return tags

    def _get_additional_tags(self, state, c_id, *args):
        tags = []
        if Platform.is_k8s():
            pod_metadata = state.get_kube_config(c_id, 'metadata')
            pod_spec = state.get_kube_config(c_id, 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning(
                    "Failed to fetch pod metadata or pod spec for container %s."
                    " Additional Kubernetes tags may be missing." % c_id[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        state = self._make_fetch_state()
        containers = [(container.get('Image'), container.get('Id'),
                       container.get('Labels'))
                      for container in self.docker_client.containers()]

        for image, cid, labels in containers:
            try:
                # value of the DATADOG_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(
                    state, cid, identifier) or []
                for conf in check_configs:
                    source, (check_name, init_config, instance) = conf

                    # build instances list if needed
                    if configs.get(check_name) is None:
                        configs[check_name] = (source, (init_config,
                                                        [instance]))
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {}. ' \
                            'Keeping the first one found.'
                        if configs[check_name][1][0] != init_config:
                            log.warning(conflict_init_msg.format(check_name))
                        configs[check_name][1][1].append(instance)
            except Exception:
                log.exception(
                    'Building config for container %s based on image %s using service '
                    'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a DATADOG_ID label, return its value or the image name if missing"""
        return labels.get(DATADOG_ID) or image

    def _get_check_configs(self, state, c_id, identifier):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        platform_kwargs = {}
        if Platform.is_k8s():
            kube_metadata = state.get_kube_config(c_id, 'metadata') or {}
            platform_kwargs = {
                'kube_container_name': state.get_kube_container_name(c_id),
                'kube_annotations': kube_metadata.get('annotations'),
            }
        config_templates = self._get_config_templates(identifier,
                                                      **platform_kwargs)
        if not config_templates:
            return None

        check_configs = []
        tags = self.get_tags(state, c_id)
        for config_tpl in config_templates:
            source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(state, c_id,
                                                      instance_tpl, variables,
                                                      tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl
                                        or {}, var_values)
            if tpl and len(tpl) == 2:
                init_config, instance = tpl
                check_configs.append(
                    (source, (check_name, init_config, instance)))

        return check_configs

    def _get_config_templates(self, identifier, **platform_kwargs):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
        else:
            auto_conf = False

        # format [(source, ('ident', {init_tpl}, {instance_tpl}))]
        raw_tpls = self.config_store.get_check_tpls(identifier,
                                                    auto_conf=auto_conf,
                                                    **platform_kwargs)
        for tpl in raw_tpls:
            # each template can come from either auto configuration or user-supplied templates
            try:
                source, (check_name, init_config_tpl, instance_tpl) = tpl
            except (TypeError, IndexError, ValueError):
                log.debug(
                    'No template was found for identifier %s, leaving it alone: %s'
                    % (identifier, tpl))
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception(
                    'Failed to decode the JSON template fetched for check {0}. Its configuration'
                    ' by service discovery failed for ident  {1}.'.format(
                        check_name, identifier))
                return None

            templates.append((source, (check_name, init_config_tpl,
                                       instance_tpl, variables)))

        return templates

    def _fill_tpl(self, state, c_id, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_image = state.inspect_container(c_id).get('Config',
                                                    {}).get('Image', '')

        # add default tags to the instance
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](state, c_id, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." %
                                         var)
                    var_values[var] = res
                except Exception as ex:
                    log.error(
                        "Could not find a value for the template variable %s for container %s "
                        "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error(
                    "No method was found to interpolate template variable %s for container %s "
                    "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values