Example #1
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list',
                side_effect=['foo'])
    @mock.patch('utils.kubeutil.KubeUtil.extract_kube_labels')
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys='bar')
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar')

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ['foo'])
        self.assertEqual(len(res), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, 'foo')
        self.assertEqual(len(res), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 6)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 4)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.9')
        self.assertEqual(len(res.get('items')), 5)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.5')
        self.assertEqual(len(res.get('items')), 1)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

    @mock.patch('utils.kubeutil.requests')
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_once_with(
            'url',
            verify=False,
            timeout=10,
            headers={'Authorization': 'Bearer foo_tok'})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_with('url',
                                 verify=__file__,
                                 timeout=10,
                                 headers={'Authorization': 'Bearer foo_tok'})

    def test_get_node_info(self):
        with mock.patch('utils.kubeutil.KubeUtil._fetch_host_data') as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = 'foo'
            self.kubeutil._node_name = 'bar'
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, 'foo')
            self.assertEqual(name, 'bar')
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch(
                'utils.kubeutil.KubeUtil.retrieve_pods_list') as mock_pods:
            self.kubeutil.host_name = 'dd-agent-1rxlh'
            mock_pods.return_value = json.loads(
                Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name,
                             'kubernetes-massi-minion-k23m')

            self.kubeutil.host_name = 'heapster-v11-l8sh1'
            mock_pods.return_value = json.loads(
                Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name,
                             'gke-cluster-1-8046fdfa-node-ld35')

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file(
            'events.json')  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv('KUBERNETES_PORT')
        self.assertFalse(Platform.is_k8s())
        os.environ['KUBERNETES_PORT'] = '999'
        self.assertTrue(Platform.is_k8s())

    def test_extract_event_tags(self):
        events = json.loads(
            Fixtures.read_file("events.json", string_escape=False))['items']
        for ev in events:
            tags = KubeUtil().extract_event_tags(ev)
            # there should be 4 tags except for some events where source.host is missing
            self.assertTrue(len(tags) >= 3)

            tag_names = [tag.split(':')[0] for tag in tags]
            self.assertIn('reason', tag_names)
            self.assertIn('namespace', tag_names)
            self.assertIn('object_type', tag_names)
            if len(tags) == 4:
                self.assertIn('node_name', tag_names)
Example #2
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.host:
            raise Exception('Unable to retrieve Docker hostname and host parameter is not set')

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            r = requests.get(url)
            for line in r.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base, AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' % (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]

        pods_list = self.kubeutil.retrieve_pods_list()

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, pods_list)

        # kubelet events
        if _is_affirmative(instance.get('collect_events', DEFAULT_COLLECT_EVENTS)):
            self._process_events(instance, pods_list)

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split("/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" % (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage'])/float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS),
                              tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer, kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception, e:
                self.log.error("Unable to collect metrics for container: {0} ({1}".format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug("Pod %s does not have containers specs, skipping...", pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                _tags = container_tags.get(name2id.get(c_name), [])

                prog = re.compile(r'[-+]?\d+[\.]?\d*')

                # limits
                try:
                    for limit, value_str in container['resources']['limits'].iteritems():
                        values = [float(s) for s in prog.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing limits value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.limits'.format(NAMESPACE, limit), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container limits for %s: %s", c_name, e)
                    self.log.debug("Container object for {}: {}".format(c_name, container))

                # requests
                try:
                    for request, value_str in container['resources']['requests'].iteritems():
                        values = [float(s) for s in prog.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing requests value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.requests'.format(NAMESPACE, request), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.error("Unable to retrieve container requests for %s: %s", c_name, e)
                    self.log.debug("Container object for {}: {}".format(c_name, container))

        self._update_pods_metrics(instance, pods_list)
Example #3
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""

    def __init__(self, agentConfig):
        self.docker_client = DockerUtil().client
        if is_k8s():
            self.kubeutil = KubeUtil()

        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.VAR_MAPPING = {
            'host': self._get_host,
            'port': self._get_ports,
            'tags': self._get_additional_tags,
        }
        AbstractSDBackend.__init__(self, agentConfig)

    def _get_host(self, container_inspect):
        """Extract the host IP from a docker inspect object, or the kubelet API."""
        ip_addr = container_inspect.get('NetworkSettings', {}).get('IPAddress')
        if not ip_addr:
            if not is_k8s():
                return
            # kubernetes case
            log.debug("Didn't find the IP address for container %s (%s), using the kubernetes way." %
                      (container_inspect.get('Id', '')[:12], container_inspect.get('Config', {}).get('Image', '')))
            pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
            c_id = container_inspect.get('Id')
            for pod in pod_list:
                pod_ip = pod.get('status', {}).get('podIP')
                if pod_ip is None:
                    continue
                else:
                    c_statuses = pod.get('status', {}).get('containerStatuses', [])
                    for status in c_statuses:
                        # compare the container id with those of containers in the current pod
                        if c_id == status.get('containerID', '').split('//')[-1]:
                            ip_addr = pod_ip

        return ip_addr

    def _get_ports(self, container_inspect):
        """Extract a list of available ports from a docker inspect object. Sort them numerically."""
        c_id = container_inspect.get('Id', '')
        try:
            ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." %
                      (c_id[:12], container_inspect.get('Config', {}).get('Image', '')))
            # first we try to get it from the docker API
            # it works if the image has an EXPOSE instruction
            ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys())
            # if it failed, try with the kubernetes API
            if not ports and is_k8s():
                co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', [])
                c_name = None
                for co in co_statuses:
                    if co.get('containerID', '').split('//')[-1] == c_id:
                        c_name = co.get('name')
                        break
                containers = self._get_kube_config(c_id, 'spec').get('containers', [])
                for co in containers:
                    if co.get('name') == c_name:
                        ports = map(lambda x: str(x.get('containerPort')), co.get('ports', []))
        ports = sorted(ports, key=lambda x: int(x))
        return ports

    def get_tags(self, c_inspect):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12])
                return []
            # get labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get replication controller
            created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}'))
            if created_by.get('reference', {}).get('kind') == 'ReplicationController':
                tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name'))

            # get kubernetes namespace
            tags.append('kube_namespace:%s' % pod_metadata.get('namespace'))

        return tags

    def _get_additional_tags(self, container_inspect):
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata')
            pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec')
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def _get_kube_config(self, c_id, key):
        """Get a part of a pod config from the kubernetes API"""
        pods = self.kubeutil.retrieve_pods_list().get('items', [])
        for pod in pods:
            c_statuses = pod.get('status', {}).get('containerStatuses', [])
            for status in c_statuses:
                if c_id == status.get('containerID', '').split('//')[-1]:
                    return pod.get(key, {})

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        containers = [(
            container.get('Image').split(':')[0].split('/')[-1],
            container.get('Id'), container.get('Labels')
        ) for container in self.docker_client.containers()]

        # used by the configcheck agent command to trace where check configs come from
        trace_config = self.agentConfig.get(TRACE_CONFIG, False)

        for image, cid, labels in containers:
            try:
                check_configs = self._get_check_configs(cid, image, trace_config=trace_config) or []
                for conf in check_configs:
                    if trace_config and conf is not None:
                        source, conf = conf

                    check_name, init_config, instance = conf
                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if trace_config:
                            configs[check_name] = (source, (init_config, [instance]))
                        else:
                            configs[check_name] = (init_config, [instance])
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \
                            'Keeping the first one found.'
                        if trace_config:
                            if configs[check_name][1][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1][1].append(instance)
                        else:
                            if configs[check_name][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service'
                              ' discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def _get_check_configs(self, c_id, image, trace_config=False):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        inspect = self.docker_client.inspect_container(c_id)
        config_templates = self._get_config_templates(image, trace_config=trace_config)
        if not config_templates:
            log.debug('No config template for container %s with image %s. '
                      'It will be left unconfigured.' % (c_id[:12], image))
            return None

        check_configs = []
        tags = self.get_tags(inspect)
        for config_tpl in config_templates:
            if trace_config:
                source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
            if tpl and len(tpl) == 2:
                if trace_config and len(tpl[1]) == 2:
                    source, (init_config, instance) = tpl
                    check_configs.append((source, (check_name, init_config, instance)))
                elif not trace_config:
                    init_config, instance = tpl
                    check_configs.append((check_name, init_config, instance))

        return check_configs

    def _get_config_templates(self, image_name, trace_config=False):
        """Extract config templates for an image from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
            log.warning('No supported configuration backend was provided, using auto-config only.')
        else:
            auto_conf = False

        # format: [('image', {init_tpl}, {instance_tpl})] without trace_config
        # or      [(source, ('image', {init_tpl}, {instance_tpl}))] with trace_config
        raw_tpls = self.config_store.get_check_tpls(image_name, auto_conf=auto_conf, trace_config=trace_config)
        for tpl in raw_tpls:
            if trace_config and tpl is not None:
                # each template can come from either auto configuration or user-supplied templates
                source, tpl = tpl
            if tpl is not None and len(tpl) == 3:
                check_name, init_config_tpl, instance_tpl = tpl
            else:
                log.debug('No template was found for image %s, leaving it alone.' % image_name)
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration'
                              ' by service discovery failed for {1}.'.format(check_name, image_name))
                return None

            if trace_config:
                templates.append((source, (check_name, init_config_tpl, instance_tpl, variables)))
            else:
                templates.append((check_name, init_config_tpl, instance_tpl, variables))

        return templates

    def _fill_tpl(self, inspect, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a """
        """dict from template variable names and their values."""
        var_values = {}

        # add default tags to the instance
        if tags:
            tags += instance_tpl.get('tags', [])
            instance_tpl['tags'] = list(set(tags))

        for v in variables:
            # variables can be suffixed with an index in case a list is found
            var_parts = v.split('_')
            if var_parts[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var_parts[0]](inspect)
                    if not res:
                        raise ValueError("Invalid value for variable %s." % var_parts[0])
                    # if an index is found in the variable, use it to select a value
                    if len(var_parts) > 1 and isinstance(res, list) and int(var_parts[-1]) < len(res):
                        var_values[v] = res[int(var_parts[-1])]
                    # if no valid index was found but we have a list, return the last element
                    elif isinstance(res, list):
                        var_values[v] = res[-1]
                    else:
                        var_values[v] = res
                except Exception as ex:
                    log.error("Could not find a value for the template variable %s: %s" % (v, str(ex)))
            else:
                log.error("No method was found to interpolate template variable %s." % v)

        return instance_tpl, var_values
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""

    def __init__(self, agentConfig):
        self.docker_client = DockerUtil().client
        if is_k8s():
            self.kubeutil = KubeUtil()

        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.VAR_MAPPING = {
            'host': self._get_host_address,
            'port': self._get_port,
            'tags': self._get_additional_tags,
        }

        AbstractSDBackend.__init__(self, agentConfig)

    def _get_host_address(self, c_inspect, tpl_var):
        """Extract the container IP from a docker inspect object, or the kubelet API."""
        c_id, c_img = c_inspect.get('Id', ''), c_inspect.get('Config', {}).get('Image', '')
        tpl_parts = tpl_var.split('_')

        # a specifier was given
        if len(tpl_parts) > 1:
            networks = c_inspect.get('NetworkSettings', {}).get('Networks') or {}
            ip_dict = {}
            for net_name, net_desc in networks.iteritems():
                ip = net_desc.get('IPAddress')
                if ip:
                    ip_dict[net_name] = ip
            ip_addr = self._extract_ip_from_networks(ip_dict, tpl_var)
            if ip_addr:
                return ip_addr

        # try to get the bridge IP address
        log.debug("No network found for container %s (%s), trying with IPAddress field" % (c_id[:12], c_img))
        ip_addr = c_inspect.get('NetworkSettings', {}).get('IPAddress')
        if ip_addr:
            return ip_addr

        if is_k8s():
            # kubernetes case
            log.debug("Couldn't find the IP address for container %s (%s), "
                      "using the kubernetes way." % (c_id[:12], c_img))
            pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
            for pod in pod_list:
                pod_ip = pod.get('status', {}).get('podIP')
                if pod_ip is None:
                    continue
                else:
                    c_statuses = pod.get('status', {}).get('containerStatuses', [])
                    for status in c_statuses:
                        # compare the container id with those of containers in the current pod
                        if c_id == status.get('containerID', '').split('//')[-1]:
                            return pod_ip

        log.error("No IP address was found for container %s (%s)" % (c_id[:12], c_img))
        return None

    def _extract_ip_from_networks(self, ip_dict, tpl_var):
        """Extract a single IP from a dictionary made of network names and IPs."""
        if not ip_dict:
            return None
        tpl_parts = tpl_var.split('_')

        # no specifier
        if len(tpl_parts) < 2:
            log.warning("No key was passed for template variable %s." % tpl_var)
            return self._get_fallback_ip(ip_dict)
        else:
            res = ip_dict.get(tpl_parts[-1])
            if res is None:
                log.warning("The key passed for template variable %s was not found." % tpl_var)
                return self._get_fallback_ip(ip_dict)
            else:
                return res

    def _get_fallback_ip(self, ip_dict):
        """try to pick the bridge key, falls back to the value of the last key"""
        if 'bridge' in ip_dict:
            log.warning("Using the bridge network.")
            return ip_dict['bridge']
        else:
            last_key = sorted(ip_dict.iterkeys())[-1]
            log.warning("Trying with the last key: '%s'." % last_key)
            return ip_dict[last_key]

    def _get_port(self, container_inspect, tpl_var):
        """Extract a port from a container_inspect or the k8s API given a template variable."""
        c_id = container_inspect.get('Id', '')

        try:
            ports = map(lambda x: x.split('/')[0], container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            # try to get ports from the docker API. Works if the image has an EXPOSE instruction
            ports = map(lambda x: x.split('/')[0], container_inspect['Config'].get('ExposedPorts', {}).keys())

            # if it failed, try with the kubernetes API
            if not ports and is_k8s():
                log.debug("Didn't find the port for container %s (%s), trying the kubernetes way." %
                          (c_id[:12], container_inspect.get('Config', {}).get('Image', '')))
                co_statuses = self._get_kube_config(c_id, 'status').get('containerStatuses', [])
                c_name = None
                for co in co_statuses:
                    if co.get('containerID', '').split('//')[-1] == c_id:
                        c_name = co.get('name')
                        break
                containers = self._get_kube_config(c_id, 'spec').get('containers', [])
                for co in containers:
                    if co.get('name') == c_name:
                        ports = map(lambda x: str(x.get('containerPort')), co.get('ports', []))
        ports = sorted(ports, key=lambda x: int(x))
        return self._extract_port_from_list(ports, tpl_var)

    def _extract_port_from_list(self, ports, tpl_var):
        if not ports:
            return None

        tpl_parts = tpl_var.split('_')

        if len(tpl_parts) == 1:
            log.debug("No index was passed for template variable %s. "
                      "Trying with the last element." % tpl_var)
            return ports[-1]

        try:
            idx = tpl_parts[-1]
            return ports[int(idx)]
        except ValueError:
            log.error("Port index is not an integer. Using the last element instead.")
        except IndexError:
            log.error("Port index is out of range. Using the last element instead.")
        return ports[-1]

    def get_tags(self, c_inspect):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(c_inspect.get('Id'), 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." % c_inspect.get('Id', '')[:12])
                return []
            # get labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get replication controller
            created_by = json.loads(pod_metadata.get('annotations', {}).get('kubernetes.io/created-by', '{}'))
            if created_by.get('reference', {}).get('kind') == 'ReplicationController':
                tags.append('kube_replication_controller:%s' % created_by.get('reference', {}).get('name'))

            # get kubernetes namespace
            tags.append('kube_namespace:%s' % pod_metadata.get('namespace'))

        return tags

    def _get_additional_tags(self, container_inspect, *args):
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(container_inspect.get('Id'), 'metadata')
            pod_spec = self._get_kube_config(container_inspect.get('Id'), 'spec')
            if pod_metadata is None or pod_spec is None:
                log.warning("Failed to fetch pod metadata or pod spec for container %s."
                            " Additional Kubernetes tags may be missing." % container_inspect.get('Id', '')[:12])
                return []
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def _get_kube_config(self, c_id, key):
        """Get a part of a pod config from the kubernetes API"""
        pods = self.kubeutil.retrieve_pods_list().get('items', [])
        for pod in pods:
            c_statuses = pod.get('status', {}).get('containerStatuses', [])
            for status in c_statuses:
                if c_id == status.get('containerID', '').split('//')[-1]:
                    return pod.get(key, {})

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        containers = [(
            container.get('Image'),
            container.get('Id'), container.get('Labels')
        ) for container in self.docker_client.containers()]

        # used by the configcheck agent command to trace where check configs come from
        trace_config = self.agentConfig.get(TRACE_CONFIG, False)

        for image, cid, labels in containers:
            try:
                # value of the DATADOG_ID tag or the image name if the label is missing
                identifier = self.get_config_id(image, labels)
                check_configs = self._get_check_configs(cid, identifier, trace_config=trace_config) or []
                for conf in check_configs:
                    if trace_config and conf is not None:
                        source, conf = conf

                    check_name, init_config, instance = conf
                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if trace_config:
                            configs[check_name] = (source, (init_config, [instance]))
                        else:
                            configs[check_name] = (init_config, [instance])
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \
                            'Keeping the first one found.'
                        if trace_config:
                            if configs[check_name][1][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1][1].append(instance)
                        else:
                            if configs[check_name][0] != init_config:
                                log.warning(conflict_init_msg.format(check_name))
                            configs[check_name][1].append(instance)
            except Exception:
                log.exception('Building config for container %s based on image %s using service '
                              'discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def get_config_id(self, image, labels):
        """Look for a DATADOG_ID label, return its value or the image name if missing"""
        return labels.get(DATADOG_ID) or image

    def _get_check_configs(self, c_id, identifier, trace_config=False):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        inspect = self.docker_client.inspect_container(c_id)
        config_templates = self._get_config_templates(identifier, trace_config=trace_config)
        if not config_templates:
            log.debug('No config template for container %s with identifier %s. '
                      'It will be left unconfigured.' % (c_id[:12], identifier))
            return None

        check_configs = []
        tags = self.get_tags(inspect)
        for config_tpl in config_templates:
            if trace_config:
                source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl, variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl or {}, var_values)
            if tpl and len(tpl) == 2:
                init_config, instance = tpl
                if trace_config:
                    check_configs.append((source, (check_name, init_config, instance)))
                else:
                    check_configs.append((check_name, init_config, instance))

        return check_configs

    def _get_config_templates(self, identifier, trace_config=False):
        """Extract config templates for an identifier from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
            log.warning('No supported configuration backend was provided, using auto-config only.')
        else:
            auto_conf = False

        # format: [('ident', {init_tpl}, {instance_tpl})] without trace_config
        # or      [(source, ('ident', {init_tpl}, {instance_tpl}))] with trace_config
        raw_tpls = self.config_store.get_check_tpls(
            identifier, auto_conf=auto_conf, trace_config=trace_config)
        for tpl in raw_tpls:
            if trace_config and tpl is not None:
                # each template can come from either auto configuration or user-supplied templates
                source, tpl = tpl
            if tpl is not None and len(tpl) == 3:
                check_name, init_config_tpl, instance_tpl = tpl
            else:
                log.debug('No template was found for identifier %s, leaving it alone.' % identifier)
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception('Failed to decode the JSON template fetched for check {0}. Its configuration'
                              ' by service discovery failed for ident  {1}.'.format(check_name, identifier))
                return None

            if trace_config:
                templates.append((source, (check_name, init_config_tpl, instance_tpl, variables)))
            else:
                templates.append((check_name, init_config_tpl, instance_tpl, variables))

        return templates

    def _fill_tpl(self, inspect, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a
           dict from template variable names and their values."""
        var_values = {}
        c_id, c_image = inspect.get('Id', ''), inspect.get('Config', {}).get('Image', '')

        # add default tags to the instance
        if tags:
            tpl_tags = instance_tpl.get('tags', [])
            tags += tpl_tags if isinstance(tpl_tags, list) else [tpl_tags]
            instance_tpl['tags'] = list(set(tags))

        for var in variables:
            # variables can be suffixed with an index in case several values are found
            if var.split('_')[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var.split('_')[0]](inspect, var)
                    if res is None:
                        raise ValueError("Invalid value for variable %s." % var)
                    var_values[var] = res
                except Exception as ex:
                    log.error("Could not find a value for the template variable %s for container %s "
                              "(%s): %s" % (var, c_id[:12], c_image, str(ex)))
            else:
                log.error("No method was found to interpolate template variable %s for container %s "
                          "(%s)." % (var, c_id[:12], c_image))

        return instance_tpl, var_values
Example #5
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self.kubeutil = KubeUtil()
        if not self.kubeutil.host:
            raise Exception(
                'Unable to get default router and host parameter is not set')

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            r = requests.get(url)
            for line in r.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(
                    2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base,
                               AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' %
                               (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges
        ]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates
        ]

        self.publish_aliases = _is_affirmative(
            instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(
            instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance)

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any(
                [fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any(
                [fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags,
                                          depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split(
                    "/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" %
                            (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer,
                                            kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer,
                                           kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage']) / float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct',
                               fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS), tags)

    def _retrieve_metrics(self, url):
        return retrieve_json(url)

    def _update_metrics(self, instance):
        pods_list = self.kubeutil.retrieve_pods_list()
        metrics = self._retrieve_metrics(self.kubeutil.metrics_url)

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(
            pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        for subcontainer in metrics:
            try:
                self._update_container_metrics(instance, subcontainer,
                                               kube_labels)
            except Exception as e:
                self.log.error(
                    "Unable to collect metrics for container: {0} ({1}".format(
                        subcontainer.get('name'), e))

        self._update_pods_metrics(instance, pods_list)

    def _update_pods_metrics(self, instance, pods):
        supported_kinds = [
            "DaemonSet",
            "Deployment",
            "Job",
            "ReplicationController",
            "ReplicaSet",
        ]

        controllers_map = defaultdict(int)
        for pod in pods['items']:
            try:
                created_by = json.loads(
                    pod['metadata']['annotations']['kubernetes.io/created-by'])
                kind = created_by['reference']['kind']
                if kind in supported_kinds:
                    controllers_map[created_by['reference']['name']] += 1
            except KeyError:
                continue

        tags = instance.get('tags', [])
        for ctrl, pod_count in controllers_map.iteritems():
            _tags = tags[:]  # copy base tags
            _tags.append('kube_replication_controller:{0}'.format(ctrl))
            self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count,
                               _tags)
Example #6
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch("utils.kubeutil.KubeUtil.retrieve_pods_list", side_effect=["foo"])
    @mock.patch("utils.kubeutil.KubeUtil.extract_kube_labels")
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys="bar")
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with("foo", excluded_keys="bar")

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ["foo"])
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ["foo"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ["k8s-app"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ["foo"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ["k8s-app"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, "foo")
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, "foo")
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, "uid")
        self.assertEqual(len(res), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, "foo")
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, "uid")
        self.assertEqual(len(res), 4)

    @mock.patch("utils.kubeutil.retrieve_json")
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch("utils.kubeutil.retrieve_json")
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, "foo")
        self.assertEqual(len(res.get("items")), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "10.240.0.9")
        self.assertEqual(len(res.get("items")), 5)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "foo")
        self.assertEqual(len(res.get("items")), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "10.240.0.5")
        self.assertEqual(len(res.get("items")), 1)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "foo")
        self.assertEqual(len(res.get("items")), 0)

    @mock.patch("utils.kubeutil.requests")
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth("url", "foo_tok")
        r.get.assert_called_once_with("url", verify=False, timeout=10, headers={"Authorization": "Bearer foo_tok"})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth("url", "foo_tok")
        r.get.assert_called_with("url", verify=__file__, timeout=10, headers={"Authorization": "Bearer foo_tok"})

    def test_get_node_info(self):
        with mock.patch("utils.kubeutil.KubeUtil._fetch_host_data") as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = "foo"
            self.kubeutil._node_name = "bar"
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, "foo")
            self.assertEqual(name, "bar")
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch("utils.kubeutil.KubeUtil.retrieve_pods_list") as mock_pods:
            self.kubeutil.host_name = "dd-agent-1rxlh"
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, "10.240.0.9")
            self.assertEqual(self.kubeutil._node_name, "kubernetes-massi-minion-k23m")

            self.kubeutil.host_name = "heapster-v11-l8sh1"
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, "10.240.0.9")
            self.assertEqual(self.kubeutil._node_name, "gke-cluster-1-8046fdfa-node-ld35")

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = "/foo/bar"
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file("events.json")  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv("KUBERNETES_PORT")
        self.assertFalse(is_k8s())
        os.environ["KUBERNETES_PORT"] = "999"
        self.assertTrue(is_k8s())
Example #7
0
class SDDockerBackend(AbstractSDBackend):
    """Docker-based service discovery"""
    def __init__(self, agentConfig):
        self.docker_client = DockerUtil().client
        if is_k8s():
            self.kubeutil = KubeUtil()

        try:
            self.config_store = get_config_store(agentConfig=agentConfig)
        except Exception as e:
            log.error('Failed to instantiate the config store client. '
                      'Auto-config only will be used. %s' % str(e))
            agentConfig['sd_config_backend'] = None
            self.config_store = get_config_store(agentConfig=agentConfig)

        self.VAR_MAPPING = {
            'host': self._get_host,
            'port': self._get_ports,
            'tags': self._get_additional_tags,
        }
        AbstractSDBackend.__init__(self, agentConfig)

    def _get_host(self, container_inspect):
        """Extract the host IP from a docker inspect object, or the kubelet API."""
        ip_addr = container_inspect.get('NetworkSettings', {}).get('IPAddress')
        if not ip_addr:
            if not is_k8s():
                return
            # kubernetes case
            log.debug(
                "Didn't find the IP address for container %s (%s), using the kubernetes way."
                % (container_inspect.get('Id', '')[:12],
                   container_inspect.get('Config', {}).get('Image', '')))
            pod_list = self.kubeutil.retrieve_pods_list().get('items', [])
            c_id = container_inspect.get('Id')
            for pod in pod_list:
                pod_ip = pod.get('status', {}).get('podIP')
                if pod_ip is None:
                    continue
                else:
                    c_statuses = pod.get('status',
                                         {}).get('containerStatuses', [])
                    for status in c_statuses:
                        # compare the container id with those of containers in the current pod
                        if c_id == status.get('containerID',
                                              '').split('//')[-1]:
                            ip_addr = pod_ip

        return ip_addr

    def _get_ports(self, container_inspect):
        """Extract a list of available ports from a docker inspect object. Sort them numerically."""
        c_id = container_inspect.get('Id', '')
        try:
            ports = map(lambda x: x.split('/')[0],
                        container_inspect['NetworkSettings']['Ports'].keys())
        except (IndexError, KeyError, AttributeError):
            log.debug(
                "Didn't find the port for container %s (%s), trying the kubernetes way."
                % (c_id[:12], container_inspect.get('Config', {}).get(
                    'Image', '')))
            # first we try to get it from the docker API
            # it works if the image has an EXPOSE instruction
            ports = map(
                lambda x: x.split('/')[0],
                container_inspect['Config'].get('ExposedPorts', {}).keys())
            # if it failed, try with the kubernetes API
            if not ports and is_k8s():
                co_statuses = self._get_kube_config(c_id, 'status').get(
                    'containerStatuses', [])
                c_name = None
                for co in co_statuses:
                    if co.get('containerID', '').split('//')[-1] == c_id:
                        c_name = co.get('name')
                        break
                containers = self._get_kube_config(c_id, 'spec').get(
                    'containers', [])
                for co in containers:
                    if co.get('name') == c_name:
                        ports = map(lambda x: str(x.get('containerPort')),
                                    co.get('ports', []))
        ports = sorted(ports, key=lambda x: int(x))
        return ports

    def get_tags(self, c_inspect):
        """Extract useful tags from docker or platform APIs. These are collected by default."""
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(c_inspect.get('Id'),
                                                 'metadata')

            if pod_metadata is None:
                log.warning("Failed to fetch pod metadata for container %s."
                            " Kubernetes tags may be missing." %
                            c_inspect.get('Id', '')[:12])
                return []
            # get labels
            kube_labels = pod_metadata.get('labels', {})
            for label, value in kube_labels.iteritems():
                tags.append('%s:%s' % (label, value))

            # get replication controller
            created_by = json.loads(
                pod_metadata.get('annotations',
                                 {}).get('kubernetes.io/created-by', '{}'))
            if created_by.get('reference',
                              {}).get('kind') == 'ReplicationController':
                tags.append('kube_replication_controller:%s' %
                            created_by.get('reference', {}).get('name'))

            # get kubernetes namespace
            tags.append('kube_namespace:%s' % pod_metadata.get('namespace'))

        return tags

    def _get_additional_tags(self, container_inspect):
        tags = []
        if is_k8s():
            pod_metadata = self._get_kube_config(container_inspect.get('Id'),
                                                 'metadata')
            pod_spec = self._get_kube_config(container_inspect.get('Id'),
                                             'spec')
            tags.append('node_name:%s' % pod_spec.get('nodeName'))
            tags.append('pod_name:%s' % pod_metadata.get('name'))
        return tags

    def _get_kube_config(self, c_id, key):
        """Get a part of a pod config from the kubernetes API"""
        pods = self.kubeutil.retrieve_pods_list().get('items', [])
        for pod in pods:
            c_statuses = pod.get('status', {}).get('containerStatuses', [])
            for status in c_statuses:
                if c_id == status.get('containerID', '').split('//')[-1]:
                    return pod.get(key, {})

    def get_configs(self):
        """Get the config for all docker containers running on the host."""
        configs = {}
        containers = [(container.get('Image').split(':')[0].split('/')[-1],
                       container.get('Id'), container.get('Labels'))
                      for container in self.docker_client.containers()]

        # used by the configcheck agent command to trace where check configs come from
        trace_config = self.agentConfig.get(TRACE_CONFIG, False)

        for image, cid, labels in containers:
            try:
                check_configs = self._get_check_configs(
                    cid, image, trace_config=trace_config) or []
                for conf in check_configs:
                    if trace_config and conf is not None:
                        source, conf = conf

                    check_name, init_config, instance = conf
                    # build instances list if needed
                    if configs.get(check_name) is None:
                        if trace_config:
                            configs[check_name] = (source, (init_config,
                                                            [instance]))
                        else:
                            configs[check_name] = (init_config, [instance])
                    else:
                        conflict_init_msg = 'Different versions of `init_config` found for check {0}. ' \
                            'Keeping the first one found.'
                        if trace_config:
                            if configs[check_name][1][0] != init_config:
                                log.warning(
                                    conflict_init_msg.format(check_name))
                            configs[check_name][1][1].append(instance)
                        else:
                            if configs[check_name][0] != init_config:
                                log.warning(
                                    conflict_init_msg.format(check_name))
                            configs[check_name][1].append(instance)
            except Exception:
                log.exception(
                    'Building config for container %s based on image %s using service'
                    ' discovery failed, leaving it alone.' % (cid[:12], image))
        return configs

    def _get_check_configs(self, c_id, image, trace_config=False):
        """Retrieve configuration templates and fill them with data pulled from docker and tags."""
        inspect = self.docker_client.inspect_container(c_id)
        config_templates = self._get_config_templates(
            image, trace_config=trace_config)
        if not config_templates:
            log.debug('No config template for container %s with image %s. '
                      'It will be left unconfigured.' % (c_id[:12], image))
            return None

        check_configs = []
        tags = self.get_tags(inspect)
        for config_tpl in config_templates:
            if trace_config:
                source, config_tpl = config_tpl
            check_name, init_config_tpl, instance_tpl, variables = config_tpl

            # insert tags in instance_tpl and process values for template variables
            instance_tpl, var_values = self._fill_tpl(inspect, instance_tpl,
                                                      variables, tags)

            tpl = self._render_template(init_config_tpl or {}, instance_tpl
                                        or {}, var_values)
            if tpl and len(tpl) == 2:
                if trace_config and len(tpl[1]) == 2:
                    source, (init_config, instance) = tpl
                    check_configs.append(
                        (source, (check_name, init_config, instance)))
                elif not trace_config:
                    init_config, instance = tpl
                    check_configs.append((check_name, init_config, instance))

        return check_configs

    def _get_config_templates(self, image_name, trace_config=False):
        """Extract config templates for an image from a K/V store and returns it as a dict object."""
        config_backend = self.agentConfig.get('sd_config_backend')
        templates = []
        if config_backend is None:
            auto_conf = True
            log.warning(
                'No supported configuration backend was provided, using auto-config only.'
            )
        else:
            auto_conf = False

        # format: [('image', {init_tpl}, {instance_tpl})] without trace_config
        # or      [(source, ('image', {init_tpl}, {instance_tpl}))] with trace_config
        raw_tpls = self.config_store.get_check_tpls(image_name,
                                                    auto_conf=auto_conf,
                                                    trace_config=trace_config)
        for tpl in raw_tpls:
            if trace_config and tpl is not None:
                # each template can come from either auto configuration or user-supplied templates
                source, tpl = tpl
            if tpl is not None and len(tpl) == 3:
                check_name, init_config_tpl, instance_tpl = tpl
            else:
                log.debug(
                    'No template was found for image %s, leaving it alone.' %
                    image_name)
                return None
            try:
                # build a list of all variables to replace in the template
                variables = self.PLACEHOLDER_REGEX.findall(str(init_config_tpl)) + \
                    self.PLACEHOLDER_REGEX.findall(str(instance_tpl))
                variables = map(lambda x: x.strip('%'), variables)
                if not isinstance(init_config_tpl, dict):
                    init_config_tpl = json.loads(init_config_tpl or '{}')
                if not isinstance(instance_tpl, dict):
                    instance_tpl = json.loads(instance_tpl or '{}')
            except json.JSONDecodeError:
                log.exception(
                    'Failed to decode the JSON template fetched for check {0}. Its configuration'
                    ' by service discovery failed for {1}.'.format(
                        check_name, image_name))
                return None

            if trace_config:
                templates.append((source, (check_name, init_config_tpl,
                                           instance_tpl, variables)))
            else:
                templates.append(
                    (check_name, init_config_tpl, instance_tpl, variables))

        return templates

    def _fill_tpl(self, inspect, instance_tpl, variables, tags=None):
        """Add container tags to instance templates and build a """
        """dict from template variable names and their values."""
        var_values = {}

        # add default tags to the instance
        if tags:
            tags += instance_tpl.get('tags', [])
            instance_tpl['tags'] = list(set(tags))

        for v in variables:
            # variables can be suffixed with an index in case a list is found
            var_parts = v.split('_')
            if var_parts[0] in self.VAR_MAPPING:
                try:
                    res = self.VAR_MAPPING[var_parts[0]](inspect)
                    if not res:
                        raise ValueError("Invalid value for variable %s." %
                                         var_parts[0])
                    # if an index is found in the variable, use it to select a value
                    if len(var_parts) > 1 and isinstance(
                            res, list) and int(var_parts[-1]) < len(res):
                        var_values[v] = res[int(var_parts[-1])]
                    # if no valid index was found but we have a list, return the last element
                    elif isinstance(res, list):
                        var_values[v] = res[-1]
                    else:
                        var_values[v] = res
                except Exception as ex:
                    log.error(
                        "Could not find a value for the template variable %s: %s"
                        % (v, str(ex)))
            else:
                log.error(
                    "No method was found to interpolate template variable %s."
                    % v)

        return instance_tpl, var_values
Example #8
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list', side_effect=['foo'])
    @mock.patch('utils.kubeutil.KubeUtil.extract_kube_labels')
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys='bar')
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar')

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ['foo'])
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, 'foo')
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 4)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.9')
        self.assertEqual(len(res.get('items')), 5)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.5')
        self.assertEqual(len(res.get('items')), 1)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

    @mock.patch('utils.kubeutil.requests')
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_once_with('url', verify=False, timeout=10, headers={'Authorization': 'Bearer foo_tok'})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_with('url', verify=__file__, timeout=10, headers={'Authorization': 'Bearer foo_tok'})

    def test_get_node_info(self):
        with mock.patch('utils.kubeutil.KubeUtil._fetch_host_data') as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = 'foo'
            self.kubeutil._node_name = 'bar'
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, 'foo')
            self.assertEqual(name, 'bar')
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list') as mock_pods:
            self.kubeutil.host_name = 'dd-agent-1rxlh'
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name, 'kubernetes-massi-minion-k23m')

            self.kubeutil.host_name = 'heapster-v11-l8sh1'
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name, 'gke-cluster-1-8046fdfa-node-ld35')

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file('events.json')  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv('KUBERNETES_PORT')
        self.assertFalse(Platform.is_k8s())
        os.environ['KUBERNETES_PORT'] = '999'
        self.assertTrue(Platform.is_k8s())
Example #9
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')
        AgentCheck.__init__(self, name, init_config, agentConfig, instances)
        self.kubeutil = KubeUtil()
        if not self.kubeutil.host:
            raise Exception('Unable to get default router and host parameter is not set')

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            r = requests.get(url)
            for line in r.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check failed: %s' % str(e))
            self.service_check(service_check_base, AgentCheck.CRITICAL,
                               message='Kubelet check failed: %s' % str(e))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance)

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split("/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" % (self._shorten_name(alias)))

        return tags


    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")


        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage'])/float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS),
                              tags)

    def _retrieve_metrics(self, url):
        return retrieve_json(url)

    def _update_metrics(self, instance):
        pods_list = self.kubeutil.retrieve_pods_list()
        metrics = self._retrieve_metrics(self.kubeutil.metrics_url)

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        for subcontainer in metrics:
            try:
                self._update_container_metrics(instance, subcontainer, kube_labels)
            except Exception as e:
                self.log.error("Unable to collect metrics for container: {0} ({1}".format(
                    subcontainer.get('name'), e))

        self._update_pods_metrics(instance, pods_list)

    def _update_pods_metrics(self, instance, pods):
        supported_kinds = [
            "DaemonSet",
            "Deployment",
            "Job",
            "ReplicationController",
            "ReplicaSet",
        ]

        controllers_map = defaultdict(int)
        for pod in pods['items']:
            try:
                created_by = json.loads(pod['metadata']['annotations']['kubernetes.io/created-by'])
                kind = created_by['reference']['kind']
                if kind in supported_kinds:
                    controllers_map[created_by['reference']['name']] += 1
            except KeyError:
                continue

        tags = instance.get('tags', [])
        for ctrl, pod_count in controllers_map.iteritems():
            _tags = tags[:]  # copy base tags
            _tags.append('kube_replication_controller:{0}'.format(ctrl))
            self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count, _tags)