Exemple #1
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from Kubernetes """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(init_config=init_config, instance=inst)

        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.log.warning(
                    "Kubelet client failed to initialized for now, pausing the Kubernetes check."
                )
            else:
                raise Exception(
                    'Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently.'
                )

        if agentConfig.get('service_discovery') and \
                agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.leader_candidate = inst.get(LEADER_CANDIDATE)
        if self.leader_candidate:
            self.kubeutil.refresh_leader()

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning(
                        'Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s'
                        % str(e))

            self.event_retriever = None
            self._configure_event_collection(inst)

    def _perform_kubelet_checks(self, url, instance):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            req = self.kubeutil.perform_kubelet_query(url)
            for line in req.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match(r'\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(
                    2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name,
                                       AgentCheck.OK,
                                       tags=instance.get('tags', []))
                else:
                    self.service_check(service_check_name,
                                       AgentCheck.CRITICAL,
                                       tags=instance.get('tags', []))
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base,
                               AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' %
                               (url, str(e)),
                               tags=instance.get('tags', []))
        else:
            if is_ok:
                self.service_check(service_check_base,
                                   AgentCheck.OK,
                                   tags=instance.get('tags', []))
            else:
                self.service_check(service_check_base,
                                   AgentCheck.CRITICAL,
                                   tags=instance.get('tags', []))

    def _configure_event_collection(self, instance):
        self._collect_events = self.kubeutil.is_leader or _is_affirmative(
            instance.get('collect_events', DEFAULT_COLLECT_EVENTS))
        if self._collect_events:
            if self.event_retriever:
                self.event_retriever.set_kinds(None)
                self.event_retriever.set_delay(None)
            else:
                self.event_retriever = self.kubeutil.get_event_retriever()
        elif self.kubeutil.collect_service_tag:
            # Only fetch service and pod events for service mapping
            event_delay = instance.get('service_tag_update_freq',
                                       DEFAULT_SERVICE_EVENT_FREQ)
            if self.event_retriever:
                self.event_retriever.set_kinds(['Service', 'Pod'])
                self.event_retriever.set_delay(event_delay)
            else:
                self.event_retriever = self.kubeutil.get_event_retriever(
                    kinds=['Service', 'Pod'], delay=event_delay)
        else:
            self.event_retriever = None

    def check(self, instance):
        if not self.kubeutil.init_success:
            if self.kubeutil.left_init_retries > 0:
                self.kubeutil.init_kubelet(instance)
                self.log.warning(
                    "Kubelet client is not initialized, Kubernetes check is paused."
                )
                return
            else:
                raise Exception(
                    "Unable to initialize Kubelet client. Try setting the host parameter. The Kubernetes check failed permanently."
                )

        # Leader election
        self.refresh_leader_status(instance)

        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges
        ]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates
        ]

        self.publish_aliases = _is_affirmative(
            instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(
            instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        try:
            pods_list = self.kubeutil.retrieve_pods_list()
        except:
            pods_list = None

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url, instance)

        if pods_list is not None:
            # Will not fail if cAdvisor is not available
            self._update_pods_metrics(instance, pods_list)
            # cAdvisor & kubelet metrics, will fail if port 4194 is not open
            try:
                if int(instance.get('port',
                                    KubeUtil.DEFAULT_CADVISOR_PORT)) > 0:
                    self._update_metrics(instance, pods_list)
            except ConnectionError:
                self.warning(
                    '''Can't access the cAdvisor metrics, performance metrics and'''
                    ''' limits/requests will not be collected. Please setup'''
                    ''' your kubelet with the --cadvisor-port=4194 option, or set port to 0'''
                    ''' in this check's configuration to disable cAdvisor lookup.'''
                )
            except Exception as err:
                self.log.warning(
                    "Error while getting performance metrics: %s" % str(err))

        # kubernetes events
        if self.event_retriever is not None:
            try:
                events = self.event_retriever.get_event_array()
                changed_cids = self.kubeutil.process_events(events,
                                                            podlist=pods_list)
                if (changed_cids and self._sd_backend):
                    self._sd_backend.update_checks(changed_cids)
                if events and self._collect_events:
                    self._update_kube_events(instance, pods_list, events)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any(
                [fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any(
                [fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags,
                                          depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        # kube_container_name is the name of the Kubernetes container resource,
        # not the name of the docker container (that's tagged as container_name)
        kube_container_name = cont_labels[KubeUtil.CONTAINER_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))
        tags.append(u"kube_container_name:{0}".format(kube_container_name))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split(
                    "/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" %
                            (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            self.log.debug("Subcontainer doesn't have a name, skipping.")
            return

        tags.append('container_name:%s' % container_name)

        container_image = self.kubeutil.image_name_resolver(
            subcontainer['spec'].get('image'))
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer,
                                            kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer,
                                           kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get(
                "spec",
            {}).get("has_filesystem") and stats.get('filesystem', []) != []:
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage']) / float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct',
                               fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS), tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_pod_tags(
            pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            if 'aliases' not in subcontainer:
                # it means the subcontainer is about a higher-level entity than a container
                continue
            try:
                tags = self._update_container_metrics(instance, subcontainer,
                                                      kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception as e:
                self.log.error(
                    "Unable to collect metrics for container: {0} ({1})".
                    format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug(
                    "Pod %s does not have containers specs, skipping...",
                    pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources'][
                            'limits'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing limits value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.limits'.format(NAMESPACE, limit),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container limits for %s: %s",
                        c_name, e)

                # requests
                try:
                    for request, value_str in container['resources'][
                            'requests'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing requests value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.requests'.format(NAMESPACE, request),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container requests for %s: %s",
                        c_name, e)

        self._update_node(instance)

    def _update_node(self, instance):
        machine_info = self.kubeutil.retrieve_machine_info()
        num_cores = machine_info.get('num_cores', 0)
        memory_capacity = machine_info.get('memory_capacity', 0)

        tags = instance.get('tags', [])
        self.publish_gauge(self, NAMESPACE + '.cpu.capacity', float(num_cores),
                           tags)
        self.publish_gauge(self, NAMESPACE + '.memory.capacity',
                           float(memory_capacity), tags)
        # TODO(markine): Report 'allocatable' which is capacity minus capacity
        # reserved for system/Kubernetes.

    def _update_pods_metrics(self, instance, pods):
        """
        Reports the number of running pods on this node, tagged by service and creator

        We go though all the pods, extract tags then count them by tag list, sorted and
        serialized in a pipe-separated string (it is an illegar character for tags)
        """
        tags_map = defaultdict(int)
        for pod in pods['items']:
            pod_meta = pod.get('metadata', {})
            pod_tags = self.kubeutil.get_pod_creator_tags(
                pod_meta, legacy_rep_controller_tag=True)
            services = self.kubeutil.match_services_for_pod(pod_meta)
            if isinstance(services, list):
                for service in services:
                    pod_tags.append('kube_service:%s' % service)
            if 'namespace' in pod_meta:
                pod_tags.append('kube_namespace:%s' % pod_meta['namespace'])

            tags_map[frozenset(pod_tags)] += 1

        commmon_tags = instance.get('tags', [])
        for pod_tags, pod_count in tags_map.iteritems():
            tags = list(pod_tags)
            tags.extend(commmon_tags)
            self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count,
                               tags)

    def _update_kube_events(self, instance, pods_list, event_items):
        """
        Process kube events and send ddog events
        The namespace filtering is done here instead of KubeEventRetriever
        to avoid interfering with service discovery
        """
        node_ip, node_name = self.kubeutil.get_node_info()
        self.log.debug('Processing events on {} [{}]'.format(
            node_name, node_ip))

        k8s_namespaces = instance.get('namespaces', DEFAULT_NAMESPACES)
        if not isinstance(k8s_namespaces, list):
            self.log.warning(
                'Configuration key "namespaces" is not a list: fallback to the default value'
            )
            k8s_namespaces = DEFAULT_NAMESPACES

        # handle old config value
        if 'namespace' in instance and instance.get('namespace') not in (
                None, 'default'):
            self.log.warning(
                '''The 'namespace' parameter is deprecated and will stop being supported starting '''
                '''from 5.13. Please use 'namespaces' and/or 'namespace_name_regexp' instead.'''
            )
            k8s_namespaces.append(instance.get('namespace'))

        if self.k8s_namespace_regexp:
            namespaces_endpoint = '{}/namespaces'.format(
                self.kubeutil.kubernetes_api_url)
            self.log.debug('Kubernetes API endpoint to query namespaces: %s' %
                           namespaces_endpoint)

            namespaces = self.kubeutil.retrieve_json_auth(
                namespaces_endpoint).json()
            for namespace in namespaces.get('items', []):
                name = namespace.get('metadata', {}).get('name', None)
                if name and self.k8s_namespace_regexp.match(name):
                    k8s_namespaces.append(name)

        k8s_namespaces = set(k8s_namespaces)

        for event in event_items:
            event_ts = calendar.timegm(
                time.strptime(event.get('lastTimestamp'),
                              '%Y-%m-%dT%H:%M:%SZ'))
            involved_obj = event.get('involvedObject', {})

            # filter events by white listed namespaces (empty namespace belong to the 'default' one)
            if involved_obj.get('namespace', 'default') not in k8s_namespaces:
                continue

            tags = self.kubeutil.extract_event_tags(event)
            tags.extend(instance.get('tags', []))

            title = '{} {} on {}'.format(involved_obj.get('name'),
                                         event.get('reason'), node_name)
            message = event.get('message')
            source = event.get('source')
            k8s_event_type = event.get('type')
            alert_type = K8S_ALERT_MAP.get(k8s_event_type, 'info')

            if source:
                message += '\nSource: {} {}\n'.format(
                    source.get('component', ''), source.get('host', ''))
            msg_body = "%%%\n{}\n```\n{}\n```\n%%%".format(title, message)
            dd_event = {
                'timestamp': event_ts,
                'host': node_ip,
                'event_type': EVENT_TYPE,
                'msg_title': title,
                'msg_text': msg_body,
                'source_type_name': EVENT_TYPE,
                'alert_type': alert_type,
                'event_object':
                'kubernetes:{}'.format(involved_obj.get('name')),
                'tags': tags,
            }
            self.event(dd_event)

    def refresh_leader_status(self, instance):
        """
        calls kubeutil.refresh_leader and compares the resulting
        leader status with the previous one.
        If it changed, update the event collection logic
        """
        if not self.leader_candidate:
            return

        leader_status = self.kubeutil.is_leader
        self.kubeutil.refresh_leader()

        # nothing changed, no-op
        if leader_status == self.kubeutil.is_leader:
            return
        # else, reset the event collection config
        else:
            self.log.info(
                "Leader status changed, updating event collection config...")
            self._configure_event_collection(instance)
Exemple #2
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception(
                'Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception(
                'Unable to reach kubelet. Try setting the host parameter.')

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning(
                        'Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s'
                        % str(e))

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            req = self.kubeutil.perform_kubelet_query(url)
            for line in req.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(
                    2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base,
                               AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' %
                               (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges
        ]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = [
            "{0}.{1}".format(NAMESPACE, x) for x in enabled_rates
        ]

        self.publish_aliases = _is_affirmative(
            instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(
            instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        pods_list = self.kubeutil.retrieve_pods_list()

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, pods_list)

        # kubelet events
        if _is_affirmative(
                instance.get('collect_events', DEFAULT_COLLECT_EVENTS)):
            try:
                self._process_events(instance, pods_list)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any(
                [fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any(
                [fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags,
                                          depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split(
                    "/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" %
                        replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" %
                            (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        container_image = subcontainer['spec'].get('image')
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer,
                                            kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer,
                                           kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage']) / float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct',
                               fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS), tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(
            pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer,
                                                      kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception, e:
                self.log.error(
                    "Unable to collect metrics for container: {0} ({1}".format(
                        c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug(
                    "Pod %s does not have containers specs, skipping...",
                    pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources'][
                            'limits'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing limits value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.limits'.format(NAMESPACE, limit),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container limits for %s: %s",
                        c_name, e)
                    self.log.debug("Container object for {}: {}".format(
                        c_name, container))

                # requests
                try:
                    for request, value_str in container['resources'][
                            'requests'].iteritems():
                        values = [
                            parse_quantity(s)
                            for s in QUANTITY_EXP.findall(value_str)
                        ]
                        if len(values) != 1:
                            self.log.warning(
                                "Error parsing requests value string: %s",
                                value_str)
                            continue
                        self.publish_gauge(
                            self, '{}.{}.requests'.format(NAMESPACE, request),
                            values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug(
                        "Unable to retrieve container requests for %s: %s",
                        c_name, e)
                    self.log.debug("Container object for {}: {}".format(
                        c_name, container))

        self._update_pods_metrics(instance, pods_list)
        self._update_node(instance)
Exemple #3
0
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.host:
            raise Exception('Unable to retrieve Docker hostname and host parameter is not set')

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            r = requests.get(url, params={'verbose': True})
            for line in r.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base, AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' % (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        pods_list = self.kubeutil.retrieve_pods_list()

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        # kubelet metrics
        self._update_metrics(instance, pods_list)

        # kubelet events
        if _is_affirmative(instance.get('collect_events', DEFAULT_COLLECT_EVENTS)):
            try:
                self._process_events(instance, pods_list)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split("/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" % (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        container_image = subcontainer['spec'].get('image')
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem"):
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage'])/float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS),
                              tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_labels(pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer, kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception, e:
                self.log.error("Unable to collect metrics for container: {0} ({1}".format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug("Pod %s does not have containers specs, skipping...", pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources']['limits'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing limits value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.limits'.format(NAMESPACE, limit), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container limits for %s: %s", c_name, e)
                    self.log.debug("Container object for {}: {}".format(c_name, container))

                # requests
                try:
                    for request, value_str in container['resources']['requests'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing requests value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.requests'.format(NAMESPACE, request), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.error("Unable to retrieve container requests for %s: %s", c_name, e)
                    self.log.debug("Container object for {}: {}".format(c_name, container))

        self._update_pods_metrics(instance, pods_list)
        self._update_node(instance)
class Kubernetes(AgentCheck):
    """ Collect metrics and events from kubelet """

    pod_names_by_container = {}

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception('Kubernetes check only supports one configured instance.')

        AgentCheck.__init__(self, name, init_config, agentConfig, instances)

        inst = instances[0] if instances is not None else None
        self.kubeutil = KubeUtil(instance=inst)
        if not self.kubeutil.kubelet_api_url:
            raise Exception('Unable to reach kubelet. Try setting the host parameter.')

        if agentConfig.get('service_discovery') and \
           agentConfig.get('service_discovery_backend') == 'docker':
            self._sd_backend = get_sd_backend(agentConfig)
        else:
            self._sd_backend = None

        self.k8s_namespace_regexp = None
        if inst:
            regexp = inst.get('namespace_name_regexp', None)
            if regexp:
                try:
                    self.k8s_namespace_regexp = re.compile(regexp)
                except re.error as e:
                    self.log.warning('Invalid regexp for "namespace_name_regexp" in configuration (ignoring regexp): %s' % str(e))

            self._collect_events = _is_affirmative(inst.get('collect_events', DEFAULT_COLLECT_EVENTS))
            if self._collect_events:
                self.event_retriever = self.kubeutil.get_event_retriever()
            elif self.kubeutil.collect_service_tag:
                # Only fetch service and pod events for service mapping
                event_delay = inst.get('service_tag_update_freq', DEFAULT_SERVICE_EVENT_FREQ)
                self.event_retriever = self.kubeutil.get_event_retriever(kinds=['Service', 'Pod'],
                                                                         delay=event_delay)
            else:
                self.event_retriever = None
        else:
            self._collect_events = None
            self.event_retriever = None

    def _perform_kubelet_checks(self, url):
        service_check_base = NAMESPACE + '.kubelet.check'
        is_ok = True
        try:
            req = self.kubeutil.perform_kubelet_query(url)
            for line in req.iter_lines():

                # avoid noise; this check is expected to fail since we override the container hostname
                if line.find('hostname') != -1:
                    continue

                matches = re.match('\[(.)\]([^\s]+) (.*)?', line)
                if not matches or len(matches.groups()) < 2:
                    continue

                service_check_name = service_check_base + '.' + matches.group(2)
                status = matches.group(1)
                if status == '+':
                    self.service_check(service_check_name, AgentCheck.OK)
                else:
                    self.service_check(service_check_name, AgentCheck.CRITICAL)
                    is_ok = False

        except Exception as e:
            self.log.warning('kubelet check %s failed: %s' % (url, str(e)))
            self.service_check(service_check_base, AgentCheck.CRITICAL,
                               message='Kubelet check %s failed: %s' % (url, str(e)))

        else:
            if is_ok:
                self.service_check(service_check_base, AgentCheck.OK)
            else:
                self.service_check(service_check_base, AgentCheck.CRITICAL)

    def check(self, instance):
        self.max_depth = instance.get('max_depth', DEFAULT_MAX_DEPTH)
        enabled_gauges = instance.get('enabled_gauges', DEFAULT_ENABLED_GAUGES)
        self.enabled_gauges = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_gauges]
        enabled_rates = instance.get('enabled_rates', DEFAULT_ENABLED_RATES)
        self.enabled_rates = ["{0}.{1}".format(NAMESPACE, x) for x in enabled_rates]

        self.publish_aliases = _is_affirmative(instance.get('publish_aliases', DEFAULT_PUBLISH_ALIASES))
        self.use_histogram = _is_affirmative(instance.get('use_histogram', DEFAULT_USE_HISTOGRAM))
        self.publish_rate = FUNC_MAP[RATE][self.use_histogram]
        self.publish_gauge = FUNC_MAP[GAUGE][self.use_histogram]
        # initialized by _filter_containers
        self._filtered_containers = set()

        try:
            pods_list = self.kubeutil.retrieve_pods_list()
        except:
            pods_list = None

        # kubelet health checks
        self._perform_kubelet_checks(self.kubeutil.kube_health_url)

        if pods_list is not None:
            # kubelet metrics
            self._update_metrics(instance, pods_list)

        # kubelet events
        if self.event_retriever is not None:
            try:
                events = self.event_retriever.get_event_array()
                changed_cids = self.kubeutil.process_events(events, podlist=pods_list)
                if (changed_cids and self._sd_backend):
                    self._sd_backend.update_checks(changed_cids)
                if events and self._collect_events:
                    self._update_kube_events(instance, pods_list, events)
            except Exception as ex:
                self.log.error("Event collection failed: %s" % str(ex))

    def _publish_raw_metrics(self, metric, dat, tags, depth=0):
        if depth >= self.max_depth:
            self.log.warning('Reached max depth on metric=%s' % metric)
            return

        if isinstance(dat, numbers.Number):
            if self.enabled_rates and any([fnmatch(metric, pat) for pat in self.enabled_rates]):
                self.publish_rate(self, metric, float(dat), tags)
            elif self.enabled_gauges and any([fnmatch(metric, pat) for pat in self.enabled_gauges]):
                self.publish_gauge(self, metric, float(dat), tags)

        elif isinstance(dat, dict):
            for k, v in dat.iteritems():
                self._publish_raw_metrics(metric + '.%s' % k.lower(), v, tags, depth + 1)

        elif isinstance(dat, list):
            self._publish_raw_metrics(metric, dat[-1], tags, depth + 1)

    @staticmethod
    def _shorten_name(name):
        # shorten docker image id
        return re.sub('([0-9a-fA-F]{64,})', lambda x: x.group(1)[0:12], name)

    def _get_post_1_2_tags(self, cont_labels, subcontainer, kube_labels):
        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        pod_namespace = cont_labels[KubeUtil.NAMESPACE_LABEL]
        tags.append(u"pod_name:{0}/{1}".format(pod_namespace, pod_name))
        tags.append(u"kube_namespace:{0}".format(pod_namespace))

        kube_labels_key = "{0}/{1}".format(pod_namespace, pod_name)

        pod_labels = kube_labels.get(kube_labels_key)
        if pod_labels:
            tags += list(pod_labels)

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            tags.append("kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append('container_alias:%s' % (self._shorten_name(alias)))

        return tags

    def _get_pre_1_2_tags(self, cont_labels, subcontainer, kube_labels):

        tags = []

        pod_name = cont_labels[KubeUtil.POD_NAME_LABEL]
        tags.append(u"pod_name:{0}".format(pod_name))

        pod_labels = kube_labels.get(pod_name)
        if pod_labels:
            tags.extend(list(pod_labels))

        if "-" in pod_name:
            replication_controller = "-".join(pod_name.split("-")[:-1])
            if "/" in replication_controller:
                namespace, replication_controller = replication_controller.split("/", 1)
                tags.append(u"kube_namespace:%s" % namespace)

            tags.append(u"kube_replication_controller:%s" % replication_controller)

        if self.publish_aliases and subcontainer.get("aliases"):
            for alias in subcontainer['aliases'][1:]:
                # we don't add the first alias as it will be the container_name
                tags.append(u"container_alias:%s" % (self._shorten_name(alias)))

        return tags

    def _update_container_metrics(self, instance, subcontainer, kube_labels):
        """Publish metrics for a subcontainer and handle filtering on tags"""
        tags = list(instance.get('tags', []))  # add support for custom tags

        if len(subcontainer.get('aliases', [])) >= 1:
            # The first alias seems to always match the docker container name
            container_name = subcontainer['aliases'][0]
        else:
            # We default to the container id
            container_name = subcontainer['name']

        tags.append('container_name:%s' % container_name)

        container_image = subcontainer['spec'].get('image')
        if container_image:
            tags.append('container_image:%s' % container_image)

            split = container_image.split(":")
            if len(split) > 2:
                # if the repo is in the image name and has the form 'docker.clearbit:5000'
                # the split will be like [repo_url, repo_port/image_name, image_tag]. Let's avoid that
                split = [':'.join(split[:-1]), split[-1]]

            tags.append('image_name:%s' % split[0])
            if len(split) == 2:
                tags.append('image_tag:%s' % split[1])

        try:
            cont_labels = subcontainer['spec']['labels']
        except KeyError:
            self.log.debug("Subcontainer, doesn't have any labels")
            cont_labels = {}

        # Collect pod names, namespaces, rc...
        if KubeUtil.NAMESPACE_LABEL in cont_labels and KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes >= 1.2
            tags += self._get_post_1_2_tags(cont_labels, subcontainer, kube_labels)

        elif KubeUtil.POD_NAME_LABEL in cont_labels:
            # Kubernetes <= 1.1
            tags += self._get_pre_1_2_tags(cont_labels, subcontainer, kube_labels)

        else:
            # Those are containers that are not part of a pod.
            # They are top aggregate views and don't have the previous metadata.
            tags.append("pod_name:no_pod")

        # if the container should be filtered we return its tags without publishing its metrics
        is_filtered = self.kubeutil.are_tags_filtered(tags)
        if is_filtered:
            self._filtered_containers.add(subcontainer['id'])
            return tags

        stats = subcontainer['stats'][-1]  # take the latest
        self._publish_raw_metrics(NAMESPACE, stats, tags)

        if subcontainer.get("spec", {}).get("has_filesystem") and stats.get('filesystem', []) != []:
            fs = stats['filesystem'][-1]
            fs_utilization = float(fs['usage'])/float(fs['capacity'])
            self.publish_gauge(self, NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags)

        if subcontainer.get("spec", {}).get("has_network"):
            net = stats['network']
            self.publish_rate(self, NAMESPACE + '.network_errors',
                              sum(float(net[x]) for x in NET_ERRORS),
                              tags)

        return tags

    def _update_metrics(self, instance, pods_list):
        def parse_quantity(s):
            number = ''
            unit = ''
            for c in s:
                if c.isdigit() or c == '.':
                    number += c
                else:
                    unit += c
            return float(number) * FACTORS.get(unit, 1)

        metrics = self.kubeutil.retrieve_metrics()

        excluded_labels = instance.get('excluded_labels')
        kube_labels = self.kubeutil.extract_kube_pod_tags(pods_list, excluded_keys=excluded_labels)

        if not metrics:
            raise Exception('No metrics retrieved cmd=%s' % self.metrics_cmd)

        # container metrics from Cadvisor
        container_tags = {}
        for subcontainer in metrics:
            c_id = subcontainer.get('id')
            try:
                tags = self._update_container_metrics(instance, subcontainer, kube_labels)
                if c_id:
                    container_tags[c_id] = tags
                # also store tags for aliases
                for alias in subcontainer.get('aliases', []):
                    container_tags[alias] = tags
            except Exception as e:
                self.log.error("Unable to collect metrics for container: {0} ({1})".format(c_id, e))

        # container metrics from kubernetes API: limits and requests
        for pod in pods_list['items']:
            try:
                containers = pod['spec']['containers']
                name2id = {}
                for cs in pod['status'].get('containerStatuses', []):
                    c_id = cs.get('containerID', '').split('//')[-1]
                    name = cs.get('name')
                    if name:
                        name2id[name] = c_id
            except KeyError:
                self.log.debug("Pod %s does not have containers specs, skipping...", pod['metadata'].get('name'))
                continue

            for container in containers:
                c_name = container.get('name')
                c_id = name2id.get(c_name)

                if c_id in self._filtered_containers:
                    self.log.debug('Container {} is excluded'.format(c_name))
                    continue

                _tags = container_tags.get(c_id, [])

                # limits
                try:
                    for limit, value_str in container['resources']['limits'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing limits value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.limits'.format(NAMESPACE, limit), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container limits for %s: %s", c_name, e)

                # requests
                try:
                    for request, value_str in container['resources']['requests'].iteritems():
                        values = [parse_quantity(s) for s in QUANTITY_EXP.findall(value_str)]
                        if len(values) != 1:
                            self.log.warning("Error parsing requests value string: %s", value_str)
                            continue
                        self.publish_gauge(self, '{}.{}.requests'.format(NAMESPACE, request), values[0], _tags)
                except (KeyError, AttributeError) as e:
                    self.log.debug("Unable to retrieve container requests for %s: %s", c_name, e)

        self._update_pods_metrics(instance, pods_list)
        self._update_node(instance)

    def _update_node(self, instance):
        machine_info = self.kubeutil.retrieve_machine_info()
        num_cores = machine_info.get('num_cores', 0)
        memory_capacity = machine_info.get('memory_capacity', 0)

        tags = instance.get('tags', [])
        self.publish_gauge(self, NAMESPACE + '.cpu.capacity', float(num_cores), tags)
        self.publish_gauge(self, NAMESPACE + '.memory.capacity', float(memory_capacity), tags)
        # TODO(markine): Report 'allocatable' which is capacity minus capacity
        # reserved for system/Kubernetes.

    def _update_pods_metrics(self, instance, pods):
        """
        Reports the number of running pods, tagged by service and creator

        We go though all the pods, extract tags then count them by tag list, sorted and
        serialized in a pipe-separated string (it is an illegar character for tags)
        """
        tags_map = defaultdict(int)
        for pod in pods['items']:
            pod_meta = pod.get('metadata', {})
            pod_tags = self.kubeutil.get_pod_creator_tags(pod_meta, legacy_rep_controller_tag=True)
            services = self.kubeutil.match_services_for_pod(pod_meta)
            if isinstance(services, list):
                for service in services:
                    pod_tags.append('kube_service:%s' % service)
            if 'namespace' in pod_meta:
                pod_tags.append('kube_namespace:%s' % pod_meta['namespace'])

            tags_map[frozenset(pod_tags)] += 1

        commmon_tags = instance.get('tags', [])
        for pod_tags, pod_count in tags_map.iteritems():
            tags = list(pod_tags)
            tags.extend(commmon_tags)
            self.publish_gauge(self, NAMESPACE + '.pods.running', pod_count, tags)

    def _update_kube_events(self, instance, pods_list, event_items):
        """
        Process kube events and send ddog events
        The namespace filtering is done here instead of KubeEventRetriever
        to avoid interfering with service discovery
        """
        node_ip, node_name = self.kubeutil.get_node_info()
        self.log.debug('Processing events on {} [{}]'.format(node_name, node_ip))

        k8s_namespaces = instance.get('namespaces', DEFAULT_NAMESPACES)
        if not isinstance(k8s_namespaces, list):
            self.log.warning('Configuration key "namespaces" is not a list: fallback to the default value')
            k8s_namespaces = DEFAULT_NAMESPACES

        # handle old config value
        if 'namespace' in instance and instance.get('namespace') not in (None, 'default'):
            self.log.warning('''The 'namespace' parameter is deprecated and will stop being supported starting '''
                             '''from 5.13. Please use 'namespaces' and/or 'namespace_name_regexp' instead.''')
            k8s_namespaces.append(instance.get('namespace'))

        if self.k8s_namespace_regexp:
            namespaces_endpoint = '{}/namespaces'.format(self.kubeutil.kubernetes_api_url)
            self.log.debug('Kubernetes API endpoint to query namespaces: %s' % namespaces_endpoint)

            namespaces = self.kubeutil.retrieve_json_auth(namespaces_endpoint)
            for namespace in namespaces.get('items', []):
                name = namespace.get('metadata', {}).get('name', None)
                if name and self.k8s_namespace_regexp.match(name):
                    k8s_namespaces.append(name)

        k8s_namespaces = set(k8s_namespaces)

        for event in event_items:
            event_ts = calendar.timegm(time.strptime(event.get('lastTimestamp'), '%Y-%m-%dT%H:%M:%SZ'))
            involved_obj = event.get('involvedObject', {})

            # filter events by white listed namespaces (empty namespace belong to the 'default' one)
            if involved_obj.get('namespace', 'default') not in k8s_namespaces:
                continue

            tags = self.kubeutil.extract_event_tags(event)
            tags.extend(instance.get('tags', []))

            title = '{} {} on {}'.format(involved_obj.get('name'), event.get('reason'), node_name)
            message = event.get('message')
            source = event.get('source')
            if source:
                message += '\nSource: {} {}\n'.format(source.get('component', ''), source.get('host', ''))
            msg_body = "%%%\n{}\n```\n{}\n```\n%%%".format(title, message)
            dd_event = {
                'timestamp': event_ts,
                'host': node_ip,
                'event_type': EVENT_TYPE,
                'msg_title': title,
                'msg_text': msg_body,
                'source_type_name': EVENT_TYPE,
                'event_object': 'kubernetes:{}'.format(involved_obj.get('name')),
                'tags': tags,
            }
            self.event(dd_event)