Ejemplo n.º 1
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list',
                side_effect=['foo'])
    @mock.patch('utils.kubeutil.KubeUtil.extract_kube_labels')
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys='bar')
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar')

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ['foo'])
        self.assertEqual(len(res), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, 'foo')
        self.assertEqual(len(res), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 6)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 4)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.9')
        self.assertEqual(len(res.get('items')), 5)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.5')
        self.assertEqual(len(res.get('items')), 1)

        pods = json.loads(
            Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

    @mock.patch('utils.kubeutil.requests')
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_once_with(
            'url',
            verify=False,
            timeout=10,
            headers={'Authorization': 'Bearer foo_tok'})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_with('url',
                                 verify=__file__,
                                 timeout=10,
                                 headers={'Authorization': 'Bearer foo_tok'})

    def test_get_node_info(self):
        with mock.patch('utils.kubeutil.KubeUtil._fetch_host_data') as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = 'foo'
            self.kubeutil._node_name = 'bar'
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, 'foo')
            self.assertEqual(name, 'bar')
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch(
                'utils.kubeutil.KubeUtil.retrieve_pods_list') as mock_pods:
            self.kubeutil.host_name = 'dd-agent-1rxlh'
            mock_pods.return_value = json.loads(
                Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name,
                             'kubernetes-massi-minion-k23m')

            self.kubeutil.host_name = 'heapster-v11-l8sh1'
            mock_pods.return_value = json.loads(
                Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name,
                             'gke-cluster-1-8046fdfa-node-ld35')

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file(
            'events.json')  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv('KUBERNETES_PORT')
        self.assertFalse(Platform.is_k8s())
        os.environ['KUBERNETES_PORT'] = '999'
        self.assertTrue(Platform.is_k8s())

    def test_extract_event_tags(self):
        events = json.loads(
            Fixtures.read_file("events.json", string_escape=False))['items']
        for ev in events:
            tags = KubeUtil().extract_event_tags(ev)
            # there should be 4 tags except for some events where source.host is missing
            self.assertTrue(len(tags) >= 3)

            tag_names = [tag.split(':')[0] for tag in tags]
            self.assertIn('reason', tag_names)
            self.assertIn('namespace', tag_names)
            self.assertIn('object_type', tag_names)
            if len(tags) == 4:
                self.assertIn('node_name', tag_names)
Ejemplo n.º 2
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)

        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'
        self.init()

    def init(self):
        try:
            instance = self.instances[0]

            self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if Platform.is_k8s():
                self.kubeutil = KubeUtil()
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()
            if not self.init_success:
                # Initialization failed, will try later
                return

        # Report image metrics
        if self.collect_image_stats:
            self._count_and_weigh_images()

        if self.collect_ecs_tags:
            self.refresh_ecs_tags()

        if Platform.is_k8s():
            try:
                self.kube_labels = self.kubeutil.get_kube_labels()
            except Exception as e:
                self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))
                self.kube_labels = {}

        # containers running with custom cgroups?
        custom_cgroups = _is_affirmative(instance.get('custom_cgroups', False))

        # Get the list of containers and the index of their names
        containers_by_id = self._get_and_count_containers(custom_cgroups)
        containers_by_id = self._crawl_container_pids(containers_by_id, custom_cgroups)

        # Send events from Docker API
        if self.collect_events or self._service_discovery:
            self._process_events(containers_by_id)

        # Report performance container metrics (cpu, mem, net, io)
        self._report_performance_metrics(containers_by_id)

        if self.collect_container_size:
            self._report_container_size(containers_by_id)

        # Collect disk stats from Docker info command
        if self.collect_disk_stats:
            self._report_disk_stats()

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Filter containers according to the exclude/include rules
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups:
                try:
                    inspect_dict = self.docker_client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)


        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if Platform.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        if entity is not None:
            pod_name = None

            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller: # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k,v))

                    if k == KubeUtil.POD_NAME_LABEL and Platform.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add ECS tags
            if self.collect_ecs_tags:
                entity_id = entity.get("Id")
                if entity_id in self.ecs_tags:
                    ecs_tags = self.ecs_tags[entity_id]
                    tags.extend(ecs_tags)

            # Add kube labels
            if Platform.is_k8s():
                kube_tags = self.kube_labels.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def refresh_ecs_tags(self):
        ecs_config = self.docker_client.inspect_container('ecs-agent')
        ip = ecs_config.get('NetworkSettings', {}).get('IPAddress')
        ports = ecs_config.get('NetworkSettings', {}).get('Ports')
        port = ports.keys()[0].split('/')[0] if ports else None
        if not ip:
            port = ECS_INTROSPECT_DEFAULT_PORT
            if Platform.is_containerized() and self.docker_gateway:
                ip = self.docker_gateway
            else:
                ip = "localhost"

        ecs_tags = {}
        try:
            if ip and port:
                tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json()
                for task in tasks.get('Tasks', []):
                    for container in task.get('Containers', []):
                        tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']]
                        ecs_tags[container['DockerId']] = tags
        except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e:
            self.log.warning("Unable to collect ECS task names: %s" % e)

        self.ecs_tags = ecs_tags

    def _filter_containers(self, containers):
        if not self._filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            if self._are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _are_tags_filtered(self, tags):
        if self._tags_match_patterns(tags, self._exclude_patterns):
            if self._tags_match_patterns(tags, self._include_patterns):
                return False
            return True
        return False

    def _tags_match_patterns(self, tags, filters):
        for rule in filters:
            for tag in tags:
                if re.match(rule, tag):
                    return True
        return False

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:

                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            self._report_cgroup_metrics(container, tags)
            if "_proc_root" not in container:
                containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                continue
            self._report_net_metrics(container, tags)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not Platform.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        try:
            for cgroup in CGROUP_METRICS:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

        except MountException as ex:
            if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES:
                raise ex
            else:
                self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now."
                             "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries))
                self.cgroup_listing_retries += 1
        else:
            self.cgroup_listing_retries = 0

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')
        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name == 'eth0':
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), tags)
                        break
        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _process_events(self, containers_by_id):
        if self.collect_events is False:
            # Crawl events for service discovery only
            self._get_events()
            return
        try:
            api_events = self._get_events()
            aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
            events = self._format_events(aggregated_events, containers_by_id)
        except (socket.timeout, urllib2.URLError):
            self.warning('Timeout when collecting events. Events will be missing.')
            return
        except Exception as e:
            self.warning("Unexpected exception when collecting events: {0}. "
                         "Events will be missing".format(e))
            return

        for ev in events:
            self.log.debug("Creating event: %s" % ev['msg_title'])
            self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, changed_container_ids = self.docker_util.get_events()
        if changed_container_ids and self._service_discovery:
            get_sd_backend(self.agentConfig).update_checks(changed_container_ids)
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                events[event['from']].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            container_tags = set()
            low_prio_events = []
            normal_prio_events = []

            for event in event_group:
                container_name = event['id'][:11]

                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)

                # health checks generate tons of these so we treat them separately and lower their priority
                if event['status'].startswith('exec_create:') or event['status'].startswith('exec_start:'):
                    low_prio_events.append((event, container_name))
                else:
                    normal_prio_events.append((event, container_name))

            exec_event = self._create_dd_event(low_prio_events, image_name, container_tags, priority='Low')
            events.append(exec_event)

            normal_event = self._create_dd_event(normal_prio_events, image_name, container_tags, priority='Normal')
            events.append(normal_event)

        return events

    def _create_dd_event(self, events, image, c_tags, priority='Normal'):
        """Create the actual event to submit from a list of similar docker events"""
        max_timestamp = 0
        status = defaultdict(int)
        status_change = []

        for ev, c_name in events:
            max_timestamp = max(max_timestamp, int(ev['time']))
            status[ev['status']] += 1
            status_change.append([c_name, ev['status']])

        status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
        msg_title = "%s %s on %s" % (image, status_text, self.hostname)
        msg_body = (
            "%%%\n"
            "{image_name} {status} on {hostname}\n"
            "```\n{status_changes}\n```\n"
            "%%%"
        ).format(
            image_name=image,
            status=status_text,
            hostname=self.hostname,
            status_changes="\n".join(
                ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
        )

        if any(error in status_text for error in ERROR_ALERT_TYPE):
            alert_type = "error"
        else:
            alert_type = None

        return {
            'timestamp': max_timestamp,
            'host': self.hostname,
            'event_type': EVENT_TYPE,
            'msg_title': msg_title,
            'msg_text': msg_body,
            'source_type_name': EVENT_TYPE,
            'event_object': 'docker:%s' % image,
            'tags': list(c_tags),
            'alert_type': alert_type,
            'priority': priority
        }


    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                val, unit = raw_val.split(' ')
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                elif 'cpuacct.usage' in stat_file:
                    return dict({"usage": str(int(fp.read())/10000000)})
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now
            self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    def _is_container_cgroup(self, line, selinux_policy):
        if line[1] not in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') or line[2] == '/docker-daemon':
            return False
        if 'docker' in line[2]: # general case
            return True
        if 'docker' in selinux_policy: # selinux
            return True
        if line[2].startswith('/') and re.match(CONTAINER_ID_RE, line[2][1:]): # kubernetes
            return True
        return False

    # proc files
    def _crawl_container_pids(self, container_dict, custom_cgroups=False):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                "If you are running the agent in a container, make sure to "
                'share the volume properly: "/proc:/host/proc:ro". '
                "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. "
                "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:

            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, "
                               "process likely raced to finish : %s" %
                               (path, str(e)))
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if self._is_container_cgroup(line, selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug("Container %s not in container_dict, it's likely excluded", container_id)
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
Ejemplo n.º 3
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch("utils.kubeutil.KubeUtil.retrieve_pods_list", side_effect=["foo"])
    @mock.patch("utils.kubeutil.KubeUtil.extract_kube_labels")
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys="bar")
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with("foo", excluded_keys="bar")

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ["foo"])
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ["foo"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ["k8s-app"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ["foo"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ["k8s-app"])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, "foo")
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, "foo")
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, "uid")
        self.assertEqual(len(res), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, "foo")
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, "uid")
        self.assertEqual(len(res), 4)

    @mock.patch("utils.kubeutil.retrieve_json")
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch("utils.kubeutil.retrieve_json")
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, "foo")
        self.assertEqual(len(res.get("items")), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "10.240.0.9")
        self.assertEqual(len(res.get("items")), 5)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "foo")
        self.assertEqual(len(res.get("items")), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "10.240.0.5")
        self.assertEqual(len(res.get("items")), 1)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, "foo")
        self.assertEqual(len(res.get("items")), 0)

    @mock.patch("utils.kubeutil.requests")
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth("url", "foo_tok")
        r.get.assert_called_once_with("url", verify=False, timeout=10, headers={"Authorization": "Bearer foo_tok"})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth("url", "foo_tok")
        r.get.assert_called_with("url", verify=__file__, timeout=10, headers={"Authorization": "Bearer foo_tok"})

    def test_get_node_info(self):
        with mock.patch("utils.kubeutil.KubeUtil._fetch_host_data") as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = "foo"
            self.kubeutil._node_name = "bar"
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, "foo")
            self.assertEqual(name, "bar")
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch("utils.kubeutil.KubeUtil.retrieve_pods_list") as mock_pods:
            self.kubeutil.host_name = "dd-agent-1rxlh"
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, "10.240.0.9")
            self.assertEqual(self.kubeutil._node_name, "kubernetes-massi-minion-k23m")

            self.kubeutil.host_name = "heapster-v11-l8sh1"
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, "10.240.0.9")
            self.assertEqual(self.kubeutil._node_name, "gke-cluster-1-8046fdfa-node-ld35")

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = "/foo/bar"
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file("events.json")  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv("KUBERNETES_PORT")
        self.assertFalse(is_k8s())
        os.environ["KUBERNETES_PORT"] = "999"
        self.assertTrue(is_k8s())
Ejemplo n.º 4
0
class DockerDaemon(AgentCheck):
    """Collect metrics and events from Docker API and cgroups."""

    def __init__(self, name, init_config, agentConfig, instances=None):
        if instances is not None and len(instances) > 1:
            raise Exception("Docker check only supports one configured instance.")
        AgentCheck.__init__(self, name, init_config,
                            agentConfig, instances=instances)

        self.init_success = False
        self._service_discovery = agentConfig.get('service_discovery') and \
            agentConfig.get('service_discovery_backend') == 'docker'
        self.init()
        self._custom_cgroups = _is_affirmative(init_config.get('custom_cgroups', False))

    def is_k8s(self):
        return 'KUBERNETES_PORT' in os.environ

    def init(self):
        try:
            instance = self.instances[0]

            # if service discovery is enabled dockerutil will need a reference to the config store
            if self._service_discovery:
                self.docker_util = DockerUtil(
                    agentConfig=self.agentConfig,
                    config_store=get_config_store(self.agentConfig)
                )
            else:
                self.docker_util = DockerUtil()
            self.docker_client = self.docker_util.client
            self.docker_gateway = DockerUtil.get_gateway()

            if self.is_k8s():
                self.kubeutil = KubeUtil()
            # We configure the check with the right cgroup settings for this host
            # Just needs to be done once
            self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS)
            self.cgroup_listing_retries = 0
            self._latest_size_query = 0
            self._filtered_containers = set()
            self._disable_net_metrics = False

            # Set tagging options
            self.custom_tags = instance.get("tags", [])
            self.collect_labels_as_tags = instance.get("collect_labels_as_tags", [])
            self.kube_labels = {}

            self.use_histogram = _is_affirmative(instance.get('use_histogram', False))
            performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS)

            self.tag_names = {
                CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS),
                PERFORMANCE: performance_tags,
                IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS)
            }

            # Set filtering settings
            if not instance.get("exclude"):
                self._filtering_enabled = False
                if instance.get("include"):
                    self.log.warning("You must specify an exclude section to enable filtering")
            else:
                self._filtering_enabled = True
                include = instance.get("include", [])
                exclude = instance.get("exclude", [])
                self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude)
                self.tag_names[FILTERED] = _filtered_tag_names

            # Other options
            self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False))
            self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False))
            self.collect_events = _is_affirmative(instance.get('collect_events', True))
            self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False))
            self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False))
            self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance()

            self.ecs_tags = {}

        except Exception as e:
            self.log.critical(e)
            self.warning("Initialization failed. Will retry at next iteration")
        else:
            self.init_success = True

    def check(self, instance):
        """Run the Docker check for one instance."""
        if not self.init_success:
            # Initialization can fail if cgroups are not ready. So we retry if needed
            # https://github.com/DataDog/dd-agent/issues/1896
            self.init()
            if not self.init_success:
                # Initialization failed, will try later
                return

        # Report image metrics
        if self.collect_image_stats:
            self._count_and_weigh_images()

        if self.collect_ecs_tags:
            self.refresh_ecs_tags()

        if self.is_k8s():
            try:
                self.kube_labels = self.kubeutil.get_kube_labels()
            except Exception as e:
                self.log.warning('Could not retrieve kubernetes labels: %s' % str(e))
                self.kube_labels = {}

        # containers running with custom cgroups?
        custom_cgroups = _is_affirmative(instance.get('custom_cgroups', self._custom_cgroups))

        # Get the list of containers and the index of their names
        containers_by_id = self._get_and_count_containers(custom_cgroups)
        containers_by_id = self._crawl_container_pids(containers_by_id)

        # Send events from Docker API
        if self.collect_events or self._service_discovery:
            self._process_events(containers_by_id)

        # Report performance container metrics (cpu, mem, net, io)
        self._report_performance_metrics(containers_by_id)

        if self.collect_container_size:
            self._report_container_size(containers_by_id)

        # Collect disk stats from Docker info command
        if self.collect_disk_stats:
            self._report_disk_stats()

    def _count_and_weigh_images(self):
        try:
            tags = self._get_tags()
            active_images = self.docker_client.images(all=False)
            active_images_len = len(active_images)
            all_images_len = len(self.docker_client.images(quiet=True, all=True))
            self.gauge("docker.images.available", active_images_len, tags=tags)
            self.gauge("docker.images.intermediate", (all_images_len - active_images_len), tags=tags)

            if self.collect_image_size:
                self._report_image_size(active_images)

        except Exception as e:
            # It's not an important metric, keep going if it fails
            self.warning("Failed to count Docker images. Exception: {0}".format(e))

    def _get_and_count_containers(self, custom_cgroups=False):
        """List all the containers from the API, filter and count them."""

        # Querying the size of containers is slow, we don't do it at each run
        must_query_size = self.collect_container_size and self._latest_size_query == 0
        self._latest_size_query = (self._latest_size_query + 1) % SIZE_REFRESH_RATE

        running_containers_count = Counter()
        all_containers_count = Counter()

        try:
            containers = self.docker_client.containers(all=True, size=must_query_size)
        except Exception as e:
            message = "Unable to list Docker containers: {0}".format(e)
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.CRITICAL,
                               message=message)
            raise Exception(message)

        else:
            self.service_check(SERVICE_CHECK_NAME, AgentCheck.OK)

        # Filter containers according to the exclude/include rules
        self._filter_containers(containers)

        containers_by_id = {}

        for container in containers:
            container_name = DockerUtil.container_name_extractor(container)[0]

            container_status_tags = self._get_tags(container, CONTAINER)

            all_containers_count[tuple(sorted(container_status_tags))] += 1
            if self._is_container_running(container):
                running_containers_count[tuple(sorted(container_status_tags))] += 1

            # Check if the container is included/excluded via its tags
            if self._is_container_excluded(container):
                self.log.debug("Container {0} is excluded".format(container_name))
                continue

            containers_by_id[container['Id']] = container

            # grab pid via API if custom cgroups - otherwise we won't find process when
            # crawling for pids.
            if custom_cgroups:
                try:
                    inspect_dict = self.docker_client.inspect_container(container_name)
                    container['_pid'] = inspect_dict['State']['Pid']
                except Exception as e:
                    self.log.debug("Unable to inspect Docker container: %s", e)


        for tags, count in running_containers_count.iteritems():
            self.gauge("docker.containers.running", count, tags=list(tags))

        for tags, count in all_containers_count.iteritems():
            stopped_count = count - running_containers_count[tags]
            self.gauge("docker.containers.stopped", stopped_count, tags=list(tags))

        return containers_by_id

    def _is_container_running(self, container):
        """Tell if a container is running, according to its status.

        There is no "nice" API field to figure it out. We just look at the "Status" field, knowing how it is generated.
        See: https://github.com/docker/docker/blob/v1.6.2/daemon/state.go#L35
        """
        return container["Status"].startswith("Up") or container["Status"].startswith("Restarting")

    def _get_tags(self, entity=None, tag_type=None):
        """Generate the tags for a given entity (container or image) according to a list of tag names."""
        # Start with custom tags
        tags = list(self.custom_tags)

        # Collect pod names as tags on kubernetes
        if self.is_k8s() and KubeUtil.POD_NAME_LABEL not in self.collect_labels_as_tags:
            self.collect_labels_as_tags.append(KubeUtil.POD_NAME_LABEL)

        if entity is not None:
            pod_name = None

            # Get labels as tags
            labels = entity.get("Labels")
            if labels is not None:
                for k in self.collect_labels_as_tags:
                    if k in labels:
                        v = labels[k]
                        if k == KubeUtil.POD_NAME_LABEL and self.is_k8s():
                            pod_name = v
                            k = "pod_name"
                            if "-" in pod_name:
                                replication_controller = "-".join(pod_name.split("-")[:-1])
                                if "/" in replication_controller: # k8s <= 1.1
                                    namespace, replication_controller = replication_controller.split("/", 1)

                                elif KubeUtil.NAMESPACE_LABEL in labels: # k8s >= 1.2
                                    namespace = labels[KubeUtil.NAMESPACE_LABEL]
                                    pod_name = "{0}/{1}".format(namespace, pod_name)

                                tags.append("kube_namespace:%s" % namespace)
                                tags.append("kube_replication_controller:%s" % replication_controller)
                                tags.append("pod_name:%s" % pod_name)

                        elif not v:
                            tags.append(k)

                        else:
                            tags.append("%s:%s" % (k,v))

                    if k == KubeUtil.POD_NAME_LABEL and self.is_k8s() and k not in labels:
                        tags.append("pod_name:no_pod")

            # Get entity specific tags
            if tag_type is not None:
                tag_names = self.tag_names[tag_type]
                for tag_name in tag_names:
                    tag_value = self._extract_tag_value(entity, tag_name)
                    if tag_value is not None:
                        for t in tag_value:
                            tags.append('%s:%s' % (tag_name, str(t).strip()))

            # Add ECS tags
            if self.collect_ecs_tags:
                entity_id = entity.get("Id")
                if entity_id in self.ecs_tags:
                    ecs_tags = self.ecs_tags[entity_id]
                    tags.extend(ecs_tags)

            # Add kube labels
            if self.is_k8s():
                kube_tags = self.kube_labels.get(pod_name)
                if kube_tags:
                    tags.extend(list(kube_tags))

        return tags

    def _extract_tag_value(self, entity, tag_name):
        """Extra tag information from the API result (containers or images).
        Cache extracted tags inside the entity object.
        """
        if tag_name not in TAG_EXTRACTORS:
            self.warning("{0} isn't a supported tag".format(tag_name))
            return

        # Check for already extracted tags
        if "_tag_values" not in entity:
            entity["_tag_values"] = {}

        if tag_name not in entity["_tag_values"]:
            entity["_tag_values"][tag_name] = TAG_EXTRACTORS[tag_name](entity)

        return entity["_tag_values"][tag_name]

    def refresh_ecs_tags(self):
        ecs_config = self.docker_client.inspect_container('ecs-agent')
        ip = ecs_config.get('NetworkSettings', {}).get('IPAddress')
        ports = ecs_config.get('NetworkSettings', {}).get('Ports')
        port = ports.keys()[0].split('/')[0] if ports else None
        if not ip:
            port = ECS_INTROSPECT_DEFAULT_PORT
            if DockerUtil.is_dockerized() and self.docker_gateway():
                ip = self.docker_gateway
            else:
                ip = "localhost"

        ecs_tags = {}
        try:
            if ip and port:
                tasks = requests.get('http://%s:%s/v1/tasks' % (ip, port)).json()
                for task in tasks.get('Tasks', []):
                    for container in task.get('Containers', []):
                        tags = ['task_name:%s' % task['Family'], 'task_version:%s' % task['Version']]
                        ecs_tags[container['DockerId']] = tags
        except (requests.exceptions.HTTPError, requests.exceptions.HTTPError) as e:
            self.log.warning("Unable to collect ECS task names: %s" % e)

        self.ecs_tags = ecs_tags

    def _filter_containers(self, containers):
        if not self._filtering_enabled:
            return

        self._filtered_containers = set()
        for container in containers:
            container_tags = self._get_tags(container, FILTERED)
            if self._are_tags_filtered(container_tags):
                container_name = DockerUtil.container_name_extractor(container)[0]
                self._filtered_containers.add(container_name)
                self.log.debug("Container {0} is filtered".format(container_name))

    def _are_tags_filtered(self, tags):
        if self._tags_match_patterns(tags, self._exclude_patterns):
            if self._tags_match_patterns(tags, self._include_patterns):
                return False
            return True
        return False

    def _tags_match_patterns(self, tags, filters):
        for rule in filters:
            for tag in tags:
                if re.match(rule, tag):
                    return True
        return False

    def _is_container_excluded(self, container):
        """Check if a container is excluded according to the filter rules.

        Requires _filter_containers to run first.
        """
        container_name = DockerUtil.container_name_extractor(container)[0]
        return container_name in self._filtered_containers

    def _report_container_size(self, containers_by_id):
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            m_func = FUNC_MAP[GAUGE][self.use_histogram]
            if "SizeRw" in container:

                m_func(self, 'docker.container.size_rw', container['SizeRw'],
                       tags=tags)
            if "SizeRootFs" in container:
                m_func(
                    self, 'docker.container.size_rootfs', container['SizeRootFs'],
                    tags=tags)

    def _report_image_size(self, images):
        for image in images:
            tags = self._get_tags(image, IMAGE)
            if 'VirtualSize' in image:
                self.gauge('docker.image.virtual_size', image['VirtualSize'], tags=tags)
            if 'Size' in image:
                self.gauge('docker.image.size', image['Size'], tags=tags)

    # Performance metrics

    def _report_performance_metrics(self, containers_by_id):

        containers_without_proc_root = []
        for container in containers_by_id.itervalues():
            if self._is_container_excluded(container) or not self._is_container_running(container):
                continue

            tags = self._get_tags(container, PERFORMANCE)
            self._report_cgroup_metrics(container, tags)
            if "_proc_root" not in container:
                containers_without_proc_root.append(DockerUtil.container_name_extractor(container)[0])
                continue
            self._report_net_metrics(container, tags)

        if containers_without_proc_root:
            message = "Couldn't find pid directory for containers: {0}. They'll be missing network metrics".format(
                ", ".join(containers_without_proc_root))
            if not self.is_k8s():
                self.warning(message)
            else:
                # On kubernetes, this is kind of expected. Network metrics will be collected by the kubernetes integration anyway
                self.log.debug(message)

    def _report_cgroup_metrics(self, container, tags):
        try:
            for cgroup in CGROUP_METRICS:
                stat_file = self._get_cgroup_from_proc(cgroup["cgroup"], container['_pid'], cgroup['file'])
                stats = self._parse_cgroup_file(stat_file)
                if stats:
                    for key, (dd_key, metric_func) in cgroup['metrics'].iteritems():
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if key in stats:
                            metric_func(self, dd_key, int(stats[key]), tags=tags)

                    # Computed metrics
                    for mname, (key_list, fct, metric_func) in cgroup.get('to_compute', {}).iteritems():
                        values = [stats[key] for key in key_list if key in stats]
                        if len(values) != len(key_list):
                            self.log.debug("Couldn't compute {0}, some keys were missing.".format(mname))
                            continue
                        value = fct(*values)
                        metric_func = FUNC_MAP[metric_func][self.use_histogram]
                        if value is not None:
                            metric_func(self, mname, value, tags=tags)

        except MountException as ex:
            if self.cgroup_listing_retries > MAX_CGROUP_LISTING_RETRIES:
                raise ex
            else:
                self.warning("Couldn't find the cgroup files. Skipping the CGROUP_METRICS for now."
                             "Will retry {0} times before failing.".format(MAX_CGROUP_LISTING_RETRIES - self.cgroup_listing_retries))
                self.cgroup_listing_retries += 1
        else:
            self.cgroup_listing_retries = 0

    def _report_net_metrics(self, container, tags):
        """Find container network metrics by looking at /proc/$PID/net/dev of the container process."""
        if self._disable_net_metrics:
            self.log.debug("Network metrics are disabled. Skipping")
            return

        proc_net_file = os.path.join(container['_proc_root'], 'net/dev')
        try:
            with open(proc_net_file, 'r') as fp:
                lines = fp.readlines()
                """Two first lines are headers:
                Inter-|   Receive                                                |  Transmit
                 face |bytes    packets errs drop fifo frame compressed multicast|bytes    packets errs drop fifo colls carrier compressed
                """
                for l in lines[2:]:
                    cols = l.split(':', 1)
                    interface_name = str(cols[0]).strip()
                    if interface_name == 'eth0':
                        x = cols[1].split()
                        m_func = FUNC_MAP[RATE][self.use_histogram]
                        m_func(self, "docker.net.bytes_rcvd", long(x[0]), tags)
                        m_func(self, "docker.net.bytes_sent", long(x[8]), tags)
                        break
        except Exception as e:
            # It is possible that the container got stopped between the API call and now
            self.warning("Failed to report IO metrics from file {0}. Exception: {1}".format(proc_net_file, e))

    def _process_events(self, containers_by_id):
        if self.collect_events is False:
            # Crawl events for service discovery only
            self._get_events()
            return
        try:
            api_events = self._get_events()
            aggregated_events = self._pre_aggregate_events(api_events, containers_by_id)
            events = self._format_events(aggregated_events, containers_by_id)
        except (socket.timeout, urllib2.URLError):
            self.warning('Timeout when collecting events. Events will be missing.')
            return
        except Exception as e:
            self.warning("Unexpected exception when collecting events: {0}. "
                         "Events will be missing".format(e))
            return

        for ev in events:
            self.log.debug("Creating event: %s" % ev['msg_title'])
            self.event(ev)

    def _get_events(self):
        """Get the list of events."""
        events, conf_reload_set = self.docker_util.get_events()
        if conf_reload_set and self._service_discovery:
            get_sd_backend(self.agentConfig).reload_check_configs = conf_reload_set
        return events

    def _pre_aggregate_events(self, api_events, containers_by_id):
        # Aggregate events, one per image. Put newer events first.
        events = defaultdict(deque)
        for event in api_events:
            # Skip events related to filtered containers
            container = containers_by_id.get(event.get('id'))
            if container is not None and self._is_container_excluded(container):
                self.log.debug("Excluded event: container {0} status changed to {1}".format(
                    event['id'], event['status']))
                continue
            # from may be missing (for network events for example)
            if 'from' in event:
                events[event['from']].appendleft(event)
        return events

    def _format_events(self, aggregated_events, containers_by_id):
        events = []
        for image_name, event_group in aggregated_events.iteritems():
            max_timestamp = 0
            status = defaultdict(int)
            status_change = []
            container_tags = set()
            for event in event_group:
                max_timestamp = max(max_timestamp, int(event['time']))
                status[event['status']] += 1
                container_name = event['id'][:11]
                if event['id'] in containers_by_id:
                    cont = containers_by_id[event['id']]
                    container_name = DockerUtil.container_name_extractor(cont)[0]
                    container_tags.update(self._get_tags(cont, PERFORMANCE))
                    container_tags.add('container_name:%s' % container_name)

                status_change.append([container_name, event['status']])

            status_text = ", ".join(["%d %s" % (count, st) for st, count in status.iteritems()])
            msg_title = "%s %s on %s" % (image_name, status_text, self.hostname)
            msg_body = (
                "%%%\n"
                "{image_name} {status} on {hostname}\n"
                "```\n{status_changes}\n```\n"
                "%%%"
            ).format(
                image_name=image_name,
                status=status_text,
                hostname=self.hostname,
                status_changes="\n".join(
                    ["%s \t%s" % (change[1].upper(), change[0]) for change in status_change])
            )
            events.append({
                'timestamp': max_timestamp,
                'host': self.hostname,
                'event_type': EVENT_TYPE,
                'msg_title': msg_title,
                'msg_text': msg_body,
                'source_type_name': EVENT_TYPE,
                'event_object': 'docker:%s' % image_name,
                'tags': list(container_tags)
            })

        return events

    def _report_disk_stats(self):
        """Report metrics about the volume space usage"""
        stats = {
            'docker.data.used': None,
            'docker.data.total': None,
            'docker.data.free': None,
            'docker.metadata.used': None,
            'docker.metadata.total': None,
            'docker.metadata.free': None
            # these two are calculated by _calc_percent_disk_stats
            # 'docker.data.percent': None,
            # 'docker.metadata.percent': None
        }
        info = self.docker_client.info()
        driver_status = info.get('DriverStatus', [])
        if not driver_status:
            self.log.warning('Disk metrics collection is enabled but docker info did not'
                             ' report any. Your storage driver might not support them, skipping.')
            return
        for metric in driver_status:
            # only consider metrics about disk space
            if len(metric) == 2 and 'Space' in metric[0]:
                # identify Data and Metadata metrics
                mtype = 'data'
                if 'Metadata' in metric[0]:
                    mtype = 'metadata'

                if 'Used' in metric[0]:
                    stats['docker.{0}.used'.format(mtype)] = metric[1]
                elif 'Space Total' in metric[0]:
                    stats['docker.{0}.total'.format(mtype)] = metric[1]
                elif 'Space Available' in metric[0]:
                    stats['docker.{0}.free'.format(mtype)] = metric[1]
        stats = self._format_disk_metrics(stats)
        stats.update(self._calc_percent_disk_stats(stats))
        tags = self._get_tags()
        for name, val in stats.iteritems():
            if val is not None:
                self.gauge(name, val, tags)

    def _format_disk_metrics(self, metrics):
        """Cast the disk stats to float and convert them to bytes"""
        for name, raw_val in metrics.iteritems():
            if raw_val:
                val, unit = raw_val.split(' ')
                # by default some are uppercased others lowercased. That's error prone.
                unit = unit.lower()
                try:
                    val = int(float(val) * UNIT_MAP[unit])
                    metrics[name] = val
                except KeyError:
                    self.log.error('Unrecognized unit %s for disk metric %s. Dropping it.' % (unit, name))
                    metrics[name] = None
        return metrics

    def _calc_percent_disk_stats(self, stats):
        """Calculate a percentage of used disk space for data and metadata"""
        mtypes = ['data', 'metadata']
        percs = {}
        for mtype in mtypes:
            used = stats.get('docker.{0}.used'.format(mtype))
            total = stats.get('docker.{0}.total'.format(mtype))
            free = stats.get('docker.{0}.free'.format(mtype))
            if used and total and free and ceil(total) < free + used:
                self.log.debug('used, free, and total disk metrics may be wrong, '
                               'used: %s, free: %s, total: %s',
                               used, free, total)
                total = used + free
            try:
                if isinstance(used, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * float(used) / float(total), 2)
                elif isinstance(free, int):
                    percs['docker.{0}.percent'.format(mtype)] = round(100 * (1.0 - (float(free) / float(total))), 2)
            except ZeroDivisionError:
                self.log.error('docker.{0}.total is 0, calculating docker.{1}.percent'
                               ' is not possible.'.format(mtype, mtype))
        return percs

    # Cgroups
    def _get_cgroup_from_proc(self, cgroup, pid, filename):
        """Find a specific cgroup file, containing metrics to extract."""
        params = {
            "file": filename,
        }
        return DockerUtil.find_cgroup_from_proc(self._mountpoints, pid, cgroup, self.docker_util._docker_root) % (params)

    def _parse_cgroup_file(self, stat_file):
        """Parse a cgroup pseudo file for key/values."""
        self.log.debug("Opening cgroup file: %s" % stat_file)
        try:
            with open(stat_file, 'r') as fp:
                if 'blkio' in stat_file:
                    return self._parse_blkio_metrics(fp.read().splitlines())
                else:
                    return dict(map(lambda x: x.split(' ', 1), fp.read().splitlines()))
        except IOError:
            # It is possible that the container got stopped between the API call and now
            self.log.info("Can't open %s. Metrics for this container are skipped." % stat_file)

    def _parse_blkio_metrics(self, stats):
        """Parse the blkio metrics."""
        metrics = {
            'io_read': 0,
            'io_write': 0,
        }
        for line in stats:
            if 'Read' in line:
                metrics['io_read'] += int(line.split()[2])
            if 'Write' in line:
                metrics['io_write'] += int(line.split()[2])
        return metrics

    # proc files
    def _crawl_container_pids(self, container_dict):
        """Crawl `/proc` to find container PIDs and add them to `containers_by_id`."""
        proc_path = os.path.join(self.docker_util._docker_root, 'proc')
        pid_dirs = [_dir for _dir in os.listdir(proc_path) if _dir.isdigit()]

        if len(pid_dirs) == 0:
            self.warning("Unable to find any pid directory in {0}. "
                "If you are running the agent in a container, make sure to "
                'share the volume properly: "/proc:/host/proc:ro". '
                "See https://github.com/DataDog/docker-dd-agent/blob/master/README.md for more information. "
                "Network metrics will be missing".format(proc_path))
            self._disable_net_metrics = True
            return container_dict

        self._disable_net_metrics = False

        for folder in pid_dirs:

            try:
                path = os.path.join(proc_path, folder, 'cgroup')
                with open(path, 'r') as f:
                    content = [line.strip().split(':') for line in f.readlines()]

                selinux_policy = ''
                path = os.path.join(proc_path, folder, 'attr', 'current')
                if os.path.exists(path):
                    with open(path, 'r') as f:
                        selinux_policy = f.readlines()[0]
            except IOError, e:
                #  Issue #2074
                self.log.debug("Cannot read %s, "
                               "process likely raced to finish : %s" %
                               (path, str(e)))
            except Exception as e:
                self.warning("Cannot read %s : %s" % (path, str(e)))
                continue

            try:
                for line in content:
                    if line[1] in ('cpu,cpuacct', 'cpuacct,cpu', 'cpuacct') and \
                            ('docker' in line[2] or 'docker' in selinux_policy):
                        cpuacct = line[2]
                        break
                else:
                    continue

                matches = re.findall(CONTAINER_ID_RE, cpuacct)
                if matches:
                    container_id = matches[-1]
                    if container_id not in container_dict:
                        self.log.debug("Container %s not in container_dict, it's likely excluded", container_id)
                        continue
                    container_dict[container_id]['_pid'] = folder
                    container_dict[container_id]['_proc_root'] = os.path.join(proc_path, folder)
                elif self._custom_cgroups: # if we match by pid that should be enough (?) - O(n) ugh!
                    for _, container in container_dict.iteritems():
                        if container.get('_pid') == int(folder):
                            container['_proc_root'] = os.path.join(proc_path, folder)
                            break

            except Exception, e:
                self.warning("Cannot parse %s content: %s" % (path, str(e)))
                continue
Ejemplo n.º 5
0
class TestKubeutil(unittest.TestCase):
    def setUp(self):
        self.kubeutil = KubeUtil()

    @mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list', side_effect=['foo'])
    @mock.patch('utils.kubeutil.KubeUtil.extract_kube_labels')
    def test_get_kube_labels(self, extract_kube_labels, retrieve_pods_list):
        self.kubeutil.get_kube_labels(excluded_keys='bar')
        retrieve_pods_list.assert_called_once()
        extract_kube_labels.assert_called_once_with('foo', excluded_keys='bar')

    def test_extract_kube_labels(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_kube_labels({}, ['foo'])
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 8)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_kube_labels(pods, ['foo'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)
        res = self.kubeutil.extract_kube_labels(pods, ['k8s-app'])
        labels = set(inn for out in res.values() for inn in out)
        self.assertEqual(len(labels), 3)

    def test_extract_meta(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.extract_meta({}, 'foo')
        self.assertEqual(len(res), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 6)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.extract_meta(pods, 'foo')
        self.assertEqual(len(res), 0)
        res = self.kubeutil.extract_meta(pods, 'uid')
        self.assertEqual(len(res), 4)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_pods_list(self, retrieve_json):
        self.kubeutil.retrieve_pods_list()
        retrieve_json.assert_called_once_with(self.kubeutil.pods_list_url)

    @mock.patch('utils.kubeutil.retrieve_json')
    def test_retrieve_metrics(self, retrieve_json):
        self.kubeutil.retrieve_metrics()
        retrieve_json.assert_called_once_with(self.kubeutil.metrics_url)

    def test_filter_pods_list(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        res = self.kubeutil.filter_pods_list({}, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.9')
        self.assertEqual(len(res.get('items')), 5)

        pods = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, '10.240.0.5')
        self.assertEqual(len(res.get('items')), 1)

        pods = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
        res = self.kubeutil.filter_pods_list(pods, 'foo')
        self.assertEqual(len(res.get('items')), 0)

    @mock.patch('utils.kubeutil.requests')
    def test_retrieve_json_auth(self, r):
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_once_with('url', verify=False, timeout=10, headers={'Authorization': 'Bearer foo_tok'})

        self.kubeutil.CA_CRT_PATH = __file__
        self.kubeutil.retrieve_json_auth('url', 'foo_tok')
        r.get.assert_called_with('url', verify=__file__, timeout=10, headers={'Authorization': 'Bearer foo_tok'})

    def test_get_node_info(self):
        with mock.patch('utils.kubeutil.KubeUtil._fetch_host_data') as f:
            self.kubeutil.get_node_info()
            f.assert_called_once()

            f.reset_mock()

            self.kubeutil._node_ip = 'foo'
            self.kubeutil._node_name = 'bar'
            ip, name = self.kubeutil.get_node_info()
            self.assertEqual(ip, 'foo')
            self.assertEqual(name, 'bar')
            f.assert_not_called()

    def test__fetch_host_data(self):
        """
        Test with both 1.1 and 1.2 version payloads
        """
        with mock.patch('utils.kubeutil.KubeUtil.retrieve_pods_list') as mock_pods:
            self.kubeutil.host_name = 'dd-agent-1rxlh'
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.2.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name, 'kubernetes-massi-minion-k23m')

            self.kubeutil.host_name = 'heapster-v11-l8sh1'
            mock_pods.return_value = json.loads(Fixtures.read_file("pods_list_1.1.json", string_escape=False))
            self.kubeutil._fetch_host_data()
            self.assertEqual(self.kubeutil._node_ip, '10.240.0.9')
            self.assertEqual(self.kubeutil._node_name, 'gke-cluster-1-8046fdfa-node-ld35')

    def test_get_auth_token(self):
        KubeUtil.AUTH_TOKEN_PATH = '/foo/bar'
        self.assertIsNone(KubeUtil.get_auth_token())
        KubeUtil.AUTH_TOKEN_PATH = Fixtures.file('events.json')  # any file could do the trick
        self.assertIsNotNone(KubeUtil.get_auth_token())

    def test_is_k8s(self):
        os.unsetenv('KUBERNETES_PORT')
        self.assertFalse(Platform.is_k8s())
        os.environ['KUBERNETES_PORT'] = '999'
        self.assertTrue(Platform.is_k8s())