def _process_container_metric(self, type, metric_name, metric, scraper_config): """ Takes a simple metric about a container, reports it as a rate or gauge. If several series are found for a given container, values are summed before submission. """ if metric.type not in METRIC_TYPES: self.log.error("Metric type %s unsupported for metric %s" % (metric.type, metric.name)) return samples = self._sum_values_by_context(metric, self._get_entity_id_if_container_metric) for c_id, sample in iteritems(samples): pod_uid = self._get_pod_uid(sample[self.SAMPLE_LABELS]) if self.pod_list_utils.is_excluded(c_id, pod_uid): continue tags = tagger.tag(c_id, tagger.HIGH) tags += scraper_config['custom_tags'] # FIXME we are forced to do that because the Kubelet PodList isn't updated # for static pods, see https://github.com/kubernetes/kubernetes/pull/59948 pod = self._get_pod_by_metric_label(sample[self.SAMPLE_LABELS]) if pod is not None and is_static_pending_pod(pod): tags += tagger.tag('kubernetes_pod://%s' % pod["metadata"]["uid"], tagger.HIGH) tags += self._get_kube_container_name(sample[self.SAMPLE_LABELS]) tags = list(set(tags)) val = sample[self.SAMPLE_VALUE] if "rate" == type: self.rate(metric_name, val, tags) elif "gauge" == type: self.gauge(metric_name, val, tags)
def _process_usage_metric(self, m_name, metric, cache, scraper_config, labels=None): """ Takes a metric object, a metric name, and a cache dict where it will store container_name --> (value, tags) so that _process_limit_metric can compute usage_pct it also submit said value and tags as a gauge. """ if labels is None: labels = [] # track containers that still exist in the cache seen_keys = {k: False for k in cache} samples = self._sum_values_by_context( metric, self._get_entity_id_if_container_metric) for c_id, sample in iteritems(samples): c_name = self._get_container_label(sample[self.SAMPLE_LABELS], 'name') if not c_name: continue pod_uid = self._get_pod_uid(sample[self.SAMPLE_LABELS]) if self.pod_list_utils.is_excluded(c_id, pod_uid): continue tags = scraper_config['custom_tags'][:] tags += tagger.tag(replace_container_rt_prefix(c_id), tagger.HIGH) or [] # FIXME we are forced to do that because the Kubelet PodList isn't updated # for static pods, see https://github.com/kubernetes/kubernetes/pull/59948 pod = self._get_pod_by_metric_label(sample[self.SAMPLE_LABELS]) if pod is not None and is_static_pending_pod(pod): tags += tagger.tag( 'kubernetes_pod_uid://%s' % pod["metadata"]["uid"], tagger.HIGH) or [] tags += self._get_kube_container_name( sample[self.SAMPLE_LABELS]) tags = list(set(tags)) for label in labels: value = sample[self.SAMPLE_LABELS].get(label) if value: tags.append('%s:%s' % (label, value)) val = sample[self.SAMPLE_VALUE] cache[c_name] = (val, tags) seen_keys[c_name] = True self.gauge(m_name, val, tags) # purge the cache for k, seen in iteritems(seen_keys): if not seen: del cache[k]
def _report_pods_running(self, pods, instance_tags): """ Reports the number of running pods on this node and the running containers in pods, tagged by service and creator. :param pods: pod list object :param instance_tags: list of tags """ pods_tag_counter = defaultdict(int) containers_tag_counter = defaultdict(int) for pod in pods.get('items', []): # Containers reporting containers = pod.get('status', {}).get('containerStatuses', []) has_container_running = False for container in containers: container_id = container.get('containerID') if not container_id: self.log.debug('skipping container with no id') continue if "running" not in container.get('state', {}): continue has_container_running = True tags = tagger.tag(replace_container_rt_prefix(container_id), tagger.LOW) or None if not tags: continue tags += instance_tags hash_tags = tuple(sorted(tags)) containers_tag_counter[hash_tags] += 1 # Pod reporting if not has_container_running: continue pod_id = pod.get('metadata', {}).get('uid') if not pod_id: self.log.debug('skipping pod with no uid') continue tags = tagger.tag('kubernetes_pod_uid://%s' % pod_id, tagger.LOW) or None if not tags: continue tags += instance_tags hash_tags = tuple(sorted(tags)) pods_tag_counter[hash_tags] += 1 for tags, count in iteritems(pods_tag_counter): self.gauge(self.NAMESPACE + '.pods.running', count, list(tags)) for tags, count in iteritems(containers_tag_counter): self.gauge(self.NAMESPACE + '.containers.running', count, list(tags))
def _process_limit_metric(self, m_name, metric, cache, scraper_config, pct_m_name=None): """ Reports limit metrics if m_name is not an empty string, and optionally checks in the given cache if there's a usage for each sample in the metric and reports the usage_pct """ samples = self._sum_values_by_context(metric, self._get_entity_id_if_container_metric) for c_id, sample in iteritems(samples): limit = sample[self.SAMPLE_VALUE] pod_uid = self._get_pod_uid(sample[self.SAMPLE_LABELS]) if self.pod_list_utils.is_excluded(c_id, pod_uid): continue tags = tagger.tag(c_id, tagger.HIGH) tags += scraper_config['custom_tags'] if m_name: self.gauge(m_name, limit, tags) if pct_m_name and limit > 0: c_name = self._get_container_label(sample[self.SAMPLE_LABELS], 'name') if not c_name: continue usage, tags = cache.get(c_name, (None, None)) if usage: self.gauge(pct_m_name, float(usage / float(limit)), tags) else: self.log.debug( "No corresponding usage found for metric %s and " "container %s, skipping usage_pct for now." % (pct_m_name, c_name) )
def _process_pod_rate(self, metric_name, metric, scraper_config, labels=None): """ Takes a simple metric about a pod, reports it as a rate. If several series are found for a given pod, values are summed before submission. """ if labels is None: labels = [] if metric.type not in METRIC_TYPES: self.log.error("Metric type %s unsupported for metric %s" % (metric.type, metric.name)) return samples = self._sum_values_by_context(metric, self._get_pod_uid_if_pod_metric) for pod_uid, sample in iteritems(samples): if '.network.' in metric_name and self._is_pod_host_networked( pod_uid): continue tags = tagger.tag('kubernetes_pod://%s' % pod_uid, tagger.HIGH) tags += scraper_config['custom_tags'] for label in labels: value = sample[self.SAMPLE_LABELS].get(label) if value: tags.append('%s:%s' % (label, value)) val = sample[self.SAMPLE_VALUE] self.rate(metric_name, val, tags)
def _report_ephemeral_storage_usage(self, pod_list, instance_tags): stats = self._retrieve_stats() ephemeral_storage_usage = {} for pod in stats.get('pods', []): pod_uid = pod.get('podRef', {}).get('uid') pod_ephemeral_usage = pod.get('ephemeral-storage', {}).get('usedBytes') if pod_uid and pod_ephemeral_usage: ephemeral_storage_usage[pod_uid] = pod_ephemeral_usage for pod in pod_list['items']: pod_uid = pod.get('metadata', {}).get('uid') if pod_uid is None: continue pod_usage = ephemeral_storage_usage.get(pod_uid) if pod_usage is None: continue tags = tagger.tag('kubernetes_pod_uid://{}'.format(pod_uid), tagger.ORCHESTRATOR) if not tags: continue tags += instance_tags self.gauge(self.NAMESPACE + '.ephemeral_storage.usage', pod_usage, tags)
def _update_container_metrics(self, instance, subcontainer, pod_list, pod_list_utils): is_pod = False in_static_pod = False subcontainer_id = subcontainer.get('id') pod_uid = subcontainer.get('labels', {}).get('io.kubernetes.pod.uid') k_container_name = subcontainer.get( 'labels', {}).get('io.kubernetes.container.name') # We want to collect network metrics at the pod level if k_container_name == "POD" and pod_uid: is_pod = True # FIXME we are forced to do that because the Kubelet PodList isn't updated # for static pods, see https://github.com/kubernetes/kubernetes/pull/59948 pod = get_pod_by_uid(pod_uid, pod_list) if pod is not None and is_static_pending_pod(pod): in_static_pod = True # Let's see who we have here if is_pod: tags = tags_for_pod(pod_uid, tagger.HIGH) elif in_static_pod and k_container_name: # FIXME static pods don't have container statuses so we can't # get the container id with the scheme, assuming docker here tags = tags_for_docker(subcontainer_id, tagger.HIGH) tags += tags_for_pod(pod_uid, tagger.HIGH) tags.append("kube_container_name:%s" % k_container_name) else: # Standard container cid = pod_list_utils.get_cid_by_name_tuple( (pod.get('metadata', {}).get('namespace', ""), pod.get('metadata', {}).get('name', ""), k_container_name)) if pod_list_utils.is_excluded(cid): self.log.debug("Filtering out " + cid) return tags = tagger.tag(cid, tagger.HIGH) if not tags: self.log.debug( "Subcontainer {} doesn't have tags, skipping.".format( subcontainer_id)) return tags = list(set(tags + instance.get('tags', []))) stats = subcontainer['stats'][-1] # take the latest self._publish_raw_metrics(NAMESPACE, stats, tags, is_pod) if is_pod is False and subcontainer.get( "spec", {}).get("has_filesystem") and stats.get('filesystem'): fs = stats['filesystem'][-1] fs_utilization = float(fs['usage']) / float(fs['capacity']) self.gauge(NAMESPACE + '.filesystem.usage_pct', fs_utilization, tags=tags) if is_pod and subcontainer.get("spec", {}).get("has_network"): net = stats['network'] self.rate(NAMESPACE + '.network_errors', sum(float(net[x]) for x in NET_ERRORS), tags=tags)
def _report_container_state_metrics(self, pod_list, instance_tags): """Reports container state & reasons by looking at container statuses""" if pod_list.get('expired_count'): self.gauge(self.NAMESPACE + '.pods.expired', pod_list.get('expired_count'), tags=instance_tags) for pod in pod_list['items']: pod_name = pod.get('metadata', {}).get('name') pod_uid = pod.get('metadata', {}).get('uid') if not pod_name or not pod_uid: continue for ctr_status in pod['status'].get('containerStatuses', []): c_name = ctr_status.get('name') cid = ctr_status.get('containerID') if not c_name or not cid: continue if self.pod_list_utils.is_excluded(cid, pod_uid): continue tags = tagger.tag('%s' % cid, tagger.ORCHESTRATOR) + instance_tags restart_count = ctr_status.get('restartCount', 0) self.gauge(self.NAMESPACE + '.containers.restarts', restart_count, tags) for (metric_name, field_name) in [('state', 'state'), ('last_state', 'lastState')]: c_state = ctr_status.get(field_name, {}) for state_name in ['terminated', 'waiting']: state_reasons = WHITELISTED_CONTAINER_STATE_REASONS.get(state_name, []) self._submit_container_state_metric(metric_name, state_name, c_state, state_reasons, tags)
def check(self, _): kubelet_conn_info = get_connection_info() endpoint = kubelet_conn_info.get('url') if endpoint is None: raise CheckException( "Unable to detect the kubelet URL automatically: " + kubelet_conn_info.get('err', '')) self.pod_list_url = endpoint.strip("/") + POD_LIST_PATH self.kubelet_credentials = KubeletCredentials(kubelet_conn_info) if self.fargate_mode: pod_list = self.retrieve_pod_list() for pod in pod_list.get('items', []): pod_id = pod.get('metadata', {}).get('uid') tagger_tags = tagger.tag('kubernetes_pod_uid://%s' % pod_id, tagger.ORCHESTRATOR) or [] tagger_tags.extend(self.tags) tags = set(tagger_tags) # Submit the heartbeat metric for fargate virtual nodes. self.gauge(self.NAMESPACE + '.pods.running', 1, tags) pod_annotations = pod.get('metadata', {}).get('annotations') if CAPACITY_ANNOTATION_KEY not in pod_annotations: continue cpu_val, mem_val = extract_resource_values( pod_annotations.get(CAPACITY_ANNOTATION_KEY)) if cpu_val == 0 or mem_val == 0: continue self.gauge(self.NAMESPACE + '.cpu.capacity', cpu_val, tags) self.gauge(self.NAMESPACE + '.memory.capacity', mem_val, tags)
def tags_for_docker(cid, cardinality, with_prefix=False): """ Queries the tagger for a given container id. If with_prefix=true, method won't add `container_id://` to `cid` :return: string array, empty if container not found """ if not with_prefix: cid = 'container_id://%s' % cid return tagger.tag(cid, cardinality) or []
def _report_container_spec_metrics(self, pod_list, instance_tags): """Reports pod requests & limits by looking at pod specs.""" for pod in pod_list.get('items', []): pod_name = pod.get('metadata', {}).get('name') pod_phase = pod.get('status', {}).get('phase') if self._should_ignore_pod(pod_name, pod_phase): continue for ctr in pod['spec']['containers']: if not ctr.get('resources'): continue c_name = ctr.get('name', '') cid = None for ctr_status in pod['status'].get('containerStatuses', []): if ctr_status.get('name') == c_name: # it is already prefixed with 'runtime://' cid = ctr_status.get('containerID') break if not cid: continue pod_uid = pod.get('metadata', {}).get('uid') if self.pod_list_utils.is_excluded(cid, pod_uid): continue tags = tagger.tag(replace_container_rt_prefix(cid), tagger.HIGH) if not tags: continue tags += instance_tags try: for resource, value_str in iteritems( ctr.get('resources', {}).get('requests', {})): value = self.parse_quantity(value_str) self.gauge( '{}.{}.requests'.format(self.NAMESPACE, resource), value, tags) except (KeyError, AttributeError) as e: self.log.debug( "Unable to retrieve container requests for %s: %s", c_name, e) try: for resource, value_str in iteritems( ctr.get('resources', {}).get('limits', {})): value = self.parse_quantity(value_str) self.gauge( '{}.{}.limits'.format(self.NAMESPACE, resource), value, tags) except (KeyError, AttributeError) as e: self.log.debug( "Unable to retrieve container limits for %s: %s", c_name, e)
def _create_pod_tags_by_pvc(self, pod_list): """ Return a map, e.g. { "<kube_namespace>/<persistentvolumeclaim>": [<list_of_pod_tags>], "<kube_namespace1>/<persistentvolumeclaim1>": [<list_of_pod_tags1>], } that can be used to add pod tags to associated volume metrics """ pod_tags_by_pvc = defaultdict(set) if pod_list is None: return pod_tags_by_pvc pods = pod_list.get('items', []) for pod in pods: # get kubernetes namespace of PVC kube_ns = pod.get('metadata', {}).get('namespace') if not kube_ns: continue # get volumes volumes = pod.get('spec', {}).get('volumes') if not volumes: continue # get pod id pod_id = pod.get('metadata', {}).get('uid') if not pod_id: self.log.debug('skipping pod with no uid') continue # get tags from tagger tags = tagger.tag('kubernetes_pod_uid://%s' % pod_id, tagger.ORCHESTRATOR) or None if not tags: continue # remove tags that don't apply to PVCs for excluded_tag in self.VOLUME_TAG_KEYS_TO_EXCLUDE: tags = [ t for t in tags if not t.startswith(excluded_tag + ':') ] # get PVC for v in volumes: pvc_name = v.get('persistentVolumeClaim', {}).get('claimName') if pvc_name: pod_tags_by_pvc['{}/{}'.format(kube_ns, pvc_name)].update(tags) return pod_tags_by_pvc
def check(self, instance): if self.fargate_mode: pod_list = self.get_pod_list() for pod in pod_list.get('items', []): pod_id = pod.get('metadata', {}).get('uid') tagger_tags = tagger.tag('kubernetes_pod_uid://%s' % pod_id, tagger.ORCHESTRATOR) or [] tagger_tags.extend(self.tags) tags = set(tagger_tags) # Submit the heartbeat metric for fargate virtual nodes. self.gauge(self.NAMESPACE + '.pods.running', 1, tags) pod_annotations = pod.get('metadata', {}).get('annotations') if CAPACITY_ANNOTATION_KEY not in pod_annotations: continue cpu_val, mem_val = extract_resource_values( pod_annotations.get(CAPACITY_ANNOTATION_KEY)) if cpu_val == 0 or mem_val == 0: continue self.gauge(self.NAMESPACE + '.cpu.capacity', cpu_val, tags) self.gauge(self.NAMESPACE + '.memory.capacity', mem_val, tags)
def tags_for_pod(pod_id, cardinality): """ Queries the tagger for a given pod uid :return: string array, empty if pod not found """ return tagger.tag('kubernetes_pod_uid://%s' % pod_id, cardinality) or []
def tags_for_docker(cid, cardinality): """ Queries the tagger for a given container id :return: string array, empty if container not found """ return tagger.tag('docker://%s' % cid, cardinality)