def __init__(self, name, init_config, agentConfig, instances=None): super(KubeletCheck, self).__init__(name, init_config, agentConfig, instances) self.NAMESPACE = 'kubernetes' if instances is not None and len(instances) > 1: raise Exception( 'Kubelet check only supports one configured instance.') inst = instances[0] if instances else None self.cadvisor_legacy_port = inst.get('cadvisor_port', CADVISOR_DEFAULT_PORT) self.cadvisor_legacy_url = None self.cadvisor_scraper = CadvisorPrometheusScraper(self) self.kubelet_scraper = PrometheusScraper(self) self.kubelet_scraper.NAMESPACE = 'kubernetes' self.kubelet_scraper.metrics_mapper = { 'apiserver_client_certificate_expiration_seconds': 'apiserver.certificate.expiration', 'rest_client_requests_total': 'rest.client.requests', 'kubelet_runtime_operations': 'kubelet.runtime.operations', 'kubelet_runtime_operations_errors': 'kubelet.runtime.errors', }
def _get_istio_mesh_scraper(self, instance): """ Grab the istio mesh scraper from the dict and return it if it exists, otherwise create the scraper and add it to the dict """ endpoint = instance.get('istio_mesh_endpoint') if self._scrapers.get(endpoint, None): return self._scrapers.get(endpoint) scraper = PrometheusScraper(self) self._scrapers[endpoint] = scraper scraper.NAMESPACE = self.MESH_NAMESPACE scraper.metrics_mapper = { # These metrics support Istio 1.0 'istio_requests_total': 'request.count', 'istio_request_duration_seconds': 'request.duration', 'istio_request_bytes': 'request.size', 'istio_response_bytes': 'response.size', # These metrics support Istio 0.8 'istio_request_count': 'request.count', 'istio_request_duration': 'request.duration', 'istio_request_size': 'request.size', 'istio_response_size': 'response.size', } scraper.label_to_hostname = endpoint scraper = self._shared_scraper_config(scraper, instance) return scraper
def test_credentials_token_noverify(): expected_headers = {'Authorization': 'Bearer mytoken'} creds = KubeletCredentials({ "verify_tls": "false", "ca_cert": "ca_cert", "client_crt": "ignore_me", "token": "mytoken" }) assert creds.verify() is False assert creds.cert_pair() is None assert creds.headers("https://dummy") == expected_headers # Make sure we don't leak the token over http assert creds.headers("http://dummy") is None scraper = PrometheusScraper(None) creds.configure_scraper(scraper, "https://dummy") assert scraper.ssl_ca_cert is False assert scraper.ssl_cert is None assert scraper.ssl_private_key is None assert scraper.extra_headers == expected_headers # Make sure we don't leak the token over http creds.configure_scraper(scraper, "http://dummy") assert scraper.ssl_ca_cert is False assert scraper.ssl_cert is None assert scraper.ssl_private_key is None assert scraper.extra_headers == {}
def test_credentials_empty(): creds = KubeletCredentials({}) assert creds.verify() is None assert creds.cert_pair() is None assert creds.headers("https://dummy") is None scraper = PrometheusScraper(None) creds.configure_scraper(scraper, "https://dummy") assert scraper.ssl_ca_cert is None assert scraper.ssl_cert is None assert scraper.ssl_private_key is None assert scraper.extra_headers == {}
def test_credentials_certificates(): creds = KubeletCredentials({ "verify_tls": "true", "ca_cert": "ca_cert", "client_crt": "crt", "client_key": "key", "token": "ignore_me" }) assert creds.verify() == "ca_cert" assert creds.cert_pair() == ("crt", "key") assert creds.headers("https://dummy") is None scraper = PrometheusScraper(None) creds.configure_scraper(scraper, "https://dummy") assert scraper.ssl_ca_cert == "ca_cert" assert scraper.ssl_cert == "crt" assert scraper.ssl_private_key == "key" assert scraper.extra_headers == {}
class KubeletCheck(AgentCheck, CadvisorScraper): """ Collect metrics from Kubelet. """ def __init__(self, name, init_config, agentConfig, instances=None): super(KubeletCheck, self).__init__(name, init_config, agentConfig, instances) self.NAMESPACE = 'kubernetes' if instances is not None and len(instances) > 1: raise Exception( 'Kubelet check only supports one configured instance.') inst = instances[0] if instances else None self.cadvisor_legacy_port = inst.get('cadvisor_port', CADVISOR_DEFAULT_PORT) self.cadvisor_legacy_url = None self.cadvisor_scraper = CadvisorPrometheusScraper(self) self.kubelet_scraper = PrometheusScraper(self) self.kubelet_scraper.NAMESPACE = 'kubernetes' self.kubelet_scraper.metrics_mapper = { 'apiserver_client_certificate_expiration_seconds': 'apiserver.certificate.expiration', 'rest_client_requests_total': 'rest.client.requests', 'kubelet_runtime_operations': 'kubelet.runtime.operations', 'kubelet_runtime_operations_errors': 'kubelet.runtime.errors', } def check(self, instance): kubelet_conn_info = get_connection_info() endpoint = kubelet_conn_info.get('url') if endpoint is None: raise CheckException( "Unable to detect the kubelet URL automatically.") if 'cadvisor_metrics_endpoint' in instance: self.cadvisor_metrics_url = \ instance.get('cadvisor_metrics_endpoint', urljoin(endpoint, CADVISOR_METRICS_PATH)) else: self.cadvisor_metrics_url = instance.get( 'metrics_endpoint', urljoin(endpoint, CADVISOR_METRICS_PATH)) if 'metrics_endpoint' in instance: self.log.warning( 'metrics_endpoint is deprecated, please specify cadvisor_metrics_endpoint instead.' ) self.kubelet_metrics_url = instance.get( 'kubelet_metrics_endpoint', urljoin(endpoint, KUBELET_METRICS_PATH)) self.kube_health_url = urljoin(endpoint, KUBELET_HEALTH_PATH) self.node_spec_url = urljoin(endpoint, NODE_SPEC_PATH) self.pod_list_url = urljoin(endpoint, POD_LIST_PATH) # Kubelet credentials handling self.kubelet_credentials = KubeletCredentials(kubelet_conn_info) self.kubelet_credentials.configure_scraper(self.cadvisor_scraper, self.cadvisor_metrics_url) self.kubelet_credentials.configure_scraper(self.kubelet_scraper, self.kubelet_metrics_url) # Legacy cadvisor support try: self.cadvisor_legacy_url = self.detect_cadvisor( endpoint, self.cadvisor_legacy_port) except Exception as e: self.log.debug( 'cAdvisor not found, running in prometheus mode: %s' % str(e)) # By default we send the buckets. send_buckets = instance.get('send_histograms_buckets', True) if send_buckets is not None and str(send_buckets).lower() == 'false': send_buckets = False else: send_buckets = True self.pod_list = self.retrieve_pod_list() self.container_filter = ContainerFilter(self.pod_list) self.instance_tags = instance.get('tags', []) self._perform_kubelet_check(self.instance_tags) self._report_node_metrics(self.instance_tags) self._report_pods_running(self.pod_list, self.instance_tags) self._report_container_spec_metrics(self.pod_list, self.instance_tags) if self.cadvisor_legacy_url: # Legacy cAdvisor self.log.debug('processing legacy cadvisor metrics') self.process_cadvisor(instance, self.cadvisor_legacy_url, self.pod_list, self.container_filter) elif self.cadvisor_metrics_url: # Prometheus self.log.debug('processing cadvisor metrics') self.cadvisor_scraper.process( self.cadvisor_metrics_url, send_histograms_buckets=send_buckets, instance=instance, pod_list=self.pod_list, container_filter=self.container_filter) if self.kubelet_metrics_url: # Prometheus self.log.debug('processing kubelet metrics') self.kubelet_scraper.process(self.kubelet_metrics_url, send_histograms_buckets=send_buckets, instance=instance, ignore_unmapped=True) # Free up memory self.pod_list = None self.container_filter = None def perform_kubelet_query(self, url, verbose=True, timeout=10): """ Perform and return a GET request against kubelet. Support auth and TLS validation. """ return requests.get(url, timeout=timeout, verify=self.kubelet_credentials.verify(), cert=self.kubelet_credentials.cert_pair(), headers=self.kubelet_credentials.headers(url), params={'verbose': verbose}) def retrieve_pod_list(self): try: pod_list = self.perform_kubelet_query(self.pod_list_url).json() if pod_list.get("items") is None: # Sanitize input: if no pod are running, 'items' is a NoneObject pod_list['items'] = [] return pod_list except Exception as e: self.log.debug( 'failed to retrieve pod list from the kubelet at %s : %s' % (self.pod_list_url, str(e))) return None def _retrieve_node_spec(self): """ Retrieve node spec from kubelet. """ node_spec = self.perform_kubelet_query(self.node_spec_url).json() # TODO: report allocatable for cpu, mem, and pod capacity # if we can get it locally or thru the DCA instead of the /nodes endpoint directly return node_spec def _report_node_metrics(self, instance_tags): node_spec = self._retrieve_node_spec() num_cores = node_spec.get('num_cores', 0) memory_capacity = node_spec.get('memory_capacity', 0) tags = instance_tags self.gauge(self.NAMESPACE + '.cpu.capacity', float(num_cores), tags) self.gauge(self.NAMESPACE + '.memory.capacity', float(memory_capacity), tags) def _perform_kubelet_check(self, instance_tags): """Runs local service checks""" service_check_base = self.NAMESPACE + '.kubelet.check' is_ok = True url = self.kube_health_url try: req = self.perform_kubelet_query(url) for line in req.iter_lines(): # avoid noise; this check is expected to fail since we override the container hostname if line.find('hostname') != -1: continue matches = re.match(r'\[(.)\]([^\s]+) (.*)?', line) if not matches or len(matches.groups()) < 2: continue service_check_name = service_check_base + '.' + matches.group( 2) status = matches.group(1) if status == '+': self.service_check(service_check_name, AgentCheck.OK, tags=instance_tags) else: self.service_check(service_check_name, AgentCheck.CRITICAL, tags=instance_tags) is_ok = False except Exception as e: self.log.warning('kubelet check %s failed: %s' % (url, str(e))) self.service_check(service_check_base, AgentCheck.CRITICAL, message='Kubelet check %s failed: %s' % (url, str(e)), tags=instance_tags) else: if is_ok: self.service_check(service_check_base, AgentCheck.OK, tags=instance_tags) else: self.service_check(service_check_base, AgentCheck.CRITICAL, tags=instance_tags) def _report_pods_running(self, pods, instance_tags): """ Reports the number of running pods on this node tagged by service and creator. :param pods: pod list object :param instance_tags: list of tags """ tag_counter = {} for pod in pods['items']: pod_id = pod.get('metadata', {}).get('uid') tags = get_tags('kubernetes_pod://%s' % pod_id, False) or None if not tags: continue tags += instance_tags hash_tags = tuple(sorted(tags)) if hash_tags in tag_counter.keys(): tag_counter[hash_tags] += 1 else: tag_counter[hash_tags] = 1 for tags, count in tag_counter.iteritems(): self.gauge(self.NAMESPACE + '.pods.running', count, list(tags)) def _report_container_spec_metrics(self, pod_list, instance_tags): """Reports pod requests & limits by looking at pod specs.""" for pod in pod_list['items']: pod_name = pod.get('metadata', {}).get('name') if not pod_name: continue for ctr in pod['spec']['containers']: if not ctr.get('resources'): continue c_name = ctr.get('name', '') cid = None for ctr_status in pod['status'].get('containerStatuses', []): if ctr_status.get('name') == c_name: # it is already prefixed with 'docker://' cid = ctr_status.get('containerID') break if not cid: continue pod_uid = pod.get('metadata', {}).get('uid') if self.container_filter.is_excluded(cid, pod_uid): continue tags = get_tags('%s' % cid, True) + instance_tags try: for resource, value_str in ctr.get('resources', {}).get('requests', {}).iteritems(): value = self.parse_quantity(value_str) self.gauge( '{}.{}.requests'.format(self.NAMESPACE, resource), value, tags) except (KeyError, AttributeError) as e: self.log.debug( "Unable to retrieve container requests for %s: %s", c_name, e) try: for resource, value_str in ctr.get('resources', {}).get('limits', {}).iteritems(): value = self.parse_quantity(value_str) self.gauge( '{}.{}.limits'.format(self.NAMESPACE, resource), value, tags) except (KeyError, AttributeError) as e: self.log.debug( "Unable to retrieve container limits for %s: %s", c_name, e) @staticmethod def parse_quantity(string): """ Parse quantity allows to convert the value in the resources spec like: resources: requests: cpu: "100m" memory": "200Mi" limits: memory: "300Mi" :param string: str :return: float """ number, unit = '', '' for char in string: if char.isdigit() or char == '.': number += char else: unit += char return float(number) * FACTORS.get(unit, 1)
def _get_mixer_scraper(self, instance): """ Grab the mixer scraper from the dict and return it if it exists, otherwise create the scraper and add it to the dict """ endpoint = instance.get('mixer_endpoint') if self._scrapers.get(endpoint, None): return self._scrapers.get(endpoint) scraper = PrometheusScraper(self) self._scrapers[endpoint] = scraper scraper.NAMESPACE = self.MIXER_NAMESPACE scraper.metrics_mapper = { 'go_gc_duration_seconds': 'go.gc_duration_seconds', 'go_goroutines': 'go.goroutines', 'go_info': 'go.info', 'go_memstats_alloc_bytes': 'go.memstats.alloc_bytes', 'go_memstats_alloc_bytes_total': 'go.memstats.alloc_bytes_total', 'go_memstats_buck_hash_sys_bytes': 'go.memstats.buck_hash_sys_bytes', 'go_memstats_frees_total': 'go.memstats.frees_total', 'go_memstats_gc_cpu_fraction': 'go.memstats.gc_cpu_fraction', 'go_memstats_gc_sys_bytes': 'go.memstats.gc_sys_bytes', 'go_memstats_heap_alloc_bytes': 'go.memstats.heap_alloc_bytes', 'go_memstats_heap_idle_bytes': 'go.memstats.heap_idle_bytes', 'go_memstats_heap_inuse_bytes': 'go.memstats.heap_inuse_bytes', 'go_memstats_heap_objects': 'go.memstats.heap_objects', 'go_memstats_heap_released_bytes': 'go.memstats.heap_released_bytes', 'go_memstats_heap_sys_bytes': 'go.memstats.heap_sys_bytes', 'go_memstats_last_gc_time_seconds': 'go.memstats.last_gc_time_seconds', 'go_memstats_lookups_total': 'go.memstats.lookups_total', 'go_memstats_mallocs_total': 'go.memstats.mallocs_total', 'go_memstats_mcache_inuse_bytes': 'go.memstats.mcache_inuse_bytes', 'go_memstats_mcache_sys_bytes': 'go.memstats.mcache_sys_bytes', 'go_memstats_mspan_inuse_bytes': 'go.memstats.mspan_inuse_bytes', 'go_memstats_mspan_sys_bytes': 'go.memstats.mspan_sys_bytes', 'go_memstats_next_gc_bytes': 'go.memstats.next_gc_bytes', 'go_memstats_other_sys_bytes': 'go.memstats.other_sys_bytes', 'go_memstats_stack_inuse_bytes': 'go.memstats.stack_inuse_bytes', 'go_memstats_stack_sys_bytes': 'go.memstats.stack_sys_bytes', 'go_memstats_sys_bytes': 'go.memstats.sys_bytes', 'go_threads': 'go.threads', 'grpc_server_handled_total': 'grpc.server.handled_total', 'grpc_server_handling_seconds': 'grpc.server.handling_seconds', 'grpc_server_msg_received_total': 'grpc.server.msg_received_total', 'grpc_server_msg_sent_total': 'grpc.server.msg_sent_total', 'grpc_server_started_total': 'grpc.server.started_total', 'mixer_adapter_dispatch_count': 'adapter.dispatch_count', 'mixer_adapter_dispatch_duration': 'adapter.dispatch_duration', 'mixer_adapter_old_dispatch_count': 'adapter.old_dispatch_count', 'mixer_adapter_old_dispatch_duration': 'adapter.old_dispatch_duration', 'mixer_config_resolve_actions': 'config.resolve_actions', 'mixer_config_resolve_count': 'config.resolve_count', 'mixer_config_resolve_duration': 'config.resolve_duration', 'mixer_config_resolve_rules': 'config.resolve_rules', 'process_cpu_seconds_total': 'process.cpu_seconds_total', 'process_max_fds': 'process.max_fds', 'process_open_fds': 'process.open_fds', 'process_resident_memory_bytes': 'process.resident_memory_bytes', 'process_start_time_seconds': 'process.start_time_seconds', 'process_virtual_memory_bytes': 'process.virtual_memory_bytes', } scraper = self._shared_scraper_config(scraper, instance) return scraper