def info(self) -> ClusterStats: # pylint: disable=too-many-locals """Retrieve Kubernetes cluster statistics.""" pl_status = ClusterStats() node_list = pykube.Node.objects( self.api).filter(namespace=pykube.all).iterator() node_dict = {} # Get basic information from nodes for node in node_list: nss = NodeStats(node.name) nss.cores_total = float(node.obj['status']['allocatable']['cpu'] [1]) ## Bug found on GKE nss.memory_total = humanfriendly.parse_size( node.obj['status']['allocatable']['memory']) nss.labels = node.obj['metadata']['labels'] nss.status = 'online' node_dict[str(socket.gethostbyname(node.name))] = nss # Get information from all running pods, then accumulate to nodes pod_list = pykube.Pod.objects( self.api).filter(namespace=pykube.all).iterator() for pod in pod_list: try: host_ip = pod.obj['status']['hostIP'] except KeyError: continue nss = node_dict[host_ip] nss.container_count += 1 spec_cont = pod.obj['spec']['containers'][0] if 'resources' in spec_cont: if 'requests' in spec_cont['resources']: if 'memory' in spec_cont['resources']['requests']: memory = spec_cont['resources']['requests']['memory'] nss.memory_reserved = nss.memory_reserved + humanfriendly.parse_size( memory) if 'cpu' in spec_cont['resources']['requests']: cpu = spec_cont['resources']['requests']['cpu'] # ex: cpu could be 100m or 0.1 cpu_splitted = cpu.split('m') if len(cpu_splitted) > 1: cpu_float = int(cpu_splitted[0]) / 1000 else: cpu_float = int(cpu_splitted[0]) nss.cores_reserved = round( nss.cores_reserved + cpu_float, 3) for node_ip in node_dict: pl_status.nodes.append(node_dict[node_ip]) return pl_status
def _host_subthread(self, host_config: DockerHostConfig): log.info("Synchro thread for host {} started".format(host_config.name)) self.host_stats[host_config.name] = NodeStats(host_config.name) while True: time_start = time.time() try: my_engine = DockerClient(host_config) container_list = my_engine.list( only_label={ 'zoe_deployment_name': get_conf().deployment_name }) info = my_engine.info() except ZoeException as e: self.host_stats[host_config.name].status = 'offline' log.error(str(e)) log.info('Node {} is offline'.format(host_config.name)) else: if self.host_stats[host_config.name].status == 'offline': log.info('Node {} is now online'.format(host_config.name)) self.host_stats[host_config.name].status = 'online' self.host_stats[ host_config.name].container_count = info['Containers'] self.host_stats[host_config.name].cores_total = info['NCPU'] self.host_stats[ host_config.name].memory_total = info['MemTotal'] self.host_stats[host_config.name].labels = host_config.labels if info['Labels'] is not None: self.host_stats[host_config.name].labels.union( set(info['Labels'])) self.host_stats[host_config.name].memory_allocated = sum([ cont['memory_soft_limit'] for cont in container_list if cont['memory_soft_limit'] != info['MemTotal'] ]) self.host_stats[host_config.name].cores_allocated = sum([ cont['cpu_quota'] / cont['cpu_period'] for cont in container_list if cont['cpu_period'] != 0 ]) stats = {} self.host_stats[host_config.name].memory_reserved = 0 self.host_stats[host_config.name].cores_reserved = 0 for cont in container_list: service = self.state.services.select( only_one=True, backend_host=host_config.name, backend_id=cont['id']) if service is None: log.warning( 'Container {} on host {} has no corresponding service' .format(cont['name'], host_config.name)) if cont['state'] == Service.BACKEND_DIE_STATUS: log.warning( 'Terminating dead and orphan container {}'. format(cont['name'])) my_engine.terminate_container(cont['id'], delete=True) continue self._update_service_status(service, cont) self.host_stats[ host_config. name].memory_reserved += service.resource_reservation.memory.min self.host_stats[ host_config. name].cores_reserved += service.resource_reservation.cores.min stats[service.id] = { 'core_limit': cont['cpu_quota'] / cont['cpu_period'], 'mem_limit': cont['memory_soft_limit'] } self.host_stats[host_config.name].service_stats = stats self.host_stats[host_config.name].images = [] for dk_image in my_engine.list_images(): image = { 'id': dk_image.attrs['Id'], 'size': dk_image.attrs['Size'], 'names': dk_image.tags # type: list } for name in image['names']: if name[-7:] == ':latest': # add an image with the name without 'latest' to fake Docker image lookup algorithm image['names'].append(name[:-7]) break self.host_stats[host_config.name].images.append(image) sleep_time = CHECK_INTERVAL - (time.time() - time_start) if sleep_time <= 0: log.warning( 'synchro thread for host {} is late by {:.2f} seconds'. format(host_config.name, sleep_time * -1)) sleep_time = 0 if self.stop.wait(timeout=sleep_time): break log.info("Synchro thread for host {} stopped".format(host_config.name))
def _update_node_state(self, host_conf: DockerHostConfig, node_stats: NodeStats, get_usage_stats: bool): node_stats.labels = host_conf.labels try: my_engine = DockerClient(host_conf) except ZoeException as e: log.error(str(e)) node_stats.status = 'offline' log.info('Node {} is offline'.format(host_conf.name)) return else: node_stats.status = 'online' try: container_list = my_engine.list(only_label={'zoe_deployment_name': get_conf().deployment_name}) info = my_engine.info() except ZoeException: return node_stats.container_count = len(container_list) node_stats.cores_total = info['NCPU'] node_stats.memory_total = info['MemTotal'] if info['Labels'] is not None: node_stats.labels += set(info['Labels']) node_stats.memory_reserved = sum([cont['memory_soft_limit'] for cont in container_list if cont['memory_soft_limit'] != node_stats.memory_total]) node_stats.cores_reserved = sum([cont['cpu_quota'] / cont['cpu_period'] for cont in container_list if cont['cpu_period'] != 0]) stats = {} for cont in container_list: stats[cont['id']] = {} stats[cont['id']]['core_limit'] = cont['cpu_quota'] / cont['cpu_period'] stats[cont['id']]['mem_limit'] = cont['memory_soft_limit'] node_stats.service_stats = stats if get_usage_stats: if get_conf().kairosdb_enable: kdb = KairosDBInMetrics() for cont in container_list: stats[cont['id']].update(kdb.get_service_usage(cont['name'])) node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()]) node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()]) else: for cont in container_list: try: aux = my_engine.stats(cont['id'], stream=False) # this call is very slow (>~1sec) if 'usage' in aux['memory_stats']: stats[cont['id']]['mem_usage'] = aux['memory_stats']['usage'] else: stats[cont['id']]['mem_usage'] = 0 stats[cont['id']]['cpu_usage'] = self._get_core_usage(aux) except ZoeException: continue node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()]) node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()]) else: node_stats.memory_in_use = 0 node_stats.cores_in_use = 0
def info(self) -> ClusterStats: """Retrieve Swarm statistics. The Docker API returns a mess difficult to parse.""" info = self.cli.info() pl_status = ClusterStats() # SystemStatus is a list... idx = 0 # Role, skip idx += 1 assert 'Strategy' in info["SystemStatus"][idx][0] pl_status.placement_strategy = info["SystemStatus"][idx][1] idx += 1 assert 'Filters' in info["SystemStatus"][idx][0] pl_status.active_filters = [x.strip() for x in info["SystemStatus"][idx][1].split(", ")] idx += 1 assert 'Nodes' in info["SystemStatus"][idx][0] node_count = int(info["SystemStatus"][idx][1]) idx += 1 # At index 4 the nodes begin for node in range(node_count): idx2 = 0 node_stats = NodeStats(info["SystemStatus"][idx + node][0].strip()) node_stats.docker_endpoint = info["SystemStatus"][idx + node][1] idx2 += 1 # ID, skip idx2 += 1 # Status if info["SystemStatus"][idx + node + idx2][1] == 'Healthy': node_stats.status = 'online' else: node_stats.status = 'offline' idx2 += 1 # Containers node_stats.container_count = int(info["SystemStatus"][idx + node + idx2][1].split(' ')[0]) idx2 += 1 # CPUs node_stats.cores_reserved = int(info["SystemStatus"][idx + node + idx2][1].split(' / ')[0]) node_stats.cores_total = int(info["SystemStatus"][idx + node + idx2][1].split(' / ')[1]) idx2 += 1 # Memory node_stats.memory_reserved = info["SystemStatus"][idx + node + idx2][1].split(' / ')[0] node_stats.memory_total = info["SystemStatus"][idx + node + idx2][1].split(' / ')[1] idx2 += 1 # Labels node_stats.labels = info["SystemStatus"][idx + node + idx2][1].split(', ') idx2 += 1 # Last update node_stats.last_update = info["SystemStatus"][idx + node + idx2][1] idx2 += 1 # Docker version node_stats.server_version = info["SystemStatus"][idx + node + idx2][1] node_stats.memory_reserved = humanfriendly.parse_size(node_stats.memory_reserved) node_stats.memory_total = humanfriendly.parse_size(node_stats.memory_total) pl_status.nodes.append(node_stats) idx += idx2 pl_status.timestamp = time.time() return pl_status