def _update_node_state(self, host_conf: DockerHostConfig, node_stats: NodeStats, get_usage_stats: bool): node_stats.labels = host_conf.labels try: my_engine = DockerClient(host_conf) except ZoeException as e: log.error(str(e)) node_stats.status = 'offline' log.info('Node {} is offline'.format(host_conf.name)) return else: node_stats.status = 'online' try: container_list = my_engine.list(only_label={'zoe_deployment_name': get_conf().deployment_name}) info = my_engine.info() except ZoeException: return node_stats.container_count = len(container_list) node_stats.cores_total = info['NCPU'] node_stats.memory_total = info['MemTotal'] if info['Labels'] is not None: node_stats.labels += set(info['Labels']) node_stats.memory_reserved = sum([cont['memory_soft_limit'] for cont in container_list if cont['memory_soft_limit'] != node_stats.memory_total]) node_stats.cores_reserved = sum([cont['cpu_quota'] / cont['cpu_period'] for cont in container_list if cont['cpu_period'] != 0]) stats = {} for cont in container_list: stats[cont['id']] = {} stats[cont['id']]['core_limit'] = cont['cpu_quota'] / cont['cpu_period'] stats[cont['id']]['mem_limit'] = cont['memory_soft_limit'] node_stats.service_stats = stats if get_usage_stats: if get_conf().kairosdb_enable: kdb = KairosDBInMetrics() for cont in container_list: stats[cont['id']].update(kdb.get_service_usage(cont['name'])) node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()]) node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()]) else: for cont in container_list: try: aux = my_engine.stats(cont['id'], stream=False) # this call is very slow (>~1sec) if 'usage' in aux['memory_stats']: stats[cont['id']]['mem_usage'] = aux['memory_stats']['usage'] else: stats[cont['id']]['mem_usage'] = 0 stats[cont['id']]['cpu_usage'] = self._get_core_usage(aux) except ZoeException: continue node_stats.memory_in_use = sum([stat['mem_usage'] for stat in stats.values()]) node_stats.cores_in_use = sum([stat['cpu_usage'] for stat in stats.values()]) else: node_stats.memory_in_use = 0 node_stats.cores_in_use = 0
def info(self) -> ClusterStats: # pylint: disable=too-many-locals """Retrieve Kubernetes cluster statistics.""" pl_status = ClusterStats() node_list = pykube.Node.objects( self.api).filter(namespace=pykube.all).iterator() node_dict = {} # Get basic information from nodes for node in node_list: nss = NodeStats(node.name) nss.cores_total = float(node.obj['status']['allocatable']['cpu'] [1]) ## Bug found on GKE nss.memory_total = humanfriendly.parse_size( node.obj['status']['allocatable']['memory']) nss.labels = node.obj['metadata']['labels'] nss.status = 'online' node_dict[str(socket.gethostbyname(node.name))] = nss # Get information from all running pods, then accumulate to nodes pod_list = pykube.Pod.objects( self.api).filter(namespace=pykube.all).iterator() for pod in pod_list: try: host_ip = pod.obj['status']['hostIP'] except KeyError: continue nss = node_dict[host_ip] nss.container_count += 1 spec_cont = pod.obj['spec']['containers'][0] if 'resources' in spec_cont: if 'requests' in spec_cont['resources']: if 'memory' in spec_cont['resources']['requests']: memory = spec_cont['resources']['requests']['memory'] nss.memory_reserved = nss.memory_reserved + humanfriendly.parse_size( memory) if 'cpu' in spec_cont['resources']['requests']: cpu = spec_cont['resources']['requests']['cpu'] # ex: cpu could be 100m or 0.1 cpu_splitted = cpu.split('m') if len(cpu_splitted) > 1: cpu_float = int(cpu_splitted[0]) / 1000 else: cpu_float = int(cpu_splitted[0]) nss.cores_reserved = round( nss.cores_reserved + cpu_float, 3) for node_ip in node_dict: pl_status.nodes.append(node_dict[node_ip]) return pl_status
def info(self) -> ClusterStats: """Retrieve Swarm statistics. The Docker API returns a mess difficult to parse.""" info = self.cli.info() pl_status = ClusterStats() # SystemStatus is a list... idx = 0 # Role, skip idx += 1 assert 'Strategy' in info["SystemStatus"][idx][0] pl_status.placement_strategy = info["SystemStatus"][idx][1] idx += 1 assert 'Filters' in info["SystemStatus"][idx][0] pl_status.active_filters = [x.strip() for x in info["SystemStatus"][idx][1].split(", ")] idx += 1 assert 'Nodes' in info["SystemStatus"][idx][0] node_count = int(info["SystemStatus"][idx][1]) idx += 1 # At index 4 the nodes begin for node in range(node_count): idx2 = 0 node_stats = NodeStats(info["SystemStatus"][idx + node][0].strip()) node_stats.docker_endpoint = info["SystemStatus"][idx + node][1] idx2 += 1 # ID, skip idx2 += 1 # Status if info["SystemStatus"][idx + node + idx2][1] == 'Healthy': node_stats.status = 'online' else: node_stats.status = 'offline' idx2 += 1 # Containers node_stats.container_count = int(info["SystemStatus"][idx + node + idx2][1].split(' ')[0]) idx2 += 1 # CPUs node_stats.cores_reserved = int(info["SystemStatus"][idx + node + idx2][1].split(' / ')[0]) node_stats.cores_total = int(info["SystemStatus"][idx + node + idx2][1].split(' / ')[1]) idx2 += 1 # Memory node_stats.memory_reserved = info["SystemStatus"][idx + node + idx2][1].split(' / ')[0] node_stats.memory_total = info["SystemStatus"][idx + node + idx2][1].split(' / ')[1] idx2 += 1 # Labels node_stats.labels = info["SystemStatus"][idx + node + idx2][1].split(', ') idx2 += 1 # Last update node_stats.last_update = info["SystemStatus"][idx + node + idx2][1] idx2 += 1 # Docker version node_stats.server_version = info["SystemStatus"][idx + node + idx2][1] node_stats.memory_reserved = humanfriendly.parse_size(node_stats.memory_reserved) node_stats.memory_total = humanfriendly.parse_size(node_stats.memory_total) pl_status.nodes.append(node_stats) idx += idx2 pl_status.timestamp = time.time() return pl_status