def select_pod_form_set(api, list_set, namespace): pre_pod = Pod.objects(api).filter(namespace=namespace) list_pods = [] for p in pre_pod.response['items']: pod_obj = Pod(api, p) pod_name = pod_obj.name try: set_name = loads(p["metadata"]["annotations"] ["kubernetes.io/created-by"])["reference"]["name"] except: pass num = 0 dic_pod = {} for e in list_set: if e["name"] == set_name: dic_pod["name"] = pod_name dic_pod["set_name"] = set_name try: dic_pod["podIP"] = p["status"]["podIP"] except: dic_pod["podIP"] = "0.0.0.0" list_pods.append(dic_pod) list_set[num]["list_pods"] = list_pods num += 1 return list_set
def run_item(self, item_name) -> dict: """ run_item Execute an item job Pod with the spec details from the appropriate OaatType object. """ # TODO: check oaatType spec = self.oaattype.podspec() contspec = spec['container'] del spec['container'] contspec.setdefault('env', []).append({ 'name': 'OAAT_ITEM', 'value': item_name }) for idx in range(len(contspec.get('command', []))): contspec['command'][idx] = (contspec['command'][idx].replace( '%%oaat_item%%', item_name)) for idx in range(len(contspec.get('args', []))): contspec['args'][idx] = (contspec['args'][idx].replace( '%%oaat_item%%', item_name)) for env in contspec['env']: env['value'] = (env.get('value', '').replace('%%oaat_item%%', item_name)) # TODO: currently only supports a single container. Do we want # multi-container? doc = { 'apiVersion': 'v1', 'kind': 'Pod', 'metadata': { 'generateName': self.name + '-' + item_name + '-', 'labels': { 'parent-name': self.name, 'oaat-name': item_name, 'app': 'oaat-operator' } }, 'spec': { 'containers': [contspec], **spec, 'restartPolicy': 'Never' }, } kopf.adopt(doc) pod = Pod(self.api, doc) try: pod.create() except pykube.exceptions.KubernetesError as exc: self.items.mark_failed(item_name) raise ProcessingComplete( error=f'could not create pod {doc}: {exc}', message=f'error creating pod for {item_name}') return pod
def create_worker(self, player_id): pod = Pod( self.api, { 'kind': 'Pod', 'apiVersion': 'v1', 'metadata': { 'generateName': "aimmo-%s-worker-%s-" % (self.game_name, player_id), 'labels': { 'app': 'aimmo-game-worker', 'game': self.game_name, 'player': str(player_id), }, }, 'spec': { 'containers': [ { 'env': [ { 'name': 'DATA_URL', 'value': "%s/player/%d" % (self.game_url, player_id), }, ], 'name': 'aimmo-game-worker', 'image': 'ocadotechnology/aimmo-game-worker:%s' % os.environ.get('IMAGE_SUFFIX', 'latest'), 'ports': [ { 'containerPort': 5000, 'protocol': 'TCP' } ], 'resources': { 'limits': { 'cpu': '10m', 'memory': '64Mi', }, }, }, ], }, } ) pod.create() iterations = 0 while pod.obj['status']['phase'] == 'Pending': if iterations > 30: raise EnvironmentError('Could not start worker %s, details %s' % (player_id, pod.obj)) LOGGER.debug('Waiting for worker %s', player_id) time.sleep(5) pod.reload() iterations += 1 worker_url = "http://%s:5000" % pod.obj['status']['podIP'] LOGGER.info("Worker started for %s, listening at %s", player_id, worker_url) return worker_url
def create_worker(self, player_id): pod = Pod( self.api, { 'kind': 'Pod', 'apiVersion': 'v1', 'metadata': { 'generateName': "aimmo-%s-worker-%s-" % (self.game_name, player_id), 'labels': { 'app': 'aimmo-game-worker', 'game': self.game_name, 'player': str(player_id), }, }, 'spec': { 'containers': [ { 'env': [ { 'name': 'DATA_URL', 'value': "%s/player/%d" % (self.game_url, player_id), }, ], 'name': 'aimmo-game-worker', 'image': 'ocadotechnology/aimmo-game-worker:latest', 'ports': [ { 'containerPort': 5000, 'protocol': 'TCP' } ], 'resources': { 'limits': { 'cpu': '10m', 'memory': '64Mi', }, }, }, ], }, } ) pod.create() time.sleep(20) pod.reload() worker_url = "http://%s:5000" % pod.obj['status']['podIP'] LOGGER.info("Worker started for %s, listening at %s", player_id, worker_url) return worker_url
def remove_worker(self, player_id): for pod in Pod.objects(self.api).filter(selector={ 'app': 'aimmo-game-worker', 'game': self.game_name, 'player': str(player_id), }): pod.delete()
def start(self, object_type, object_file_path, force=False): if not self.api: logging.info('API Client does not exist') return with open(object_file_path) as json_data: json_file = json.load(json_data) if object_type is KubernetesObjects.POD: pod = Pod(self.api, json_file) self._recreate_object(pod, force) self._add_object_to_kube_objects_dict('pods', pod) elif object_type is KubernetesObjects.SERVICE: service = Service(self.api, json_file) self._recreate_object(service, force) self._add_object_to_kube_objects_dict('services', service) elif object_type is KubernetesObjects.REPLICATION_CONTROLLER: rc = ReplicationController(self.api, json_file) self._recreate_object(rc, force) self._add_object_to_kube_objects_dict('rcs', rc)
def find_backend_application(client, ingress, rule): ''' The Ingress object might not have a "application" label, so let's try to find the application by looking at the backend service and its pods ''' paths = rule.get('http', {}).get('paths', []) selectors = [] for path in paths: service_name = path.get('backend', {}).get('serviceName') if service_name: try: service = Service.objects( client, namespace=ingress.namespace).get(name=service_name) except ObjectDoesNotExist: logger.debug( f'Referenced service does not exist: {ingress.namespace}/{service_name}' ) else: selector = service.obj['spec'].get('selector', {}) selectors.append(selector) application = get_application_from_labels(selector) if application: return application # we still haven't found the application, let's look up pods by label selectors for selector in selectors: application_candidates = set() for pod in Pod.objects(client).filter(namespace=ingress.namespace, selector=selector): application = get_application_from_labels(pod.labels) if application: application_candidates.add(application) if len(application_candidates) == 1: return application_candidates.pop() return ''
def remove_worker(self, player_id): for pod in Pod.objects(self.api).filter( selector={ 'app': 'aimmo-game-worker', 'game': self.game_name, 'player': str(player_id), }): pod.delete()
def remove_worker(self, player_id): for pod in Pod.objects(self.api).filter(selector={ 'app': 'aimmo-game-worker', 'game': self.game_name, 'player': str(player_id), }): LOGGER.debug('Removing pod %s', pod.obj['spec']) pod.delete()
def remove_worker(self, player_id): for pod in Pod.objects(self.api).filter(selector={ 'app': 'aimmo-game-worker', 'game': self.game_id, 'player': str(player_id), }): LOGGER.debug('Removing pod %s', pod.obj['spec']) pod.delete()
def validate_running_pod(self) -> None: """ validate_running_pod Check whether the Pod we previously started is still running. If not, assume the job was killed without being processed by the operator (or was never started) and clean up. Mark as failed. If Pod is still running, update the status details. Returns: - None if no pod is expected - ProcessingComplete exception if pod is expected but not running - ProcessingComplete exception if pod is expected and is running """ # TODO: what if a pod is running, but the operator doesn't expect one? curpod = self.get_status('pod') curitem = self.get_status('currently_running') if curpod: try: pod = Pod.objects( self.api, namespace=self.namespace).get_by_name(curpod).obj except pykube.exceptions.ObjectDoesNotExist: self.info(f'pod {curpod} missing/deleted, cleaning up') self.set_status('currently_running') self.set_status('pod') self.set_status('state', 'missing') self.items.mark_failed(curitem) self.items.set_item_status(curitem, 'pod_detail') raise ProcessingComplete( info='Cleaned up missing/deleted item') podphase = pod.get('status', {}).get('phase', 'unknown') self.info(f'validated that pod {curpod} is ' f'still running (phase={podphase})') recorded_phase = self.items.status(curitem, 'podphase', 'unknown') # valid phases are Pending, Running, Succeeded, Failed, Unknown # 'started' is the phase the pods start with when created by # operator. if recorded_phase in ('started', 'Pending', 'Running', 'Failed'): self.info(f'item {curitem} status for ' f'{curpod}: {recorded_phase}') raise ProcessingComplete(message=f'item {curitem} %s' % recorded_phase.lower()) if recorded_phase == 'Succeeded': self.info(f'item {curitem} podphase={recorded_phase} but ' f'not yet acknowledged: {curpod}') raise ProcessingComplete(message=f'item {curitem} succeeded, ' 'awaiting acknowledgement') raise ProcessingComplete( error=f'item {curitem} unexpected state: ' f'recorded_phase={recorded_phase}, ' f'status={str(self.status)}', message=f'item {curitem} unexpected state')
def get_application_label_from_pods(client: pykube.HTTPClient, namespace, selector): application_candidates = set() for pod in Pod.objects(client).filter(namespace=namespace, selector=selector): application = get_application_from_labels(pod.labels) if application: application_candidates.add(application) if len(application_candidates) == 1: return application_candidates.pop() return ""
def create_worker(self, player_id): pod = Pod( self.api, { 'kind': 'Pod', 'apiVersion': 'v1', 'metadata': { 'generateName': "aimmo-%s-worker-%s-" % (self.game_id, player_id), 'labels': { 'app': 'aimmo-game-worker', 'game': self.game_id, 'player': str(player_id), }, }, 'spec': { 'containers': [ { 'env': [ { 'name': 'DATA_URL', 'value': "%s/player/%d" % (self.game_url, player_id), }, ], 'name': 'aimmo-game-worker', 'image': 'ocadotechnology/aimmo-game-worker:%s' % os.environ.get('IMAGE_SUFFIX', 'latest'), 'ports': [{ 'containerPort': 5000, 'protocol': 'TCP' }], 'resources': { 'limits': { 'cpu': '10m', 'memory': '64Mi', }, }, }, ], }, }) pod.create() iterations = 0 while pod.obj['status']['phase'] == 'Pending': if iterations > 30: raise EnvironmentError( 'Could not start worker %s, details %s' % (player_id, pod.obj)) LOGGER.debug('Waiting for worker %s', player_id) time.sleep(5) pod.reload() iterations += 1 worker_url = "http://%s:5000" % pod.obj['status']['podIP'] LOGGER.info("Worker started for %s, listening at %s", player_id, worker_url) return worker_url
def validate_expected_pod_is_running(self) -> None: """ validate_expected_pod_is_running Validate that the pod which we expect should be running (based on `oaatgroup` status `pod` and `currently_running`) Check whether the Pod we previously started is still running. If not, assume the job was killed without being processed by the operator (or was never started) and clean up. Mark as failed. Returns: - ProcessingComplete exception: - Cleaned up missing/deleted item - Pod exists and is in state: <state> """ curpod = self.get_status('pod') curitem = self.get_status('currently_running') try: pod = Pod.objects(self.api, namespace=self.namespace).get_by_name(curpod).obj except pykube.exceptions.ObjectDoesNotExist: self.info(f'pod {curpod} missing/deleted, cleaning up') self.set_status('currently_running') self.set_status('pod') self.set_status('state', 'missing') self.items.mark_failed(curitem) self.items.set_item_status(curitem, 'pod_detail') raise ProcessingComplete( message=f'item {curitem} failed during validation', info='Cleaned up missing/deleted item') podphase = pod.get('status', {}).get('phase', 'unknown') self.info(f'validated that pod {curpod} exists ' f'(phase={podphase})') recorded_phase = self.items.status(curitem, 'podphase', 'unknown') # if there is a mismatch in phase, then the pod phase handlers # have not yet picked it up and updated the oaatgroup phase. # Note it here, but take no further action if podphase != recorded_phase: self.info(f'mismatch in phase for pod {curpod}: ' f'pod={podphase}, oaatgroup={recorded_phase}') # valid phases are Pending, Running, Succeeded, Failed, Unknown # 'started' is the phase the pods start with when created by # operator. raise ProcessingComplete( message=f'Pod {curpod} exists and is in state {podphase}')
def validate_no_rogue_pods_are_running(self) -> None: found_rogue = 0 for pod in Pod.objects(self.api, namespace=self.namespace).iterator(): if pod.name == self.get_status('pod'): continue if pod.labels.get('parent-name', '') == self.name: if pod.labels.get('app', '') == 'oaat-operator': podphase = (pod.obj['status'].get('phase', 'unknown')) if podphase in ['Running', 'Pending']: self.warning( f'rogue pod {pod.name} found (phase={podphase})') found_rogue += 1 if found_rogue > 0: raise ProcessingComplete( message='rogue pods running', error=f'found {found_rogue} rogue pods running')
def create_worker(self, player_id): pod = Pod( self.api, { 'kind': 'Pod', 'apiVersion': 'v1', 'metadata': { 'generateName': "aimmo-%s-worker-%s-" % (self.game_name, player_id), 'labels': { 'app': 'aimmo-game-worker', 'game': self.game_name, 'player': str(player_id), }, }, 'spec': { 'containers': [ { 'env': [ { 'name': 'DATA_URL', 'value': "%s/player/%d" % (self.game_url, player_id), }, ], 'name': 'aimmo-game-worker', 'image': 'ocadotechnology/aimmo-game-worker:%s' % os.environ.get('IMAGE_SUFFIX', 'latest'), 'ports': [{ 'containerPort': 5000, 'protocol': 'TCP' }], 'resources': { 'limits': { 'cpu': '10m', 'memory': '64Mi', }, }, }, ], }, }) pod.create() time.sleep(20) pod.reload() worker_url = "http://%s:5000" % pod.obj['status']['podIP'] LOGGER.info("Worker started for %s, listening at %s", player_id, worker_url) return worker_url
def find_backend_application(client: pykube.HTTPClient, ingress: Ingress, rule): """ Find the application ID for a given Ingress object. The Ingress object might not have a "application" label, so let's try to find the application by looking at the backend service and its pods """ paths = rule.get("http", {}).get("paths", []) selectors = [] for path in paths: service_name = path.get("backend", {}).get("serviceName") if service_name: try: service = Service.objects(client, namespace=ingress.namespace).get( name=service_name ) except ObjectDoesNotExist: logger.debug( f"Referenced service does not exist: {ingress.namespace}/{service_name}" ) else: selector = service.obj["spec"].get("selector", {}) selectors.append(selector) application = get_application_from_labels(selector) if application: return application # we still haven't found the application, let's look up pods by label selectors for selector in selectors: application_candidates = set() for pod in Pod.objects(client).filter( namespace=ingress.namespace, selector=selector ): application = get_application_from_labels(pod.labels) if application: application_candidates.add(application) if len(application_candidates) == 1: return application_candidates.pop() return ""
def _get_pods(self): return Pod.objects(self.api).filter(selector=self.labels)
def query_kubernetes_cluster(cluster): cluster_id = cluster.id api_server_url = cluster.api_server_url nodes = {} pods_by_namespace_name = {} unassigned_pods = {} for node in Node.objects(cluster.client): obj = map_node(node.obj) nodes[obj['name']] = obj now = time.time() for pod in Pod.objects(cluster.client, namespace=pykube.all): obj = map_pod(pod.obj) if 'deletionTimestamp' in pod.metadata: obj['deleted'] = parse_time(pod.metadata['deletionTimestamp']) for cont in pod.obj['spec']['containers']: obj['containers'].append(map_container(cont, pod.obj)) if obj['phase'] in ('Succeeded', 'Failed'): last_termination_time = 0 for container in obj['containers']: termination_time = container.get('state', {}).get('terminated', {}).get('finishedAt') if termination_time: termination_time = parse_time(termination_time) if termination_time > last_termination_time: last_termination_time = termination_time if (last_termination_time and last_termination_time < now - 3600) or (obj.get('reason') == 'Evicted'): # the job/pod finished more than an hour ago or if it is evicted by cgroup limits # => filter out continue pods_by_namespace_name[(pod.namespace, pod.name)] = obj pod_key = f'{pod.namespace}/{pod.name}' node_name = pod.obj['spec'].get('nodeName') if node_name in nodes: nodes[node_name]['pods'][pod_key] = obj else: unassigned_pods[pod_key] = obj try: for node_metrics in NodeMetrics.objects(cluster.client): key = node_metrics.name nodes[key]['usage'] = node_metrics.obj.get('usage', {}) except Exception as e: logger.warning('Failed to query node metrics {}: {}'.format( cluster.id, get_short_error_message(e))) try: for pod_metrics in PodMetrics.objects(cluster.client, namespace=pykube.all): key = (pod_metrics.namespace, pod_metrics.name) pod = pods_by_namespace_name.get(key) if pod: for container in pod['containers']: for container_metrics in pod_metrics.obj.get( 'containers', []): if container['name'] == container_metrics['name']: container['resources'][ 'usage'] = container_metrics['usage'] except Exception as e: logger.warning('Failed to query pod metrics for cluster {}: {}'.format( cluster.id, get_short_error_message(e))) return { 'id': cluster_id, 'api_server_url': api_server_url, 'nodes': nodes, 'unassigned_pods': unassigned_pods }
def query_cluster(cluster, executor, system_namespaces, additional_cost_per_cluster, no_ingress_status, node_label): logger.info(f"Querying cluster {cluster.id} ({cluster.api_server_url})..") pods = {} nodes = {} namespaces = {} for namespace in Namespace.objects(cluster.client): email = namespace.annotations.get('email') namespaces[namespace.name] = { "status": namespace.obj['status']['phase'], "email": email, } cluster_capacity = collections.defaultdict(float) cluster_allocatable = collections.defaultdict(float) cluster_requests = collections.defaultdict(float) user_requests = collections.defaultdict(float) node_count = collections.defaultdict(int) cluster_cost = additional_cost_per_cluster for _node in Node.objects(cluster.client): node = _node.obj nodes[_node.name] = node node["capacity"] = {} node["allocatable"] = {} node["requests"] = new_resources() node["usage"] = new_resources() for k, v in node["status"].get("capacity", {}).items(): parsed = parse_resource(v) node["capacity"][k] = parsed cluster_capacity[k] += parsed for k, v in node["status"].get("allocatable", {}).items(): parsed = parse_resource(v) node["allocatable"][k] = parsed cluster_allocatable[k] += parsed role = _node.labels.get(NODE_LABEL_ROLE) or "worker" node_count[role] += 1 region = _node.labels.get(NODE_LABEL_REGION, "unknown") instance_type = _node.labels.get(NODE_LABEL_INSTANCE_TYPE, "unknown") is_spot = _node.labels.get(NODE_LABEL_SPOT) == "true" node["spot"] = is_spot node["kubelet_version"] = (node["status"].get("nodeInfo", {}).get( "kubeletVersion", "")) node["role"] = role node["instance_type"] = instance_type node["cost"] = pricing.get_node_cost(region, instance_type, is_spot) cluster_cost += node["cost"] get_node_usage(cluster, nodes) cluster_usage = collections.defaultdict(float) for node in nodes.values(): for k, v in node['usage'].items(): cluster_usage[k] += v cost_per_cpu = cluster_cost / cluster_allocatable["cpu"] cost_per_memory = cluster_cost / cluster_allocatable["memory"] for pod in Pod.objects(cluster.client, namespace=pykube.all): if pod.obj["status"].get("phase") != "Running": # ignore unschedulable/completed pods continue application = get_application_from_labels(pod.labels) component = get_component_from_labels(pod.labels) requests = collections.defaultdict(float) ns = pod.namespace container_images = [] for container in pod.obj["spec"]["containers"]: # note that the "image" field is optional according to Kubernetes docs image = container.get("image") if image: container_images.append(image) for k, v in container["resources"].get("requests", {}).items(): pv = parse_resource(v) requests[k] += pv cluster_requests[k] += pv if ns not in system_namespaces: user_requests[k] += pv if "nodeName" in pod.obj["spec"] and pod.obj["spec"][ "nodeName"] in nodes: for k in ("cpu", "memory"): nodes[pod.obj["spec"] ["nodeName"]]["requests"][k] += requests.get(k, 0) cost = max(requests["cpu"] * cost_per_cpu, requests["memory"] * cost_per_memory) pods[(ns, pod.name)] = { "requests": requests, "application": application, "component": component, "container_images": container_images, "cost": cost, "usage": new_resources(), } hourly_cost = cluster_cost / HOURS_PER_MONTH cluster_summary = { "cluster": cluster, "nodes": nodes, "pods": pods, "namespaces": namespaces, "user_pods": len([p for ns, p in pods if ns not in system_namespaces]), "master_nodes": node_count["master"], "worker_nodes": node_count[node_label], "kubelet_versions": set([ n["kubelet_version"] for n in nodes.values() if n["role"] == node_label ]), "worker_instance_types": set([ n["instance_type"] for n in nodes.values() if n["role"] == node_label ]), "worker_instance_is_spot": any([n["spot"] for n in nodes.values() if n["role"] == node_label]), "capacity": cluster_capacity, "allocatable": cluster_allocatable, "requests": cluster_requests, "user_requests": user_requests, "usage": cluster_usage, "cost": cluster_cost, "cost_per_user_request_hour": { "cpu": 0.5 * hourly_cost / max(user_requests["cpu"], MIN_CPU_USER_REQUESTS), "memory": 0.5 * hourly_cost / max(user_requests["memory"] / ONE_GIBI, MIN_MEMORY_USER_REQUESTS), }, "ingresses": [], } get_pod_usage(cluster, pods) cluster_slack_cost = 0 for pod in pods.values(): usage_cost = max( pod["usage"]["cpu"] * cost_per_cpu, pod["usage"]["memory"] * cost_per_memory, ) pod["slack_cost"] = pod["cost"] - usage_cost cluster_slack_cost += pod["slack_cost"] cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost) with FuturesSession(max_workers=10, session=session) as futures_session: futures_by_host = {} # hostname -> future futures = collections.defaultdict(list) # future -> [ingress] for _ingress in Ingress.objects(cluster.client, namespace=pykube.all): application = get_application_from_labels(_ingress.labels) for rule in _ingress.obj["spec"].get("rules", []): host = rule.get('host', '') if not application: # find the application by getting labels from pods backend_application = find_backend_application( cluster.client, _ingress, rule) else: backend_application = None ingress = [ _ingress.namespace, _ingress.name, application or backend_application, host, 0 ] if host and not no_ingress_status: try: future = futures_by_host[host] except KeyError: future = futures_session.get(f"https://{host}/", timeout=5) futures_by_host[host] = future futures[future].append(ingress) cluster_summary["ingresses"].append(ingress) if not no_ingress_status: logger.info( f'Waiting for ingress status for {cluster.id} ({cluster.api_server_url})..' ) for future in concurrent.futures.as_completed(futures): ingresses = futures[future] try: response = future.result() status = response.status_code except: status = 999 for ingress in ingresses: ingress[4] = status return cluster_summary
def query_cluster( cluster, executor, system_namespaces, additional_cost_per_cluster, alpha_ema, prev_cluster_summaries, no_ingress_status, node_labels, ): logger.info(f"Querying cluster {cluster.id} ({cluster.api_server_url})..") pods = {} nodes = {} namespaces = {} for namespace in Namespace.objects(cluster.client): email = namespace.annotations.get("email") namespaces[namespace.name] = { "status": namespace.obj["status"]["phase"], "email": email, } cluster_capacity = collections.defaultdict(float) cluster_allocatable = collections.defaultdict(float) cluster_requests = collections.defaultdict(float) user_requests = collections.defaultdict(float) cluster_cost = additional_cost_per_cluster for _node in Node.objects(cluster.client): node = map_node(_node) nodes[_node.name] = node for k, v in node["capacity"].items(): cluster_capacity[k] += v for k, v in node["allocatable"].items(): cluster_allocatable[k] += v cluster_cost += node["cost"] metrics.get_node_usage(cluster, nodes, prev_cluster_summaries.get("nodes", {}), alpha_ema) cluster_usage = collections.defaultdict(float) for node in nodes.values(): for k, v in node["usage"].items(): cluster_usage[k] += v try: vpas_by_namespace_label = get_vpas_by_match_labels(cluster.client) except Exception as e: logger.warning(f"Failed to query VPAs in cluster {cluster.id}: {e}") vpas_by_namespace_label = collections.defaultdict(list) cost_per_cpu = cluster_cost / cluster_allocatable["cpu"] cost_per_memory = cluster_cost / cluster_allocatable["memory"] for pod in Pod.objects(cluster.client, namespace=pykube.all): # ignore unschedulable/completed pods if not pod_active(pod): continue pod_ = map_pod(pod, cost_per_cpu, cost_per_memory) for k, v in pod_["requests"].items(): cluster_requests[k] += v if pod.namespace not in system_namespaces: user_requests[k] += v node_name = pod.obj["spec"].get("nodeName") if node_name and node_name in nodes: for k in ("cpu", "memory"): nodes[node_name]["requests"][k] += pod_["requests"].get(k, 0) found_vpa = False for k, v in pod.labels.items(): vpas = vpas_by_namespace_label[(pod.namespace, k, v)] for vpa in vpas: if vpa.matches_pod(pod): recommendation = new_resources() container_names = set() for container in pod.obj["spec"]["containers"]: container_names.add(container["name"]) for container in vpa.container_recommendations: # VPA might contain recommendations for containers which are no longer there! if container["containerName"] in container_names: for k in ("cpu", "memory"): recommendation[k] += parse_resource( container["target"][k]) pod_["recommendation"] = recommendation found_vpa = True break if found_vpa: break pods[(pod.namespace, pod.name)] = pod_ hourly_cost = cluster_cost / HOURS_PER_MONTH cluster_summary = { "cluster": cluster, "nodes": nodes, "pods": pods, "namespaces": namespaces, "user_pods": len([p for ns, p in pods if ns not in system_namespaces]), "master_nodes": len([n for n in nodes.values() if n["role"] == "master"]), "worker_nodes": len([n for n in nodes.values() if n["role"] in node_labels]), "kubelet_versions": set([ n["kubelet_version"] for n in nodes.values() if n["role"] in node_labels ]), "worker_instance_types": set([ n["instance_type"] for n in nodes.values() if n["role"] in node_labels ]), "worker_instance_is_spot": any([n["spot"] for n in nodes.values() if n["role"] in node_labels]), "capacity": cluster_capacity, "allocatable": cluster_allocatable, "requests": cluster_requests, "user_requests": user_requests, "usage": cluster_usage, "cost": cluster_cost, "cost_per_user_request_hour": { "cpu": 0.5 * hourly_cost / max(user_requests["cpu"], MIN_CPU_USER_REQUESTS), "memory": 0.5 * hourly_cost / max(user_requests["memory"] / ONE_GIBI, MIN_MEMORY_USER_REQUESTS), }, "ingresses": [], } metrics.get_pod_usage(cluster, pods, prev_cluster_summaries.get("pods", {}), alpha_ema) cluster_slack_cost = 0 for pod in pods.values(): usage_cost = max( pod["usage"]["cpu"] * cost_per_cpu, pod["usage"]["memory"] * cost_per_memory, ) pod["slack_cost"] = pod["cost"] - usage_cost cluster_slack_cost += pod["slack_cost"] cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost) with FuturesSession(max_workers=10, session=session) as futures_session: futures_by_host = {} # hostname -> future futures = collections.defaultdict(list) # future -> [ingress] for _ingress in Ingress.objects(cluster.client, namespace=pykube.all): application = get_application_from_labels(_ingress.labels) for rule in _ingress.obj["spec"].get("rules", []): host = rule.get("host", "") if not application: # find the application by getting labels from pods backend_application = find_backend_application( cluster.client, _ingress, rule) else: backend_application = None ingress = [ _ingress.namespace, _ingress.name, application or backend_application, host, 0, ] if host and not no_ingress_status: try: future = futures_by_host[host] except KeyError: future = futures_session.get(f"https://{host}/", timeout=5) futures_by_host[host] = future futures[future].append(ingress) cluster_summary["ingresses"].append(ingress) if not no_ingress_status: logger.info( f"Waiting for ingress status for {cluster.id} ({cluster.api_server_url}).." ) for future in concurrent.futures.as_completed(futures): ingresses = futures[future] try: response = future.result() status = response.status_code except Exception: status = 999 for ingress in ingresses: ingress[4] = status return cluster_summary
def query_kubernetes_cluster(cluster): cluster_id = cluster.id api_server_url = cluster.api_server_url nodes = {} pods_by_namespace_name = {} unassigned_pods = {} for node in Node.objects(cluster.client): obj = map_node(node.obj) nodes[obj["name"]] = obj now = time.time() for pod in Pod.objects(cluster.client, namespace=pykube.all): obj = map_pod(pod.obj) if "deletionTimestamp" in pod.metadata: obj["deleted"] = parse_time(pod.metadata["deletionTimestamp"]) for cont in pod.obj["spec"]["containers"]: obj["containers"].append(map_container(cont, pod.obj)) if obj["phase"] in ("Succeeded", "Failed"): last_termination_time = 0 for container in obj["containers"]: termination_time = (container.get("state", {}).get( "terminated", {}).get("finishedAt")) if termination_time: termination_time = parse_time(termination_time) if termination_time > last_termination_time: last_termination_time = termination_time if (last_termination_time and last_termination_time < now - 3600) or (obj.get("reason") == "Evicted"): # the job/pod finished more than an hour ago or if it is evicted by cgroup limits # => filter out continue pods_by_namespace_name[(pod.namespace, pod.name)] = obj pod_key = f"{pod.namespace}/{pod.name}" node_name = pod.obj["spec"].get("nodeName") if node_name in nodes: nodes[node_name]["pods"][pod_key] = obj else: unassigned_pods[pod_key] = obj try: for node_metrics in NodeMetrics.objects(cluster.client): key = node_metrics.name nodes[key]["usage"] = node_metrics.obj.get("usage", {}) except Exception as e: logger.warning("Failed to query node metrics {}: {}".format( cluster.id, get_short_error_message(e))) try: for pod_metrics in PodMetrics.objects(cluster.client, namespace=pykube.all): key = (pod_metrics.namespace, pod_metrics.name) pod = pods_by_namespace_name.get(key) if pod: for container in pod["containers"]: for container_metrics in pod_metrics.obj.get( "containers", []): if container["name"] == container_metrics["name"]: container["resources"][ "usage"] = container_metrics["usage"] except Exception as e: logger.warning("Failed to query pod metrics for cluster {}: {}".format( cluster.id, get_short_error_message(e))) return { "id": cluster_id, "api_server_url": api_server_url, "nodes": nodes, "unassigned_pods": unassigned_pods, }