Exemple #1
0
def select_pod_form_set(api, list_set, namespace):
    pre_pod = Pod.objects(api).filter(namespace=namespace)
    list_pods = []
    for p in pre_pod.response['items']:
        pod_obj = Pod(api, p)
        pod_name = pod_obj.name

        try:
            set_name = loads(p["metadata"]["annotations"]
                             ["kubernetes.io/created-by"])["reference"]["name"]
        except:
            pass

        num = 0
        dic_pod = {}
        for e in list_set:
            if e["name"] == set_name:
                dic_pod["name"] = pod_name
                dic_pod["set_name"] = set_name
                try:
                    dic_pod["podIP"] = p["status"]["podIP"]
                except:
                    dic_pod["podIP"] = "0.0.0.0"
                list_pods.append(dic_pod)
                list_set[num]["list_pods"] = list_pods
            num += 1
    return list_set
Exemple #2
0
    def run_item(self, item_name) -> dict:
        """
        run_item

        Execute an item job Pod with the spec details from the appropriate
        OaatType object.
        """
        # TODO: check oaatType
        spec = self.oaattype.podspec()
        contspec = spec['container']
        del spec['container']
        contspec.setdefault('env', []).append({
            'name': 'OAAT_ITEM',
            'value': item_name
        })
        for idx in range(len(contspec.get('command', []))):
            contspec['command'][idx] = (contspec['command'][idx].replace(
                '%%oaat_item%%', item_name))
        for idx in range(len(contspec.get('args', []))):
            contspec['args'][idx] = (contspec['args'][idx].replace(
                '%%oaat_item%%', item_name))
        for env in contspec['env']:
            env['value'] = (env.get('value',
                                    '').replace('%%oaat_item%%', item_name))

        # TODO: currently only supports a single container. Do we want
        # multi-container?
        doc = {
            'apiVersion': 'v1',
            'kind': 'Pod',
            'metadata': {
                'generateName': self.name + '-' + item_name + '-',
                'labels': {
                    'parent-name': self.name,
                    'oaat-name': item_name,
                    'app': 'oaat-operator'
                }
            },
            'spec': {
                'containers': [contspec],
                **spec, 'restartPolicy': 'Never'
            },
        }

        kopf.adopt(doc)
        pod = Pod(self.api, doc)

        try:
            pod.create()
        except pykube.exceptions.KubernetesError as exc:
            self.items.mark_failed(item_name)
            raise ProcessingComplete(
                error=f'could not create pod {doc}: {exc}',
                message=f'error creating pod for {item_name}')
        return pod
Exemple #3
0
 def create_worker(self, player_id):
     pod = Pod(
         self.api,
         {
          'kind': 'Pod',
          'apiVersion': 'v1',
          'metadata': {
             'generateName': "aimmo-%s-worker-%s-" % (self.game_name, player_id),
             'labels': {
                 'app': 'aimmo-game-worker',
                 'game': self.game_name,
                 'player': str(player_id),
                 },
             },
          'spec': {
             'containers': [
                 {
                     'env': [
                         {
                             'name': 'DATA_URL',
                             'value': "%s/player/%d" % (self.game_url, player_id),
                         },
                     ],
                     'name': 'aimmo-game-worker',
                     'image': 'ocadotechnology/aimmo-game-worker:%s' % os.environ.get('IMAGE_SUFFIX', 'latest'),
                     'ports': [
                         {
                             'containerPort': 5000,
                             'protocol': 'TCP'
                         }
                     ],
                     'resources': {
                         'limits': {
                             'cpu': '10m',
                             'memory': '64Mi',
                         },
                     },
                 },
             ],
          },
         }
     )
     pod.create()
     iterations = 0
     while pod.obj['status']['phase'] == 'Pending':
         if iterations > 30:
             raise EnvironmentError('Could not start worker %s, details %s' % (player_id, pod.obj))
         LOGGER.debug('Waiting for worker %s', player_id)
         time.sleep(5)
         pod.reload()
         iterations += 1
     worker_url = "http://%s:5000" % pod.obj['status']['podIP']
     LOGGER.info("Worker started for %s, listening at %s", player_id, worker_url)
     return worker_url
Exemple #4
0
 def create_worker(self, player_id):
     pod = Pod(
         self.api,
         {
          'kind': 'Pod',
          'apiVersion': 'v1',
          'metadata': {
             'generateName': "aimmo-%s-worker-%s-" % (self.game_name, player_id),
             'labels': {
                 'app': 'aimmo-game-worker',
                 'game': self.game_name,
                 'player': str(player_id),
                 },
             },
          'spec': {
             'containers': [
                 {
                     'env': [
                         {
                             'name': 'DATA_URL',
                             'value': "%s/player/%d" % (self.game_url, player_id),
                         },
                     ],
                     'name': 'aimmo-game-worker',
                     'image': 'ocadotechnology/aimmo-game-worker:latest',
                     'ports': [
                         {
                             'containerPort': 5000,
                             'protocol': 'TCP'
                         }
                     ],
                     'resources': {
                         'limits': {
                             'cpu': '10m',
                             'memory': '64Mi',
                         },
                     },
                 },
             ],
          },
         }
     )
     pod.create()
     time.sleep(20)
     pod.reload()
     worker_url = "http://%s:5000" % pod.obj['status']['podIP']
     LOGGER.info("Worker started for %s, listening at %s", player_id, worker_url)
     return worker_url
Exemple #5
0
 def remove_worker(self, player_id):
     for pod in Pod.objects(self.api).filter(selector={
         'app': 'aimmo-game-worker',
         'game': self.game_name,
         'player': str(player_id),
     }):
         pod.delete()
Exemple #6
0
    def start(self, object_type, object_file_path, force=False):

        if not self.api:
            logging.info('API Client does not exist')
            return

        with open(object_file_path) as json_data:
            json_file = json.load(json_data)

        if object_type is KubernetesObjects.POD:

            pod = Pod(self.api, json_file)
            self._recreate_object(pod, force)
            self._add_object_to_kube_objects_dict('pods', pod)

        elif object_type is KubernetesObjects.SERVICE:

            service = Service(self.api, json_file)
            self._recreate_object(service, force)
            self._add_object_to_kube_objects_dict('services', service)

        elif object_type is KubernetesObjects.REPLICATION_CONTROLLER:

            rc = ReplicationController(self.api, json_file)
            self._recreate_object(rc, force)
            self._add_object_to_kube_objects_dict('rcs', rc)
def find_backend_application(client, ingress, rule):
    '''
    The Ingress object might not have a "application" label, so let's try to find the application by looking at the backend service and its pods
    '''
    paths = rule.get('http', {}).get('paths', [])
    selectors = []
    for path in paths:
        service_name = path.get('backend', {}).get('serviceName')
        if service_name:
            try:
                service = Service.objects(
                    client, namespace=ingress.namespace).get(name=service_name)
            except ObjectDoesNotExist:
                logger.debug(
                    f'Referenced service does not exist: {ingress.namespace}/{service_name}'
                )
            else:
                selector = service.obj['spec'].get('selector', {})
                selectors.append(selector)
                application = get_application_from_labels(selector)
                if application:
                    return application
    # we still haven't found the application, let's look up pods by label selectors
    for selector in selectors:
        application_candidates = set()
        for pod in Pod.objects(client).filter(namespace=ingress.namespace,
                                              selector=selector):
            application = get_application_from_labels(pod.labels)
            if application:
                application_candidates.add(application)

        if len(application_candidates) == 1:
            return application_candidates.pop()
    return ''
Exemple #8
0
 def remove_worker(self, player_id):
     for pod in Pod.objects(self.api).filter(
             selector={
                 'app': 'aimmo-game-worker',
                 'game': self.game_name,
                 'player': str(player_id),
             }):
         pod.delete()
Exemple #9
0
 def remove_worker(self, player_id):
     for pod in Pod.objects(self.api).filter(selector={
         'app': 'aimmo-game-worker',
         'game': self.game_name,
         'player': str(player_id),
     }):
         LOGGER.debug('Removing pod %s', pod.obj['spec'])
         pod.delete()
Exemple #10
0
 def remove_worker(self, player_id):
     for pod in Pod.objects(self.api).filter(selector={
         'app': 'aimmo-game-worker',
         'game': self.game_id,
         'player': str(player_id),
     }):
         LOGGER.debug('Removing pod %s', pod.obj['spec'])
         pod.delete()
Exemple #11
0
    def validate_running_pod(self) -> None:
        """
        validate_running_pod

        Check whether the Pod we previously started is still running. If not,
        assume the job was killed without being processed by the
        operator (or was never started) and clean up. Mark as failed.

        If Pod is still running, update the status details.

        Returns:
        - None if no pod is expected
        - ProcessingComplete exception if pod is expected but not running
        - ProcessingComplete exception if pod is expected and is running
        """
        # TODO: what if a pod is running, but the operator doesn't expect one?
        curpod = self.get_status('pod')
        curitem = self.get_status('currently_running')
        if curpod:
            try:
                pod = Pod.objects(
                    self.api, namespace=self.namespace).get_by_name(curpod).obj
            except pykube.exceptions.ObjectDoesNotExist:
                self.info(f'pod {curpod} missing/deleted, cleaning up')
                self.set_status('currently_running')
                self.set_status('pod')
                self.set_status('state', 'missing')
                self.items.mark_failed(curitem)
                self.items.set_item_status(curitem, 'pod_detail')
                raise ProcessingComplete(
                    info='Cleaned up missing/deleted item')

            podphase = pod.get('status', {}).get('phase', 'unknown')
            self.info(f'validated that pod {curpod} is '
                      f'still running (phase={podphase})')

            recorded_phase = self.items.status(curitem, 'podphase', 'unknown')

            # valid phases are Pending, Running, Succeeded, Failed, Unknown
            # 'started' is the phase the pods start with when created by
            # operator.
            if recorded_phase in ('started', 'Pending', 'Running', 'Failed'):
                self.info(f'item {curitem} status for '
                          f'{curpod}: {recorded_phase}')
                raise ProcessingComplete(message=f'item {curitem} %s' %
                                         recorded_phase.lower())

            if recorded_phase == 'Succeeded':
                self.info(f'item {curitem} podphase={recorded_phase} but '
                          f'not yet acknowledged: {curpod}')
                raise ProcessingComplete(message=f'item {curitem} succeeded, '
                                         'awaiting acknowledgement')

            raise ProcessingComplete(
                error=f'item {curitem} unexpected state: '
                f'recorded_phase={recorded_phase}, '
                f'status={str(self.status)}',
                message=f'item {curitem} unexpected state')
Exemple #12
0
def get_application_label_from_pods(client: pykube.HTTPClient, namespace,
                                    selector):
    application_candidates = set()
    for pod in Pod.objects(client).filter(namespace=namespace,
                                          selector=selector):
        application = get_application_from_labels(pod.labels)
        if application:
            application_candidates.add(application)

    if len(application_candidates) == 1:
        return application_candidates.pop()
    return ""
Exemple #13
0
 def create_worker(self, player_id):
     pod = Pod(
         self.api, {
             'kind': 'Pod',
             'apiVersion': 'v1',
             'metadata': {
                 'generateName':
                 "aimmo-%s-worker-%s-" % (self.game_id, player_id),
                 'labels': {
                     'app': 'aimmo-game-worker',
                     'game': self.game_id,
                     'player': str(player_id),
                 },
             },
             'spec': {
                 'containers': [
                     {
                         'env': [
                             {
                                 'name':
                                 'DATA_URL',
                                 'value':
                                 "%s/player/%d" %
                                 (self.game_url, player_id),
                             },
                         ],
                         'name':
                         'aimmo-game-worker',
                         'image':
                         'ocadotechnology/aimmo-game-worker:%s' %
                         os.environ.get('IMAGE_SUFFIX', 'latest'),
                         'ports': [{
                             'containerPort': 5000,
                             'protocol': 'TCP'
                         }],
                         'resources': {
                             'limits': {
                                 'cpu': '10m',
                                 'memory': '64Mi',
                             },
                         },
                     },
                 ],
             },
         })
     pod.create()
     iterations = 0
     while pod.obj['status']['phase'] == 'Pending':
         if iterations > 30:
             raise EnvironmentError(
                 'Could not start worker %s, details %s' %
                 (player_id, pod.obj))
         LOGGER.debug('Waiting for worker %s', player_id)
         time.sleep(5)
         pod.reload()
         iterations += 1
     worker_url = "http://%s:5000" % pod.obj['status']['podIP']
     LOGGER.info("Worker started for %s, listening at %s", player_id,
                 worker_url)
     return worker_url
Exemple #14
0
    def validate_expected_pod_is_running(self) -> None:
        """
        validate_expected_pod_is_running

        Validate that the pod which we expect should be running (based
        on `oaatgroup` status `pod` and `currently_running`)

        Check whether the Pod we previously started is still running. If not,
        assume the job was killed without being processed by the
        operator (or was never started) and clean up. Mark as failed.

        Returns:
        - ProcessingComplete exception:
            - Cleaned up missing/deleted item
            - Pod exists and is in state: <state>
        """
        curpod = self.get_status('pod')
        curitem = self.get_status('currently_running')
        try:
            pod = Pod.objects(self.api,
                              namespace=self.namespace).get_by_name(curpod).obj
        except pykube.exceptions.ObjectDoesNotExist:
            self.info(f'pod {curpod} missing/deleted, cleaning up')
            self.set_status('currently_running')
            self.set_status('pod')
            self.set_status('state', 'missing')
            self.items.mark_failed(curitem)
            self.items.set_item_status(curitem, 'pod_detail')
            raise ProcessingComplete(
                message=f'item {curitem} failed during validation',
                info='Cleaned up missing/deleted item')

        podphase = pod.get('status', {}).get('phase', 'unknown')
        self.info(f'validated that pod {curpod} exists ' f'(phase={podphase})')
        recorded_phase = self.items.status(curitem, 'podphase', 'unknown')

        # if there is a mismatch in phase, then the pod phase handlers
        # have not yet picked it up and updated the oaatgroup phase.
        # Note it here, but take no further action
        if podphase != recorded_phase:
            self.info(f'mismatch in phase for pod {curpod}: '
                      f'pod={podphase}, oaatgroup={recorded_phase}')

        # valid phases are Pending, Running, Succeeded, Failed, Unknown
        # 'started' is the phase the pods start with when created by
        # operator.

        raise ProcessingComplete(
            message=f'Pod {curpod} exists and is in state {podphase}')
Exemple #15
0
    def validate_no_rogue_pods_are_running(self) -> None:
        found_rogue = 0
        for pod in Pod.objects(self.api, namespace=self.namespace).iterator():
            if pod.name == self.get_status('pod'):
                continue
            if pod.labels.get('parent-name', '') == self.name:
                if pod.labels.get('app', '') == 'oaat-operator':
                    podphase = (pod.obj['status'].get('phase', 'unknown'))
                    if podphase in ['Running', 'Pending']:
                        self.warning(
                            f'rogue pod {pod.name} found (phase={podphase})')
                        found_rogue += 1

        if found_rogue > 0:
            raise ProcessingComplete(
                message='rogue pods running',
                error=f'found {found_rogue} rogue pods running')
Exemple #16
0
 def create_worker(self, player_id):
     pod = Pod(
         self.api, {
             'kind': 'Pod',
             'apiVersion': 'v1',
             'metadata': {
                 'generateName':
                 "aimmo-%s-worker-%s-" % (self.game_name, player_id),
                 'labels': {
                     'app': 'aimmo-game-worker',
                     'game': self.game_name,
                     'player': str(player_id),
                 },
             },
             'spec': {
                 'containers': [
                     {
                         'env': [
                             {
                                 'name':
                                 'DATA_URL',
                                 'value':
                                 "%s/player/%d" %
                                 (self.game_url, player_id),
                             },
                         ],
                         'name':
                         'aimmo-game-worker',
                         'image':
                         'ocadotechnology/aimmo-game-worker:%s' %
                         os.environ.get('IMAGE_SUFFIX', 'latest'),
                         'ports': [{
                             'containerPort': 5000,
                             'protocol': 'TCP'
                         }],
                         'resources': {
                             'limits': {
                                 'cpu': '10m',
                                 'memory': '64Mi',
                             },
                         },
                     },
                 ],
             },
         })
     pod.create()
     time.sleep(20)
     pod.reload()
     worker_url = "http://%s:5000" % pod.obj['status']['podIP']
     LOGGER.info("Worker started for %s, listening at %s", player_id,
                 worker_url)
     return worker_url
Exemple #17
0
def find_backend_application(client: pykube.HTTPClient, ingress: Ingress, rule):
    """
    Find the application ID for a given Ingress object.

    The Ingress object might not have a "application" label, so let's try to find the application by looking at the backend service and its pods
    """
    paths = rule.get("http", {}).get("paths", [])
    selectors = []
    for path in paths:
        service_name = path.get("backend", {}).get("serviceName")
        if service_name:
            try:
                service = Service.objects(client, namespace=ingress.namespace).get(
                    name=service_name
                )
            except ObjectDoesNotExist:
                logger.debug(
                    f"Referenced service does not exist: {ingress.namespace}/{service_name}"
                )
            else:
                selector = service.obj["spec"].get("selector", {})
                selectors.append(selector)
                application = get_application_from_labels(selector)
                if application:
                    return application
    # we still haven't found the application, let's look up pods by label selectors
    for selector in selectors:
        application_candidates = set()
        for pod in Pod.objects(client).filter(
            namespace=ingress.namespace, selector=selector
        ):
            application = get_application_from_labels(pod.labels)
            if application:
                application_candidates.add(application)

        if len(application_candidates) == 1:
            return application_candidates.pop()
    return ""
Exemple #18
0
 def _get_pods(self):
     return Pod.objects(self.api).filter(selector=self.labels)
Exemple #19
0
def query_kubernetes_cluster(cluster):
    cluster_id = cluster.id
    api_server_url = cluster.api_server_url
    nodes = {}
    pods_by_namespace_name = {}
    unassigned_pods = {}
    for node in Node.objects(cluster.client):
        obj = map_node(node.obj)
        nodes[obj['name']] = obj
    now = time.time()
    for pod in Pod.objects(cluster.client, namespace=pykube.all):
        obj = map_pod(pod.obj)
        if 'deletionTimestamp' in pod.metadata:
            obj['deleted'] = parse_time(pod.metadata['deletionTimestamp'])
        for cont in pod.obj['spec']['containers']:
            obj['containers'].append(map_container(cont, pod.obj))
        if obj['phase'] in ('Succeeded', 'Failed'):
            last_termination_time = 0
            for container in obj['containers']:
                termination_time = container.get('state',
                                                 {}).get('terminated',
                                                         {}).get('finishedAt')
                if termination_time:
                    termination_time = parse_time(termination_time)
                    if termination_time > last_termination_time:
                        last_termination_time = termination_time
            if (last_termination_time and
                    last_termination_time < now - 3600) or (obj.get('reason')
                                                            == 'Evicted'):
                # the job/pod finished more than an hour ago or if it is evicted by cgroup limits
                # => filter out
                continue
        pods_by_namespace_name[(pod.namespace, pod.name)] = obj
        pod_key = f'{pod.namespace}/{pod.name}'
        node_name = pod.obj['spec'].get('nodeName')
        if node_name in nodes:
            nodes[node_name]['pods'][pod_key] = obj
        else:
            unassigned_pods[pod_key] = obj

    try:
        for node_metrics in NodeMetrics.objects(cluster.client):
            key = node_metrics.name
            nodes[key]['usage'] = node_metrics.obj.get('usage', {})
    except Exception as e:
        logger.warning('Failed to query node metrics {}: {}'.format(
            cluster.id, get_short_error_message(e)))
    try:
        for pod_metrics in PodMetrics.objects(cluster.client,
                                              namespace=pykube.all):
            key = (pod_metrics.namespace, pod_metrics.name)
            pod = pods_by_namespace_name.get(key)
            if pod:
                for container in pod['containers']:
                    for container_metrics in pod_metrics.obj.get(
                            'containers', []):
                        if container['name'] == container_metrics['name']:
                            container['resources'][
                                'usage'] = container_metrics['usage']
    except Exception as e:
        logger.warning('Failed to query pod metrics for cluster {}: {}'.format(
            cluster.id, get_short_error_message(e)))
    return {
        'id': cluster_id,
        'api_server_url': api_server_url,
        'nodes': nodes,
        'unassigned_pods': unassigned_pods
    }
Exemple #20
0
def query_cluster(cluster, executor, system_namespaces,
                  additional_cost_per_cluster, no_ingress_status, node_label):
    logger.info(f"Querying cluster {cluster.id} ({cluster.api_server_url})..")
    pods = {}
    nodes = {}
    namespaces = {}

    for namespace in Namespace.objects(cluster.client):
        email = namespace.annotations.get('email')
        namespaces[namespace.name] = {
            "status": namespace.obj['status']['phase'],
            "email": email,
        }

    cluster_capacity = collections.defaultdict(float)
    cluster_allocatable = collections.defaultdict(float)
    cluster_requests = collections.defaultdict(float)
    user_requests = collections.defaultdict(float)
    node_count = collections.defaultdict(int)
    cluster_cost = additional_cost_per_cluster

    for _node in Node.objects(cluster.client):
        node = _node.obj
        nodes[_node.name] = node
        node["capacity"] = {}
        node["allocatable"] = {}
        node["requests"] = new_resources()
        node["usage"] = new_resources()
        for k, v in node["status"].get("capacity", {}).items():
            parsed = parse_resource(v)
            node["capacity"][k] = parsed
            cluster_capacity[k] += parsed
        for k, v in node["status"].get("allocatable", {}).items():
            parsed = parse_resource(v)
            node["allocatable"][k] = parsed
            cluster_allocatable[k] += parsed
        role = _node.labels.get(NODE_LABEL_ROLE) or "worker"
        node_count[role] += 1
        region = _node.labels.get(NODE_LABEL_REGION, "unknown")
        instance_type = _node.labels.get(NODE_LABEL_INSTANCE_TYPE, "unknown")
        is_spot = _node.labels.get(NODE_LABEL_SPOT) == "true"
        node["spot"] = is_spot
        node["kubelet_version"] = (node["status"].get("nodeInfo", {}).get(
            "kubeletVersion", ""))
        node["role"] = role
        node["instance_type"] = instance_type
        node["cost"] = pricing.get_node_cost(region, instance_type, is_spot)
        cluster_cost += node["cost"]

    get_node_usage(cluster, nodes)

    cluster_usage = collections.defaultdict(float)
    for node in nodes.values():
        for k, v in node['usage'].items():
            cluster_usage[k] += v

    cost_per_cpu = cluster_cost / cluster_allocatable["cpu"]
    cost_per_memory = cluster_cost / cluster_allocatable["memory"]

    for pod in Pod.objects(cluster.client, namespace=pykube.all):
        if pod.obj["status"].get("phase") != "Running":
            # ignore unschedulable/completed pods
            continue
        application = get_application_from_labels(pod.labels)
        component = get_component_from_labels(pod.labels)
        requests = collections.defaultdict(float)
        ns = pod.namespace
        container_images = []
        for container in pod.obj["spec"]["containers"]:
            # note that the "image" field is optional according to Kubernetes docs
            image = container.get("image")
            if image:
                container_images.append(image)
            for k, v in container["resources"].get("requests", {}).items():
                pv = parse_resource(v)
                requests[k] += pv
                cluster_requests[k] += pv
                if ns not in system_namespaces:
                    user_requests[k] += pv
        if "nodeName" in pod.obj["spec"] and pod.obj["spec"][
                "nodeName"] in nodes:
            for k in ("cpu", "memory"):
                nodes[pod.obj["spec"]
                      ["nodeName"]]["requests"][k] += requests.get(k, 0)
        cost = max(requests["cpu"] * cost_per_cpu,
                   requests["memory"] * cost_per_memory)
        pods[(ns, pod.name)] = {
            "requests": requests,
            "application": application,
            "component": component,
            "container_images": container_images,
            "cost": cost,
            "usage": new_resources(),
        }

    hourly_cost = cluster_cost / HOURS_PER_MONTH

    cluster_summary = {
        "cluster":
        cluster,
        "nodes":
        nodes,
        "pods":
        pods,
        "namespaces":
        namespaces,
        "user_pods":
        len([p for ns, p in pods if ns not in system_namespaces]),
        "master_nodes":
        node_count["master"],
        "worker_nodes":
        node_count[node_label],
        "kubelet_versions":
        set([
            n["kubelet_version"] for n in nodes.values()
            if n["role"] == node_label
        ]),
        "worker_instance_types":
        set([
            n["instance_type"] for n in nodes.values()
            if n["role"] == node_label
        ]),
        "worker_instance_is_spot":
        any([n["spot"] for n in nodes.values() if n["role"] == node_label]),
        "capacity":
        cluster_capacity,
        "allocatable":
        cluster_allocatable,
        "requests":
        cluster_requests,
        "user_requests":
        user_requests,
        "usage":
        cluster_usage,
        "cost":
        cluster_cost,
        "cost_per_user_request_hour": {
            "cpu":
            0.5 * hourly_cost /
            max(user_requests["cpu"], MIN_CPU_USER_REQUESTS),
            "memory":
            0.5 * hourly_cost /
            max(user_requests["memory"] / ONE_GIBI, MIN_MEMORY_USER_REQUESTS),
        },
        "ingresses": [],
    }

    get_pod_usage(cluster, pods)

    cluster_slack_cost = 0
    for pod in pods.values():
        usage_cost = max(
            pod["usage"]["cpu"] * cost_per_cpu,
            pod["usage"]["memory"] * cost_per_memory,
        )
        pod["slack_cost"] = pod["cost"] - usage_cost
        cluster_slack_cost += pod["slack_cost"]

    cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost)

    with FuturesSession(max_workers=10, session=session) as futures_session:
        futures_by_host = {}  # hostname -> future
        futures = collections.defaultdict(list)  # future -> [ingress]

        for _ingress in Ingress.objects(cluster.client, namespace=pykube.all):
            application = get_application_from_labels(_ingress.labels)
            for rule in _ingress.obj["spec"].get("rules", []):
                host = rule.get('host', '')
                if not application:
                    # find the application by getting labels from pods
                    backend_application = find_backend_application(
                        cluster.client, _ingress, rule)
                else:
                    backend_application = None
                ingress = [
                    _ingress.namespace, _ingress.name, application
                    or backend_application, host, 0
                ]
                if host and not no_ingress_status:
                    try:
                        future = futures_by_host[host]
                    except KeyError:
                        future = futures_session.get(f"https://{host}/",
                                                     timeout=5)
                        futures_by_host[host] = future
                    futures[future].append(ingress)
                cluster_summary["ingresses"].append(ingress)

        if not no_ingress_status:
            logger.info(
                f'Waiting for ingress status for {cluster.id} ({cluster.api_server_url})..'
            )
            for future in concurrent.futures.as_completed(futures):
                ingresses = futures[future]
                try:
                    response = future.result()
                    status = response.status_code
                except:
                    status = 999
                for ingress in ingresses:
                    ingress[4] = status

    return cluster_summary
Exemple #21
0
def query_cluster(
    cluster,
    executor,
    system_namespaces,
    additional_cost_per_cluster,
    alpha_ema,
    prev_cluster_summaries,
    no_ingress_status,
    node_labels,
):
    logger.info(f"Querying cluster {cluster.id} ({cluster.api_server_url})..")
    pods = {}
    nodes = {}
    namespaces = {}

    for namespace in Namespace.objects(cluster.client):
        email = namespace.annotations.get("email")
        namespaces[namespace.name] = {
            "status": namespace.obj["status"]["phase"],
            "email": email,
        }

    cluster_capacity = collections.defaultdict(float)
    cluster_allocatable = collections.defaultdict(float)
    cluster_requests = collections.defaultdict(float)
    user_requests = collections.defaultdict(float)
    cluster_cost = additional_cost_per_cluster

    for _node in Node.objects(cluster.client):
        node = map_node(_node)
        nodes[_node.name] = node

        for k, v in node["capacity"].items():
            cluster_capacity[k] += v
        for k, v in node["allocatable"].items():
            cluster_allocatable[k] += v
        cluster_cost += node["cost"]

    metrics.get_node_usage(cluster, nodes,
                           prev_cluster_summaries.get("nodes", {}), alpha_ema)

    cluster_usage = collections.defaultdict(float)
    for node in nodes.values():
        for k, v in node["usage"].items():
            cluster_usage[k] += v

    try:
        vpas_by_namespace_label = get_vpas_by_match_labels(cluster.client)
    except Exception as e:
        logger.warning(f"Failed to query VPAs in cluster {cluster.id}: {e}")
        vpas_by_namespace_label = collections.defaultdict(list)

    cost_per_cpu = cluster_cost / cluster_allocatable["cpu"]
    cost_per_memory = cluster_cost / cluster_allocatable["memory"]

    for pod in Pod.objects(cluster.client, namespace=pykube.all):
        # ignore unschedulable/completed pods
        if not pod_active(pod):
            continue
        pod_ = map_pod(pod, cost_per_cpu, cost_per_memory)
        for k, v in pod_["requests"].items():
            cluster_requests[k] += v
            if pod.namespace not in system_namespaces:
                user_requests[k] += v
        node_name = pod.obj["spec"].get("nodeName")
        if node_name and node_name in nodes:
            for k in ("cpu", "memory"):
                nodes[node_name]["requests"][k] += pod_["requests"].get(k, 0)
        found_vpa = False
        for k, v in pod.labels.items():
            vpas = vpas_by_namespace_label[(pod.namespace, k, v)]
            for vpa in vpas:
                if vpa.matches_pod(pod):
                    recommendation = new_resources()
                    container_names = set()
                    for container in pod.obj["spec"]["containers"]:
                        container_names.add(container["name"])
                    for container in vpa.container_recommendations:
                        # VPA might contain recommendations for containers which are no longer there!
                        if container["containerName"] in container_names:
                            for k in ("cpu", "memory"):
                                recommendation[k] += parse_resource(
                                    container["target"][k])
                    pod_["recommendation"] = recommendation
                    found_vpa = True
                    break
            if found_vpa:
                break
        pods[(pod.namespace, pod.name)] = pod_

    hourly_cost = cluster_cost / HOURS_PER_MONTH

    cluster_summary = {
        "cluster":
        cluster,
        "nodes":
        nodes,
        "pods":
        pods,
        "namespaces":
        namespaces,
        "user_pods":
        len([p for ns, p in pods if ns not in system_namespaces]),
        "master_nodes":
        len([n for n in nodes.values() if n["role"] == "master"]),
        "worker_nodes":
        len([n for n in nodes.values() if n["role"] in node_labels]),
        "kubelet_versions":
        set([
            n["kubelet_version"] for n in nodes.values()
            if n["role"] in node_labels
        ]),
        "worker_instance_types":
        set([
            n["instance_type"] for n in nodes.values()
            if n["role"] in node_labels
        ]),
        "worker_instance_is_spot":
        any([n["spot"] for n in nodes.values() if n["role"] in node_labels]),
        "capacity":
        cluster_capacity,
        "allocatable":
        cluster_allocatable,
        "requests":
        cluster_requests,
        "user_requests":
        user_requests,
        "usage":
        cluster_usage,
        "cost":
        cluster_cost,
        "cost_per_user_request_hour": {
            "cpu":
            0.5 * hourly_cost /
            max(user_requests["cpu"], MIN_CPU_USER_REQUESTS),
            "memory":
            0.5 * hourly_cost /
            max(user_requests["memory"] / ONE_GIBI, MIN_MEMORY_USER_REQUESTS),
        },
        "ingresses": [],
    }

    metrics.get_pod_usage(cluster, pods,
                          prev_cluster_summaries.get("pods", {}), alpha_ema)

    cluster_slack_cost = 0
    for pod in pods.values():
        usage_cost = max(
            pod["usage"]["cpu"] * cost_per_cpu,
            pod["usage"]["memory"] * cost_per_memory,
        )
        pod["slack_cost"] = pod["cost"] - usage_cost
        cluster_slack_cost += pod["slack_cost"]

    cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost)

    with FuturesSession(max_workers=10, session=session) as futures_session:
        futures_by_host = {}  # hostname -> future
        futures = collections.defaultdict(list)  # future -> [ingress]

        for _ingress in Ingress.objects(cluster.client, namespace=pykube.all):
            application = get_application_from_labels(_ingress.labels)
            for rule in _ingress.obj["spec"].get("rules", []):
                host = rule.get("host", "")
                if not application:
                    # find the application by getting labels from pods
                    backend_application = find_backend_application(
                        cluster.client, _ingress, rule)
                else:
                    backend_application = None
                ingress = [
                    _ingress.namespace,
                    _ingress.name,
                    application or backend_application,
                    host,
                    0,
                ]
                if host and not no_ingress_status:
                    try:
                        future = futures_by_host[host]
                    except KeyError:
                        future = futures_session.get(f"https://{host}/",
                                                     timeout=5)
                        futures_by_host[host] = future
                    futures[future].append(ingress)
                cluster_summary["ingresses"].append(ingress)

        if not no_ingress_status:
            logger.info(
                f"Waiting for ingress status for {cluster.id} ({cluster.api_server_url}).."
            )
            for future in concurrent.futures.as_completed(futures):
                ingresses = futures[future]
                try:
                    response = future.result()
                    status = response.status_code
                except Exception:
                    status = 999
                for ingress in ingresses:
                    ingress[4] = status

    return cluster_summary
Exemple #22
0
def query_kubernetes_cluster(cluster):
    cluster_id = cluster.id
    api_server_url = cluster.api_server_url
    nodes = {}
    pods_by_namespace_name = {}
    unassigned_pods = {}
    for node in Node.objects(cluster.client):
        obj = map_node(node.obj)
        nodes[obj["name"]] = obj
    now = time.time()
    for pod in Pod.objects(cluster.client, namespace=pykube.all):
        obj = map_pod(pod.obj)
        if "deletionTimestamp" in pod.metadata:
            obj["deleted"] = parse_time(pod.metadata["deletionTimestamp"])
        for cont in pod.obj["spec"]["containers"]:
            obj["containers"].append(map_container(cont, pod.obj))
        if obj["phase"] in ("Succeeded", "Failed"):
            last_termination_time = 0
            for container in obj["containers"]:
                termination_time = (container.get("state", {}).get(
                    "terminated", {}).get("finishedAt"))
                if termination_time:
                    termination_time = parse_time(termination_time)
                    if termination_time > last_termination_time:
                        last_termination_time = termination_time
            if (last_termination_time and
                    last_termination_time < now - 3600) or (obj.get("reason")
                                                            == "Evicted"):
                # the job/pod finished more than an hour ago or if it is evicted by cgroup limits
                # => filter out
                continue
        pods_by_namespace_name[(pod.namespace, pod.name)] = obj
        pod_key = f"{pod.namespace}/{pod.name}"
        node_name = pod.obj["spec"].get("nodeName")
        if node_name in nodes:
            nodes[node_name]["pods"][pod_key] = obj
        else:
            unassigned_pods[pod_key] = obj

    try:
        for node_metrics in NodeMetrics.objects(cluster.client):
            key = node_metrics.name
            nodes[key]["usage"] = node_metrics.obj.get("usage", {})
    except Exception as e:
        logger.warning("Failed to query node metrics {}: {}".format(
            cluster.id, get_short_error_message(e)))
    try:
        for pod_metrics in PodMetrics.objects(cluster.client,
                                              namespace=pykube.all):
            key = (pod_metrics.namespace, pod_metrics.name)
            pod = pods_by_namespace_name.get(key)
            if pod:
                for container in pod["containers"]:
                    for container_metrics in pod_metrics.obj.get(
                            "containers", []):
                        if container["name"] == container_metrics["name"]:
                            container["resources"][
                                "usage"] = container_metrics["usage"]
    except Exception as e:
        logger.warning("Failed to query pod metrics for cluster {}: {}".format(
            cluster.id, get_short_error_message(e)))
    return {
        "id": cluster_id,
        "api_server_url": api_server_url,
        "nodes": nodes,
        "unassigned_pods": unassigned_pods,
    }