Esempio n. 1
0
def map_node(_node: Node):
    """Map a Kubernetes Node object to our internal structure."""

    node: Dict[str, Any] = {}
    node["capacity"] = {}
    node["allocatable"] = {}
    node["requests"] = new_resources()
    node["usage"] = new_resources()
    node["pods"] = {}
    node["slack_cost"] = 0

    status = _node.obj["status"]
    for k, v in status.get("capacity", {}).items():
        parsed = parse_resource(v)
        node["capacity"][k] = parsed

    for k, v in status.get("allocatable", {}).items():
        parsed = parse_resource(v)
        node["allocatable"][k] = parsed

    role = _node.labels.get(NODE_LABEL_ROLE) or "worker"
    region = _node.labels.get(NODE_LABEL_REGION, "unknown")
    instance_type = _node.labels.get(NODE_LABEL_INSTANCE_TYPE, "unknown")
    is_spot = _node.labels.get(NODE_LABEL_SPOT) == NODE_LABEL_SPOT_VALUE
    is_preemptible = _node.labels.get(NODE_LABEL_PREEMPTIBLE,
                                      "false") == "true"
    if is_preemptible:
        instance_type = instance_type + "-preemptible"
    node["spot"] = is_spot or is_preemptible
    node["kubelet_version"] = status.get("nodeInfo",
                                         {}).get("kubeletVersion", "")
    node["role"] = role
    node["instance_type"] = instance_type
    node["cost"] = pricing.get_node_cost(
        region,
        instance_type,
        is_spot,
        cpu=node["capacity"].get("cpu"),
        memory=node["capacity"].get("memory"),
    )
    return node
Esempio n. 2
0
def query_cluster(cluster, executor, system_namespaces,
                  additional_cost_per_cluster, no_ingress_status):
    logger.info("Querying cluster {} ({})..".format(cluster.id,
                                                    cluster.api_server_url))
    pods = {}
    nodes = {}

    response = request(cluster, "/api/v1/nodes")
    response.raise_for_status()
    cluster_capacity = collections.defaultdict(float)
    cluster_allocatable = collections.defaultdict(float)
    cluster_requests = collections.defaultdict(float)
    user_requests = collections.defaultdict(float)
    cluster_usage = collections.defaultdict(float)
    node_count = collections.defaultdict(int)
    cluster_cost = additional_cost_per_cluster
    for node in response.json()["items"]:
        nodes[node["metadata"]["name"]] = node
        node["capacity"] = {}
        node["allocatable"] = {}
        node["requests"] = {"cpu": 0, "memory": 0}
        node["usage"] = {"cpu": 0, "memory": 0}
        for k, v in node["status"].get("capacity", {}).items():
            parsed = parse_resource(v)
            node["capacity"][k] = parsed
            cluster_capacity[k] += parsed
        for k, v in node["status"].get("allocatable", {}).items():
            parsed = parse_resource(v)
            node["allocatable"][k] = parsed
            cluster_allocatable[k] += parsed
        role = node["metadata"]["labels"].get("kubernetes.io/role") or "worker"
        node_count[role] += 1
        region = node["metadata"]["labels"].get(
            "failure-domain.beta.kubernetes.io/region", "unknown")
        instance_type = node["metadata"]["labels"].get(
            "beta.kubernetes.io/instance-type", "unknown")
        is_spot = node["metadata"]["labels"].get(NODE_LABEL_SPOT) == "true"
        node["spot"] = is_spot
        node["kubelet_version"] = (node["status"].get("nodeInfo", {}).get(
            "kubeletVersion", ""))
        node["role"] = role
        node["instance_type"] = instance_type
        node["cost"] = pricing.get_node_cost(region, instance_type, is_spot)
        cluster_cost += node["cost"]

    try:
        # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/instrumentation/resource-metrics-api.md
        for i, url in enumerate([
                "/apis/metrics.k8s.io/v1beta1/nodes",
                "/api/v1/namespaces/kube-system/services/heapster/proxy/apis/metrics/v1alpha1/nodes",
        ]):
            try:
                response = request(cluster, url)
                response.raise_for_status()
            except Exception as e:
                if i == 0:
                    logger.warning("Failed to query metrics: %s", e)
                else:
                    raise
            if response.ok:
                break
        for item in response.json()["items"]:
            key = item["metadata"]["name"]
            node = nodes.get(key)
            if node:
                usage = collections.defaultdict(float)
                for k, v in item.get("usage", {}).items():
                    usage[k] += parse_resource(v)
                    cluster_usage[k] += parse_resource(v)
                node["usage"] = usage
    except Exception as e:
        logger.exception("Failed to query Heapster metrics")

    cost_per_cpu = cluster_cost / cluster_allocatable["cpu"]
    cost_per_memory = cluster_cost / cluster_allocatable["memory"]

    response = request(cluster, "/api/v1/pods")
    response.raise_for_status()
    for pod in response.json()["items"]:
        if pod["status"].get("phase") in ("Succeeded", "Failed"):
            # ignore completed pods
            continue
        labels = pod["metadata"].get("labels", {})
        application = labels.get("application", labels.get("app", ""))
        requests = collections.defaultdict(float)
        ns = pod["metadata"]["namespace"]
        for container in pod["spec"]["containers"]:
            for k, v in container["resources"].get("requests", {}).items():
                pv = parse_resource(v)
                requests[k] += pv
                cluster_requests[k] += pv
                if ns not in system_namespaces:
                    user_requests[k] += pv
        if "nodeName" in pod["spec"] and pod["spec"]["nodeName"] in nodes:
            for k in ("cpu", "memory"):
                nodes[pod["spec"]["nodeName"]]["requests"][k] += requests.get(
                    k, 0)
        cost = max(requests["cpu"] * cost_per_cpu,
                   requests["memory"] * cost_per_memory)
        pods[(ns, pod["metadata"]["name"])] = {
            "requests": requests,
            "application": application,
            "cost": cost,
            "usage": {
                "cpu": 0,
                "memory": 0
            },
        }

    hourly_cost = cluster_cost / HOURS_PER_MONTH

    cluster_summary = {
        "cluster":
        cluster,
        "nodes":
        nodes,
        "pods":
        pods,
        "user_pods":
        len([p for ns, p in pods if ns not in system_namespaces]),
        "master_nodes":
        node_count["master"],
        "worker_nodes":
        node_count["worker"],
        "kubelet_versions":
        set([
            n["kubelet_version"] for n in nodes.values()
            if n["role"] == "worker"
        ]),
        "worker_instance_types":
        set([
            n["instance_type"] for n in nodes.values() if n["role"] == "worker"
        ]),
        "worker_instance_is_spot":
        any([n["spot"] for n in nodes.values() if n["role"] == "worker"]),
        "capacity":
        cluster_capacity,
        "allocatable":
        cluster_allocatable,
        "requests":
        cluster_requests,
        "user_requests":
        user_requests,
        "usage":
        cluster_usage,
        "cost":
        cluster_cost,
        "cost_per_user_request_hour": {
            "cpu":
            0.5 * hourly_cost / max(user_requests["cpu"], 1),
            "memory":
            0.5 * hourly_cost / max(user_requests["memory"] / ONE_GIBI, 1),
        },
        "ingresses": [],
    }

    cluster_slack_cost = 0

    try:
        # https://github.com/kubernetes/community/blob/master/contributors/design-proposals/instrumentation/resource-metrics-api.md
        for i, url in enumerate([
                "/apis/metrics.k8s.io/v1beta1/pods",
                "/api/v1/namespaces/kube-system/services/heapster/proxy/apis/metrics/v1alpha1/pods",
        ]):
            try:
                response = request(cluster, url)
                response.raise_for_status()
            except Exception as e:
                if i == 0:
                    logger.warning("Failed to query metrics: %s", e)
                else:
                    raise
            if response.ok:
                break
        for item in response.json()["items"]:
            key = (item["metadata"]["namespace"], item["metadata"]["name"])
            pod = pods.get(key)
            if pod:
                usage = collections.defaultdict(float)
                for container in item["containers"]:
                    for k, v in container.get("usage", {}).items():
                        usage[k] += parse_resource(v)
                pod["usage"] = usage
                usage_cost = max(
                    pod["usage"]["cpu"] * cost_per_cpu,
                    pod["usage"]["memory"] * cost_per_memory,
                )
                pod["slack_cost"] = pod["cost"] - usage_cost
                cluster_slack_cost += pod["slack_cost"]
    except Exception as e:
        logger.exception("Failed to query Heapster metrics")

    cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost)

    response = request(cluster, "/apis/extensions/v1beta1/ingresses")
    response.raise_for_status()

    with FuturesSession(max_workers=10, session=session) as futures_session:
        futures = {}
        for item in response.json()["items"]:
            namespace, name = item["metadata"]["namespace"], item["metadata"][
                "name"]
            labels = item["metadata"].get("labels", {})
            application = labels.get("application", labels.get("app", ""))
            for rule in item["spec"]["rules"]:
                ingress = [namespace, name, application, rule["host"], 0]
                if not no_ingress_status:
                    futures[futures_session.get("https://{}/".format(
                        rule["host"]),
                                                timeout=5)] = ingress
                cluster_summary["ingresses"].append(ingress)

        if not no_ingress_status:
            logger.info("Waiting for ingress status..")
            for future in concurrent.futures.as_completed(futures):
                ingress = futures[future]
                try:
                    response = future.result()
                    status = response.status_code
                except:
                    status = 999
                ingress[4] = status

    return cluster_summary
Esempio n. 3
0
def query_cluster(cluster, executor, system_namespaces,
                  additional_cost_per_cluster, no_ingress_status, node_label):
    logger.info(f"Querying cluster {cluster.id} ({cluster.api_server_url})..")
    pods = {}
    nodes = {}
    namespaces = {}

    for namespace in Namespace.objects(cluster.client):
        email = namespace.annotations.get('email')
        namespaces[namespace.name] = {
            "status": namespace.obj['status']['phase'],
            "email": email,
        }

    cluster_capacity = collections.defaultdict(float)
    cluster_allocatable = collections.defaultdict(float)
    cluster_requests = collections.defaultdict(float)
    user_requests = collections.defaultdict(float)
    node_count = collections.defaultdict(int)
    cluster_cost = additional_cost_per_cluster

    for _node in Node.objects(cluster.client):
        node = _node.obj
        nodes[_node.name] = node
        node["capacity"] = {}
        node["allocatable"] = {}
        node["requests"] = new_resources()
        node["usage"] = new_resources()
        for k, v in node["status"].get("capacity", {}).items():
            parsed = parse_resource(v)
            node["capacity"][k] = parsed
            cluster_capacity[k] += parsed
        for k, v in node["status"].get("allocatable", {}).items():
            parsed = parse_resource(v)
            node["allocatable"][k] = parsed
            cluster_allocatable[k] += parsed
        role = _node.labels.get(NODE_LABEL_ROLE) or "worker"
        node_count[role] += 1
        region = _node.labels.get(NODE_LABEL_REGION, "unknown")
        instance_type = _node.labels.get(NODE_LABEL_INSTANCE_TYPE, "unknown")
        is_spot = _node.labels.get(NODE_LABEL_SPOT) == "true"
        node["spot"] = is_spot
        node["kubelet_version"] = (node["status"].get("nodeInfo", {}).get(
            "kubeletVersion", ""))
        node["role"] = role
        node["instance_type"] = instance_type
        node["cost"] = pricing.get_node_cost(region, instance_type, is_spot)
        cluster_cost += node["cost"]

    get_node_usage(cluster, nodes)

    cluster_usage = collections.defaultdict(float)
    for node in nodes.values():
        for k, v in node['usage'].items():
            cluster_usage[k] += v

    cost_per_cpu = cluster_cost / cluster_allocatable["cpu"]
    cost_per_memory = cluster_cost / cluster_allocatable["memory"]

    for pod in Pod.objects(cluster.client, namespace=pykube.all):
        if pod.obj["status"].get("phase") != "Running":
            # ignore unschedulable/completed pods
            continue
        application = get_application_from_labels(pod.labels)
        component = get_component_from_labels(pod.labels)
        requests = collections.defaultdict(float)
        ns = pod.namespace
        container_images = []
        for container in pod.obj["spec"]["containers"]:
            # note that the "image" field is optional according to Kubernetes docs
            image = container.get("image")
            if image:
                container_images.append(image)
            for k, v in container["resources"].get("requests", {}).items():
                pv = parse_resource(v)
                requests[k] += pv
                cluster_requests[k] += pv
                if ns not in system_namespaces:
                    user_requests[k] += pv
        if "nodeName" in pod.obj["spec"] and pod.obj["spec"][
                "nodeName"] in nodes:
            for k in ("cpu", "memory"):
                nodes[pod.obj["spec"]
                      ["nodeName"]]["requests"][k] += requests.get(k, 0)
        cost = max(requests["cpu"] * cost_per_cpu,
                   requests["memory"] * cost_per_memory)
        pods[(ns, pod.name)] = {
            "requests": requests,
            "application": application,
            "component": component,
            "container_images": container_images,
            "cost": cost,
            "usage": new_resources(),
        }

    hourly_cost = cluster_cost / HOURS_PER_MONTH

    cluster_summary = {
        "cluster":
        cluster,
        "nodes":
        nodes,
        "pods":
        pods,
        "namespaces":
        namespaces,
        "user_pods":
        len([p for ns, p in pods if ns not in system_namespaces]),
        "master_nodes":
        node_count["master"],
        "worker_nodes":
        node_count[node_label],
        "kubelet_versions":
        set([
            n["kubelet_version"] for n in nodes.values()
            if n["role"] == node_label
        ]),
        "worker_instance_types":
        set([
            n["instance_type"] for n in nodes.values()
            if n["role"] == node_label
        ]),
        "worker_instance_is_spot":
        any([n["spot"] for n in nodes.values() if n["role"] == node_label]),
        "capacity":
        cluster_capacity,
        "allocatable":
        cluster_allocatable,
        "requests":
        cluster_requests,
        "user_requests":
        user_requests,
        "usage":
        cluster_usage,
        "cost":
        cluster_cost,
        "cost_per_user_request_hour": {
            "cpu":
            0.5 * hourly_cost /
            max(user_requests["cpu"], MIN_CPU_USER_REQUESTS),
            "memory":
            0.5 * hourly_cost /
            max(user_requests["memory"] / ONE_GIBI, MIN_MEMORY_USER_REQUESTS),
        },
        "ingresses": [],
    }

    get_pod_usage(cluster, pods)

    cluster_slack_cost = 0
    for pod in pods.values():
        usage_cost = max(
            pod["usage"]["cpu"] * cost_per_cpu,
            pod["usage"]["memory"] * cost_per_memory,
        )
        pod["slack_cost"] = pod["cost"] - usage_cost
        cluster_slack_cost += pod["slack_cost"]

    cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost)

    with FuturesSession(max_workers=10, session=session) as futures_session:
        futures_by_host = {}  # hostname -> future
        futures = collections.defaultdict(list)  # future -> [ingress]

        for _ingress in Ingress.objects(cluster.client, namespace=pykube.all):
            application = get_application_from_labels(_ingress.labels)
            for rule in _ingress.obj["spec"].get("rules", []):
                host = rule.get('host', '')
                if not application:
                    # find the application by getting labels from pods
                    backend_application = find_backend_application(
                        cluster.client, _ingress, rule)
                else:
                    backend_application = None
                ingress = [
                    _ingress.namespace, _ingress.name, application
                    or backend_application, host, 0
                ]
                if host and not no_ingress_status:
                    try:
                        future = futures_by_host[host]
                    except KeyError:
                        future = futures_session.get(f"https://{host}/",
                                                     timeout=5)
                        futures_by_host[host] = future
                    futures[future].append(ingress)
                cluster_summary["ingresses"].append(ingress)

        if not no_ingress_status:
            logger.info(
                f'Waiting for ingress status for {cluster.id} ({cluster.api_server_url})..'
            )
            for future in concurrent.futures.as_completed(futures):
                ingresses = futures[future]
                try:
                    response = future.result()
                    status = response.status_code
                except:
                    status = 999
                for ingress in ingresses:
                    ingress[4] = status

    return cluster_summary
Esempio n. 4
0
def query_cluster(cluster, executor, system_namespaces,
                  additional_cost_per_cluster, no_ingress_status, node_label):
    logger.info(f"Querying cluster {cluster.id} ({cluster.api_server_url})..")
    pods = {}
    nodes = {}
    namespaces = {}

    response = request(cluster, "/api/v1/namespaces")
    response.raise_for_status()

    for item in response.json()["items"]:
        email = None
        namespace, status = item["metadata"]["name"], item["status"]["phase"]
        if 'annotations' in item["metadata"]:
            if 'email' in item["metadata"]["annotations"]:
                email = item["metadata"]["annotations"]["email"]
        namespaces[namespace] = {
            "status": status,
            "email": email,
        }

    response = request(cluster, "/api/v1/nodes")
    response.raise_for_status()
    cluster_capacity = collections.defaultdict(float)
    cluster_allocatable = collections.defaultdict(float)
    cluster_requests = collections.defaultdict(float)
    user_requests = collections.defaultdict(float)
    node_count = collections.defaultdict(int)
    cluster_cost = additional_cost_per_cluster
    for node in response.json()["items"]:
        nodes[node["metadata"]["name"]] = node
        node["capacity"] = {}
        node["allocatable"] = {}
        node["requests"] = new_resources()
        node["usage"] = new_resources()
        for k, v in node["status"].get("capacity", {}).items():
            parsed = parse_resource(v)
            node["capacity"][k] = parsed
            cluster_capacity[k] += parsed
        for k, v in node["status"].get("allocatable", {}).items():
            parsed = parse_resource(v)
            node["allocatable"][k] = parsed
            cluster_allocatable[k] += parsed
        role = node["metadata"]["labels"].get(NODE_LABEL_ROLE) or "worker"
        node_count[role] += 1
        region = node["metadata"]["labels"].get(NODE_LABEL_REGION, "unknown")
        instance_type = node["metadata"]["labels"].get(
            NODE_LABEL_INSTANCE_TYPE, "unknown")
        is_spot = node["metadata"]["labels"].get(NODE_LABEL_SPOT) == "true"
        node["spot"] = is_spot
        node["kubelet_version"] = (node["status"].get("nodeInfo", {}).get(
            "kubeletVersion", ""))
        node["role"] = role
        node["instance_type"] = instance_type
        node["cost"] = pricing.get_node_cost(region, instance_type, is_spot)
        cluster_cost += node["cost"]

    get_node_usage(cluster, nodes)

    cluster_usage = collections.defaultdict(float)
    for node in nodes.values():
        for k, v in node['usage'].items():
            cluster_usage[k] += v

    cost_per_cpu = cluster_cost / cluster_allocatable["cpu"]
    cost_per_memory = cluster_cost / cluster_allocatable["memory"]

    response = request(cluster, "/api/v1/pods")
    response.raise_for_status()
    for pod in response.json()["items"]:
        if pod["status"].get("phase") != "Running":
            # ignore unschedulable/completed pods
            continue
        labels = pod["metadata"].get("labels", {})
        application = get_application_from_labels(labels)
        requests = collections.defaultdict(float)
        ns = pod["metadata"]["namespace"]
        for container in pod["spec"]["containers"]:
            for k, v in container["resources"].get("requests", {}).items():
                pv = parse_resource(v)
                requests[k] += pv
                cluster_requests[k] += pv
                if ns not in system_namespaces:
                    user_requests[k] += pv
        if "nodeName" in pod["spec"] and pod["spec"]["nodeName"] in nodes:
            for k in ("cpu", "memory"):
                nodes[pod["spec"]["nodeName"]]["requests"][k] += requests.get(
                    k, 0)
        cost = max(requests["cpu"] * cost_per_cpu,
                   requests["memory"] * cost_per_memory)
        pods[(ns, pod["metadata"]["name"])] = {
            "requests": requests,
            "application": application,
            "cost": cost,
            "usage": new_resources(),
        }

    hourly_cost = cluster_cost / HOURS_PER_MONTH

    cluster_summary = {
        "cluster":
        cluster,
        "nodes":
        nodes,
        "pods":
        pods,
        "namespaces":
        namespaces,
        "user_pods":
        len([p for ns, p in pods if ns not in system_namespaces]),
        "master_nodes":
        node_count["master"],
        "worker_nodes":
        node_count[node_label],
        "kubelet_versions":
        set([
            n["kubelet_version"] for n in nodes.values()
            if n["role"] == node_label
        ]),
        "worker_instance_types":
        set([
            n["instance_type"] for n in nodes.values()
            if n["role"] == node_label
        ]),
        "worker_instance_is_spot":
        any([n["spot"] for n in nodes.values() if n["role"] == node_label]),
        "capacity":
        cluster_capacity,
        "allocatable":
        cluster_allocatable,
        "requests":
        cluster_requests,
        "user_requests":
        user_requests,
        "usage":
        cluster_usage,
        "cost":
        cluster_cost,
        "cost_per_user_request_hour": {
            "cpu":
            0.5 * hourly_cost /
            max(user_requests["cpu"], MIN_CPU_USER_REQUESTS),
            "memory":
            0.5 * hourly_cost /
            max(user_requests["memory"] / ONE_GIBI, MIN_MEMORY_USER_REQUESTS),
        },
        "ingresses": [],
    }

    get_pod_usage(cluster, pods)

    cluster_slack_cost = 0
    for pod in pods.values():
        usage_cost = max(
            pod["usage"]["cpu"] * cost_per_cpu,
            pod["usage"]["memory"] * cost_per_memory,
        )
        pod["slack_cost"] = pod["cost"] - usage_cost
        cluster_slack_cost += pod["slack_cost"]

    cluster_summary["slack_cost"] = min(cluster_cost, cluster_slack_cost)

    response = request(cluster, "/apis/extensions/v1beta1/ingresses")
    response.raise_for_status()

    with FuturesSession(max_workers=10, session=session) as futures_session:
        futures = {}
        for item in response.json()["items"]:
            namespace, name = item["metadata"]["namespace"], item["metadata"][
                "name"]
            labels = item["metadata"].get("labels", {})
            application = get_application_from_labels(labels)
            for rule in item["spec"].get("rules", []):
                host = rule.get('host', '')
                ingress = [namespace, name, application, host, 0]
                if host and not no_ingress_status:
                    futures[futures_session.get(f"https://{host}/",
                                                timeout=5)] = ingress
                cluster_summary["ingresses"].append(ingress)

        if not no_ingress_status:
            logger.info("Waiting for ingress status..")
            for future in concurrent.futures.as_completed(futures):
                ingress = futures[future]
                try:
                    response = future.result()
                    status = response.status_code
                except:
                    status = 999
                ingress[4] = status

    return cluster_summary