def __get_vc_used(self, vc_pod_statuses, vc_jobs_without_pods): vc_used = collections.defaultdict(lambda: ClusterResource()) vc_preemptable_used = collections.defaultdict(lambda: ClusterResource()) for vc_name in self.vc_info: # Account all pods in vc pod_statuses = vc_pod_statuses.get(vc_name, {}) for _, pod_status in pod_statuses.items(): pod_res = ClusterResource( params={ "cpu": pod_status.get("cpu", Cpu()).to_dict(), "memory": pod_status.get("memory", Memory()).to_dict(), "gpu": pod_status.get("gpu", Gpu()).to_dict(), }) vc_used[vc_name] += pod_res pod_preemptable_res = ClusterResource( params={ "preemptable_cpu": pod_status.get("preemptable_cpu", Cpu()).to_dict(), "preemptable_memory": pod_status.get("preemptable_memory", Memory() ).to_dict(), "preemptable_gpu": pod_status.get("preemptable_gpu", Gpu()).to_dict(), }) vc_preemptable_used[vc_name] += pod_preemptable_res # Account all jobs without pods in vc jobs_without_pods = vc_jobs_without_pods.get(vc_name, []) for job in jobs_without_pods: job_params = job["jobParams"] job_res_params = get_resource_params_from_job_params(job_params) job_res = ClusterResource(params=job_res_params) preemption_allowed = job_params.get("preemptionAllowed", False) if not preemption_allowed: vc_used[vc_name] += job_res else: vc_preemptable_used[vc_name] += job_res logger.info("Added job %s resource %s to the usage of vc %s", job, job_res, vc_name) return vc_used, vc_preemptable_used
def gen_user_statuses(self): user_statuses = {} user_statuses_preemptable = {} for _, pod_status in self.pod_statuses.items(): username = pod_status["username"] gpu = pod_status["gpu"] preemptable_gpu = pod_status["preemptable_gpu"] cpu = pod_status["cpu"] preemptable_cpu = pod_status["preemptable_cpu"] memory = pod_status["memory"] preemptable_memory = pod_status["preemptable_memory"] if username is not None: if username not in user_statuses: user_statuses[username] = { "gpu": Gpu(), "cpu": Cpu(), "memory": Memory() } if username not in user_statuses_preemptable: user_statuses_preemptable[username] = { "gpu": Gpu(), "cpu": Cpu(), "memory": Memory() } user_statuses[username]["gpu"] += gpu user_statuses[username]["cpu"] += cpu user_statuses[username]["memory"] += memory user_statuses_preemptable[username]["gpu"] += preemptable_gpu user_statuses_preemptable[username]["cpu"] += preemptable_cpu user_statuses_preemptable[username]["memory"] += \ preemptable_memory self.user_statuses = user_statuses self.user_statuses_preemptable = user_statuses_preemptable self.__adjust_user_statuses()
def __adjust_user_statuses(self): # Adjust with jobs that have not been scheduled on k8s. # Add to corresponding user usage for job in self.jobs_without_pods: job_params = job["jobParams"] job_res_params = get_resource_params_from_job_params(job_params) job_res = ClusterResource(params=job_res_params) username = job["userName"].split("@")[0].strip() if username not in self.user_statuses: self.user_statuses[username] = { "gpu": Gpu(), "cpu": Cpu(), "memory": Memory() } if username not in self.user_statuses_preemptable: self.user_statuses_preemptable[username] = { "gpu": Gpu(), "cpu": Cpu(), "memory": Memory() } preemption_allowed = job_params.get("preemptionAllowed", False) if not preemption_allowed: self.user_statuses[username]["gpu"] += job_res.gpu self.user_statuses[username]["cpu"] += job_res.cpu self.user_statuses[username]["memory"] += job_res.memory logger.info("Added job %s resource %s to used for user %s", job, job_res, username) else: self.user_statuses_preemptable[username]["gpu"] += job_res.gpu self.user_statuses_preemptable[username]["cpu"] += job_res.cpu self.user_statuses_preemptable[username]["memory"] += \ job_res.memory logger.info( "Added job %s resource %s to preemptable used for " "user %s", job, job_res, username)
def __gen_pod_statuses(self): gpu_str = "nvidia.com/gpu" cpu_str = "cpu" mem_str = "memory" self.pod_statuses = {} for pod in self.pods: # pod is of class 'kubernetes.client.models.v1_pod.V1Pod' if pod.metadata is None: continue if pod.status is None: continue phase = pod.status.phase if phase in ["Succeeded", "Failed"]: continue if pod.spec is None: continue name = pod.metadata.name namespace = pod.metadata.namespace labels = pod.metadata.labels node_selector = pod.spec.node_selector node_name = pod.spec.node_name gpu_type = "" job_id = None vc_name = None if labels is not None: gpu_type = labels.get("gpuType", "") job_id = labels.get("jobId") vc_name = labels.get("vcName") sku = "" if node_selector is not None: sku = node_selector.get("sku", "") if sku == "" and node_name is not None: node = self.node_statuses.get(node_name, {}) node_labels = node.get("labels") if node_labels is not None: sku = node_labels.get("sku", "") username = None if labels is not None and "userName" in labels: username = labels.get("userName") preemption_allowed = False if labels is not None and "preemptionAllowed" in labels: preemption_allowed = str2bool(labels["preemptionAllowed"]) pod_name = name if username is not None: pod_name += " : " + username gpu_usage = self.__job_gpu_usage(name) if gpu_usage is not None: pod_name += " (gpu usage:%s%%)" % gpu_usage if gpu_usage <= 25: pod_name += "!!!!!!" gpu = Gpu() preemptable_gpu = Gpu() cpu = Cpu() preemptable_cpu = Cpu() memory = Memory() preemptable_memory = Memory() containers = pod.spec.containers if containers is not None: for container in containers: # container is of class # 'kubernetes.client.models.v1_container.V1Container' curr_container_gpu = 0 container_gpu = Gpu() container_cpu = Cpu() container_memory = Memory() # resources is of class # 'kubernetes.client.models.v1_resource_requirements # .V1ResourceRequirements' resources = container.resources r_requests = {} if resources.requests is not None: r_requests = resources.requests if gpu_str in r_requests: curr_container_gpu = int(r_requests[gpu_str]) container_gpu = Gpu({sku: curr_container_gpu}) if cpu_str in r_requests: container_cpu = Cpu({sku: r_requests[cpu_str]}) if mem_str in r_requests: container_memory = Memory({sku: r_requests[mem_str]}) if preemption_allowed: preemptable_gpu += container_gpu preemptable_cpu += container_cpu preemptable_memory += container_memory else: gpu += container_gpu cpu += container_cpu memory += container_memory pod_name += " (gpu #:%s)" % curr_container_gpu pod_status = { "name": name, "pod_name": pod_name, "job_id": job_id, "vc_name": vc_name, "namespace": namespace, "node_name": node_name, "username": username, "preemption_allowed": preemption_allowed, "gpu": gpu, "preemptable_gpu": preemptable_gpu, "cpu": cpu, "preemptable_cpu": preemptable_cpu, "memory": memory, "preemptable_memory": preemptable_memory, "gpuType": gpu_type, "gpu_usage": gpu_usage, } self.pod_statuses[name] = pod_status
def __gen_node_statuses(self): gpu_str = "nvidia.com/gpu" cpu_str = "cpu" mem_str = "memory" self.node_statuses = {} for node in self.nodes: # node is of class 'kubernetes.client.models.v1_node.V1Node' if node.metadata is None: continue if node.spec is None: continue if node.status is None: continue name = node.metadata.name labels = node.metadata.labels gpu_type = "" sku = "" scheduled_service = [] if labels is not None: for label, status in labels.items(): if status == "active" and label not in ["all", "default"]: scheduled_service.append(label) if label == "gpuType": scheduled_service.append(status) gpu_type = status if label == "sku": scheduled_service.append(status) sku = status if node.status is None: continue allocatable = node.status.allocatable gpu_allocatable = Gpu() cpu_allocatable = Cpu() mem_allocatable = Memory() if allocatable is not None: if gpu_str in allocatable: gpu_num = int(allocatable[gpu_str]) gpu_allocatable = Gpu({sku: gpu_num}) if cpu_str in allocatable: cpu_num = allocatable[cpu_str] cpu_allocatable = Cpu({sku: cpu_num}) if mem_str in allocatable: mem_num = allocatable[mem_str] mem_allocatable = Memory({sku: mem_num}) capacity = node.status.capacity gpu_capacity = Gpu() cpu_capacity = Cpu() mem_capacity = Memory() if capacity is not None: if gpu_str in capacity: gpu_num = int(capacity[gpu_str]) gpu_capacity = Gpu({sku: gpu_num}) if cpu_str in capacity: cpu_num = capacity[cpu_str] cpu_capacity = Cpu({sku: cpu_num}) if mem_str in capacity: mem_num = capacity[mem_str] mem_capacity = Memory({sku: mem_num}) internal_ip = "unknown" addresses = node.status.addresses if addresses is not None: for addr in addresses: # addr is of class # 'kubernetes.client.models.v1_node_address.V1NodeAddress' if addr.type == "InternalIP": internal_ip = addr.address unschedulable = node.spec.unschedulable if unschedulable is not None and unschedulable is True: unschedulable = True else: unschedulable = False conditions = node.status.conditions if conditions is not None: for cond in conditions: # cond is of class # 'kubernetes.client.models.v1_node_condition # .V1NodeCondition' if cond.type == "Ready" and cond.status != "True": unschedulable = True node_status = { "name": name, "labels": labels, "gpuType": gpu_type, "scheduled_service": scheduled_service, "gpu_allocatable": gpu_allocatable, "gpu_capacity": gpu_capacity, "gpu_used": Gpu(), "gpu_preemptable_used": Gpu(), "cpu_allocatable": cpu_allocatable, "cpu_capacity": cpu_capacity, "cpu_used": Cpu(), "cpu_preemptable_used": Cpu(), "memory_allocatable": mem_allocatable, "memory_capacity": mem_capacity, "memory_used": Memory(), "memory_preemptable_used": Memory(), "InternalIP": internal_ip, "pods": [], "unschedulable": unschedulable } self.node_statuses[name] = node_status
def get_vc_statuses(self): vc_statuses = { "vc1": VirtualClusterStatus("vc1", {}, self.cluster_status, {}, {}, {}), "vc2": VirtualClusterStatus("vc2", {}, self.cluster_status, {}, {}, {}), } vc1_status = vc_statuses["vc1"] vc2_status = vc_statuses["vc2"] # Set vc1 resource count vc1_status.gpu_capacity = Gpu({"m_type1": 2, "m_type2": 0}) vc1_status.gpu_used = Gpu({"m_type1": 2, "m_type2": 0}) vc1_status.gpu_preemptable_used = Gpu() vc1_status.gpu_available = Gpu({"m_type1": 0, "m_type2": 0}) vc1_status.gpu_unschedulable = Gpu() vc1_status.gpu_reserved = Gpu() vc1_status.cpu_capacity = Cpu({"m_type1": 8, "m_type2": 16}) vc1_status.cpu_used = Cpu({"m_type1": 5, "m_type2": 16}) vc1_status.cpu_preemptable_used = Cpu() vc1_status.cpu_available = Cpu({"m_type1": 3, "m_type2": 0}) vc1_status.cpu_unschedulable = Cpu() vc1_status.cpu_reserved = Cpu() vc1_status.memory_capacity = Memory({ "m_type1": "92160Mi", "m_type2": "348160Mi" }) vc1_status.memory_used = Memory({ "m_type1": "92160Mi", "m_type2": "348160Mi" }) vc1_status.memory_preemptable_used = Memory() vc1_status.memory_available = Memory({ "m_type1": "0Mi", "m_type2": "0Mi" }) vc1_status.memory_unschedulable = Memory() vc1_status.memory_reserved = Memory() # Set vc1 node and pod status vc1_status.node_status = self.node_status vc1_status.pod_status = [self.pod_status[i] for i in [0, 1, 4]] # Set vc1 user status user_status = [ { "userName": "******", "userGPU": Gpu({"m_type1": 2}), "userCPU": Cpu({"m_type1": 5}), "userMemory": Memory({"m_type1": "92160Mi"}), }, { "userName": "******", "userGPU": Gpu(), "userCPU": Cpu({"m_type2": 16}), "userMemory": Memory({"m_type2": "348160Mi"}), }, ] vc1_status.user_status = user_status user_status_preemptable = [{ "userName": "******" % i, "userGPU": Gpu(), "userCPU": Cpu(), "userMemory": Memory(), } for i in [1, 2]] vc1_status.user_status_preemptable = user_status_preemptable # Set vc1 active job count vc1_status.available_job_num = 3 # Set vc2 resource count vc2_status.gpu_capacity = Gpu({ "m_type1": 2, "m_type2": 0, "m_type3": 4 }) vc2_status.gpu_used = Gpu({"m_type1": 2, "m_type2": 0, "m_type3": 2}) vc2_status.gpu_preemptable_used = Gpu() vc2_status.gpu_available = Gpu({ "m_type1": 0, "m_type2": 0, "m_type3": 0 }) vc2_status.gpu_unschedulable = Gpu({ "m_type1": 0, "m_type2": 0, "m_type3": 2 }) vc2_status.gpu_reserved = Gpu({ "m_type1": 0, "m_type2": 0, "m_type3": 2 }) vc2_status.cpu_capacity = Cpu({ "m_type1": 2, "m_type2": 4, "m_type3": 12 }) vc2_status.cpu_used = Cpu({"m_type1": 2, "m_type2": 1, "m_type3": 6}) vc2_status.cpu_preemptable_used = Cpu({"m_type2": 1}) vc2_status.cpu_available = Cpu({ "m_type1": 0, "m_type2": 3, "m_type3": 0 }) vc2_status.cpu_unschedulable = Cpu({ "m_type1": 0, "m_type2": 0, "m_type3": 6 }) vc2_status.cpu_reserved = Cpu({ "m_type1": 0, "m_type2": 0, "m_type3": 6 }) vc2_status.memory_capacity = Memory({ "m_type1": "10240Mi", "m_type2": "61440Mi", "m_type3": "102400Mi" }) vc2_status.memory_used = Memory({ "m_type1": "2048Mi", "m_type2": "0Mi", "m_type3": "61440Mi" }) vc2_status.memory_preemptable_used = Memory() vc2_status.memory_available = Memory({ "m_type1": "8192Mi", "m_type2": "61440Mi", "m_type3": "0Mi" }) vc2_status.memory_unschedulable = Memory({ "m_type1": "0Mi", "m_type2": "0Mi", "m_type3": "40960Mi" }) vc2_status.memory_reserved = Memory({ "m_type1": "0Mi", "m_type2": "0Mi", "m_type3": "40960Mi" }) # Set vc2 node and pod status vc2_status.node_status = self.node_status vc2_status.pod_status = self.pod_status[2:4] # Set vc2 user status user_status = [ { "userName": "******", "userGPU": Gpu({"m_type3": 2}), "userCPU": Cpu({"m_type3": 6}), "userMemory": Memory({"m_type3": "61440Mi"}), }, { "userName": "******", "userGPU": Gpu({ "m_type1": 2, "m_type2": 0 }), "userCPU": Cpu({ "m_type1": 2, "m_type2": 1 }), "userMemory": Memory({ "m_type1": "2048Mi", "m_type2": "0Mi" }), }, ] vc2_status.user_status = user_status user_status_preemptable = [{ "userName": "******", "userGPU": Gpu(), "userCPU": Cpu(), "userMemory": Memory(), }, { "userName": "******", "userGPU": Gpu(), "userCPU": Cpu({"m_type2": 1}), "userMemory": Memory(), }] vc2_status.user_status_preemptable = user_status_preemptable # Set vc2 active job count vc2_status.available_job_num = 4 return vc_statuses
def get_cluster_status(self): cs = ClusterStatus({}, {}, []) # Set resource count cs.gpu_capacity = Gpu({"m_type1": 4, "m_type3": 4}) cs.gpu_used = Gpu({"m_type1": 4, "m_type3": 2}) cs.gpu_preemptable_used = Gpu() cs.gpu_available = Gpu({"m_type1": 0}) cs.gpu_unschedulable = Gpu({"m_type3": 4}) cs.gpu_reserved = Gpu({"m_type3": 2}) cs.cpu_capacity = Cpu({"m_type1": 10, "m_type2": 20, "m_type3": 12}) cs.cpu_used = Cpu({"m_type1": 7, "m_type2": 17, "m_type3": 6}) cs.cpu_preemptable_used = Cpu({"m_type2": 1}) cs.cpu_available = Cpu({"m_type1": 3, "m_type2": 3}) cs.cpu_unschedulable = Cpu({"m_type3": 12}) cs.cpu_reserved = Cpu({"m_type3": 6}) cs.memory_capacity = Memory({ "m_type1": "102400Mi", "m_type2": "409600Mi", "m_type3": "102400Mi", }) cs.memory_used = Memory({ "m_type1": "94208Mi", "m_type2": "348160Mi", "m_type3": "61440Mi", }) cs.memory_preemptable_used = Memory() cs.memory_available = Memory({ "m_type1": "8192Mi", "m_type2": "61440Mi", }) cs.memory_unschedulable = Memory({"m_type3": "102400Mi"}) cs.memory_reserved = Memory({"m_type3": "40960Mi"}) # Set node and pod status cs.node_status = self.node_status cs.pod_status = self.pod_status # Set cluster user status user_status = [ { "userName": "******", "userGPU": Gpu({ "m_type1": 2, "m_type3": 2 }), "userCPU": Cpu({ "m_type1": 5, "m_type3": 6 }), "userMemory": Memory({ "m_type1": "92160Mi", "m_type3": "61440Mi" }), }, { "userName": "******", "userGPU": Gpu(), "userCPU": Cpu({"m_type2": 16}), "userMemory": Memory({"m_type2": "348160Mi"}), }, { "userName": "******", "userGPU": Gpu({"m_type1": 2}), "userCPU": Cpu({ "m_type1": 2, "m_type2": 1 }), "userMemory": Memory({"m_type1": "2048Mi"}), }, ] cs.user_status = user_status user_status_preemptable = [{ "userName": "******" % i, "userGPU": Gpu(), "userCPU": Cpu(), "userMemory": Memory(), } for i in range(1, 3)] user_status_preemptable.append({ "userName": "******", "userGPU": Gpu(), "userCPU": Cpu({"m_type2": 1}), "userMemory": Memory(), }) cs.user_status_preemptable = user_status_preemptable # Cluster active jobs cs.jobs = self.jobs cs.available_job_num = 7 return cs
def get_pod_status(self): # Cluster pod status pod1_status = { "name": "pod1", "pod_name": "pod1 : user1 (gpu #:1)", "job_id": "j1", "vc_name": "vc1", "namespace": "default", "node_name": "node1", "username": "******", "preemption_allowed": False, "gpu": Gpu({"m_type1": 1}), "preemptable_gpu": Gpu(), "cpu": Cpu({"m_type1": 4}), "preemptable_cpu": Cpu(), "memory": Memory({"m_type1": "81920Mi"}), "preemptable_memory": Memory(), "gpuType": "P40", "gpu_usage": None, } pod2_status = { "name": "pod2", "pod_name": "pod2 : user2 (gpu #:0)", "job_id": "j2", "vc_name": "vc1", "namespace": "default", "node_name": "node2", "username": "******", "preemption_allowed": False, "gpu": Gpu(), "preemptable_gpu": Gpu(), "cpu": Cpu({"m_type2": 16}), "preemptable_cpu": Cpu(), "memory": Memory({"m_type2": "348160Mi"}), "preemptable_memory": Memory(), "gpuType": "", "gpu_usage": None, } pod3_status = { "name": "pod3", "pod_name": "pod3 : user3 (gpu #:2)", "job_id": "j3", "vc_name": "vc2", "namespace": "kube-system", "node_name": "node1", "username": "******", "preemption_allowed": False, "gpu": Gpu({"m_type1": 2}), "preemptable_gpu": Gpu(), "cpu": Cpu({"m_type1": 2}), "preemptable_cpu": Cpu(), "memory": Memory({"m_type1": "2048Mi"}), "preemptable_memory": Memory(), "gpuType": "P40", "gpu_usage": None, } pod4_status = { "name": "pod4", "pod_name": "pod4 : user1 (gpu #:2)", "job_id": "j4", "vc_name": "vc2", "namespace": "default", "node_name": "node3", "username": "******", "preemption_allowed": False, "gpu": Gpu({"m_type3": 2}), "preemptable_gpu": Gpu(), "cpu": Cpu({"m_type3": 6}), "preemptable_cpu": Cpu(), "memory": Memory({"m_type3": "61440Mi"}), "preemptable_memory": Memory(), "gpuType": "P40", "gpu_usage": None, } pod5_status = { "name": "pod5", "pod_name": "pod5 : user1 (gpu #:1)", "job_id": "j7", "vc_name": "vc1", "namespace": "default", "node_name": None, "username": "******", "preemption_allowed": False, "gpu": Gpu({"m_type1": 1}), "preemptable_gpu": Gpu(), "cpu": Cpu({"m_type1": 1}), "preemptable_cpu": Cpu(), "memory": Memory({"m_type1": "10240Mi"}), "preemptable_memory": Memory(), "gpuType": "P40", "gpu_usage": None, } return [ pod1_status, pod2_status, pod3_status, pod4_status, pod5_status ]
def get_node_status(self): # Cluster node status node1_status = { "name": "node1", "labels": { "gpuType": "P40", "sku": "m_type1", "worker": "active" }, "gpuType": "P40", "scheduled_service": ["P40", "m_type1", "worker"], "gpu_allocatable": Gpu({"m_type1": 4}), "gpu_capacity": Gpu({"m_type1": 4}), "gpu_used": Gpu({"m_type1": 3}), "gpu_preemptable_used": Gpu({}), "cpu_allocatable": Cpu({"m_type1": 10}), "cpu_capacity": Cpu({"m_type1": 10}), "cpu_used": Cpu({"m_type1": 6}), "cpu_preemptable_used": Cpu({}), "memory_allocatable": Memory({"m_type1": "102400Mi"}), "memory_capacity": Memory({"m_type1": "102400Mi"}), "memory_used": Memory({"m_type1": "83968Mi"}), "memory_preemptable_used": Memory({}), "InternalIP": "10.0.0.1", "pods": ["pod1 : user1 (gpu #:1)"], "unschedulable": False } node2_status = { "name": "node2", "labels": { "sku": "m_type2", "worker": "active" }, "gpuType": "", "scheduled_service": ["m_type2", "worker"], "gpu_allocatable": Gpu({}), "gpu_capacity": Gpu({}), "gpu_used": Gpu({}), "gpu_preemptable_used": Gpu({}), "cpu_allocatable": Cpu({"m_type2": 20}), "cpu_capacity": Cpu({"m_type2": 20}), "cpu_used": Cpu({"m_type2": 16}), "cpu_preemptable_used": Cpu({}), "memory_allocatable": Memory({"m_type2": "409600Mi"}), "memory_capacity": Memory({"m_type2": "409600Mi"}), "memory_used": Memory({"m_type2": "348160Mi"}), "memory_preemptable_used": Memory({}), "InternalIP": "10.0.0.2", "pods": ["pod2 : user2 (gpu #:0)"], "unschedulable": False } node3_status = { "name": "node3", "labels": { "gpuType": "P40", "sku": "m_type3", "worker": "active" }, "gpuType": "P40", "scheduled_service": ["P40", "m_type3", "worker"], "gpu_allocatable": Gpu({"m_type3": 4}), "gpu_capacity": Gpu({"m_type3": 4}), "gpu_used": Gpu({"m_type3": 2}), "gpu_preemptable_used": Gpu({}), "cpu_allocatable": Cpu({"m_type3": 12}), "cpu_capacity": Cpu({"m_type3": 12}), "cpu_used": Cpu({"m_type3": 6}), "cpu_preemptable_used": Cpu({}), "memory_allocatable": Memory({"m_type3": "102400Mi"}), "memory_capacity": Memory({"m_type3": "102400Mi"}), "memory_used": Memory({"m_type3": "61440Mi"}), "memory_preemptable_used": Memory({}), "InternalIP": "10.0.0.3", "pods": ["pod4 : user1 (gpu #:2)"], "unschedulable": True } return [node1_status, node2_status, node3_status]