def test_ceil(self): v = ClusterResource( params={ "cpu": { "r1": "1.5", }, "memory": { "r1": "100.2", }, "gpu": { "r1": "10.4", }, "gpu_memory": { "r1": "199.9", }, }).ceil expected = ClusterResource( params={ "cpu": { "r1": "2", }, "memory": { "r1": "101", }, "gpu": { "r1": "11", }, "gpu_memory": { "r1": "200", }, }) self.assertEqual(expected, v)
def test_floor(self): v = ClusterResource( params={ "cpu": { "r1": "1.5", }, "memory": { "r1": "100.2", }, "gpu": { "r1": "10.4", }, "gpu_memory": { "r1": "199.9", }, }).floor expected = ClusterResource( params={ "cpu": { "r1": "1", }, "memory": { "r1": "100", }, "gpu": { "r1": "10", }, "gpu_memory": { "r1": "199", }, }) self.assertEqual(expected, v)
def setUp(self): v1_params = { "cpu": { "r1": "2", "r2": "4", }, "memory": { "r1": "100Ki", "r2": "200Ki", }, "gpu": { "r1": "1", "r2": "2", }, "gpu_memory": { "r1": "100Ki", "r2": "200Ki", }, } self.v1 = ClusterResource(params=v1_params) v2_params = { "cpu": { "r1": "2", "r2": "2", }, "memory": { "r1": "400Ki", "r2": "100Ki", }, "gpu": { "r1": "4", "r2": "4", }, "gpu_memory": { "r1": "400Ki", "r2": "400Ki", }, } self.v2 = ClusterResource(params=v2_params) v3_params = { "cpu": { "r1": "0.5", }, "memory": { "r1": "0.5", }, "gpu": { "r1": "1", }, "gpu_memory": { "r1": "1", }, } self.v3 = ClusterResource(params=v3_params) self.scalar = 0.5
def __get_vc_used(self, vc_pod_statuses, vc_jobs_without_pods): vc_used = collections.defaultdict(lambda: ClusterResource()) vc_preemptable_used = collections.defaultdict(lambda: ClusterResource()) for vc_name in self.vc_info: # Account all pods in vc pod_statuses = vc_pod_statuses.get(vc_name, {}) for _, pod_status in pod_statuses.items(): pod_res = ClusterResource( params={ "cpu": pod_status.get("cpu", Cpu()).to_dict(), "memory": pod_status.get("memory", Memory()).to_dict(), "gpu": pod_status.get("gpu", Gpu()).to_dict(), }) vc_used[vc_name] += pod_res pod_preemptable_res = ClusterResource( params={ "preemptable_cpu": pod_status.get("preemptable_cpu", Cpu()).to_dict(), "preemptable_memory": pod_status.get("preemptable_memory", Memory() ).to_dict(), "preemptable_gpu": pod_status.get("preemptable_gpu", Gpu()).to_dict(), }) vc_preemptable_used[vc_name] += pod_preemptable_res # Account all jobs without pods in vc jobs_without_pods = vc_jobs_without_pods.get(vc_name, []) for job in jobs_without_pods: job_params = job["jobParams"] job_res_params = get_resource_params_from_job_params(job_params) job_res = ClusterResource(params=job_res_params) preemption_allowed = job_params.get("preemptionAllowed", False) if not preemption_allowed: vc_used[vc_name] += job_res else: vc_preemptable_used[vc_name] += job_res logger.info("Added job %s resource %s to the usage of vc %s", job, job_res, vc_name) return vc_used, vc_preemptable_used
def get_cluster_schedulable_from_unschedulable(cluster_status): # Compute cluster schedulable resource cluster_capacity = ClusterResource( params={ "cpu": cluster_status["cpu_capacity"], "memory": cluster_status["memory_capacity"], "gpu": cluster_status["gpu_capacity"], }) cluster_unschedulable = ClusterResource( params={ "cpu": cluster_status["cpu_unschedulable"], "memory": cluster_status["memory_unschedulable"], "gpu": cluster_status["gpu_unschedulable"], }) cluster_schedulable = cluster_capacity - cluster_unschedulable cluster_schedulable = discount_cluster_resource(cluster_schedulable) return cluster_schedulable
def get_vc_info(vc_list): vc_info = {} for vc in vc_list: resource_quota = {} try: resource_quota = json.loads(vc["resourceQuota"]) except: logger.exception("Parsing resourceQuota failed for %s", vc) vc_info[vc["vcName"]] = ClusterResource(params=resource_quota) return vc_info
def test_idiv(self): v = copy.deepcopy(self.v1) v /= self.scalar expected = ClusterResource( params={ "cpu": { "r1": "4", "r2": "8", }, "memory": { "r1": "200Ki", "r2": "400Ki", }, "gpu": { "r1": "2", "r2": "4", }, "gpu_memory": { "r1": "200Ki", "r2": "400Ki", }, }) self.assertEqual(expected, v) v = copy.deepcopy(self.v1) v /= self.v3 expected = ClusterResource( params={ "cpu": { "r1": "4", }, "memory": { "r1": "200Ki", }, "gpu": { "r1": "1", }, "gpu_memory": { "r1": "100Ki", }, }) self.assertEqual(expected, v)
def get_cluster_schedulable(cluster_status): # Compute cluster schedulable resource cluster_capacity = ClusterResource( params={ "cpu": cluster_status["cpu_capacity"], "memory": cluster_status["memory_capacity"], "gpu": cluster_status["gpu_capacity"], }) # On 1 node, reserved = unschedulable - used cluster_reserved = ClusterResource( params={ "cpu": cluster_status["cpu_reserved"], "memory": cluster_status["memory_reserved"], "gpu": cluster_status["gpu_reserved"], }) cluster_schedulable = cluster_capacity - cluster_reserved cluster_schedulable = discount_cluster_resource(cluster_schedulable) logger.info("cluster schedulable: %s", cluster_schedulable) return cluster_schedulable
def get_jobs_info(jobs): priority_dict = get_priority_dict() jobs_info = [] for job in jobs: job_status = job.get("jobStatus") if job_status in ["queued", "scheduling", "running"]: job_params = json.loads(base64decode(job["jobParams"])) preemption_allowed = job_params.get("preemptionAllowed", False) job_id = job_params["jobId"] job_res = get_resource_params_from_job_params(job_params) job_resource = ClusterResource(params=job_res) # Job lists will be sorted based on and in the order of below # 1. non-preemptible precedes preemptible # 2. running precedes scheduling, precedes queued # 3. larger priority value precedes lower priority value # 4. early job time precedes later job time # Non-Preemptible jobs first preemptible = 1 if preemption_allowed else 0 # Job status job_status_key = 0 if job["jobStatus"] == "scheduling": job_status_key = 1 elif job["jobStatus"] == "queued": job_status_key = 2 # Priority value reverse_priority = get_job_priority(priority_dict, job_id) priority = 999999 - reverse_priority # Job time queue_time = int(datetime.datetime.timestamp(job["lastUpdated"])) sort_key = "{}_{}_{:06d}_{}".format(preemptible, job_status_key, priority, queue_time) single_job_info = { "job": job, "preemptionAllowed": preemption_allowed, "jobId": job_id, "job_resource": job_resource, "sort_key": sort_key, "allowed": False, } jobs_info.append(single_job_info) jobs_info.sort(key=lambda x: x["sort_key"]) return jobs_info
def __get_valid_vc_usage(vc_info, vc_usage): valid_vc_usage = collections.defaultdict(lambda: ClusterResource()) for vc_name, usage in vc_usage.items(): if vc_name not in vc_info: logger.warning( "ignore used resource in %s. vc quota do not have this vc, " "possible due to job template error", vc_name) else: valid_vc_usage[vc_name] = usage return valid_vc_usage
def __get_cluster_resource_count(self): cluster = self.cluster_status capacity = ClusterResource( params={ "cpu": cluster.cpu_capacity, "memory": cluster.memory_capacity, "gpu": cluster.gpu_capacity, }) avail = ClusterResource( params={ "cpu": cluster.cpu_available, "memory": cluster.memory_available, "gpu": cluster.gpu_available, }) reserved = ClusterResource( params={ "cpu": cluster.cpu_reserved, "memory": cluster.memory_reserved, "gpu": cluster.gpu_reserved, }) return capacity, avail, reserved
def test_mul(self): result = self.v1 * self.scalar expected = ClusterResource( params={ "cpu": { "r1": "1", "r2": "2", }, "memory": { "r1": "50Ki", "r2": "100Ki", }, "gpu": { "r1": "0.5", "r2": "1", }, "gpu_memory": { "r1": "50Ki", "r2": "100Ki", }, }) self.assertEqual(expected, result) result = self.v1 * self.v3 expected = ClusterResource( params={ "cpu": { "r1": "1", }, "memory": { "r1": "50Ki", }, "gpu": { "r1": "1", }, "gpu_memory": { "r1": "100Ki", }, }) self.assertEqual(expected, result)
def test_truediv(self): result = self.v1 / self.scalar expected = ClusterResource( params={ "cpu": { "r1": "4", "r2": "8", }, "memory": { "r1": "200Ki", "r2": "400Ki", }, "gpu": { "r1": "2", "r2": "4", }, "gpu_memory": { "r1": "200Ki", "r2": "400Ki", }, }) self.assertEqual(expected, result) result = self.v1 / self.v3 expected = ClusterResource( params={ "cpu": { "r1": "4", }, "memory": { "r1": "200Ki", }, "gpu": { "r1": "1", }, "gpu_memory": { "r1": "100Ki", }, }) self.assertEqual(expected, result)
def get_vc_schedulables(cluster_status): # Compute VC schedulable resources vc_statuses = cluster_status.get("vc_statuses", {}) vc_schedulables = {} for vc_name, vc_status in vc_statuses.items(): vc_capacity = ClusterResource( params={ "cpu": vc_status["cpu_capacity"], "memory": vc_status["memory_capacity"], "gpu": vc_status["gpu_capacity"], }) vc_unschedulable = ClusterResource( params={ "cpu": vc_status["cpu_unschedulable"], "memory": vc_status["memory_unschedulable"], "gpu": vc_status["gpu_unschedulable"], }) vc_schedulable = vc_capacity - vc_unschedulable vc_schedulables[vc_name] = discount_cluster_resource(vc_schedulable) logger.info("vc schedulables: %s", vc_schedulables) return vc_schedulables
def __adjust_resource_status(self): # Adjust with jobs that have not been scheduled on k8s. # Subtract from cluster available # Add to cluster used for job in self.jobs_without_pods: job_params = job["jobParams"] job_res_params = get_resource_params_from_job_params(job_params) job_res = ClusterResource(params=job_res_params) preemption_allowed = job_params.get("preemptionAllowed", False) if not preemption_allowed: self.gpu_available -= job_res.gpu self.cpu_available -= job_res.cpu self.memory_available -= job_res.memory self.gpu_used += job_res.gpu self.cpu_used += job_res.cpu self.memory_used += job_res.memory logger.info("Added job %s resource %s to used", job, job_res) else: self.gpu_preemptable_used += job_res.gpu self.cpu_preemptable_used += job_res.cpu self.memory_preemptable_used += job_res.memory logger.info("Added job %s resource %s to preemptable used", job, job_res) # Account pods without node assignment. # This occurs when fragmentation happens and job manager still let # through jobs because there is still remaining quota. for name, pod_status in self.pods_without_node_assignment.items(): if pod_status["preemption_allowed"] is False: self.gpu_used += pod_status["gpu"] self.cpu_used += pod_status["cpu"] self.memory_used += pod_status["memory"] self.gpu_available -= pod_status["gpu"] self.cpu_available -= pod_status["cpu"] self.memory_available -= pod_status["memory"] else: self.gpu_preemptable_used += pod_status["preemptable_gpu"] self.cpu_preemptable_used += pod_status["preemptable_cpu"] self.memory_preemptable_used += pod_status[ "preemptable_memory"]
def test_repr(self): v = ClusterResource( params={ "cpu": { "r1": "1m", }, "memory": { "r1": "100Ki", }, "gpu": { "r1": "4", }, "gpu_memory": { "r1": "200Ki" } }) self.assertEqual( "{'cpu': {'r1': %s}, 'memory': {'r1': %s}, " "'gpu': {'r1': %s}, 'gpu_memory': {'r1': %s}}" % (0.001, float(102400), float(4), float(204800)), repr(v))
def test_iadd(self): self.v1 += self.v2 expected = ClusterResource( params={ "cpu": { "r1": "4", "r2": "6", }, "memory": { "r1": "500Ki", "r2": "300Ki", }, "gpu": { "r1": "5", "r2": "6", }, "gpu_memory": { "r1": "500Ki", "r2": "600Ki", }, }) self.assertEqual(expected, self.v1)
def test_isub(self): self.v1 -= self.v2 expected = ClusterResource( params={ "cpu": { "r1": "0", "r2": "2", }, "memory": { "r1": "0", "r2": "100Ki", }, "gpu": { "r1": "0", "r2": "0", }, "gpu_memory": { "r1": "0", "r2": "0", }, }) self.assertEqual(expected, self.v1)
def __adjust_user_statuses(self): # Adjust with jobs that have not been scheduled on k8s. # Add to corresponding user usage for job in self.jobs_without_pods: job_params = job["jobParams"] job_res_params = get_resource_params_from_job_params(job_params) job_res = ClusterResource(params=job_res_params) username = job["userName"].split("@")[0].strip() if username not in self.user_statuses: self.user_statuses[username] = { "gpu": Gpu(), "cpu": Cpu(), "memory": Memory() } if username not in self.user_statuses_preemptable: self.user_statuses_preemptable[username] = { "gpu": Gpu(), "cpu": Cpu(), "memory": Memory() } preemption_allowed = job_params.get("preemptionAllowed", False) if not preemption_allowed: self.user_statuses[username]["gpu"] += job_res.gpu self.user_statuses[username]["cpu"] += job_res.cpu self.user_statuses[username]["memory"] += job_res.memory logger.info("Added job %s resource %s to used for user %s", job, job_res, username) else: self.user_statuses_preemptable[username]["gpu"] += job_res.gpu self.user_statuses_preemptable[username]["cpu"] += job_res.cpu self.user_statuses_preemptable[username]["memory"] += \ job_res.memory logger.info( "Added job %s resource %s to preemptable used for " "user %s", job, job_res, username)
def test_gpu_accounting_real_case2_in_cs(self): # This is the same test as test_gpu_accounting_real_case2. # The purpose of this test is to make sure ClusterResource accounting # has the same logic as pure GPU accounting. vc_info = { "quantus": ClusterResource(params={"gpu": { "P40": 150 }}), "relevance2": ClusterResource(params={"gpu": { "P40": 234 }}), "relevance2-inf": ClusterResource(params={"gpu": { "P40": 40 }}), } vc_usage = { "quantus": ClusterResource(params={"gpu": { "P40": 125 }}), "relevance2": ClusterResource(params={"gpu": { "P40": 231 }}), "relevance2-inf": ClusterResource(params={"gpu": { "P40": 0 }}), } cluster_total = ClusterResource(params={"gpu": {"P40": 424}}) cluster_avail = ClusterResource(params={"gpu": {"P40": 68}}) cluster_unschedulable = ClusterResource(params={"gpu": {"P40": 1}}) result = quota.calculate_vc_resources(cluster_total, cluster_avail, cluster_unschedulable, vc_info, vc_usage) vc_total, vc_used, vc_avail, vc_unschedulable = result self.assertEqual(vc_info, vc_total) self.assertEqual(vc_usage, vc_used) target_vc_available = { "quantus": ClusterResource(params={"gpu": { "P40": 25 }}), "relevance2": ClusterResource(params={"gpu": { "P40": 2 }}), "relevance2-inf": ClusterResource(params={"gpu": { "P40": 40 }}), } self.assertEqual(target_vc_available, vc_avail) target_vc_unschedulable = { "quantus": ClusterResource(params={"gpu": { "P40": 0 }}), "relevance2": ClusterResource(params={"gpu": { "P40": 1 }}), "relevance2-inf": ClusterResource(params={"gpu": { "P40": 0 }}), } self.assertEqual(target_vc_unschedulable, vc_unschedulable)
def test_gpu_accounting_real_case_in_cs(self): # This is the same test as test_gpu_accounting_real_case. # The purpose of this test is to make sure ClusterResource accounting # has the same logic as pure GPU accounting. vc_info = { "platform": ClusterResource(params={"gpu": { "P40": 48 }}), "relevance": ClusterResource(params={"gpu": { "P40": 200 }}), "quantus": ClusterResource(params={"gpu": { "P40": 100 }}), "AU": ClusterResource(params={"gpu": { "P40": 20 }}), } vc_usage = { "platform": ClusterResource(params={"gpu": { "P40": 57 }}), "relevance": ClusterResource(params={"gpu": { "P40": 164 }}), "quantus": ClusterResource(params={"gpu": { "P40": 93 }}), "AU": ClusterResource(params={"gpu": { "P40": 0 }}), } cluster_total = ClusterResource(params={"gpu": {"P40": 368}}) cluster_avail = ClusterResource(params={"gpu": {"P40": 54}}) cluster_unschedulable = ClusterResource() result = quota.calculate_vc_resources(cluster_total, cluster_avail, cluster_unschedulable, vc_info, vc_usage) vc_total, vc_used, vc_avail, vc_unschedulable = result self.assertEqual(vc_info, vc_total) self.assertEqual(vc_usage, vc_used) target_vc_avail = { "platform": ClusterResource(params={"gpu": { "P40": 0 }}), "relevance": ClusterResource(params={"gpu": { "P40": 30 }}), "quantus": ClusterResource(params={"gpu": { "P40": 6 }}), "AU": ClusterResource(params={"gpu": { "P40": 17 }}), } self.assertEqual(target_vc_avail, vc_avail) target_vc_unschedulable = { "platform": ClusterResource(params={"gpu": { "P40": 0 }}), "relevance": ClusterResource(params={"gpu": { "P40": 6 }}), "quantus": ClusterResource(params={"gpu": { "P40": 1 }}), "AU": ClusterResource(params={"gpu": { "P40": 3 }}), } self.assertEqual(target_vc_unschedulable, vc_unschedulable)
def test_gpu_accounting_move_quota_from_one_vc_to_another_in_cs(self): # This is the same test as # test_gpu_accounting_move_quota_from_one_vc_to_another. # The purpose of this test is to make sure ClusterResource accounting # has the same logic as pure GPU accounting. vc_info = { "A": ClusterResource(params={"gpu": { "P40": 20 }}), "B": ClusterResource(params={"gpu": { "P40": 20 }}), } # previous A has quota of 30, and A used them all, later admin moved # 10 to B vc_usage = { "A": ClusterResource(params={"gpu": { "P40": 30 }}), "B": ClusterResource(params={"gpu": { "P40": 5 }}), } cluster_total = ClusterResource(params={"gpu": {"P40": 40}}) cluster_avail = ClusterResource(params={"gpu": {"P40": 5}}) cluster_unschedulable = ClusterResource() result = quota.calculate_vc_resources(cluster_total, cluster_avail, cluster_unschedulable, vc_info, vc_usage) vc_total, vc_used, vc_avail, vc_unschedulable = result self.assertEqual(vc_info, vc_total) self.assertEqual(vc_usage, vc_used) target_vc_avail = { "A": ClusterResource(params={"gpu": { "P40": 0 }}), "B": ClusterResource(params={"gpu": { "P40": 5 }}), } self.assertEqual(target_vc_avail, vc_avail) target_vc_unschedulable = { "A": ClusterResource(params={"gpu": { "P40": 0 }}), "B": ClusterResource(params={"gpu": { "P40": 10 }}), } self.assertEqual(target_vc_unschedulable, vc_unschedulable)
def test_gpu_accounting_idle_gpus_become_unscheduable_in_cs(self): # This is the same test as # test_gpu_accounting_idle_gpus_become_unscheduable. # The purpose of this test is to make sure ClusterResource accounting # has the same logic as pure GPU accounting. vc_info = { "A": ClusterResource(params={"gpu": { "P40": 40 }}), "B": ClusterResource(params={"gpu": { "P40": 40 }}), "C": ClusterResource(params={"gpu": { "P40": 40 }}), } vc_usage = { "A": ClusterResource(params={"gpu": { "P40": 40 }}), "B": ClusterResource(params={"gpu": { "P40": 31 }}), "C": ClusterResource(params={"gpu": { "P40": 0 }}), } cluster_total = ClusterResource(params={"gpu": {"P40": 120}}) cluster_avail = ClusterResource(params={"gpu": {"P40": 29}}) cluster_unschedulable = ClusterResource(params={"gpu": {"P40": 20}}) result = quota.calculate_vc_resources(cluster_total, cluster_avail, cluster_unschedulable, vc_info, vc_usage) vc_total, vc_used, vc_avail, vc_unschedulable = result self.assertEqual(vc_info, vc_total) self.assertEqual(vc_usage, vc_used) target_vc_avail = { "A": ClusterResource(params={"gpu": { "P40": 0 }}), "B": ClusterResource(params={"gpu": { "P40": 1 }}), "C": ClusterResource(params={"gpu": { "P40": 27 }}), } self.assertEqual(target_vc_avail, vc_avail) target_vc_unschedulable = { "A": ClusterResource(params={"gpu": { "P40": 0 }}), "B": ClusterResource(params={"gpu": { "P40": 8 }}), "C": ClusterResource(params={"gpu": { "P40": 13 }}), } self.assertEqual(target_vc_unschedulable, vc_unschedulable)
def calculate_vc_resources(cluster_capacity, cluster_avail, cluster_reserved, vc_info, vc_usage): """Calculates vc resources based on cluster resources and vc info. Qi' = Qi - R * (Qi / sum(Qi)) Qi'' = max(Qi' - Ui, 0) Ai = A * (Qi'' / sum(Qi'')) Where - R: cluster reserved - A: cluster avail - Qi: vc quota - Ui: vc used Args: cluster_capacity: Total resource capacity in the cluster cluster_avail: Currently available resource in the cluster cluster_reserved: Currently reserved resource in the cluster vc_info: VC quota information vc_usage: Currently used resource by VC in the cluster Returns: Qi: vc_total Ui: vc_used Ai: vc_avail max(Qi - Ui - Ai, 0): vc_unschedulable """ logger.debug("cluster_capacity %s, cluster_avail %s, cluster_reserved %s", cluster_capacity, cluster_avail, cluster_reserved) logger.debug("vc_info %s, vc_usage %s", vc_info, vc_usage) vc_usage = __get_valid_vc_usage(vc_info, vc_usage) vc_total = collections.defaultdict(lambda: ClusterResource()) vc_used = collections.defaultdict(lambda: ClusterResource()) vc_avail = collections.defaultdict(lambda: ClusterResource()) vc_unschedulable = collections.defaultdict(lambda: ClusterResource()) # vc total == assigned quota for vc_name, quota in vc_info.items(): vc_total[vc_name] = copy.deepcopy(quota) quota_sum = ClusterResource() for vc_name, quota in vc_info.items(): quota_sum += quota # ratios for calculating vc avail # Qi' = Qi - R * (Qi / sum(Qi)) # Qi'' = max(Qi' - Ui, 0) ratios = collections.defaultdict(lambda: ClusterResource()) for vc_name, quota in vc_info.items(): reserved = (cluster_reserved * quota / quota_sum).ceil # over-reserve used = vc_usage.get(vc_name, ClusterResource()) ratio = quota - reserved ratios[vc_name] = ratio - used ratio_sum = ClusterResource() for vc_name, ratio in ratios.items(): ratio_sum += ratio logger.debug("ratios %s, ratio_sum %s", ratios, ratio_sum) # calculate avail and unschedulable # Ai = A * (Qi'' / sum(Qi'')) # max(Qi - Ui - Ai, 0) for vc_name, ratio in ratios.items(): used = copy.deepcopy(vc_usage.get(vc_name, ClusterResource())) avail = (cluster_avail * ratio / ratio_sum).floor # under-avail quota = vc_total.get(vc_name, ClusterResource()) vc_used[vc_name] = used vc_avail[vc_name] = avail vc_unschedulable[vc_name] = quota - used - avail logger.debug("vc_total %s, vc_used %s, vc_avail %s, vc_unschedulable %s", vc_total, vc_used, vc_avail, vc_unschedulable) return vc_total, vc_used, vc_avail, vc_unschedulable
def test_mark_schedulable_non_preemptable_gpu_jobs(self): # job1 is running on an unschedulable node job1_info = { "job": { "vcName": "platform", "jobId": "job1", }, "jobId": "job1", "job_resource": ClusterResource( params={ "cpu": { "Standard_ND24rs": 1 }, "memory": { "Standard_ND24rs": 0 }, "gpu": { "Standard_ND24rs": 3 }, "gpu_memory": { "Standard_ND24rs": 0 }, }), "preemptionAllowed": False, "sort_key": "0_0_999899_2020-03-31 08:07:46", "allowed": False, } # job2 is running on a good node job2_info = { "job": { "vcName": "platform", "jobId": "job2", }, "jobId": "job2", "job_resource": ClusterResource( params={ "cpu": { "Standard_ND24rs": 1 }, "memory": { "Standard_ND24rs": 0 }, "gpu": { "Standard_ND24rs": 4 }, "gpu_memory": { "Standard_ND24rs": 0 }, }), "preemptionAllowed": False, "sort_key": "0_0_999899_2020-03-31 08:08:49", "allowed": False, } # job3 is submitted just now job3_info = { "job": { "vcName": "platform", "jobId": "job3", }, "jobId": "job3", "job_resource": ClusterResource( params={ "cpu": { "Standard_ND24rs": 1 }, "memory": { "Standard_ND24rs": 0 }, "gpu": { "Standard_ND24rs": 4 }, "gpu_memory": { "Standard_ND24rs": 0 }, }), "preemptionAllowed": False, "sort_key": "0_2_999899_2020-03-31 09:00:10", "allowed": False, } jobs_info = [job1_info, job2_info, job3_info] cluster_status = { "gpu_capacity": { "Standard_ND24rs": 12 }, "gpu_reserved": { "Standard_ND24rs": 0 }, "gpu_unschedulable": { "Standard_ND24rs": 4 }, "cpu_capacity": { "Standard_ND24rs": 72 }, "cpu_reserved": { "Standard_ND24rs": 23 }, "cpu_unschedulable": { "Standard_ND24rs": 24 }, "memory_capacity": { "Standard_ND24rs": "1344Gi" }, "memory_reserved": { "Standard_ND24rs": "448Gi" }, "memory_unschedulable": { "Standard_ND24rs": "448Gi" }, } cluster_capacity = ClusterResource( params={ "cpu": cluster_status["cpu_capacity"], "memory": cluster_status["memory_capacity"], "gpu": cluster_status["gpu_capacity"], }) cluster_reserved = ClusterResource( params={ "cpu": cluster_status["cpu_reserved"], "memory": cluster_status["memory_reserved"], "gpu": cluster_status["gpu_reserved"], }) cluster_unschedulable = ClusterResource( params={ "cpu": cluster_status["cpu_unschedulable"], "memory": cluster_status["memory_unschedulable"], "gpu": cluster_status["gpu_unschedulable"], }) vc_capacity = ClusterResource( params={ "cpu": cluster_status["cpu_capacity"], "memory": cluster_status["memory_capacity"], "gpu": cluster_status["gpu_capacity"], }) vc_unschedulable = ClusterResource( params={ "cpu": cluster_status["cpu_reserved"], "memory": cluster_status["memory_reserved"], "gpu": cluster_status["gpu_reserved"], }) vc_schedulable = discount_cluster_resource(vc_capacity - vc_unschedulable) vc_schedulables = {"platform": vc_schedulable} # job3 will not but should be scheduled if using # cluster_schedulable = cluster_capacity - cluster_unschedulable c_schedulable = discount_cluster_resource(cluster_capacity - cluster_unschedulable) jobs_info_list = copy.deepcopy(jobs_info) mark_schedulable_non_preemptable_jobs(jobs_info_list, c_schedulable, copy.deepcopy(vc_schedulables)) self.assertTrue(jobs_info_list[0]["allowed"]) self.assertTrue(jobs_info_list[1]["allowed"]) self.assertFalse(jobs_info_list[2]["allowed"]) # job3 will and should be scheduled if using # cluster_schedulable = cluster_capacity - cluster_reserved c_schedulable = discount_cluster_resource(cluster_capacity - cluster_reserved) jobs_info_list = copy.deepcopy(jobs_info) mark_schedulable_non_preemptable_jobs(jobs_info_list, c_schedulable, copy.deepcopy(vc_schedulables)) self.assertTrue(jobs_info_list[0]["allowed"]) self.assertTrue(jobs_info_list[1]["allowed"]) self.assertTrue(jobs_info_list[2]["allowed"])
def test_calculate_vc_resources(self): cluster_capacity = ClusterResource( params={ "cpu": { "r1": 30, "r2": 40, "": 4, }, "memory": { "r1": "300Gi", "r2": "400Gi", "": "16Gi", }, "gpu": { "r1": 16, "": 4, }, "gpu_memory": { "r1": "256Gi", "": "64Gi", }, }) cluster_avail = ClusterResource( params={ "cpu": { "r1": 17, "r2": 2, "": 2, }, "memory": { "r1": "230Gi", "r2": "100Gi", "": "8Gi", }, "gpu": { "r1": 7, }, "gpu_memory": { "r1": "112Gi", }, }) cluster_reserved = ClusterResource(params={ "cpu": { "r1": 4, }, "memory": { "r1": "20Gi", }, }) vc_info = { "vc1": ClusterResource( params={ "cpu": { "r1": 10, "r2": 40, }, "memory": { "r1": "100Gi", "r2": "400Gi", }, "gpu": { "r1": 12, }, "gpu_memory": { "r1": "192Gi", }, }), "vc2": ClusterResource( params={ "cpu": { "r1": 20, "": 4, }, "memory": { "r1": "200Gi", "": "16Gi", }, "gpu": { "r1": 4, "": 4, }, "gpu_memory": { "r1": "64Gi", "": "64Gi", }, }) } vc_usage = { "vc1": ClusterResource( params={ "cpu": { "r1": 9, "r2": 38, }, "memory": { "r1": "50Gi", "r2": "300Gi", }, "gpu": { "r1": 8, }, "gpu_memory": { "r1": "128Gi", }, }), "vc2": ClusterResource( params={ "cpu": { "": 2, }, "memory": { "": "8Gi", }, "gpu": { "r1": 1, "": 4, }, "gpu_memory": { "r1": "16Gi", "": "64Gi", }, }) } result = quota.calculate_vc_resources(cluster_capacity, cluster_avail, cluster_reserved, vc_info, vc_usage) vc_total, vc_used, vc_avail, vc_unschedulable = result self.assertEqual(vc_info, vc_total) self.assertEqual(vc_usage, vc_used) expected_vc_avail = { "vc1": ClusterResource( params={ "cpu": { "r1": 0, "r2": 2, }, "memory": { "r1": "46528812373", "r2": "100Gi", }, "gpu": { "r1": 4, }, "gpu_memory": { "r1": "64Gi", }, }), "vc2": ClusterResource( params={ "cpu": { "r1": 17, "": 2, }, "memory": { "r1": "200431807146", "": "8Gi", }, "gpu": { "r1": 3, }, "gpu_memory": { "r1": "48Gi", }, }) } self.assertEqual(expected_vc_avail, vc_avail) expected_vc_unschedulable = { "vc1": ClusterResource( params={ "cpu": { "r1": 1, "r2": 0, }, "memory": { "r1": "7158278827", "r2": "0", } }), "vc2": ClusterResource( params={ "cpu": { "r1": 3, "": 0, }, "memory": { "r1": "14316557654", "": "0", } }) } self.assertEqual(expected_vc_unschedulable, vc_unschedulable)