def test_ceil(self):
     v = ClusterResource(
         params={
             "cpu": {
                 "r1": "1.5",
             },
             "memory": {
                 "r1": "100.2",
             },
             "gpu": {
                 "r1": "10.4",
             },
             "gpu_memory": {
                 "r1": "199.9",
             },
         }).ceil
     expected = ClusterResource(
         params={
             "cpu": {
                 "r1": "2",
             },
             "memory": {
                 "r1": "101",
             },
             "gpu": {
                 "r1": "11",
             },
             "gpu_memory": {
                 "r1": "200",
             },
         })
     self.assertEqual(expected, v)
 def test_floor(self):
     v = ClusterResource(
         params={
             "cpu": {
                 "r1": "1.5",
             },
             "memory": {
                 "r1": "100.2",
             },
             "gpu": {
                 "r1": "10.4",
             },
             "gpu_memory": {
                 "r1": "199.9",
             },
         }).floor
     expected = ClusterResource(
         params={
             "cpu": {
                 "r1": "1",
             },
             "memory": {
                 "r1": "100",
             },
             "gpu": {
                 "r1": "10",
             },
             "gpu_memory": {
                 "r1": "199",
             },
         })
     self.assertEqual(expected, v)
    def setUp(self):
        v1_params = {
            "cpu": {
                "r1": "2",
                "r2": "4",
            },
            "memory": {
                "r1": "100Ki",
                "r2": "200Ki",
            },
            "gpu": {
                "r1": "1",
                "r2": "2",
            },
            "gpu_memory": {
                "r1": "100Ki",
                "r2": "200Ki",
            },
        }
        self.v1 = ClusterResource(params=v1_params)

        v2_params = {
            "cpu": {
                "r1": "2",
                "r2": "2",
            },
            "memory": {
                "r1": "400Ki",
                "r2": "100Ki",
            },
            "gpu": {
                "r1": "4",
                "r2": "4",
            },
            "gpu_memory": {
                "r1": "400Ki",
                "r2": "400Ki",
            },
        }
        self.v2 = ClusterResource(params=v2_params)

        v3_params = {
            "cpu": {
                "r1": "0.5",
            },
            "memory": {
                "r1": "0.5",
            },
            "gpu": {
                "r1": "1",
            },
            "gpu_memory": {
                "r1": "1",
            },
        }
        self.v3 = ClusterResource(params=v3_params)

        self.scalar = 0.5
    def __get_vc_used(self, vc_pod_statuses, vc_jobs_without_pods):
        vc_used = collections.defaultdict(lambda: ClusterResource())
        vc_preemptable_used = collections.defaultdict(lambda: ClusterResource())

        for vc_name in self.vc_info:
            # Account all pods in vc
            pod_statuses = vc_pod_statuses.get(vc_name, {})

            for _, pod_status in pod_statuses.items():
                pod_res = ClusterResource(
                    params={
                        "cpu": pod_status.get("cpu", Cpu()).to_dict(),
                        "memory": pod_status.get("memory", Memory()).to_dict(),
                        "gpu": pod_status.get("gpu", Gpu()).to_dict(),
                    })
                vc_used[vc_name] += pod_res

                pod_preemptable_res = ClusterResource(
                    params={
                        "preemptable_cpu":
                            pod_status.get("preemptable_cpu", Cpu()).to_dict(),
                        "preemptable_memory":
                            pod_status.get("preemptable_memory", Memory()
                                          ).to_dict(),
                        "preemptable_gpu":
                            pod_status.get("preemptable_gpu", Gpu()).to_dict(),
                    })
                vc_preemptable_used[vc_name] += pod_preemptable_res

            # Account all jobs without pods in vc
            jobs_without_pods = vc_jobs_without_pods.get(vc_name, [])
            for job in jobs_without_pods:
                job_params = job["jobParams"]
                job_res_params = get_resource_params_from_job_params(job_params)
                job_res = ClusterResource(params=job_res_params)

                preemption_allowed = job_params.get("preemptionAllowed", False)
                if not preemption_allowed:
                    vc_used[vc_name] += job_res
                else:
                    vc_preemptable_used[vc_name] += job_res
                logger.info("Added job %s resource %s to the usage of vc %s",
                            job, job_res, vc_name)

        return vc_used, vc_preemptable_used
def get_cluster_schedulable_from_unschedulable(cluster_status):
    # Compute cluster schedulable resource
    cluster_capacity = ClusterResource(
        params={
            "cpu": cluster_status["cpu_capacity"],
            "memory": cluster_status["memory_capacity"],
            "gpu": cluster_status["gpu_capacity"],
        })
    cluster_unschedulable = ClusterResource(
        params={
            "cpu": cluster_status["cpu_unschedulable"],
            "memory": cluster_status["memory_unschedulable"],
            "gpu": cluster_status["gpu_unschedulable"],
        })

    cluster_schedulable = cluster_capacity - cluster_unschedulable
    cluster_schedulable = discount_cluster_resource(cluster_schedulable)
    return cluster_schedulable
def get_vc_info(vc_list):
    vc_info = {}
    for vc in vc_list:
        resource_quota = {}
        try:
            resource_quota = json.loads(vc["resourceQuota"])
        except:
            logger.exception("Parsing resourceQuota failed for %s", vc)
        vc_info[vc["vcName"]] = ClusterResource(params=resource_quota)
    return vc_info
    def test_idiv(self):
        v = copy.deepcopy(self.v1)
        v /= self.scalar
        expected = ClusterResource(
            params={
                "cpu": {
                    "r1": "4",
                    "r2": "8",
                },
                "memory": {
                    "r1": "200Ki",
                    "r2": "400Ki",
                },
                "gpu": {
                    "r1": "2",
                    "r2": "4",
                },
                "gpu_memory": {
                    "r1": "200Ki",
                    "r2": "400Ki",
                },
            })
        self.assertEqual(expected, v)

        v = copy.deepcopy(self.v1)
        v /= self.v3
        expected = ClusterResource(
            params={
                "cpu": {
                    "r1": "4",
                },
                "memory": {
                    "r1": "200Ki",
                },
                "gpu": {
                    "r1": "1",
                },
                "gpu_memory": {
                    "r1": "100Ki",
                },
            })
        self.assertEqual(expected, v)
Esempio n. 8
0
def get_cluster_schedulable(cluster_status):
    # Compute cluster schedulable resource
    cluster_capacity = ClusterResource(
        params={
            "cpu": cluster_status["cpu_capacity"],
            "memory": cluster_status["memory_capacity"],
            "gpu": cluster_status["gpu_capacity"],
        })
    # On 1 node, reserved = unschedulable - used
    cluster_reserved = ClusterResource(
        params={
            "cpu": cluster_status["cpu_reserved"],
            "memory": cluster_status["memory_reserved"],
            "gpu": cluster_status["gpu_reserved"],
        })

    cluster_schedulable = cluster_capacity - cluster_reserved
    cluster_schedulable = discount_cluster_resource(cluster_schedulable)
    logger.info("cluster schedulable: %s", cluster_schedulable)
    return cluster_schedulable
Esempio n. 9
0
def get_jobs_info(jobs):
    priority_dict = get_priority_dict()

    jobs_info = []
    for job in jobs:
        job_status = job.get("jobStatus")
        if job_status in ["queued", "scheduling", "running"]:
            job_params = json.loads(base64decode(job["jobParams"]))
            preemption_allowed = job_params.get("preemptionAllowed", False)
            job_id = job_params["jobId"]

            job_res = get_resource_params_from_job_params(job_params)
            job_resource = ClusterResource(params=job_res)

            # Job lists will be sorted based on and in the order of below
            # 1. non-preemptible precedes preemptible
            # 2. running precedes scheduling, precedes queued
            # 3. larger priority value precedes lower priority value
            # 4. early job time precedes later job time

            # Non-Preemptible jobs first
            preemptible = 1 if preemption_allowed else 0

            # Job status
            job_status_key = 0
            if job["jobStatus"] == "scheduling":
                job_status_key = 1
            elif job["jobStatus"] == "queued":
                job_status_key = 2

            # Priority value
            reverse_priority = get_job_priority(priority_dict, job_id)
            priority = 999999 - reverse_priority

            # Job time
            queue_time = int(datetime.datetime.timestamp(job["lastUpdated"]))

            sort_key = "{}_{}_{:06d}_{}".format(preemptible, job_status_key,
                                                priority, queue_time)

            single_job_info = {
                "job": job,
                "preemptionAllowed": preemption_allowed,
                "jobId": job_id,
                "job_resource": job_resource,
                "sort_key": sort_key,
                "allowed": False,
            }

            jobs_info.append(single_job_info)

    jobs_info.sort(key=lambda x: x["sort_key"])
    return jobs_info
Esempio n. 10
0
def __get_valid_vc_usage(vc_info, vc_usage):
    valid_vc_usage = collections.defaultdict(lambda: ClusterResource())

    for vc_name, usage in vc_usage.items():
        if vc_name not in vc_info:
            logger.warning(
                "ignore used resource in %s. vc quota do not have this vc, "
                "possible due to job template error", vc_name)
        else:
            valid_vc_usage[vc_name] = usage

    return valid_vc_usage
 def __get_cluster_resource_count(self):
     cluster = self.cluster_status
     capacity = ClusterResource(
         params={
             "cpu": cluster.cpu_capacity,
             "memory": cluster.memory_capacity,
             "gpu": cluster.gpu_capacity,
         })
     avail = ClusterResource(
         params={
             "cpu": cluster.cpu_available,
             "memory": cluster.memory_available,
             "gpu": cluster.gpu_available,
         })
     reserved = ClusterResource(
         params={
             "cpu": cluster.cpu_reserved,
             "memory": cluster.memory_reserved,
             "gpu": cluster.gpu_reserved,
         })
     return capacity, avail, reserved
    def test_mul(self):
        result = self.v1 * self.scalar
        expected = ClusterResource(
            params={
                "cpu": {
                    "r1": "1",
                    "r2": "2",
                },
                "memory": {
                    "r1": "50Ki",
                    "r2": "100Ki",
                },
                "gpu": {
                    "r1": "0.5",
                    "r2": "1",
                },
                "gpu_memory": {
                    "r1": "50Ki",
                    "r2": "100Ki",
                },
            })
        self.assertEqual(expected, result)

        result = self.v1 * self.v3
        expected = ClusterResource(
            params={
                "cpu": {
                    "r1": "1",
                },
                "memory": {
                    "r1": "50Ki",
                },
                "gpu": {
                    "r1": "1",
                },
                "gpu_memory": {
                    "r1": "100Ki",
                },
            })
        self.assertEqual(expected, result)
    def test_truediv(self):
        result = self.v1 / self.scalar
        expected = ClusterResource(
            params={
                "cpu": {
                    "r1": "4",
                    "r2": "8",
                },
                "memory": {
                    "r1": "200Ki",
                    "r2": "400Ki",
                },
                "gpu": {
                    "r1": "2",
                    "r2": "4",
                },
                "gpu_memory": {
                    "r1": "200Ki",
                    "r2": "400Ki",
                },
            })
        self.assertEqual(expected, result)

        result = self.v1 / self.v3
        expected = ClusterResource(
            params={
                "cpu": {
                    "r1": "4",
                },
                "memory": {
                    "r1": "200Ki",
                },
                "gpu": {
                    "r1": "1",
                },
                "gpu_memory": {
                    "r1": "100Ki",
                },
            })
        self.assertEqual(expected, result)
Esempio n. 14
0
def get_vc_schedulables(cluster_status):
    # Compute VC schedulable resources
    vc_statuses = cluster_status.get("vc_statuses", {})
    vc_schedulables = {}
    for vc_name, vc_status in vc_statuses.items():
        vc_capacity = ClusterResource(
            params={
                "cpu": vc_status["cpu_capacity"],
                "memory": vc_status["memory_capacity"],
                "gpu": vc_status["gpu_capacity"],
            })
        vc_unschedulable = ClusterResource(
            params={
                "cpu": vc_status["cpu_unschedulable"],
                "memory": vc_status["memory_unschedulable"],
                "gpu": vc_status["gpu_unschedulable"],
            })
        vc_schedulable = vc_capacity - vc_unschedulable
        vc_schedulables[vc_name] = discount_cluster_resource(vc_schedulable)

    logger.info("vc schedulables: %s", vc_schedulables)
    return vc_schedulables
Esempio n. 15
0
    def __adjust_resource_status(self):
        # Adjust with jobs that have not been scheduled on k8s.
        # Subtract from cluster available
        # Add to cluster used
        for job in self.jobs_without_pods:
            job_params = job["jobParams"]
            job_res_params = get_resource_params_from_job_params(job_params)
            job_res = ClusterResource(params=job_res_params)

            preemption_allowed = job_params.get("preemptionAllowed", False)
            if not preemption_allowed:
                self.gpu_available -= job_res.gpu
                self.cpu_available -= job_res.cpu
                self.memory_available -= job_res.memory

                self.gpu_used += job_res.gpu
                self.cpu_used += job_res.cpu
                self.memory_used += job_res.memory
                logger.info("Added job %s resource %s to used", job, job_res)
            else:
                self.gpu_preemptable_used += job_res.gpu
                self.cpu_preemptable_used += job_res.cpu
                self.memory_preemptable_used += job_res.memory
                logger.info("Added job %s resource %s to preemptable used",
                            job, job_res)

        # Account pods without node assignment.
        # This occurs when fragmentation happens and job manager still let
        # through jobs because there is still remaining quota.
        for name, pod_status in self.pods_without_node_assignment.items():
            if pod_status["preemption_allowed"] is False:
                self.gpu_used += pod_status["gpu"]
                self.cpu_used += pod_status["cpu"]
                self.memory_used += pod_status["memory"]

                self.gpu_available -= pod_status["gpu"]
                self.cpu_available -= pod_status["cpu"]
                self.memory_available -= pod_status["memory"]
            else:
                self.gpu_preemptable_used += pod_status["preemptable_gpu"]
                self.cpu_preemptable_used += pod_status["preemptable_cpu"]
                self.memory_preemptable_used += pod_status[
                    "preemptable_memory"]
 def test_repr(self):
     v = ClusterResource(
         params={
             "cpu": {
                 "r1": "1m",
             },
             "memory": {
                 "r1": "100Ki",
             },
             "gpu": {
                 "r1": "4",
             },
             "gpu_memory": {
                 "r1": "200Ki"
             }
         })
     self.assertEqual(
         "{'cpu': {'r1': %s}, 'memory': {'r1': %s}, "
         "'gpu': {'r1': %s}, 'gpu_memory': {'r1': %s}}" %
         (0.001, float(102400), float(4), float(204800)), repr(v))
 def test_iadd(self):
     self.v1 += self.v2
     expected = ClusterResource(
         params={
             "cpu": {
                 "r1": "4",
                 "r2": "6",
             },
             "memory": {
                 "r1": "500Ki",
                 "r2": "300Ki",
             },
             "gpu": {
                 "r1": "5",
                 "r2": "6",
             },
             "gpu_memory": {
                 "r1": "500Ki",
                 "r2": "600Ki",
             },
         })
     self.assertEqual(expected, self.v1)
 def test_isub(self):
     self.v1 -= self.v2
     expected = ClusterResource(
         params={
             "cpu": {
                 "r1": "0",
                 "r2": "2",
             },
             "memory": {
                 "r1": "0",
                 "r2": "100Ki",
             },
             "gpu": {
                 "r1": "0",
                 "r2": "0",
             },
             "gpu_memory": {
                 "r1": "0",
                 "r2": "0",
             },
         })
     self.assertEqual(expected, self.v1)
Esempio n. 19
0
    def __adjust_user_statuses(self):
        # Adjust with jobs that have not been scheduled on k8s.
        # Add to corresponding user usage
        for job in self.jobs_without_pods:
            job_params = job["jobParams"]
            job_res_params = get_resource_params_from_job_params(job_params)
            job_res = ClusterResource(params=job_res_params)
            username = job["userName"].split("@")[0].strip()

            if username not in self.user_statuses:
                self.user_statuses[username] = {
                    "gpu": Gpu(),
                    "cpu": Cpu(),
                    "memory": Memory()
                }
            if username not in self.user_statuses_preemptable:
                self.user_statuses_preemptable[username] = {
                    "gpu": Gpu(),
                    "cpu": Cpu(),
                    "memory": Memory()
                }

            preemption_allowed = job_params.get("preemptionAllowed", False)
            if not preemption_allowed:
                self.user_statuses[username]["gpu"] += job_res.gpu
                self.user_statuses[username]["cpu"] += job_res.cpu
                self.user_statuses[username]["memory"] += job_res.memory
                logger.info("Added job %s resource %s to used for user %s",
                            job, job_res, username)
            else:
                self.user_statuses_preemptable[username]["gpu"] += job_res.gpu
                self.user_statuses_preemptable[username]["cpu"] += job_res.cpu
                self.user_statuses_preemptable[username]["memory"] += \
                    job_res.memory
                logger.info(
                    "Added job %s resource %s to preemptable used for "
                    "user %s", job, job_res, username)
Esempio n. 20
0
    def test_gpu_accounting_real_case2_in_cs(self):
        # This is the same test as test_gpu_accounting_real_case2.
        # The purpose of this test is to make sure ClusterResource accounting
        # has the same logic as pure GPU accounting.
        vc_info = {
            "quantus": ClusterResource(params={"gpu": {
                "P40": 150
            }}),
            "relevance2": ClusterResource(params={"gpu": {
                "P40": 234
            }}),
            "relevance2-inf": ClusterResource(params={"gpu": {
                "P40": 40
            }}),
        }

        vc_usage = {
            "quantus": ClusterResource(params={"gpu": {
                "P40": 125
            }}),
            "relevance2": ClusterResource(params={"gpu": {
                "P40": 231
            }}),
            "relevance2-inf": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
        }

        cluster_total = ClusterResource(params={"gpu": {"P40": 424}})
        cluster_avail = ClusterResource(params={"gpu": {"P40": 68}})
        cluster_unschedulable = ClusterResource(params={"gpu": {"P40": 1}})

        result = quota.calculate_vc_resources(cluster_total, cluster_avail,
                                              cluster_unschedulable, vc_info,
                                              vc_usage)

        vc_total, vc_used, vc_avail, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_available = {
            "quantus": ClusterResource(params={"gpu": {
                "P40": 25
            }}),
            "relevance2": ClusterResource(params={"gpu": {
                "P40": 2
            }}),
            "relevance2-inf": ClusterResource(params={"gpu": {
                "P40": 40
            }}),
        }

        self.assertEqual(target_vc_available, vc_avail)

        target_vc_unschedulable = {
            "quantus": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "relevance2": ClusterResource(params={"gpu": {
                "P40": 1
            }}),
            "relevance2-inf": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 21
0
    def test_gpu_accounting_real_case_in_cs(self):
        # This is the same test as test_gpu_accounting_real_case.
        # The purpose of this test is to make sure ClusterResource accounting
        # has the same logic as pure GPU accounting.
        vc_info = {
            "platform": ClusterResource(params={"gpu": {
                "P40": 48
            }}),
            "relevance": ClusterResource(params={"gpu": {
                "P40": 200
            }}),
            "quantus": ClusterResource(params={"gpu": {
                "P40": 100
            }}),
            "AU": ClusterResource(params={"gpu": {
                "P40": 20
            }}),
        }

        vc_usage = {
            "platform": ClusterResource(params={"gpu": {
                "P40": 57
            }}),
            "relevance": ClusterResource(params={"gpu": {
                "P40": 164
            }}),
            "quantus": ClusterResource(params={"gpu": {
                "P40": 93
            }}),
            "AU": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
        }

        cluster_total = ClusterResource(params={"gpu": {"P40": 368}})
        cluster_avail = ClusterResource(params={"gpu": {"P40": 54}})
        cluster_unschedulable = ClusterResource()

        result = quota.calculate_vc_resources(cluster_total, cluster_avail,
                                              cluster_unschedulable, vc_info,
                                              vc_usage)

        vc_total, vc_used, vc_avail, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_avail = {
            "platform": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "relevance": ClusterResource(params={"gpu": {
                "P40": 30
            }}),
            "quantus": ClusterResource(params={"gpu": {
                "P40": 6
            }}),
            "AU": ClusterResource(params={"gpu": {
                "P40": 17
            }}),
        }

        self.assertEqual(target_vc_avail, vc_avail)

        target_vc_unschedulable = {
            "platform": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "relevance": ClusterResource(params={"gpu": {
                "P40": 6
            }}),
            "quantus": ClusterResource(params={"gpu": {
                "P40": 1
            }}),
            "AU": ClusterResource(params={"gpu": {
                "P40": 3
            }}),
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 22
0
    def test_gpu_accounting_move_quota_from_one_vc_to_another_in_cs(self):
        # This is the same test as
        # test_gpu_accounting_move_quota_from_one_vc_to_another.
        # The purpose of this test is to make sure ClusterResource accounting
        # has the same logic as pure GPU accounting.
        vc_info = {
            "A": ClusterResource(params={"gpu": {
                "P40": 20
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 20
            }}),
        }

        # previous A has quota of 30, and A used them all, later admin moved
        # 10 to B
        vc_usage = {
            "A": ClusterResource(params={"gpu": {
                "P40": 30
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 5
            }}),
        }

        cluster_total = ClusterResource(params={"gpu": {"P40": 40}})
        cluster_avail = ClusterResource(params={"gpu": {"P40": 5}})
        cluster_unschedulable = ClusterResource()

        result = quota.calculate_vc_resources(cluster_total, cluster_avail,
                                              cluster_unschedulable, vc_info,
                                              vc_usage)

        vc_total, vc_used, vc_avail, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_avail = {
            "A": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 5
            }}),
        }

        self.assertEqual(target_vc_avail, vc_avail)

        target_vc_unschedulable = {
            "A": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 10
            }}),
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 23
0
    def test_gpu_accounting_idle_gpus_become_unscheduable_in_cs(self):
        # This is the same test as
        # test_gpu_accounting_idle_gpus_become_unscheduable.
        # The purpose of this test is to make sure ClusterResource accounting
        # has the same logic as pure GPU accounting.
        vc_info = {
            "A": ClusterResource(params={"gpu": {
                "P40": 40
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 40
            }}),
            "C": ClusterResource(params={"gpu": {
                "P40": 40
            }}),
        }

        vc_usage = {
            "A": ClusterResource(params={"gpu": {
                "P40": 40
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 31
            }}),
            "C": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
        }

        cluster_total = ClusterResource(params={"gpu": {"P40": 120}})
        cluster_avail = ClusterResource(params={"gpu": {"P40": 29}})
        cluster_unschedulable = ClusterResource(params={"gpu": {"P40": 20}})

        result = quota.calculate_vc_resources(cluster_total, cluster_avail,
                                              cluster_unschedulable, vc_info,
                                              vc_usage)

        vc_total, vc_used, vc_avail, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_avail = {
            "A": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 1
            }}),
            "C": ClusterResource(params={"gpu": {
                "P40": 27
            }}),
        }

        self.assertEqual(target_vc_avail, vc_avail)

        target_vc_unschedulable = {
            "A": ClusterResource(params={"gpu": {
                "P40": 0
            }}),
            "B": ClusterResource(params={"gpu": {
                "P40": 8
            }}),
            "C": ClusterResource(params={"gpu": {
                "P40": 13
            }}),
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 24
0
def calculate_vc_resources(cluster_capacity, cluster_avail, cluster_reserved,
                           vc_info, vc_usage):
    """Calculates vc resources based on cluster resources and vc info.

    Qi' = Qi - R * (Qi / sum(Qi))
    Qi'' = max(Qi' - Ui, 0)
    Ai = A * (Qi'' / sum(Qi''))

    Where
    - R: cluster reserved
    - A: cluster avail
    - Qi: vc quota
    - Ui: vc used

    Args:
        cluster_capacity: Total resource capacity in the cluster
        cluster_avail: Currently available resource in the cluster
        cluster_reserved: Currently reserved resource in the cluster
        vc_info: VC quota information
        vc_usage: Currently used resource by VC in the cluster

    Returns:
        Qi: vc_total
        Ui: vc_used
        Ai: vc_avail
        max(Qi - Ui - Ai, 0): vc_unschedulable
    """
    logger.debug("cluster_capacity %s, cluster_avail %s, cluster_reserved %s",
                 cluster_capacity, cluster_avail, cluster_reserved)
    logger.debug("vc_info %s, vc_usage %s", vc_info, vc_usage)

    vc_usage = __get_valid_vc_usage(vc_info, vc_usage)

    vc_total = collections.defaultdict(lambda: ClusterResource())
    vc_used = collections.defaultdict(lambda: ClusterResource())
    vc_avail = collections.defaultdict(lambda: ClusterResource())
    vc_unschedulable = collections.defaultdict(lambda: ClusterResource())

    # vc total == assigned quota
    for vc_name, quota in vc_info.items():
        vc_total[vc_name] = copy.deepcopy(quota)

    quota_sum = ClusterResource()
    for vc_name, quota in vc_info.items():
        quota_sum += quota

    # ratios for calculating vc avail
    #   Qi' = Qi - R * (Qi / sum(Qi))
    #   Qi'' = max(Qi' - Ui, 0)
    ratios = collections.defaultdict(lambda: ClusterResource())
    for vc_name, quota in vc_info.items():
        reserved = (cluster_reserved * quota / quota_sum).ceil # over-reserve
        used = vc_usage.get(vc_name, ClusterResource())
        ratio = quota - reserved
        ratios[vc_name] = ratio - used

    ratio_sum = ClusterResource()
    for vc_name, ratio in ratios.items():
        ratio_sum += ratio

    logger.debug("ratios %s, ratio_sum %s", ratios, ratio_sum)

    # calculate avail and unschedulable
    # Ai = A * (Qi'' / sum(Qi''))
    # max(Qi - Ui - Ai, 0)
    for vc_name, ratio in ratios.items():
        used = copy.deepcopy(vc_usage.get(vc_name, ClusterResource()))
        avail = (cluster_avail * ratio / ratio_sum).floor # under-avail
        quota = vc_total.get(vc_name, ClusterResource())

        vc_used[vc_name] = used
        vc_avail[vc_name] = avail
        vc_unschedulable[vc_name] = quota - used - avail

    logger.debug("vc_total %s, vc_used %s, vc_avail %s, vc_unschedulable %s",
                 vc_total, vc_used, vc_avail, vc_unschedulable)
    return vc_total, vc_used, vc_avail, vc_unschedulable
Esempio n. 25
0
    def test_mark_schedulable_non_preemptable_gpu_jobs(self):
        # job1 is running on an unschedulable node
        job1_info = {
            "job": {
                "vcName": "platform",
                "jobId": "job1",
            },
            "jobId":
            "job1",
            "job_resource":
            ClusterResource(
                params={
                    "cpu": {
                        "Standard_ND24rs": 1
                    },
                    "memory": {
                        "Standard_ND24rs": 0
                    },
                    "gpu": {
                        "Standard_ND24rs": 3
                    },
                    "gpu_memory": {
                        "Standard_ND24rs": 0
                    },
                }),
            "preemptionAllowed":
            False,
            "sort_key":
            "0_0_999899_2020-03-31 08:07:46",
            "allowed":
            False,
        }

        # job2 is running on a good node
        job2_info = {
            "job": {
                "vcName": "platform",
                "jobId": "job2",
            },
            "jobId":
            "job2",
            "job_resource":
            ClusterResource(
                params={
                    "cpu": {
                        "Standard_ND24rs": 1
                    },
                    "memory": {
                        "Standard_ND24rs": 0
                    },
                    "gpu": {
                        "Standard_ND24rs": 4
                    },
                    "gpu_memory": {
                        "Standard_ND24rs": 0
                    },
                }),
            "preemptionAllowed":
            False,
            "sort_key":
            "0_0_999899_2020-03-31 08:08:49",
            "allowed":
            False,
        }

        # job3 is submitted just now
        job3_info = {
            "job": {
                "vcName": "platform",
                "jobId": "job3",
            },
            "jobId":
            "job3",
            "job_resource":
            ClusterResource(
                params={
                    "cpu": {
                        "Standard_ND24rs": 1
                    },
                    "memory": {
                        "Standard_ND24rs": 0
                    },
                    "gpu": {
                        "Standard_ND24rs": 4
                    },
                    "gpu_memory": {
                        "Standard_ND24rs": 0
                    },
                }),
            "preemptionAllowed":
            False,
            "sort_key":
            "0_2_999899_2020-03-31 09:00:10",
            "allowed":
            False,
        }

        jobs_info = [job1_info, job2_info, job3_info]

        cluster_status = {
            "gpu_capacity": {
                "Standard_ND24rs": 12
            },
            "gpu_reserved": {
                "Standard_ND24rs": 0
            },
            "gpu_unschedulable": {
                "Standard_ND24rs": 4
            },
            "cpu_capacity": {
                "Standard_ND24rs": 72
            },
            "cpu_reserved": {
                "Standard_ND24rs": 23
            },
            "cpu_unschedulable": {
                "Standard_ND24rs": 24
            },
            "memory_capacity": {
                "Standard_ND24rs": "1344Gi"
            },
            "memory_reserved": {
                "Standard_ND24rs": "448Gi"
            },
            "memory_unschedulable": {
                "Standard_ND24rs": "448Gi"
            },
        }

        cluster_capacity = ClusterResource(
            params={
                "cpu": cluster_status["cpu_capacity"],
                "memory": cluster_status["memory_capacity"],
                "gpu": cluster_status["gpu_capacity"],
            })
        cluster_reserved = ClusterResource(
            params={
                "cpu": cluster_status["cpu_reserved"],
                "memory": cluster_status["memory_reserved"],
                "gpu": cluster_status["gpu_reserved"],
            })
        cluster_unschedulable = ClusterResource(
            params={
                "cpu": cluster_status["cpu_unschedulable"],
                "memory": cluster_status["memory_unschedulable"],
                "gpu": cluster_status["gpu_unschedulable"],
            })

        vc_capacity = ClusterResource(
            params={
                "cpu": cluster_status["cpu_capacity"],
                "memory": cluster_status["memory_capacity"],
                "gpu": cluster_status["gpu_capacity"],
            })
        vc_unschedulable = ClusterResource(
            params={
                "cpu": cluster_status["cpu_reserved"],
                "memory": cluster_status["memory_reserved"],
                "gpu": cluster_status["gpu_reserved"],
            })
        vc_schedulable = discount_cluster_resource(vc_capacity -
                                                   vc_unschedulable)
        vc_schedulables = {"platform": vc_schedulable}

        # job3 will not but should be scheduled if using
        # cluster_schedulable = cluster_capacity - cluster_unschedulable
        c_schedulable = discount_cluster_resource(cluster_capacity -
                                                  cluster_unschedulable)

        jobs_info_list = copy.deepcopy(jobs_info)
        mark_schedulable_non_preemptable_jobs(jobs_info_list, c_schedulable,
                                              copy.deepcopy(vc_schedulables))

        self.assertTrue(jobs_info_list[0]["allowed"])
        self.assertTrue(jobs_info_list[1]["allowed"])
        self.assertFalse(jobs_info_list[2]["allowed"])

        # job3 will and should be scheduled if using
        # cluster_schedulable = cluster_capacity - cluster_reserved
        c_schedulable = discount_cluster_resource(cluster_capacity -
                                                  cluster_reserved)

        jobs_info_list = copy.deepcopy(jobs_info)
        mark_schedulable_non_preemptable_jobs(jobs_info_list, c_schedulable,
                                              copy.deepcopy(vc_schedulables))

        self.assertTrue(jobs_info_list[0]["allowed"])
        self.assertTrue(jobs_info_list[1]["allowed"])
        self.assertTrue(jobs_info_list[2]["allowed"])
Esempio n. 26
0
    def test_calculate_vc_resources(self):
        cluster_capacity = ClusterResource(
            params={
                "cpu": {
                    "r1": 30,
                    "r2": 40,
                    "": 4,
                },
                "memory": {
                    "r1": "300Gi",
                    "r2": "400Gi",
                    "": "16Gi",
                },
                "gpu": {
                    "r1": 16,
                    "": 4,
                },
                "gpu_memory": {
                    "r1": "256Gi",
                    "": "64Gi",
                },
            })
        cluster_avail = ClusterResource(
            params={
                "cpu": {
                    "r1": 17,
                    "r2": 2,
                    "": 2,
                },
                "memory": {
                    "r1": "230Gi",
                    "r2": "100Gi",
                    "": "8Gi",
                },
                "gpu": {
                    "r1": 7,
                },
                "gpu_memory": {
                    "r1": "112Gi",
                },
            })
        cluster_reserved = ClusterResource(params={
            "cpu": {
                "r1": 4,
            },
            "memory": {
                "r1": "20Gi",
            },
        })
        vc_info = {
            "vc1":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 10,
                        "r2": 40,
                    },
                    "memory": {
                        "r1": "100Gi",
                        "r2": "400Gi",
                    },
                    "gpu": {
                        "r1": 12,
                    },
                    "gpu_memory": {
                        "r1": "192Gi",
                    },
                }),
            "vc2":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 20,
                        "": 4,
                    },
                    "memory": {
                        "r1": "200Gi",
                        "": "16Gi",
                    },
                    "gpu": {
                        "r1": 4,
                        "": 4,
                    },
                    "gpu_memory": {
                        "r1": "64Gi",
                        "": "64Gi",
                    },
                })
        }
        vc_usage = {
            "vc1":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 9,
                        "r2": 38,
                    },
                    "memory": {
                        "r1": "50Gi",
                        "r2": "300Gi",
                    },
                    "gpu": {
                        "r1": 8,
                    },
                    "gpu_memory": {
                        "r1": "128Gi",
                    },
                }),
            "vc2":
            ClusterResource(
                params={
                    "cpu": {
                        "": 2,
                    },
                    "memory": {
                        "": "8Gi",
                    },
                    "gpu": {
                        "r1": 1,
                        "": 4,
                    },
                    "gpu_memory": {
                        "r1": "16Gi",
                        "": "64Gi",
                    },
                })
        }

        result = quota.calculate_vc_resources(cluster_capacity, cluster_avail,
                                              cluster_reserved, vc_info,
                                              vc_usage)
        vc_total, vc_used, vc_avail, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        expected_vc_avail = {
            "vc1":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 0,
                        "r2": 2,
                    },
                    "memory": {
                        "r1": "46528812373",
                        "r2": "100Gi",
                    },
                    "gpu": {
                        "r1": 4,
                    },
                    "gpu_memory": {
                        "r1": "64Gi",
                    },
                }),
            "vc2":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 17,
                        "": 2,
                    },
                    "memory": {
                        "r1": "200431807146",
                        "": "8Gi",
                    },
                    "gpu": {
                        "r1": 3,
                    },
                    "gpu_memory": {
                        "r1": "48Gi",
                    },
                })
        }
        self.assertEqual(expected_vc_avail, vc_avail)

        expected_vc_unschedulable = {
            "vc1":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 1,
                        "r2": 0,
                    },
                    "memory": {
                        "r1": "7158278827",
                        "r2": "0",
                    }
                }),
            "vc2":
            ClusterResource(
                params={
                    "cpu": {
                        "r1": 3,
                        "": 0,
                    },
                    "memory": {
                        "r1": "14316557654",
                        "": "0",
                    }
                })
        }
        self.assertEqual(expected_vc_unschedulable, vc_unschedulable)