Esempio n. 1
0
    def test_gpu_accounting_move_quota_from_one_vc_to_another(self):
        vc_info = {
            "A": {
                "P40": 20
            },
            "B": {
                "P40": 20
            },
        }

        # previous A has quota of 30, and A used them all, later admin moved
        # 10 to B
        vc_usage = {
            "A": {
                "P40": 30
            },
            "B": {
                "P40": 5
            },
        }

        cluster_total = {"P40": 40}
        cluster_available = {"P40": 5}
        cluster_unschedulable = {}

        result = quota.calculate_vc_gpu_counts(cluster_total,
                                               cluster_available,
                                               cluster_unschedulable, vc_info,
                                               vc_usage)

        vc_total, vc_used, vc_available, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_available = {
            "A": {
                "P40": 0
            },
            "B": {
                "P40": 5
            },
        }

        self.assertEqual(target_vc_available, vc_available)

        target_vc_unschedulable = {
            "A": {
                "P40": 0
            },
            "B": {
                "P40": 10
            },
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 2
0
    def test_gpu_accounting_real_case2(self):
        vc_info = {
            "quantus": {
                "P40": 150
            },
            "relevance2": {
                "P40": 234
            },
            "relevance2-inf": {
                "P40": 40
            },
        }

        vc_usage = {
            "quantus": {
                "P40": 125
            },
            "relevance2": {
                "P40": 231
            },
            "relevance2-inf": {
                "P40": 0
            },
        }

        cluster_total = {"P40": 424}
        cluster_available = {"P40": 68}
        cluster_unschedulable = {"P40": 1}

        result = quota.calculate_vc_gpu_counts(cluster_total,
                                               cluster_available,
                                               cluster_unschedulable, vc_info,
                                               vc_usage)

        vc_total, vc_used, vc_available, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_available = {
            "quantus": {
                "P40": 25
            },
            "relevance2": {
                "P40": 2
            },
            "relevance2-inf": {
                "P40": 40
            },
        }

        self.assertEqual(target_vc_available, vc_available)

        target_vc_unschedulable = {
            "quantus": {
                "P40": 0
            },
            "relevance2": {
                "P40": 1
            },
            "relevance2-inf": {
                "P40": 0
            },
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 3
0
    def test_gpu_accounting_idle_gpus_become_unscheduable(self):
        vc_info = {
            "A": {
                "P40": 40
            },
            "B": {
                "P40": 40
            },
            "C": {
                "P40": 40
            },
        }

        vc_usage = {
            "A": {
                "P40": 40
            },
            "B": {
                "P40": 31
            },
            "C": {
                "P40": 0
            },
        }

        cluster_total = {"P40": 120}
        cluster_available = {"P40": 29}
        cluster_unschedulable = {"P40": 20}

        result = quota.calculate_vc_gpu_counts(cluster_total,
                                               cluster_available,
                                               cluster_unschedulable, vc_info,
                                               vc_usage)

        vc_total, vc_used, vc_available, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_available = {
            "A": {
                "P40": 0
            },
            "B": {
                "P40": 1
            },
            "C": {
                "P40": 27
            },
        }

        self.assertEqual(target_vc_available, vc_available)

        target_vc_unschedulable = {
            "A": {
                "P40": 0
            },
            "B": {
                "P40": 8
            },
            "C": {
                "P40": 13
            },
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 4
0
    def test_gpu_accounting_real_case(self):
        vc_info = {
            "platform": {
                "P40": 48
            },
            "relevance": {
                "P40": 200
            },
            "quantus": {
                "P40": 100
            },
            "AU": {
                "P40": 20
            },
        }

        vc_usage = {
            "platform": {
                "P40": 57
            },
            "relevance": {
                "P40": 164
            },
            "quantus": {
                "P40": 93
            },
            "AU": {
                "P40": 0
            },
        }

        cluster_total = {"P40": 368}
        cluster_available = {"P40": 54}
        cluster_unschedulable = {}

        result = quota.calculate_vc_gpu_counts(cluster_total,
                                               cluster_available,
                                               cluster_unschedulable, vc_info,
                                               vc_usage)

        vc_total, vc_used, vc_available, vc_unschedulable = result

        self.assertEqual(vc_info, vc_total)
        self.assertEqual(vc_usage, vc_used)

        target_vc_available = {
            "platform": {
                "P40": 0
            },
            "relevance": {
                "P40": 30
            },
            "quantus": {
                "P40": 6
            },
            "AU": {
                "P40": 17
            },
        }

        self.assertEqual(target_vc_available, vc_available)

        target_vc_unschedulable = {
            "platform": {
                "P40": 0
            },
            "relevance": {
                "P40": 6
            },
            "quantus": {
                "P40": 1
            },
            "AU": {
                "P40": 3
            },
        }

        self.assertEqual(target_vc_unschedulable, vc_unschedulable)
Esempio n. 5
0
def GetVC(userName, vcName):
    ret = None

    data_handler = DataHandler()

    cluster_status, _ = data_handler.GetClusterStatus()
    cluster_total = cluster_status["gpu_capacity"]
    cluster_available = cluster_status["gpu_avaliable"]
    cluster_reserved = cluster_status["gpu_reserved"]

    user_status = collections.defaultdict(lambda: ResourceInfo())
    user_status_preemptable = collections.defaultdict(lambda: ResourceInfo())

    vc_list = getClusterVCs()
    vc_info = {}
    vc_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))
    vc_preemptable_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))

    for vc in vc_list:
        vc_info[vc["vcName"]] = json.loads(vc["quota"])

    active_job_list = data_handler.GetActiveJobList()
    for job in active_job_list:
        jobParam = json.loads(base64.b64decode(job["jobParams"]))
        if "gpuType" in jobParam:
            if not jobParam["preemptionAllowed"]:
                vc_usage[job["vcName"]][jobParam["gpuType"]] += GetJobTotalGpu(
                    jobParam)
            else:
                vc_preemptable_usage[job["vcName"]][
                    jobParam["gpuType"]] += GetJobTotalGpu(jobParam)

    result = quota.calculate_vc_gpu_counts(cluster_total, cluster_available,
                                           cluster_reserved, vc_info, vc_usage)

    vc_total, vc_used, vc_available, vc_unschedulable = result

    for vc in vc_list:
        if vc["vcName"] == vcName and AuthorizationManager.HasAccess(
                userName, ResourceType.VC, vcName, Permission.User):

            num_active_jobs = 0
            for job in active_job_list:
                if job["vcName"] == vcName and job["jobStatus"] == "running":
                    num_active_jobs += 1
                    username = job["userName"]
                    jobParam = json.loads(base64.b64decode(job["jobParams"]))
                    if "gpuType" in jobParam:
                        if not jobParam["preemptionAllowed"]:
                            if username not in user_status:
                                user_status[username] = ResourceInfo()
                            user_status[username].Add(
                                ResourceInfo({
                                    jobParam["gpuType"]:
                                    GetJobTotalGpu(jobParam)
                                }))
                        else:
                            if username not in user_status_preemptable:
                                user_status_preemptable[
                                    username] = ResourceInfo()
                            user_status_preemptable[username].Add(
                                ResourceInfo({
                                    jobParam["gpuType"]:
                                    GetJobTotalGpu(jobParam)
                                }))

            vc["gpu_capacity"] = vc_total[vcName]
            vc["gpu_used"] = vc_used[vcName]
            vc["gpu_preemptable_used"] = vc_preemptable_usage[vcName]
            vc["gpu_unschedulable"] = vc_unschedulable[vcName]
            vc["gpu_avaliable"] = vc_available[vcName]
            vc["AvaliableJobNum"] = num_active_jobs
            vc["node_status"] = cluster_status["node_status"]
            vc["user_status"] = []
            for user_name, user_gpu in user_status.iteritems():
                # TODO: job_manager.getAlias should be put in a util file
                user_name = user_name.split("@")[0].strip()
                vc["user_status"].append({
                    "userName": user_name,
                    "userGPU": user_gpu.ToSerializable()
                })

            vc["user_status_preemptable"] = []
            for user_name, user_gpu in user_status_preemptable.iteritems():
                user_name = user_name.split("@")[0].strip()
                vc["user_status_preemptable"].append({
                    "userName":
                    user_name,
                    "userGPU":
                    user_gpu.ToSerializable()
                })

            try:
                gpu_idle_url = config["gpu_reporter"] + '/gpu_idle'
                gpu_idle_params = {"vc": vcName}
                gpu_idle_response = requests.get(gpu_idle_url,
                                                 params=gpu_idle_params)
                gpu_idle_json = gpu_idle_response.json()
                vc["gpu_idle"] = gpu_idle_json
            except Exception:
                logger.exception("Failed to fetch gpu_idle from gpu-exporter")

            ret = vc
            break
    return ret
Esempio n. 6
0
def TakeJobActions(data_handler, redis_conn, launcher, jobs):
    vc_list = data_handler.ListVCs()
    cluster_status, _ = data_handler.GetClusterStatus()
    cluster_total = cluster_status["gpu_capacity"]
    cluster_available = cluster_status["gpu_avaliable"]
    cluster_reserved = cluster_status["gpu_reserved"]

    vc_info = {}
    vc_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))

    for vc in vc_list:
        vc_info[vc["vcName"]] = json.loads(vc["quota"])

    active_job_list = data_handler.GetActiveJobList()
    for job in active_job_list:
        jobParam = json.loads(base64.b64decode(job["jobParams"]))
        if "gpuType" in jobParam:
            vc_usage[job["vcName"]][jobParam["gpuType"]] += GetJobTotalGpu(
                jobParam)

    result = quota.calculate_vc_gpu_counts(cluster_total, cluster_available,
                                           cluster_reserved, vc_info, vc_usage)
    vc_total, vc_used, vc_available, vc_unschedulable = result

    cluster_gpu_capacity = cluster_status["gpu_capacity"]
    cluster_gpu_unschedulable = cluster_status["gpu_unschedulable"]
    global_total = ResourceInfo(cluster_gpu_capacity)
    global_unschedulable = ResourceInfo(cluster_gpu_unschedulable)

    vc_resources = {}
    globalResInfo = ResourceInfo.Difference(global_total, global_unschedulable)

    priority_dict = get_priority_dict()
    logging.info("Job priority dict: {}".format(priority_dict))

    for vc in vc_list:
        vc_name = vc["vcName"]
        vc_schedulable = {}
        for gpu_type, total in vc_total[vc_name].items():
            vc_schedulable[
                gpu_type] = total - vc_unschedulable[vc_name][gpu_type]
        vc_resources[vc_name] = ResourceInfo(vc_schedulable)

    jobsInfo = []
    for job in jobs:
        if job["jobStatus"] in ["queued", "scheduling", "running"]:
            singleJobInfo = {}
            singleJobInfo["job"] = job
            job_params = json.loads(base64.b64decode(job["jobParams"]))
            singleJobInfo["preemptionAllowed"] = job_params[
                "preemptionAllowed"]
            singleJobInfo["jobId"] = job_params["jobId"]
            jobGpuType = "any"
            if "gpuType" in job_params:
                jobGpuType = job_params["gpuType"]
            singleJobInfo["globalResInfo"] = ResourceInfo(
                {jobGpuType: GetJobTotalGpu(job_params)})

            # Job lists will be sorted based on and in the order of below
            # 1. non-preemptible precedes preemptible
            # 2. running precedes scheduling, precedes queued
            # 3. larger priority value precedes lower priority value
            # 4. early job time precedes later job time

            # Non-Preemptible jobs first
            preemptible = 1 if singleJobInfo["preemptionAllowed"] else 0

            # Job status
            job_status = 0
            if job["jobStatus"] == "scheduling":
                job_status = 1
            elif job["jobStatus"] == "queued":
                job_status = 2

            # Priority value
            reverse_priority = get_job_priority(priority_dict,
                                                singleJobInfo["jobId"])
            priority = 999999 - reverse_priority

            # Job time
            job_time = str(job["jobTime"])

            singleJobInfo["sortKey"] = "{}_{}_{:06d}_{}".format(
                preemptible, job_status, priority, job_time)

            singleJobInfo["allowed"] = False
            jobsInfo.append(singleJobInfo)

    jobsInfo.sort(key=lambda x: x["sortKey"])

    logging.info("TakeJobActions : local resources : %s" % (vc_resources))
    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        logging.info("TakeJobActions : job : %s : %s : %s" %
                     (sji["jobId"], sji["globalResInfo"].CategoryToCountMap,
                      sji["sortKey"]))
        vc_name = sji["job"]["vcName"]
        vc_resource = vc_resources[vc_name]

        if (not sji["preemptionAllowed"]) and (vc_resource.CanSatisfy(
                sji["globalResInfo"])):
            vc_resource.Subtract(sji["globalResInfo"])
            globalResInfo.Subtract(sji["globalResInfo"])
            sji["allowed"] = True
            logging.info(
                "TakeJobActions : local assignment : %s : %s" %
                (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))

    for sji in jobsInfo:
        if sji["preemptionAllowed"] and (sji["allowed"] is False):
            if globalResInfo.CanSatisfy(sji["globalResInfo"]):
                logging.info(
                    "TakeJobActions : job : %s : %s" %
                    (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))
                # Strict FIFO policy not required for global (bonus) tokens since these jobs are anyway pre-emptible.
                globalResInfo.Subtract(sji["globalResInfo"])
                sji["allowed"] = True
                logging.info(
                    "TakeJobActions : global assignment : %s : %s" %
                    (sji["jobId"], sji["globalResInfo"].CategoryToCountMap))

    logging.info("TakeJobActions : global resources : %s" %
                 (globalResInfo.CategoryToCountMap))

    for sji in jobsInfo:
        try:
            if sji["job"]["jobStatus"] == "queued" and (sji["allowed"] is
                                                        True):
                launcher.submit_job(sji["job"])
                update_job_state_latency(redis_conn, sji["jobId"],
                                         "scheduling")
                logging.info("TakeJobActions : submitting job : %s : %s" %
                             (sji["jobId"], sji["sortKey"]))
            elif sji["preemptionAllowed"] and (
                    sji["job"]["jobStatus"] == "scheduling"
                    or sji["job"]["jobStatus"]
                    == "running") and (sji["allowed"] is False):
                launcher.kill_job(sji["job"]["jobId"], "queued")
                logging.info("TakeJobActions : pre-empting job : %s : %s" %
                             (sji["jobId"], sji["sortKey"]))
        except Exception as e:
            logging.error("Process job failed {}".format(sji["job"]),
                          exc_info=True)

    logging.info("TakeJobActions : job desired actions taken")