Example #1
0
def GetVC(userName, vcName):
    ret = None

    data_handler = DataHandler()

    cluster_status, _ = data_handler.GetClusterStatus()
    cluster_total = cluster_status["gpu_capacity"]
    cluster_available = cluster_status["gpu_avaliable"]
    cluster_reserved = cluster_status["gpu_reserved"]

    user_status = collections.defaultdict(lambda: ResourceInfo())
    user_status_preemptable = collections.defaultdict(lambda: ResourceInfo())

    vc_list = getClusterVCs()
    vc_info = {}
    vc_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))
    vc_preemptable_usage = collections.defaultdict(
        lambda: collections.defaultdict(lambda: 0))

    for vc in vc_list:
        vc_info[vc["vcName"]] = json.loads(vc["quota"])

    active_job_list = data_handler.GetActiveJobList()
    for job in active_job_list:
        jobParam = json.loads(base64.b64decode(job["jobParams"]))
        if "gpuType" in jobParam:
            if not jobParam["preemptionAllowed"]:
                vc_usage[job["vcName"]][jobParam["gpuType"]] += GetJobTotalGpu(
                    jobParam)
            else:
                vc_preemptable_usage[job["vcName"]][
                    jobParam["gpuType"]] += GetJobTotalGpu(jobParam)

    result = quota.calculate_vc_gpu_counts(cluster_total, cluster_available,
                                           cluster_reserved, vc_info, vc_usage)

    vc_total, vc_used, vc_available, vc_unschedulable = result

    for vc in vc_list:
        if vc["vcName"] == vcName and AuthorizationManager.HasAccess(
                userName, ResourceType.VC, vcName, Permission.User):

            num_active_jobs = 0
            for job in active_job_list:
                if job["vcName"] == vcName and job["jobStatus"] == "running":
                    num_active_jobs += 1
                    username = job["userName"]
                    jobParam = json.loads(base64.b64decode(job["jobParams"]))
                    if "gpuType" in jobParam:
                        if not jobParam["preemptionAllowed"]:
                            if username not in user_status:
                                user_status[username] = ResourceInfo()
                            user_status[username].Add(
                                ResourceInfo({
                                    jobParam["gpuType"]:
                                    GetJobTotalGpu(jobParam)
                                }))
                        else:
                            if username not in user_status_preemptable:
                                user_status_preemptable[
                                    username] = ResourceInfo()
                            user_status_preemptable[username].Add(
                                ResourceInfo({
                                    jobParam["gpuType"]:
                                    GetJobTotalGpu(jobParam)
                                }))

            vc["gpu_capacity"] = vc_total[vcName]
            vc["gpu_used"] = vc_used[vcName]
            vc["gpu_preemptable_used"] = vc_preemptable_usage[vcName]
            vc["gpu_unschedulable"] = vc_unschedulable[vcName]
            vc["gpu_avaliable"] = vc_available[vcName]
            vc["AvaliableJobNum"] = num_active_jobs
            vc["node_status"] = cluster_status["node_status"]
            vc["user_status"] = []
            for user_name, user_gpu in user_status.iteritems():
                # TODO: job_manager.getAlias should be put in a util file
                user_name = user_name.split("@")[0].strip()
                vc["user_status"].append({
                    "userName": user_name,
                    "userGPU": user_gpu.ToSerializable()
                })

            vc["user_status_preemptable"] = []
            for user_name, user_gpu in user_status_preemptable.iteritems():
                user_name = user_name.split("@")[0].strip()
                vc["user_status_preemptable"].append({
                    "userName":
                    user_name,
                    "userGPU":
                    user_gpu.ToSerializable()
                })

            try:
                gpu_idle_url = config["gpu_reporter"] + '/gpu_idle'
                gpu_idle_params = {"vc": vcName}
                gpu_idle_response = requests.get(gpu_idle_url,
                                                 params=gpu_idle_params)
                gpu_idle_json = gpu_idle_response.json()
                vc["gpu_idle"] = gpu_idle_json
            except Exception:
                logger.exception("Failed to fetch gpu_idle from gpu-exporter")

            ret = vc
            break
    return ret