Beispiel #1
0
def RunCommand(command):
    dataHandler = DataHandler()
    k8sUtils.kubectl_exec("exec %s %s" %
                          (command["jobId"], command["command"]))
    dataHandler.FinishCommand(command["id"])
    dataHandler.Close()
    return True
Beispiel #2
0
def setup_tensorboard(user_name, pod_name):
    tensorboard_port = random.randint(40000, 49999)
    bash_script = "bash -c 'export DEBIAN_FRONTEND=noninteractive; pip install tensorboard; runuser -l " + user_name + " -c \"mkdir -p ~/tensorboard/\${DLWS_JOB_ID}/logs; nohup tensorboard --logdir=~/tensorboard/\${DLWS_JOB_ID}/logs --port=" + str(tensorboard_port) + " &>/dev/null &\"'"
    output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
    if output == "":
        raise Exception("Failed to start tensorboard in container. JobId: %s " % pod_name)
    return tensorboard_port
Beispiel #3
0
def start_ssh_server(pod_name, user_name, host_network=False, ssh_port=22):
    '''Setup the ssh server in container, and return the listening port.'''
    bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'"

    # ssh_port = 22

    # modify the script for HostNewtork
    if host_network:
        # if the ssh_port is default value 22, randomly choose one
        if ssh_port == 22:
            ssh_port = random.randint(40000, 49999)
        # bash_script = "sed -i '/^Port 22/c Port "+str(ssh_port)+"' /etc/ssh/sshd_config && "+bash_script
        # TODO refine the script later
        bash_script = "sudo bash -c 'apt-get update && apt-get install -y openssh-server && sed -i \"s/^Port 22/Port " + str(
            ssh_port
        ) + "/\" /etc/ssh/sshd_config && cd /home/" + user_name + " && (chown " + user_name + " -R .ssh; chmod 600 -R .ssh/*; chmod 700 .ssh; true) && service ssh restart'"

    # TODO setup reasonable timeout
    # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1)
    output = k8sUtils.kubectl_exec("exec %s %s" %
                                   (pod_name, " -- " + bash_script))
    if output == "":
        raise Exception("Failed to setup ssh server in container. JobId: %s " %
                        pod_name)
    return ssh_port
Beispiel #4
0
def is_ssh_server_ready(pod_name):
    bash_script = "sudo service ssh status"
    output = k8sUtils.kubectl_exec("exec %s %s" %
                                   (pod_name, " -- " + bash_script))
    if output == "":
        return False
    return True
Beispiel #5
0
def is_user_ready(pod_name):
    bash_script = "bash -c 'ls /dlws/USER_READY'"
    output = k8sUtils.kubectl_exec("exec %s %s" %
                                   (pod_name, " -- " + bash_script))
    if output == "":
        return False
    return True
Beispiel #6
0
def setup_jupyter_server(user_name, pod_name):

    jupyter_port = random.randint(40000, 49999)
    bash_script = "bash -c 'export DEBIAN_FRONTEND=noninteractive; apt-get update && apt-get install -y python3-pip && python3 -m pip install --upgrade pip && python3 -m pip install jupyter && cd /home/" + user_name + " && runuser -l " + user_name + " -c \"jupyter notebook --no-browser --ip=0.0.0.0 --NotebookApp.token= --port=" + str(jupyter_port) + " &>/dev/null &\"'"
    output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
    if output == "":
        raise Exception("Failed to start jupyter server in container. JobId: %s " % pod_name)
    return jupyter_port
Beispiel #7
0
def start_ssh_server(pod_name):
    '''Setup the ssh server in container, and return the listening port.'''
    bash_script = "service ssh start" # assume ssh server already setup

    # TODO setup reasonable timeout
    # output = k8sUtils.kubectl_exec("exec %s %s" % (jobId, " -- " + bash_script), 1)
    output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
    if output == "":
        raise Exception("Failed to setup ssh server in container. JobId: %s " % pod_name)
def RunCommand(command):
    dataHandler = DataHandler()
    logging.info("Job %s exec command: [%s]" %
                 (command["jobId"], command["command"]))
    output = k8sUtils.kubectl_exec("exec %s %s" %
                                   (command["jobId"], command["command"]))
    logging.info("exec output:\n %s" % (output))
    dataHandler.FinishCommand(command["id"], output)
    dataHandler.Close()
    return True
def query_ssh_port(pod_name):
    bash_script = "\"grep ^Port /usr/etc/sshd_config | cut -d' ' -f2\""
    # status_code, output = deployer.pod_exec(pod_name, ["/bin/bash", "-c", bash_script])
    output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + "/bin/bash" + " -c " + bash_script))
    if output == "":
        raise RuntimeError("Query ssh port failed: {}".format(pod_name))
    # if status_code != 0:
    #     raise RuntimeError("Query ssh port failed: {}".format(pod_name))
    if not output:
        return 22
        
    return int(output)
def is_server_ready(endpoint):
    pod_name = endpoint["podName"]
    port_name = endpoint["name"]
    cmd = None
    if port_name == "ipython":
        cmd = "ps -ef|grep jupyter-lab"
    elif port_name == "tensorboard":
        cmd = "ps -ef|grep tensorboard"
    elif port_name == "vscode":
        cmd = "ps -ef|grep code-server"
    if cmd:
        output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + cmd))
        if output == "":
            return False
    return True
Beispiel #11
0
def query_ssh_port(pod_name):
    bash_script = "grep ^Port /etc/ssh/sshd_config | cut -d' ' -f2"
    ssh_port = k8sUtils.kubectl_exec("exec %s %s" %
                                     (pod_name, " -- " + bash_script))
    return int(ssh_port)
Beispiel #12
0
def get_cluster_status():
    cluster_status = {}
    gpuStr = "alpha.kubernetes.io/nvidia-gpu"
    try:
        output = k8sUtils.kubectl_exec(" get nodes -o yaml")
        nodeInfo = yaml.load(output)
        nodes_status = {}
        user_status = {}

        if "items" in nodeInfo:
            for node in nodeInfo["items"]:
                node_status = {}
                node_status["name"] = node["metadata"]["name"]
                node_status["labels"] = node["metadata"]["labels"]
                if (gpuStr in node["status"]["allocatable"]):
                    node_status["gpu_allocatable"] = int(
                        node["status"]["allocatable"][gpuStr])
                else:
                    node_status["gpu_allocatable"] = 0
                if (gpuStr in node["status"]["capacity"]):
                    node_status["gpu_capacity"] = int(
                        node["status"]["capacity"][gpuStr])
                else:
                    node_status["gpu_capacity"] = 0
                node_status["gpu_used"] = 0
                node_status["InternalIP"] = "unknown"
                node_status["pods"] = []

                if "addresses" in node["status"]:
                    for addr in node["status"]["addresses"]:
                        if addr["type"] == "InternalIP":
                            node_status["InternalIP"] = addr["address"]

                node_status["scheduled_service"] = []
                for l, s in node_status["labels"].iteritems():
                    if s == "active" and l != "all" and l != "default":
                        node_status["scheduled_service"].append(l)

                if "unschedulable" in node["spec"] and node["spec"][
                        "unschedulable"]:
                    node_status["unschedulable"] = True
                else:
                    node_status["unschedulable"] = False

                if "status" in node and "conditions" in node["status"]:
                    for condi in node["status"]:
                        if "type" in condi and condi[
                                "type"] == "Ready" and "status" in condi and condi[
                                    "status"] == "Unknown":
                            node_status["unschedulable"] = True

                nodes_status[node_status["name"]] = node_status

        output = k8sUtils.kubectl_exec(" get pods -o yaml")
        podsInfo = yaml.load(output)
        if "items" in podsInfo:
            for pod in podsInfo["items"]:
                gpus = 0
                username = None
                if "metadata" in pod and "labels" in pod[
                        "metadata"] and "userName" in pod["metadata"]["labels"]:
                    username = pod["metadata"]["labels"]["userName"]
                if "spec" in pod and "nodeName" in pod["spec"]:
                    node_name = pod["spec"]["nodeName"]
                    pod_name = pod["metadata"]["name"]
                    if username is not None:
                        pod_name += " : " + username
                    gpuUsage = get_job_gpu_usage(pod["metadata"]["name"])
                    if gpuUsage is not None:
                        pod_name += " (gpu usage:" + str(gpuUsage) + "%)"
                        if gpuUsage <= 25:
                            pod_name += "!!!!!!"
                    if "containers" in pod["spec"]:
                        for container in pod["spec"]["containers"]:
                            if "resources" in container and "requests" in container[
                                    "resources"] and gpuStr in container[
                                        "resources"]["requests"]:
                                gpus += int(
                                    container["resources"]["requests"][gpuStr])
                                pod_name += " (gpu #:" + container[
                                    "resources"]["requests"][gpuStr] + ")"
                    if node_name in nodes_status:
                        nodes_status[node_name]["gpu_used"] += gpus
                        nodes_status[node_name]["pods"].append(pod_name)

                if username is not None:
                    if username not in user_status:
                        user_status[username] = gpus
                    else:
                        user_status[username] += gpus

        gpu_avaliable = 0
        gpu_reserved = 0
        gpu_capacity = 0
        gpu_unschedulable = 0
        gpu_schedulable = 0
        gpu_used = 0

        for node_name, node_status in nodes_status.iteritems():
            if node_status["unschedulable"]:
                gpu_unschedulable += node_status["gpu_capacity"]
            else:
                gpu_avaliable += (node_status["gpu_allocatable"] -
                                  node_status["gpu_used"])
                gpu_schedulable += node_status["gpu_capacity"]
                gpu_unschedulable += (node_status["gpu_capacity"] -
                                      node_status["gpu_allocatable"])

            gpu_reserved += (node_status["gpu_capacity"] -
                             node_status["gpu_allocatable"])
            gpu_used += node_status["gpu_used"]
            gpu_capacity += node_status["gpu_capacity"]

        cluster_status["user_status"] = []
        for user_name, user_gpu in user_status.iteritems():
            cluster_status["user_status"].append({
                "userName": user_name,
                "userGPU": user_gpu
            })

        cluster_status["gpu_avaliable"] = gpu_avaliable
        cluster_status["gpu_capacity"] = gpu_capacity
        cluster_status["gpu_unschedulable"] = gpu_unschedulable
        cluster_status["gpu_used"] = gpu_used
        cluster_status["gpu_reserved"] = gpu_reserved
        cluster_status["node_status"] = [
            node_status for node_name, node_status in nodes_status.iteritems()
        ]

    except Exception as e:
        print e
    dataHandler = DataHandler()
    cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount()
    cluster_status["TotalJobNum"] = dataHandler.GetALLJobsCount()
    if "cluster_status" in config and check_cluster_status_change(
            config["cluster_status"], cluster_status):
        logging.info("updating the cluster status...")
        dataHandler.UpdateClusterStatus(cluster_status)
    else:
        logging.info(
            "nothing changed in cluster, skipping the cluster status update..."
        )
    config["cluster_status"] = copy.deepcopy(cluster_status)
    dataHandler.Close()
    return cluster_status
Beispiel #13
0
def get_cluster_status():
    cluster_status = {}
    gpuStr = "nvidia.com/gpu"
    try:
        output = k8sUtils.kubectl_exec(" get nodes -o yaml")
        nodeInfo = yaml.load(output)
        nodes_status = {}
        user_status = {}

        if "items" in nodeInfo:
            for node in nodeInfo["items"]:
                node_status = {}
                node_status["name"] = node["metadata"]["name"]
                node_status["labels"] = node["metadata"]["labels"]
                node_status["gpuType"] = ""

                node_status["scheduled_service"] = []
                for l, s in node_status["labels"].iteritems():
                    if s == "active" and l != "all" and l != "default":
                        node_status["scheduled_service"].append(l)
                    if l == "gpuType":
                        node_status["scheduled_service"].append(s)
                        node_status["gpuType"] = s

                if (gpuStr in node["status"]["allocatable"]):
                    node_status["gpu_allocatable"] = ResourceInfo({
                        node_status["gpuType"]:
                        int(node["status"]["allocatable"][gpuStr])
                    }).ToSerializable()
                else:
                    node_status["gpu_allocatable"] = ResourceInfo(
                    ).ToSerializable()
                if (gpuStr in node["status"]["capacity"]):
                    node_status["gpu_capacity"] = ResourceInfo({
                        node_status["gpuType"]:
                        int(node["status"]["capacity"][gpuStr])
                    }).ToSerializable()
                else:
                    node_status["gpu_capacity"] = ResourceInfo(
                    ).ToSerializable()
                node_status["gpu_used"] = ResourceInfo().ToSerializable()
                node_status["InternalIP"] = "unknown"
                node_status["pods"] = []
                if "annotations" in node["metadata"]:
                    if "node.alpha/DeviceInformation" in node["metadata"][
                            "annotations"]:
                        node_info = json.loads(
                            node["metadata"]["annotations"]
                            ["node.alpha/DeviceInformation"])
                        if (int(node_info["capacity"]["alpha.gpu/numgpu"]) >
                                ResourceInfo(
                                    node_status["gpu_capacity"]).TotalCount()):
                            node_status["gpu_capacity"] = ResourceInfo({
                                node_status["gpuType"]:
                                int(node_info["capacity"]["alpha.gpu/numgpu"])
                            }).ToSerializable()
                        if (int(node_info["allocatable"]["alpha.gpu/numgpu"]) >
                                ResourceInfo(node_status["gpu_allocatable"]
                                             ).TotalCount()):
                            node_status["gpu_allocatable"] = ResourceInfo({
                                node_status["gpuType"]:
                                int(node_info["allocatable"]
                                    ["alpha.gpu/numgpu"])
                            }).ToSerializable()

                if "addresses" in node["status"]:
                    for addr in node["status"]["addresses"]:
                        if addr["type"] == "InternalIP":
                            node_status["InternalIP"] = addr["address"]

                if "unschedulable" in node["spec"] and node["spec"][
                        "unschedulable"]:
                    node_status["unschedulable"] = True
                else:
                    node_status["unschedulable"] = False

                if "status" in node and "conditions" in node["status"]:
                    for condi in node["status"]["conditions"]:
                        if "type" in condi and condi[
                                "type"] == "Ready" and "status" in condi and condi[
                                    "status"] == "Unknown":
                            node_status["unschedulable"] = True

                nodes_status[node_status["name"]] = node_status

        output = k8sUtils.kubectl_exec(" get pods -o yaml")
        podsInfo = yaml.load(output)
        if "items" in podsInfo:
            for pod in podsInfo["items"]:
                gpus = 0
                username = None
                if "metadata" in pod and "labels" in pod[
                        "metadata"] and "userName" in pod["metadata"]["labels"]:
                    username = pod["metadata"]["labels"]["userName"]
                if "spec" in pod and "nodeName" in pod["spec"]:
                    node_name = pod["spec"]["nodeName"]
                    pod_name = pod["metadata"]["name"]
                    if username is not None:
                        pod_name += " : " + username
                    gpuUsage = get_job_gpu_usage(pod["metadata"]["name"])
                    if gpuUsage is not None:
                        pod_name += " (gpu usage:" + str(gpuUsage) + "%)"
                        if gpuUsage <= 25:
                            pod_name += "!!!!!!"
                    pod_info_cont = {}
                    pod_info_initcont = {}
                    if "annotations" in pod["metadata"]:
                        if "pod.alpha/DeviceInformation" in pod["metadata"][
                                "annotations"]:
                            pod_info = json.loads(
                                pod["metadata"]["annotations"]
                                ["pod.alpha/DeviceInformation"])
                            if "runningcontainer" in pod_info:
                                pod_info_cont = pod_info["runningcontainer"]
                            if "initcontainer" in pod_info:
                                pod_info_initcont = pod_info["initcontainer"]
                    if "containers" in pod["spec"]:
                        for container in pod["spec"]["containers"]:
                            containerGPUs = 0
                            if "resources" in container and "requests" in container[
                                    "resources"] and gpuStr in container[
                                        "resources"]["requests"]:
                                containerGPUs = int(
                                    container["resources"]["requests"][gpuStr])
                            if container["name"] in pod_info_cont:
                                if "requests" in pod_info_cont[container[
                                        "name"]] and "alpha.gpu/numgpu" in pod_info_cont[
                                            container["name"]]["requests"]:
                                    containerGPUs = max(
                                        int(pod_info_cont[container["name"]]
                                            ["requests"]["alpha.gpu/numgpu"]),
                                        containerGPUs)
                            gpus += containerGPUs
                            pod_name += " (gpu #:" + str(containerGPUs) + ")"

                    if node_name in nodes_status:
                        nodes_status[node_name]["gpu_used"] = ResourceInfo(
                            nodes_status[node_name]["gpu_used"]).Add(
                                ResourceInfo(
                                    {nodes_status[node_name]["gpuType"]:
                                     gpus})).ToSerializable()
                        nodes_status[node_name]["pods"].append(pod_name)

                        if username is not None:
                            if username not in user_status:
                                user_status[username] = ResourceInfo(
                                    {nodes_status[node_name]["gpuType"]: gpus})
                            else:
                                user_status[username].Add(
                                    ResourceInfo({
                                        nodes_status[node_name]["gpuType"]:
                                        gpus
                                    }))

        gpu_avaliable = ResourceInfo()
        gpu_reserved = ResourceInfo()
        gpu_capacity = ResourceInfo()
        gpu_unschedulable = ResourceInfo()
        gpu_schedulable = ResourceInfo()
        gpu_used = ResourceInfo()

        for node_name, node_status in nodes_status.iteritems():
            if node_status["unschedulable"]:
                gpu_unschedulable.Add(ResourceInfo(
                    node_status["gpu_capacity"]))
                gpu_reserved.Add(
                    ResourceInfo.Difference(
                        ResourceInfo(node_status["gpu_capacity"]),
                        ResourceInfo(node_status["gpu_used"])))
            else:
                gpu_avaliable.Add(
                    ResourceInfo.Difference(
                        ResourceInfo(node_status["gpu_allocatable"]),
                        ResourceInfo(node_status["gpu_used"])))
                gpu_schedulable.Add(ResourceInfo(node_status["gpu_capacity"]))
                gpu_unschedulable.Add(
                    ResourceInfo.Difference(
                        ResourceInfo(node_status["gpu_capacity"]),
                        ResourceInfo(node_status["gpu_allocatable"])))
                gpu_reserved.Add(
                    ResourceInfo.Difference(
                        ResourceInfo(node_status["gpu_capacity"]),
                        ResourceInfo(node_status["gpu_allocatable"])))

            gpu_used.Add(ResourceInfo(node_status["gpu_used"]))
            gpu_capacity.Add(ResourceInfo(node_status["gpu_capacity"]))

        cluster_status["user_status"] = []
        for user_name, user_gpu in user_status.iteritems():
            cluster_status["user_status"].append({
                "userName":
                user_name,
                "userGPU":
                user_gpu.ToSerializable()
            })

        cluster_status["gpu_avaliable"] = gpu_avaliable.ToSerializable()
        cluster_status["gpu_capacity"] = gpu_capacity.ToSerializable()
        cluster_status["gpu_unschedulable"] = gpu_unschedulable.ToSerializable(
        )
        cluster_status["gpu_used"] = gpu_used.ToSerializable()
        cluster_status["gpu_reserved"] = gpu_reserved.ToSerializable()
        cluster_status["node_status"] = [
            node_status for node_name, node_status in nodes_status.iteritems()
        ]

    except Exception as e:
        logging.exception("get cluster status")

    dataHandler = DataHandler()
    cluster_status["AvaliableJobNum"] = dataHandler.GetActiveJobsCount()

    if "cluster_status" in config and check_cluster_status_change(
            config["cluster_status"], cluster_status):
        logging.info("updating the cluster status...")
        dataHandler.UpdateClusterStatus(cluster_status)
    else:
        logging.info(
            "nothing changed in cluster, skipping the cluster status update..."
        )

    config["cluster_status"] = copy.deepcopy(cluster_status)
    dataHandler.Close()
    return cluster_status
Beispiel #14
0
def launch_ps_dist_job(jobParams):
    job_id = jobParams["jobId"]
    pods = k8sUtils.GetPod("run=" + job_id)

    # if any pod is not up, return
    if "items" not in pods or len(pods["items"]) != (
            int(jobParams["numpsworker"]) + int(jobParams["numps"])):
        return
    # if any pod is not ready, return
    pod_status = [k8sUtils.check_pod_status(pod) for pod in pods["items"]]
    if any([status != "Running" for status in pod_status]):
        return

    user_name = getAlias(jobParams["userName"])
    if "hostNetwork" in jobParams and jobParams["hostNetwork"]:
        host_network = True
    else:
        host_network = False

    # setup ssh server
    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        dist_port = pod["metadata"]["labels"]["distPort"]
        # quit if can't setup ssh server
        ssh_port = start_ssh_server(pod_name, user_name, host_network,
                                    dist_port)

    # generate ssh config
    ssh_config = """
Host %s
  HostName %s
  Port %s
  User %s
  StrictHostKeyChecking no
  UserKnownHostsFile /dev/null
                """
    sshconfigstr = ""
    for [idx, pod] in enumerate(pods["items"]):
        pod_ip = pod["status"]["podIP"]
        dist_port = pod["metadata"]["labels"]["distPort"]
        role = pod["metadata"]["labels"]["distRole"]
        role_idx = pod["metadata"]["labels"]["distRoleIdx"]

        # TODO hostNetwork
        if host_network:
            sshconfigstr += (
                ssh_config %
                (role + "-" + str(role_idx), pod_ip, str(dist_port), user_name)
                + "\n")
        else:
            sshconfigstr += (
                ssh_config %
                (role + "-" + str(role_idx), pod_ip, 22, user_name) + "\n")

    # config ssh client
    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        bash_script = "cat > /home/" + user_name + "/.ssh/config <<EOF " + sshconfigstr + "\nEOF"
        print("override ssh client config: %s" % bash_script)
        k8sUtils.kubectl_exec(
            "exec %s -- bash -c \'%s\' ; chown -R %s /home/%s/.ssh/config" %
            (pod_name, bash_script, user_name, user_name))

        # fix ~/.ssh/ folder permission
        k8sUtils.kubectl_exec(
            "exec %s -- chmod 600 -R /home/%s/.ssh; chmod 700 /home/%s/.ssh; chown -R %s /home/%s/.ssh/config"
            % (pod_name, user_name, user_name, user_name, user_name))

    # generate hostfile
    hostfilecontent = ""
    for [_, pod] in enumerate(pods["items"]):
        role = pod["metadata"]["labels"]["distRole"]
        if role == "ps":
            continue
        role_idx = pod["metadata"]["labels"]["distRoleIdx"]
        worker_gpu_num = pod["spec"]["containers"][0]["resources"]["requests"][
            "nvidia.com/gpu"]
        hostfilecontent += "%s  slots=%s\n" % ("worker-" + str(role_idx),
                                               worker_gpu_num)
    tmp_hostfile = "/tmp/" + job_id + ".hostfile"
    with open(tmp_hostfile, 'w') as f:
        f.write(hostfilecontent + "\n")
    # write the hostfile
    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        remotecmd = "cp %s %s:/job/hostfile" % (tmp_hostfile, pod_name)
        k8sUtils.kubectl_exec(remotecmd)

    for [idx, pod] in enumerate(pods["items"]):
        pod_name = pod["metadata"]["name"]
        k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" % pod_name)

    # execute user command
    #k8sUtils.kubectl_exec("exec %s -- bash -c 'runuser -l ${DLWS_USER_NAME} <<EOF_USER_SCRIPT %s \nEOF_USER_SCRIPT'" % (pod_name, jobParams["cmd"]))

    # update job status
    dataHandler = DataHandler()
    dataHandler.UpdateJobTextField(job_id, "jobStatus", "running")
    dataHandler.Close()
Beispiel #15
0
def get_k8s_endpoint(endpoint_description_path):
    endpoint_description_path = os.path.join(config["storage-mount-path"], endpoint_description_path)
    return k8sUtils.kubectl_exec("get -o json -f %s" % endpoint_description_path)
Beispiel #16
0
def launch_ps_dist_job(jobParams):
    jobId = jobParams["jobId"]
    workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId)
    psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId)
    if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(
            jobParams["numpsworker"]) and "items" in psPodInfo and len(
                psPodInfo["items"]) == int(jobParams["numps"]):
        podStatus = [
            k8sUtils.check_pod_status(pod)
            for pod in workerPodInfo["items"] + psPodInfo["items"]
        ]
        if all([status == "Running" for status in podStatus]):
            ps_pod_names = [
                pod["metadata"]["name"] for pod in psPodInfo["items"]
            ]
            worker_pod_names = [
                pod["metadata"]["name"] for pod in workerPodInfo["items"]
            ]

            ps_pod_ips = [pod["status"]["podIP"] for pod in psPodInfo["items"]]
            worker_pod_ips = [
                pod["status"]["podIP"] for pod in workerPodInfo["items"]
            ]

            ps_num = len(psPodInfo["items"])
            worker_num = len(workerPodInfo["items"])

            ps_ports = [
                int(item["metadata"]["labels"]["distPort"])
                for item in psPodInfo["items"]
            ]
            worker_ports = [
                int(item["metadata"]["labels"]["distPort"])
                for item in workerPodInfo["items"]
            ]

            #port range: 30000~31000
            #rndList = range(max(1000,ps_num + worker_num))
            #random.shuffle(rndList)
            #ps_ports = [rndList[i] + 30000 for i in range(ps_num)]
            #worker_ports = [rndList[i + ps_num] + 30000 for i in range(worker_num)]

            ps_hosts = ",".join([
                "%s:%s" % (ps_pod_ips[i], ps_ports[i]) for i in range(ps_num)
            ])
            worker_hosts = ",".join([
                "%s:%s" % (worker_pod_ips[i], worker_ports[i])
                for i in range(worker_num)
            ])

            ps_files = ["/tmp/" + str(uuid.uuid4()) for i in range(ps_num)]
            worker_files = [
                "/tmp/" + str(uuid.uuid4()) for i in range(worker_num)
            ]

            ps_cmd = [
                "%s --ps_hosts=%s --worker_hosts=%s --job_name=ps --task_index=%d 2>&1 | tee %s"
                % (jobParams["cmd"], ps_hosts, worker_hosts, i, ps_files[i])
                for i in range(ps_num)
            ]
            worker_cmd = [
                "%s --ps_hosts=%s --worker_hosts=%s --job_name=worker --task_index=%d 2>&1 | tee %s"
                %
                (jobParams["cmd"], ps_hosts, worker_hosts, i, worker_files[i])
                for i in range(worker_num)
            ]

            for i in range(ps_num):
                os.system("mkdir -p %s" % ps_files[i])
                ps_files[i] = os.path.join(ps_files[i], "run_dist_job.sh")
                with open(ps_files[i], 'w') as f:
                    f.write(ps_cmd[i] + "\n")
                f.close()
                if "userId" in jobParams:
                    os.system("chown -R %s %s" %
                              (jobParams["userId"], ps_files[i]))
                remotecmd = "cp %s %s:/opt/run_dist_job.sh" % (ps_files[i],
                                                               ps_pod_names[i])
                k8sUtils.kubectl_exec(remotecmd)
                k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" %
                                      ps_pod_names[i])

            for i in range(worker_num):
                os.system("mkdir -p %s" % worker_files[i])
                worker_files[i] = os.path.join(worker_files[i],
                                               "run_dist_job.sh")
                with open(worker_files[i], 'w') as f:
                    f.write(worker_cmd[i] + "\n")
                f.close()
                if "userId" in jobParams:
                    os.system("chown -R %s %s" %
                              (jobParams["userId"], worker_files[i]))
                remotecmd = "cp %s %s:/opt/run_dist_job.sh" % (
                    worker_files[i], worker_pod_names[i])
                k8sUtils.kubectl_exec(remotecmd)
                k8sUtils.kubectl_exec("exec %s touch /opt/run_dist_job" %
                                      worker_pod_names[i])

            dataHandler = DataHandler()
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "running")
def SubmitRegularJob(job):
    ret = {}
    dataHandler = DataHandler()
    logging.info("start to submit regular job...")

    try:
        jobParams = json.loads(base64.b64decode(job["jobParams"]))

        jobParams["pvc_job"] = "jobs-" + jobParams["jobId"]
        jobParams["pvc_work"] = "work-" + jobParams["jobId"]
        jobParams["pvc_data"] = "storage-" + jobParams["jobId"]

        if "jobPath" not in jobParams or len(
                jobParams["jobPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: job-path does not exist")
            msg = "ERROR: job-path does not exist. jobid: %s" % (
                jobParams["jobId"])
            logging.error(msg)
            return False

        if "workPath" not in jobParams or len(
                jobParams["workPath"].strip()) == 0:
            dataHandler.SetJobError(jobParams["jobId"],
                                    "ERROR: work-path does not exist")

            msg = "ERROR: work-path does not exist. jobid: %s" % (
                jobParams["jobId"])
            logging.error(msg)
            return False

        #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0:
        #    dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist")
        #    return False
        jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                     jobParams["workPath"],
                                                     jobParams["dataPath"])
        localJobPath = os.path.join(config["storage-mount-path"], jobPath)

        if not os.path.exists(localJobPath):
            if "userId" in jobParams:
                mkdirsAsUser(localJobPath, jobParams["userId"])
                mkdirsAsUser(os.path.join(localJobPath, "models"),
                             jobParams["userId"])
            else:
                mkdirsAsUser(localJobPath, "0")
                mkdirsAsUser(os.path.join(localJobPath, "models"), "0")

        jobParams["LaunchCMD"] = ""
        if "cmd" not in jobParams:
            jobParams["cmd"] = ""

        if isinstance(jobParams["cmd"],
                      basestring) and not jobParams["cmd"] == "":
            launchScriptPath = os.path.join(
                localJobPath, "launch-%s.sh" % jobParams["jobId"])

            with open(launchScriptPath, 'w') as f:
                f.write("#!/bin/bash -x\n")
                f.write(jobParams["cmd"] + "\n")

                msg = "write cmd(%s) to file: %s" % (jobParams["cmd"],
                                                     launchScriptPath)
                logging.info(msg)

            f.close()
            if "userId" in jobParams:
                cmd = "chown -R %s %s" % (jobParams["userId"],
                                          launchScriptPath)
                os.system(cmd)
                logging.info(cmd)

            # todo: Pod启动后会执行shell脚本,需预先将shell脚本拷贝到Pod所在的节点机器的目录:
            # 譬如:/dlwsdata/work/user-nanme/jobs/191225/6f81459e-42ea-447e-9380-f545da2517e9/
            # Pod启动后,会将此目录挂载至/job/
            # jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"]
            jobParams[
                "LaunchCMD"] = "[\"/bin/sh\", \"-ec\", \"sleep 6000315360000\"]"

        jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml"
        jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"]
                                            if e.isalnum())
        ENV = Environment(loader=FileSystemLoader("/"))

        jobTempDir = os.path.join(config["root-path"], "Jobs_Templete")
        jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template")

        jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"],
                                                jobPath)
        jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"],
                                                 workPath)
        jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"],
                                                 dataPath)
        jobParams["nvidiaDriverPath"] = nvidiaDriverPath

        jobParams["userNameLabel"] = getAlias(jobParams["userName"])
        jobParams["rest-api"] = config["rest-api"]

        if "mountpoints" not in jobParams:
            jobParams["mountpoints"] = []

        for onemount in jobParams["mountpoints"]:
            onemount["name"] = onemount["containerPath"].replace(
                "/", "").replace(".", "").replace("_", "-")

        # mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True}
        # if CheckMountPoints(jobParams["mountpoints"],mp):
        #    jobParams["mountpoints"].append(mp)

        mp = {
            "name": "job",
            "containerPath": "/job",
            "hostPath": jobParams["hostjobPath"],
            "enabled": True
        }
        if CheckMountPoints(jobParams["mountpoints"], mp):
            jobParams["mountpoints"].append(mp)

        mp = {
            "name": "work",
            "containerPath": "/work",
            "hostPath": jobParams["hostworkPath"],
            "enabled": True
        }
        if CheckMountPoints(jobParams["mountpoints"], mp):
            jobParams["mountpoints"].append(mp)

        mp = {
            "name": "data",
            "containerPath": "/data",
            "hostPath": jobParams["hostdataPath"],
            "enabled": True
        }
        if CheckMountPoints(jobParams["mountpoints"], mp):
            jobParams["mountpoints"].append(mp)

        userAlias = getAlias(jobParams["userName"])

        mp = {
            "name":
            "sshkey",
            "containerPath":
            "/home/%s/.ssh" % userAlias,
            "hostPath":
            os.path.join(config["storage-mount-path"],
                         GetWorkPath(userAlias) + "/.ssh"),
            "readOnly":
            True,
            "enabled":
            True
        }  #
        if CheckMountPoints(jobParams["mountpoints"], mp):
            jobParams["mountpoints"].append(mp)

        jobParams["pod_ip_range"] = config["pod_ip_range"]
        if "usefreeflow" in config:
            jobParams["usefreeflow"] = config["usefreeflow"]
        else:
            jobParams["usefreeflow"] = False

        msg = ("Render Job: %s" % jobParams)
        print(msg)
        logging.info(msg)

        jobDescriptionList = []
        pods = []

        if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams:
            i = int(jobParams["hyperparameterstartvalue"])
            end = int(jobParams["hyperparameterendvalue"])
            step = int(jobParams["hyperparameterstep"])
            c = 0

            while (i <= end):
                pod = {}
                pod["podName"] = jobParams["jobId"] + "-pod-" + str(c)
                pod["envs"] = [{
                    "name": jobParams["hyperparametername"],
                    "value": i
                }]
                i += step
                c += 1
                pods.append(pod)
        else:
            pod = {}
            pod["podName"] = jobParams["jobId"]
            pod["envs"] = []
            pods.append(pod)

        if "env" not in jobParams:
            jobParams["env"] = []

        jobParams["commonenv"] = copy.copy(jobParams["env"])
        for pod in pods:
            jobParams["podName"] = pod["podName"]
            jobParams["env"] = jobParams["commonenv"] + pod["envs"]

            if "kube_custom_scheduler" in config and config[
                    "kube_custom_scheduler"]:
                container = {}
                container["requests"] = {
                    "alpha.gpu/numgpu": int(jobParams["resourcegpu"])
                }
                podInfo = {}
                podInfo["podname"] = jobParams["podName"]
                if "useGPUTopology" in jobParams and jobParams[
                        "useGPUTopology"]:
                    # add topology constraints explicitly - for testing
                    # if (jobParams["resourcegpu"] >= 2):
                    #     # both cards in same inner group
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 3):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 4):
                    #     container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 5):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 6):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 7):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1
                    # if (jobParams["resourcegpu"] >= 8):
                    #     container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1
                    podInfo["requests"] = {
                        "alpha.gpu/gpu-generate-topology": 1
                    }
                else:
                    # for cases when desired topology is explictly given or not desired
                    podInfo["requests"] = {
                        "alpha.gpu/gpu-generate-topology": 0
                    }
                podInfo["runningcontainer"] = {jobParams["podName"]: container}

                if "annotations" not in jobParams:
                    jobParams["annotations"] = {}
                jobParams["annotations"][
                    "pod.alpha/DeviceInformation"] = "'" + json.dumps(
                        podInfo) + "'"
                jobParams[
                    "resourcegpu"] = 0  # gpu requests specified through annotation

            template = ENV.get_template(os.path.abspath(jobTemp))
            job_description = template.render(job=jobParams)
            jobDescriptionList.append(job_description)

            if ("interactivePort" in jobParams
                    and len(jobParams["interactivePort"].strip()) > 0):
                ports = [
                    p.strip()
                    for p in re.split(",|;", jobParams["interactivePort"])
                    if len(p.strip()) > 0 and p.strip().isdigit()
                ]
                for portNum in ports:
                    jobParams["serviceId"] = "interactive-" + jobParams[
                        "podName"] + "-" + portNum
                    jobParams["port"] = portNum
                    jobParams["port-name"] = "interactive"
                    jobParams["port-type"] = "TCP"

                    serviceTemplate = ENV.get_template(
                        os.path.join(jobTempDir, "KubeSvc.yaml.template"))
                    stemplate = ENV.get_template(serviceTemplate)
                    interactiveMeta = stemplate.render(svc=jobParams)
                    jobDescriptionList.append(interactiveMeta)

        jobDescription = "\n---\n".join(jobDescriptionList)
        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          jobParams["jobDescriptionPath"])

        if not os.path.exists(
                os.path.dirname(os.path.realpath(jobDescriptionPath))):
            os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath)))

        if os.path.isfile(jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

        with open(jobDescriptionPath, 'w') as f:
            f.write(jobDescription)

        output = k8sUtils.kubectl_create(jobDescriptionPath)
        logging.info("kubectl create " + jobDescriptionPath + " output: " +
                     str(output))

        msg = "Submitted job %s to k8s, returned with status %s" % (
            jobParams["jobId"], output)
        logging.info(msg)

        msg = "JobParams: \n" + json.dumps(jobParams)
        logging.info(msg)

        ## 启动命令非空
        if isinstance(jobParams["cmd"],
                      basestring) and not jobParams["cmd"] == "":
            ## 等待docker启动完毕,再执行文件拷贝指令
            time.sleep(15)
            launch_file_name = "launch-%s.sh" % jobParams["jobId"]

            # 将文件拷贝进podName:/tmp/
            # /job/目录需要root权限才能操作,因此此处无法直接拷贝进/job/
            remotecmd = "cp %s %s:%s" % (launchScriptPath,
                                         jobParams["podName"], "/tmp/")
            output = k8sUtils.kubectl_exec(remotecmd)
            logging.info("remotecmd[" + remotecmd + "]" + " output[" +
                         str(output) + "]")

            # 添加执行权限:/tmp/lunach_jobid.sh
            remotecmd = "exec %s -- bash -c \"chmod 777 /tmp/%s\"" % (
                jobParams["jobId"], launch_file_name)
            output = k8sUtils.kubectl_exec(remotecmd)
            logging.info("remotecmd[" + remotecmd + "]" + " output[" +
                         str(output) + "]")

            # 执行/tmp/lunach_jobid.sh
            remotecmd = "exec %s -- bash -c \"/tmp/%s\"" % (jobParams["jobId"],
                                                            launch_file_name)
            output = k8sUtils.kubectl_exec(remotecmd)
            logging.info("remotecmd[" + remotecmd + "]" + " output[" +
                         str(output) + "]")

        else:
            pass

        ret["output"] = output
        ret["jobId"] = jobParams["jobId"]

        if "userName" not in jobParams:
            jobParams["userName"] = ""

        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(jobParams["jobId"],
                                       "jobDescriptionPath",
                                       jobParams["jobDescriptionPath"])
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription",
                                       base64.b64encode(jobDescription))

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["workPath"] = jobParams["workPath"]
        jobMeta["jobPath"] = jobParams["jobPath"]
        jobMeta["LaunchCMD"] = jobParams["LaunchCMD"]

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta",
                                       jobMetaStr)

        msg = "update job text field %s, returned with status" % (
            jobParams["jobId"])
        logging.info(msg)

    except Exception as e:
        print e
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(jobParams["jobId"])

        if retries >= 5:
            dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus",
                                           "error")
            dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))

    return ret