def _extract_job_log(jobId, logPath, userId): dataHandler = None try: dataHandler = DataHandler() old_cursor = dataHandler.GetJobTextField(jobId, "jobLogCursor") if old_cursor is not None and len(old_cursor) == 0: old_cursor = None (pod_logs, new_cursor) = GetJobLog(jobId, cursor=old_cursor) jobLogDir = os.path.dirname(logPath) if not os.path.exists(jobLogDir): mkdirsAsUser(jobLogDir, userId) for (pod_name, log_text) in pod_logs.items(): try: podLogPath = os.path.join(jobLogDir, "log-pod-" + pod_name + ".txt") with open(podLogPath, 'a', encoding="utf-8") as f: f.write(log_text) os.system("chown -R %s %s" % (userId, podLogPath)) except Exception: logger.exception("write pod log of {} failed".format(jobId)) logging.info("cursor of job %s: %s" % (jobId, new_cursor)) if new_cursor is not None: dataHandler.UpdateJobTextFields({"jobId": jobId}, {"jobLogCursor": new_cursor}) except Exception as e: logging.error(e) finally: if dataHandler is not None: dataHandler.Close()
def extract_job_log(jobId, logPath, userId): try: dataHandler = DataHandler() # logs = k8sUtils.GetLog(jobId) # logs = k8sUtils.getJobConsoleDetail(jobId) jupyterLog = k8sUtils.getJupyterInfo(jobId) # TODO: Replace joblog manager with elastic search logs = k8sUtils.GetLog(jobId, tail=None) # Do not overwrite existing logs with empty log # DLTS bootstrap will generate logs for all containers. # If one container has empty log, skip writing. if not logs: return for log in logs: if "containerLog" in log and log["containerLog"] == "": return jobLogDir = os.path.dirname(logPath) if not os.path.exists(jobLogDir): mkdirsAsUser(jobLogDir, userId) logStr = "" trimlogstr = "" for log in logs: if "podName" in log and "containerID" in log and "containerLog" in log: logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += " logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += log["containerLog"] logStr += jupyterLog logStr += "\n\n\n" logStr += "=========================================================\n" logStr += " end of logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "\n\n\n" logLines = logStr.split('\n') length = len(logLines) if len(logStr.strip()) > 0: if (length <= 2000): if os.path.exists(os.path.join(jobLogDir, "max_page")): os.system("rm -rf %s" % (jobLogDir)) save_log(jobLogDir, str(jobId), userId, logStr) else: with open(os.path.join(jobLogDir, "max_page"), 'w') as f: f.write(str(length // 2000 + 1)) for i in range(1, length // 2000 + 2): trimlogstr = "\n".join(logLines[(i - 1) * 2000:i * 2000]) save_log(jobLogDir, str(jobId), userId, trimlogstr, i) except Exception as e: logger.error(e)
def generate_launch_script(job_id, path_to_save, user_id, gpu_num, user_script): if not os.path.exists(path_to_save): mkdirsAsUser(path_to_save, user_id) file_name = "job_command.sh" launch_script_file = os.path.join(path_to_save, file_name) with open(launch_script_file, 'w') as f: f.write(user_script) os.system("sudo chown %s %s" % (user_id, launch_script_file)) luanch_cmd = ["bash", "/pod/scripts/bootstrap.sh"] return luanch_cmd
def generate_launch_script(dist_role, dist_role_idx, user_id, job_path, cmd): # change ssh folder permission here because the setup permission # script in launch_ps_job function may have race condition with init_user.sh script. # results in no such user error local_pod_path = os.path.join(config["storage-mount-path"], "work/", job_path, "{}-{}".format(dist_role, dist_role_idx)) if not os.path.exists(local_pod_path): mkdirsAsUser(local_pod_path, user_id) file_name = "job_command.sh" launch_script_file = os.path.join(local_pod_path, file_name) with open(launch_script_file, 'w') as f: f.write(cmd) f.close() launchCMD = ["bash", "/pod/scripts/bootstrap.sh"] return launchCMD
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len( jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: job-path does not exist") return False if "workPath" not in jobParams or len( jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: work-path does not exist") return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath, jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath, "models"), jobParams["userId"]) else: mkdirsAsUser(localJobPath, "0") mkdirsAsUser(os.path.join(localJobPath, "models"), "0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) jobParams[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams[ "jobId"] jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace("/", "") jobParams["mountpoints"].append({ "name": "nvidia-driver", "containerPath": "/usr/local/nvidia", "hostPath": nvidiaDriverPath }) jobParams["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": jobParams["hostjobPath"] }) jobParams["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": jobParams["hostworkPath"] }) jobParams["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": jobParams["hostdataPath"] }) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False print("Render Job: %s" % jobParams) template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList = [] jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [ p.strip() for p in re.split(",|;", jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit() ] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams[ "jobId"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template( os.path.join(jobTempDir, "KubeSvc.yaml.template")) template = ENV.get_template(serviceTemplate) interactiveMeta = template.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("Submitted job %s to k8s, returned with status %s" % (job["jobId"], output)) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.copy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False if "dataPath" not in distJobParam or len( distJobParam["dataPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: data-path does not exist") return False jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) distJobParam["LaunchCMD"] = "" if "cmd" not in distJobParam: distJobParam["cmd"] = "" ################One choice is that we only wait for certain time. # launchCMD = """ ##!/bin/bash #mkdir -p /opt #echo "[DLWorkspace System]: Waiting for all containers are ready..." ## wait for at most 10 mins. #for i in {1..200}; do # if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # sleep 3 # else # break # fi #done #if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..." # exit 1 #else # echo "[DLWorkspace System]: All containers are ready, launching training job..." # chmod +x /opt/run_dist_job.sh # /opt/run_dist_job.sh #fi #""" launchCMD = """ #!/bin/bash mkdir -p /opt echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do sleep 3 done echo "[DLWorkspace System]: All containers are ready, launching training job..." chmod +x /opt/run_dist_job.sh /opt/run_dist_job.sh """ launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % distJobParam["jobId"]) with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() distJobParam[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[ "jobId"] distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) distJobParam["userNameLabel"] = getAlias( jobParams["userName"]) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] distJobParam["mountpoints"].append({ "name": "nvidia-driver", "containerPath": "/usr/local/nvidia", "hostPath": nvidiaDriverPath }) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config and config[ "usefreeflow"] == "True": distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False random.seed(datetime.datetime.now()) distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def extract_job_log(jobId, logPath, userId): try: dataHandler = DataHandler() # TODO: Replace joblog manager with elastic search logs = k8sUtils.GetLog(jobId, tail=None) # Do not overwrite existing logs with empty log # DLTS bootstrap will generate logs for all containers. # If one container has empty log, skip writing. for log in logs: if "containerLog" in log and log["containerLog"] == "": return jobLogDir = os.path.dirname(logPath) if not os.path.exists(jobLogDir): mkdirsAsUser(jobLogDir, userId) logStr = "" trimlogstr = "" for log in logs: if "podName" in log and "containerID" in log and "containerLog" in log: logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += " logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += log["containerLog"] logStr += "\n\n\n" logStr += "=========================================================\n" logStr += " end of logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += " logs from pod: %s\n" % log["podName"] trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" logLines = log["containerLog"].split('\n') if (len(logLines) < 3000): trimlogstr += log["containerLog"] trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" else: trimlogstr += "\n".join(logLines[-2000:]) trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += " Note: the log is too long to display in the webpage.\n" trimlogstr += " Only the last 2000 lines are shown here.\n" trimlogstr += " Please check the log file (in Job Folder) for the full logs.\n" trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" try: containerLogPath = os.path.join( jobLogDir, "log-container-" + log["containerID"] + ".txt") with open(containerLogPath, 'w') as f: f.write(log["containerLog"]) f.close() os.system("chown -R %s %s" % (userId, containerLogPath)) except Exception as e: logger.exception("write container log failed") if len(trimlogstr.strip()) > 0: dataHandler.UpdateJobTextField(jobId, "jobLog", base64.b64encode(trimlogstr)) with open(logPath, 'w') as f: f.write(logStr) f.close() os.system("chown -R %s %s" % (userId, logPath)) except Exception as e: logger.error(e)
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist") return False if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist") return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"],jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath,jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"]) else: mkdirsAsUser(localJobPath,"0") mkdirsAsUser(os.path.join(localJobPath,"models"),"0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"] jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace("/","") mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) userAlias = getAlias(jobParams["userName"]) mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False print ("Render Job: %s" % jobParams) jobDescriptionList = [] pods = [] if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams: i = int(jobParams["hyperparameterstartvalue"]) end = int(jobParams["hyperparameterendvalue"]) step = int(jobParams["hyperparameterstep"]) c = 0 while (i <= end): pod = {} pod["podName"] = jobParams["jobId"]+"-pod-"+str(c) pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}] i += step c += 1 pods.append(pod) else: pod = {} pod["podName"] = jobParams["jobId"] pod["envs"] = [] pods.append(pod) if "env" not in jobParams: jobParams["env"] = [] jobParams["commonenv"] = copy.copy(jobParams["env"]) for pod in pods: jobParams["podName"] = pod["podName"] jobParams["env"] = jobParams["commonenv"] + pod["envs"] if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]: container = {} container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])} podInfo = {} podInfo["podname"] = jobParams["podName"] if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]: # add topology constraints explicitly - for testing # if (jobParams["resourcegpu"] >= 2): # # both cards in same inner group # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1 # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1 # if (jobParams["resourcegpu"] >= 3): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1 # if (jobParams["resourcegpu"] >= 4): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1 # if (jobParams["resourcegpu"] >= 5): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1 # if (jobParams["resourcegpu"] >= 6): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1 # if (jobParams["resourcegpu"] >= 7): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1 # if (jobParams["resourcegpu"] >= 8): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1 podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1} else: # for cases when desired topology is explictly given or not desired podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0} podInfo["runningcontainer"] = {jobParams["podName"] : container} if "annotations" not in jobParams: jobParams["annotations"] = {} jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" jobParams["resourcegpu"] = 0 # gpu requests specified through annotation template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template")) stemplate = ENV.get_template(serviceTemplate) interactiveMeta = stemplate.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output)) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) return ret
def extract_job_log(jobId, logPath, userId): try: dataHandler = DataHandler() logs = k8sUtils.GetLog(jobId) jobLogDir = os.path.dirname(logPath) if not os.path.exists(jobLogDir): mkdirsAsUser(jobLogDir, userId) logStr = "" trimlogstr = "" for log in logs: if "podName" in log and "containerID" in log and "containerLog" in log: logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += " logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += "=========================================================\n" logStr += log["containerLog"] logStr += "\n\n\n" logStr += "=========================================================\n" logStr += " end of logs from pod: %s\n" % log["podName"] logStr += "=========================================================\n" logStr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += " logs from pod: %s\n" % log["podName"] trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" trimlogstr += "=========================================================\n" logLines = log["containerLog"].split('\n') if (len(logLines) < 3000): trimlogstr += log["containerLog"] trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" else: trimlogstr += "\n".join(logLines[-2000:]) trimlogstr += "\n\n\n" trimlogstr += "=========================================================\n" trimlogstr += " end of logs from pod: %s\n" % log[ "podName"] trimlogstr += " Note: the log is too long to display in the webpage.\n" trimlogstr += " Only the last 2000 lines are shown here.\n" trimlogstr += " Please check the log file (in Job Folder) for the full logs.\n" trimlogstr += "=========================================================\n" trimlogstr += "\n\n\n" try: containerLogPath = os.path.join( jobLogDir, "log-container-" + log["containerID"] + ".txt") with open(containerLogPath, 'w') as f: f.write(log["containerLog"]) f.close() os.system("chown -R %s %s" % (userId, containerLogPath)) except Exception as e: print e if len(trimlogstr.strip()) > 0: dataHandler.UpdateJobTextField(jobId, "jobLog", base64.b64encode(trimlogstr)) with open(logPath, 'w') as f: f.write(logStr) f.close() os.system("chown -R %s %s" % (userId, logPath)) except Exception as e: logging.error(e)
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) userAlias = getAlias(jobParams["userName"]) jobParams["user_email"] = jobParams["userName"] jobParams["homeFolderHostpath"] = os.path.join( config["storage-mount-path"], GetWorkPath(userAlias)) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.deepcopy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role distJobParam["distRoleIdx"] = i if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0: # dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist") # return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) # TODO ??? if "cmd" not in distJobParam: distJobParam["cmd"] = "" #change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error if role == "ps": launchCMD = """ #!/bin/bash echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ]; do sleep 3 done sudo chmod 600 -R /home/%s/.ssh &>/dev/null; sudo chmod 700 /home/%s/.ssh &>/dev/null; sudo chown -R %s /home/%s/.ssh &>/dev/null; sudo mkdir -p /root/.ssh &>/dev/null ; sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; sudo mkdir -p /opt &>/dev/null; sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; JOB_DIR='/home/%s' WORKER_NUM=%s echo $JOB_DIR $WORKER_NUM all_workers_ready=false while [ "$all_workers_ready" != true ] do # update it to false if any woker is not ready all_workers_ready=true for i in $(seq 0 $(( ${WORKER_NUM} - 1)) ) do worker="worker${i}" file="$JOB_DIR/${worker}/WORKER_READY" #echo $file if [ ! -f $file ]; then echo "${worker} not ready!" all_workers_ready=false sleep 10 fi done done echo "[DLWorkspace System]: All containers are ready, launching training job..." %s """ % (userAlias, userAlias, userAlias, userAlias, userAlias, distJobParam["jobPath"], jobParams["numpsworker"], distJobParam["cmd"]) else: launchCMD = """ while [ ! -f /opt/run_dist_job ]; do sleep 3 done sudo chmod 600 -R /home/%s/.ssh &>/dev/null; sudo chmod 700 /home/%s/.ssh &>/dev/null; sudo chown -R %s /home/%s/.ssh &>/dev/null; sudo mkdir -p /root/.ssh &>/dev/null; sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; # TODO mark the worker as 'READY', better to change to '/pod/READY' later sudo touch /job/WORKER_READY sleep infinity """ % (userAlias, userAlias, userAlias, userAlias, userAlias) launchScriptPath = os.path.join( localJobPath, "launch-%s-%s%d.sh" % (distJobParam["jobId"], role, i)) # TODO need to set up user for distribute jobs with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % ( distJobParam["jobId"], role, i) distJobParam[ "LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath}) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) for idx in range(len(distJobParam["mountpoints"])): if "name" not in distJobParam["mountpoints"][idx]: distJobParam["mountpoints"][idx]["name"] = str( uuid.uuid4()).replace("-", "") distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False distJobParam["numworker"] = int(jobParams["numpsworker"]) distJobParam["numps"] = int(jobParams["numps"]) random.seed(datetime.datetime.now()) if "hostNetwork" in jobParams and jobParams["hostNetwork"]: distJobParam["containerPort"] = random.randint( 40000, 49999) else: distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack if "gpuType" in distJobParam: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["gpuType"] = distJobParam[ "gpuType"] # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( jobParams["userName"])[0] distJobParam["gid"] = user_info["gid"] distJobParam["uid"] = user_info["uid"] distJobParam["user"] = userAlias template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["cmd"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: import traceback traceback.print_exc() print(e) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret