def cleanup_endpoints(): try: data_handler = DataHandler() dead_endpoints = data_handler.GetDeadEndpoints() for endpoint_id, dead_endpoint in dead_endpoints.items(): print("\n\n\n\n\n\n----------------Begin to cleanup endpoint %s" % endpoint_id) endpoint_description_path = os.path.join( config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"]) still_running = get_k8s_endpoint(endpoint_description_path) # empty mean not existing if still_running == "": print("Endpoint already gone %s" % endpoint_id) status = "stopped" else: output = k8sUtils.kubectl_delete(endpoint_description_path) # 0 for success if output == 0: status = "stopped" print("Succeed cleanup endpoint %s" % endpoint_id) else: print("Clean dead endpoint %s failed, endpoints: %s" % (endpoint_id, dead_endpoint)) dead_endpoint["status"] = status dead_endpoint["lastUpdated"] = datetime.datetime.now().isoformat() data_handler.UpdateEndpoint(dead_endpoint) except Exception as e: traceback.print_exc() finally: pass
def KillJob(job): dataHandler = DataHandler() result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) logging.info("Killing job %s, with status %s, %s" % (job["jobId"], result, detail)) if "jobDescriptionPath" in job and job["jobDescriptionPath"] is not None: jobDescriptionPath = os.path.join(config["storage-mount-path"], job["jobDescriptionPath"]) if os.path.isfile(jobDescriptionPath): if k8sUtils.kubectl_delete(jobDescriptionPath) == 0: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "killed") return True else: dataHandler.UpdateJobTextField( job["jobId"], "errorMsg", "Cannot delete job from Kubernetes Cluster!") else: dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot find job description file!") dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") return False
def cleanup_endpoints(): try: data_handler = DataHandler() try: dead_endpoints = data_handler.GetDeadEndpoints() for endpoint_id, dead_endpoint in dead_endpoints.items(): try: logger.info( "\n\n\n\n\n\n----------------Begin to cleanup endpoint %s", endpoint_id) endpoint_description_path = os.path.join( config["storage-mount-path"], dead_endpoint["endpointDescriptionPath"]) still_running = get_k8s_endpoint(endpoint_description_path) # empty mean not existing if still_running == "": logger.info("Endpoint already gone %s", endpoint_id) status = "stopped" else: output = k8sUtils.kubectl_delete( endpoint_description_path) # 0 for success if output == 0: status = "stopped" logger.info("Succeed cleanup endpoint %s", endpoint_id) else: # TODO will need to clean it up eventually status = "unknown" logger.info( "Clean dead endpoint %s failed, endpoints: %s", endpoint_id, dead_endpoint) # we are not changing status from "pending", "pending" endpoints are planed to setup later if dead_endpoint["status"] != "pending": dead_endpoint["status"] = status dead_endpoint["lastUpdated"] = datetime.datetime.now( ).isoformat() data_handler.UpdateEndpoint(dead_endpoint) except Exception as e: logger.warning( "Clanup endpoint failed {}".format(dead_endpoint), exc_info=True) except Exception as e: logger.exception("cleanup endpoint failed") finally: data_handler.Close() except Exception as e: logger.exception("close data handler failed")
def UpdateDistJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if "userId" not in jobParams: jobParams["userId"] = "0" jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None jobId = jobParams["jobId"] workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) if "items" in workerPodInfo and len(workerPodInfo["items"]) == int( jobParams["numpsworker"]) and "items" in psPodInfo and len( psPodInfo["items"]) == int(jobParams["numps"]): if job["jobStatus"] == "scheduling": launch_ps_dist_job(jobParams) if job["jobStatus"] == "running": result, detail = GetDistJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) printlog("job %s status: %s" % (job["jobId"], result)) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"] ) if "jobDescriptionPath" in job else None if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode( json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField( job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] pass
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len( jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: job-path does not exist") return False if "workPath" not in jobParams or len( jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"], "ERROR: work-path does not exist") return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath, jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath, "models"), jobParams["userId"]) else: mkdirsAsUser(localJobPath, "0") mkdirsAsUser(os.path.join(localJobPath, "models"), "0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) jobParams[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams[ "jobId"] jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace("/", "") jobParams["mountpoints"].append({ "name": "nvidia-driver", "containerPath": "/usr/local/nvidia", "hostPath": nvidiaDriverPath }) jobParams["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": jobParams["hostjobPath"] }) jobParams["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": jobParams["hostworkPath"] }) jobParams["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": jobParams["hostdataPath"] }) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False print("Render Job: %s" % jobParams) template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList = [] jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [ p.strip() for p in re.split(",|;", jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit() ] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams[ "jobId"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template( os.path.join(jobTempDir, "KubeSvc.yaml.template")) template = ENV.get_template(serviceTemplate) interactiveMeta = template.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("Submitted job %s to k8s, returned with status %s" % (job["jobId"], output)) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.copy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False if "dataPath" not in distJobParam or len( distJobParam["dataPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: data-path does not exist") return False jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) distJobParam["LaunchCMD"] = "" if "cmd" not in distJobParam: distJobParam["cmd"] = "" ################One choice is that we only wait for certain time. # launchCMD = """ ##!/bin/bash #mkdir -p /opt #echo "[DLWorkspace System]: Waiting for all containers are ready..." ## wait for at most 10 mins. #for i in {1..200}; do # if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # sleep 3 # else # break # fi #done #if [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; then # echo "[DLWorkspace System]: Waiting for containers: timeout! Restarting..." # exit 1 #else # echo "[DLWorkspace System]: All containers are ready, launching training job..." # chmod +x /opt/run_dist_job.sh # /opt/run_dist_job.sh #fi #""" launchCMD = """ #!/bin/bash mkdir -p /opt echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ] || [ ! -f /opt/run_dist_job.sh ]; do sleep 3 done echo "[DLWorkspace System]: All containers are ready, launching training job..." chmod +x /opt/run_dist_job.sh /opt/run_dist_job.sh """ launchScriptPath = os.path.join( localJobPath, "launch-%s.sh" % distJobParam["jobId"]) with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() distJobParam[ "LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % distJobParam[ "jobId"] distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) distJobParam["userNameLabel"] = getAlias( jobParams["userName"]) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] distJobParam["mountpoints"].append({ "name": "nvidia-driver", "containerPath": "/usr/local/nvidia", "hostPath": nvidiaDriverPath }) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config and config[ "usefreeflow"] == "True": distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False random.seed(datetime.datetime.now()) distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) return ret
def SubmitRegularJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["pvc_job"] = "jobs-" + jobParams["jobId"] jobParams["pvc_work"] = "work-" + jobParams["jobId"] jobParams["pvc_data"] = "storage-" + jobParams["jobId"] if "jobPath" not in jobParams or len(jobParams["jobPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"],"ERROR: job-path does not exist") return False if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: dataHandler.SetJobError(jobParams["jobId"],"ERROR: work-path does not exist") return False #if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: # dataHandler.SetJobError(jobParams["jobId"],"ERROR: data-path does not exist") # return False jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"],jobPath) if not os.path.exists(localJobPath): if "userId" in jobParams: mkdirsAsUser(localJobPath,jobParams["userId"]) mkdirsAsUser(os.path.join(localJobPath,"models"),jobParams["userId"]) else: mkdirsAsUser(localJobPath,"0") mkdirsAsUser(os.path.join(localJobPath,"models"),"0") jobParams["LaunchCMD"] = "" if "cmd" not in jobParams: jobParams["cmd"] = "" if isinstance(jobParams["cmd"], basestring) and not jobParams["cmd"] == "": launchScriptPath = os.path.join(localJobPath,"launch-%s.sh" % jobParams["jobId"]) with open(launchScriptPath, 'w') as f: f.write("#!/bin/bash -x\n") f.write(jobParams["cmd"] + "\n") f.close() if "userId" in jobParams: os.system("chown -R %s %s" % (jobParams["userId"], launchScriptPath)) jobParams["LaunchCMD"] = "[\"bash\", \"/job/launch-%s.sh\"]" % jobParams["jobId"] jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime("%y%m%d") + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobParams["jobNameLabel"] = ''.join(e for e in jobParams["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"],"Jobs_Templete") jobTemp = os.path.join(jobTempDir, "RegularJob.yaml.template") jobParams["hostjobPath"] = os.path.join(config["storage-mount-path"], jobPath) jobParams["hostworkPath"] = os.path.join(config["storage-mount-path"], workPath) jobParams["hostdataPath"] = os.path.join(config["storage-mount-path"], dataPath) jobParams["nvidiaDriverPath"] = nvidiaDriverPath jobParams["userNameLabel"] = getAlias(jobParams["userName"]) jobParams["rest-api"] = config["rest-api"] if "mountpoints" not in jobParams: jobParams["mountpoints"] = [] for onemount in jobParams["mountpoints"]: onemount["name"] = onemount["containerPath"].replace("/","") mp = {"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath, "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"job","containerPath":"/job","hostPath":jobParams["hostjobPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"work","containerPath":"/work","hostPath":jobParams["hostworkPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) mp = {"name":"data","containerPath":"/data","hostPath":jobParams["hostdataPath"], "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) userAlias = getAlias(jobParams["userName"]) mp = {"name":"sshkey","containerPath":"/home/%s/.ssh" % userAlias,"hostPath":os.path.join(config["storage-mount-path"], GetWorkPath(userAlias)+"/.ssh"), "readOnly":True, "enabled":True} if CheckMountPoints(jobParams["mountpoints"],mp): jobParams["mountpoints"].append(mp) jobParams["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: jobParams["usefreeflow"] = config["usefreeflow"] else: jobParams["usefreeflow"] = False print ("Render Job: %s" % jobParams) jobDescriptionList = [] pods = [] if "hyperparametername" in jobParams and "hyperparameterstartvalue" in jobParams and "hyperparameterendvalue" in jobParams and "hyperparameterstep" in jobParams: i = int(jobParams["hyperparameterstartvalue"]) end = int(jobParams["hyperparameterendvalue"]) step = int(jobParams["hyperparameterstep"]) c = 0 while (i <= end): pod = {} pod["podName"] = jobParams["jobId"]+"-pod-"+str(c) pod["envs"] = [{"name":jobParams["hyperparametername"],"value":i}] i += step c += 1 pods.append(pod) else: pod = {} pod["podName"] = jobParams["jobId"] pod["envs"] = [] pods.append(pod) if "env" not in jobParams: jobParams["env"] = [] jobParams["commonenv"] = copy.copy(jobParams["env"]) for pod in pods: jobParams["podName"] = pod["podName"] jobParams["env"] = jobParams["commonenv"] + pod["envs"] if "kube_custom_scheduler" in config and config["kube_custom_scheduler"]: container = {} container["requests"] = {"alpha.gpu/numgpu" : int(jobParams["resourcegpu"])} podInfo = {} podInfo["podname"] = jobParams["podName"] if "useGPUTopology" in jobParams and jobParams["useGPUTopology"]: # add topology constraints explicitly - for testing # if (jobParams["resourcegpu"] >= 2): # # both cards in same inner group # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/0/cards"] = 1 # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/0/gpu/1/cards"] = 1 # if (jobParams["resourcegpu"] >= 3): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/2/cards"] = 1 # if (jobParams["resourcegpu"] >= 4): # container["requests"]["alpha/grpresource/gpugrp1/0/gpugrp0/1/gpu/3/cards"] = 1 # if (jobParams["resourcegpu"] >= 5): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/4/cards"] = 1 # if (jobParams["resourcegpu"] >= 6): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/2/gpu/5/cards"] = 1 # if (jobParams["resourcegpu"] >= 7): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/6/cards"] = 1 # if (jobParams["resourcegpu"] >= 8): # container["requests"]["alpha/grpresource/gpugrp1/1/gpugrp0/3/gpu/7/cards"] = 1 podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 1} else: # for cases when desired topology is explictly given or not desired podInfo["requests"] = {"alpha.gpu/gpu-generate-topology" : 0} podInfo["runningcontainer"] = {jobParams["podName"] : container} if "annotations" not in jobParams: jobParams["annotations"] = {} jobParams["annotations"]["pod.alpha/DeviceInformation"] = "'" + json.dumps(podInfo) + "'" jobParams["resourcegpu"] = 0 # gpu requests specified through annotation template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=jobParams) jobDescriptionList.append(job_description) if ("interactivePort" in jobParams and len(jobParams["interactivePort"].strip()) > 0): ports = [p.strip() for p in re.split(",|;",jobParams["interactivePort"]) if len(p.strip()) > 0 and p.strip().isdigit()] for portNum in ports: jobParams["serviceId"] = "interactive-" + jobParams["podName"] + "-" + portNum jobParams["port"] = portNum jobParams["port-name"] = "interactive" jobParams["port-type"] = "TCP" serviceTemplate = ENV.get_template(os.path.join(jobTempDir,"KubeSvc.yaml.template")) stemplate = ENV.get_template(serviceTemplate) interactiveMeta = stemplate.render(svc=jobParams) jobDescriptionList.append(interactiveMeta) jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists(os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) logging.info("Submitted job %s to k8s, returned with status %s" %(job["jobId"], output)) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescriptionPath",jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"],"jobDescription",base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["LaunchCMD"] jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"],"jobMeta",jobMetaStr) except Exception as e: print e ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"],"jobStatus","error") dataHandler.UpdateJobTextField(jobParams["jobId"],"errorMsg","Cannot submit job!" + str(e)) return ret
def UpdateJobStatus(job, notifier=None, dataHandlerOri=None): assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running") if dataHandlerOri is None: dataHandler = DataHandler() else: dataHandler = dataHandlerOri jobParams = json.loads(base64.b64decode(job["jobParams"])) result = check_job_status(job["jobId"]) logging.info("++++++++ Job status: {} {}".format(job["jobId"], result)) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) elif result == "Running": if job["jobStatus"] != "running": started_at = datetime.datetime.now().isoformat() detail = [{ "startedAt": started_at, "message": "started at: {}".format(started_at) }] dataHandler.UpdateJobTextField( job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") elif result == "Failed": logging.warning("Job %s fails, cleaning...", job["jobId"]) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "pod failed") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result == "Unknown" or result == "NotFound": if job["jobId"] not in UnusualJobs: logging.warning("!!! Job status ---{}---, job: {}".format( result, job["jobId"])) UnusualJobs[job["jobId"]] = datetime.datetime.now() # TODO # 1) May need to reduce the timeout. # It takes minutes before pod turns into "Unknown", we may don't need to wait so long. # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'. elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30: del UnusualJobs[job["jobId"]] # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job["jobId"]) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) logging.warning( "Job {} fails in Kubernetes as {}, delete and re-submit.". format(job["jobId"], result)) KillJob(job["jobId"], "queued") if result != "Unknown" and result != "NotFound" and job[ "jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] if dataHandlerOri is None: dataHandler.Close()
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": # launch user command only all pods are ready result, detail = k8sUtils.GetJobStatus(job["jobId"]) if result in ["Failed", "Succeeded"]: # TODO shoudn't be here, update status dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result) pass else: # previously status is 'scheduling', and now all pods are ready # TODO check all pods are ready if k8sUtils.all_pod_ready(job["jobId"]): try: launch_ps_dist_job(jobParams) except Exception as e: print(e) return jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] dataHandler.Close()
def SubmitPSDistJob(job): ret = {} dataHandler = DataHandler() try: jobParams = json.loads(base64.b64decode(job["jobParams"])) jobParams["rest-api"] = config["rest-api"] distJobParams = {} distJobParams["ps"] = [] distJobParams["worker"] = [] assignedRack = None if len(config["racks"]) > 0: assignedRack = random.choice(config["racks"]) userAlias = getAlias(jobParams["userName"]) jobParams["user_email"] = jobParams["userName"] jobParams["homeFolderHostpath"] = os.path.join( config["storage-mount-path"], GetWorkPath(userAlias)) if jobParams["jobtrainingtype"] == "PSDistJob": jobDescriptionList = [] nums = { "ps": int(jobParams["numps"]), "worker": int(jobParams["numpsworker"]) } for role in ["ps", "worker"]: for i in range(nums[role]): distJobParam = copy.deepcopy(jobParams) distJobParam["distId"] = "%s%d" % (role, i) distJobParam["distRole"] = role distJobParam["distRoleIdx"] = i if "jobPath" not in distJobParam or len( distJobParam["jobPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: job-path does not exist") return False if "workPath" not in distJobParam or len( distJobParam["workPath"].strip()) == 0: dataHandler.SetJobError( distJobParam["jobId"], "ERROR: work-path does not exist") return False #if "dataPath" not in distJobParam or len(distJobParam["dataPath"].strip()) == 0: # dataHandler.SetJobError(distJobParam["jobId"],"ERROR: data-path does not exist") # return False distJobParam["distJobPath"] = os.path.join( distJobParam["jobPath"], distJobParam["distId"]) jobPath, workPath, dataPath = GetStoragePath( distJobParam["distJobPath"], distJobParam["workPath"], distJobParam["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) if not os.path.exists(localJobPath): if "userId" in distJobParam: mkdirsAsUser(localJobPath, distJobParam["userId"]) else: mkdirsAsUser(localJobPath, 0) # TODO ??? if "cmd" not in distJobParam: distJobParam["cmd"] = "" #change ssh folder permission here because the setup permission script in launch_ps_job function may have race condition with init_user.sh script. results in no such user error if role == "ps": launchCMD = """ #!/bin/bash echo "[DLWorkspace System]: Waiting for all containers are ready..." while [ ! -f /opt/run_dist_job ]; do sleep 3 done sudo chmod 600 -R /home/%s/.ssh &>/dev/null; sudo chmod 700 /home/%s/.ssh &>/dev/null; sudo chown -R %s /home/%s/.ssh &>/dev/null; sudo mkdir -p /root/.ssh &>/dev/null ; sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; sudo mkdir -p /opt &>/dev/null; sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; JOB_DIR='/home/%s' WORKER_NUM=%s echo $JOB_DIR $WORKER_NUM all_workers_ready=false while [ "$all_workers_ready" != true ] do # update it to false if any woker is not ready all_workers_ready=true for i in $(seq 0 $(( ${WORKER_NUM} - 1)) ) do worker="worker${i}" file="$JOB_DIR/${worker}/WORKER_READY" #echo $file if [ ! -f $file ]; then echo "${worker} not ready!" all_workers_ready=false sleep 10 fi done done echo "[DLWorkspace System]: All containers are ready, launching training job..." %s """ % (userAlias, userAlias, userAlias, userAlias, userAlias, distJobParam["jobPath"], jobParams["numpsworker"], distJobParam["cmd"]) else: launchCMD = """ while [ ! -f /opt/run_dist_job ]; do sleep 3 done sudo chmod 600 -R /home/%s/.ssh &>/dev/null; sudo chmod 700 /home/%s/.ssh &>/dev/null; sudo chown -R %s /home/%s/.ssh &>/dev/null; sudo mkdir -p /root/.ssh &>/dev/null; sudo ln -s /home/%s/.ssh/config /root/.ssh/config &>/dev/null; sudo mkdir -p /opt && sudo ln -s /job/hostfile /opt/hostfile &>/dev/null; # TODO mark the worker as 'READY', better to change to '/pod/READY' later sudo touch /job/WORKER_READY sleep infinity """ % (userAlias, userAlias, userAlias, userAlias, userAlias) launchScriptPath = os.path.join( localJobPath, "launch-%s-%s%d.sh" % (distJobParam["jobId"], role, i)) # TODO need to set up user for distribute jobs with open(launchScriptPath, 'w') as f: f.write(launchCMD) f.close() launchScriptInContainer = "bash /job/launch-%s-%s%d.sh" % ( distJobParam["jobId"], role, i) distJobParam[ "LaunchCMD"] = '["bash", "-c", "bash /dlws/init_user.sh &> /job/init_user_script.log && runuser -l ${DLWS_USER_NAME} -c \'%s\'"]' % launchScriptInContainer distJobParam["jobNameLabel"] = ''.join( e for e in distJobParam["jobName"] if e.isalnum()) ENV = Environment(loader=FileSystemLoader("/")) jobTempDir = os.path.join(config["root-path"], "Jobs_Templete") jobTemp = os.path.join(jobTempDir, "DistJob.yaml.template") distJobParam["hostjobPath"] = os.path.join( config["storage-mount-path"], jobPath) distJobParam["hostworkPath"] = os.path.join( config["storage-mount-path"], workPath) distJobParam["hostdataPath"] = os.path.join( config["storage-mount-path"], dataPath) distJobParam["nvidiaDriverPath"] = nvidiaDriverPath if "mountpoints" not in distJobParam: distJobParam["mountpoints"] = [] # distJobParam["mountpoints"].append({"name":"nvidia-driver","containerPath":"/usr/local/nvidia","hostPath":nvidiaDriverPath}) distJobParam["mountpoints"].append({ "name": "job", "containerPath": "/job", "hostPath": distJobParam["hostjobPath"] }) distJobParam["mountpoints"].append({ "name": "work", "containerPath": "/work", "hostPath": distJobParam["hostworkPath"] }) distJobParam["mountpoints"].append({ "name": "data", "containerPath": "/data", "hostPath": distJobParam["hostdataPath"] }) for idx in range(len(distJobParam["mountpoints"])): if "name" not in distJobParam["mountpoints"][idx]: distJobParam["mountpoints"][idx]["name"] = str( uuid.uuid4()).replace("-", "") distJobParam["pod_ip_range"] = config["pod_ip_range"] if "usefreeflow" in config: distJobParam["usefreeflow"] = config["usefreeflow"] else: distJobParam["usefreeflow"] = False distJobParam["numworker"] = int(jobParams["numpsworker"]) distJobParam["numps"] = int(jobParams["numps"]) random.seed(datetime.datetime.now()) if "hostNetwork" in jobParams and jobParams["hostNetwork"]: distJobParam["containerPort"] = random.randint( 40000, 49999) else: distJobParam["containerPort"] = int(random.random() * 1000 + 3000) if assignedRack is not None: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["rack"] = assignedRack if "gpuType" in distJobParam: if "nodeSelector" not in distJobParam: distJobParam["nodeSelector"] = {} distJobParam["nodeSelector"]["gpuType"] = distJobParam[ "gpuType"] # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( jobParams["userName"])[0] distJobParam["gid"] = user_info["gid"] distJobParam["uid"] = user_info["uid"] distJobParam["user"] = userAlias template = ENV.get_template(os.path.abspath(jobTemp)) job_description = template.render(job=distJobParam) jobDescriptionList.append(job_description) distJobParams[role].append(distJobParam) jobParams["jobDescriptionPath"] = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + jobParams["jobId"] + "/" + jobParams["jobId"] + ".yaml" jobDescription = "\n---\n".join(jobDescriptionList) jobDescriptionPath = os.path.join(config["storage-mount-path"], jobParams["jobDescriptionPath"]) if not os.path.exists( os.path.dirname(os.path.realpath(jobDescriptionPath))): os.makedirs(os.path.dirname(os.path.realpath(jobDescriptionPath))) if os.path.isfile(jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) with open(jobDescriptionPath, 'w') as f: f.write(jobDescription) output = k8sUtils.kubectl_create(jobDescriptionPath) ret["output"] = output ret["jobId"] = jobParams["jobId"] if "userName" not in jobParams: jobParams["userName"] = "" dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "scheduling") dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescriptionPath", jobParams["jobDescriptionPath"]) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobDescription", base64.b64encode(jobDescription)) jobMeta = {} jobMeta["jobDescriptionPath"] = jobParams["jobDescriptionPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["workPath"] = jobParams["workPath"] jobMeta["jobPath"] = jobParams["jobPath"] jobMeta["LaunchCMD"] = jobParams["cmd"] jobMeta["distJobParams"] = distJobParams jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(jobParams["jobId"], "jobMeta", jobMetaStr) except Exception as e: import traceback traceback.print_exc() print(e) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(jobParams["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(jobParams["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(jobParams["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) logging.info("start to update job status...") if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": launch_ps_dist_job(jobParams) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result, json.dumps(detail)) logging.info(msg) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode(json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]]