def UpdateDistJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if "userId" not in jobParams: jobParams["userId"] = "0" jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None jobId = jobParams["jobId"] workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) if "items" in workerPodInfo and len(workerPodInfo["items"]) == int( jobParams["numpsworker"]) and "items" in psPodInfo and len( psPodInfo["items"]) == int(jobParams["numps"]): if job["jobStatus"] == "scheduling": launch_ps_dist_job(jobParams) if job["jobStatus"] == "running": result, detail = GetDistJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) printlog("job %s status: %s" % (job["jobId"], result)) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"] ) if "jobDescriptionPath" in job else None if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode( json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField( job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] pass
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) logging.info("start to update job status...") if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": launch_ps_dist_job(jobParams) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result, json.dumps(detail)) logging.info(msg) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode(json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]]