def UpdateDistJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if "userId" not in jobParams: jobParams["userId"] = "0" jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None jobId = jobParams["jobId"] workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId) psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId) if "items" in workerPodInfo and len(workerPodInfo["items"]) == int( jobParams["numpsworker"]) and "items" in psPodInfo and len( psPodInfo["items"]) == int(jobParams["numps"]): if job["jobStatus"] == "scheduling": launch_ps_dist_job(jobParams) if job["jobStatus"] == "running": result, detail = GetDistJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(detail)) printlog("job %s status: %s" % (job["jobId"], result)) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"] ) if "jobDescriptionPath" in job else None if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode( json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField( job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] pass
def UpdateJobStatus(redis_conn, launcher, job, notifier=None, dataHandlerOri=None): assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running") if dataHandlerOri is None: dataHandler = DataHandler() else: dataHandler = dataHandlerOri jobParams = json.loads(b64decode(job["jobParams"])) result, details, diagnostics = launcher.get_job_status(job["jobId"]) logger.info("Job status: %s %s", job["jobId"], result) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") if "userId" not in jobParams: jobParams["userId"] = "0" if result == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) # TODO: Refactor detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time(detail, "finished") dataFields = { "jobStatusDetail": b64encode(json.dumps(detail)), "jobStatus": "finished" } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) launcher.delete_job(job["jobId"], force=True) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) elif result == "Running": update_job_state_latency(redis_conn, job["jobId"], "running") launcher.scale_job(job) if job["jobStatus"] != "running": started_at = k8sUtils.localize_time(datetime.datetime.now()) detail = [{ "startedAt": started_at, "message": "started at: {}".format(started_at) }] dataFields = { "jobStatusDetail": b64encode(json.dumps(detail)), "jobStatus": "running" } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) if notifier is not None: notifier.notify( notify.new_job_state_change_message( job["userName"], job["jobId"], result.strip())) elif result == "Failed": now = datetime.datetime.now() params = json.loads(base64decode(job["jobParams"])) if params.get("debug") is True and (now - job["jobTime"]).seconds < 60: logger.info("leave job %s there for debug for 60s", job["jobId"]) return logger.warning("Job %s fails, cleaning...", job["jobId"]) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) # TODO: Refactor detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time(detail, "failed") dataFields = { "jobStatusDetail": b64encode(json.dumps(detail)), "jobStatus": "failed", "errorMsg": diagnostics } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) launcher.delete_job(job["jobId"], force=True) elif result == "Unknown" or result == "NotFound": if job["jobId"] not in UnusualJobs: logger.warning("!!! Job status ---{}---, job: {}".format( result, job["jobId"])) UnusualJobs[job["jobId"]] = datetime.datetime.now() # TODO # 1) May need to reduce the timeout. # It takes minutes before pod turns into "Unknown", we may don't need to wait so long. # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'. elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30: del UnusualJobs[job["jobId"]] # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job["jobId"]) for endpoint_id, endpoint in list(endpoints.items()): endpoint["status"] = "pending" logger.debug("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) logger.warning( "Job {} fails in Kubernetes as {}, delete and re-submit.". format(job["jobId"], result)) launcher.kill_job(job["jobId"], "queued") if notifier is not None: notifier.notify( notify.new_job_state_change_message( job["userName"], job["jobId"], result.strip())) elif result == "Pending": _, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextFields( {"jobId": job["jobId"]}, {"jobStatusDetail": b64encode(json.dumps(detail))}) if result != "Unknown" and result != "NotFound" and job[ "jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] if dataHandlerOri is None: dataHandler.Close()
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": # launch user command only all pods are ready result, detail = k8sUtils.GetJobStatus(job["jobId"]) if result in ["Failed", "Succeeded"]: # TODO shoudn't be here, update status dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result) pass else: # previously status is 'scheduling', and now all pods are ready # TODO check all pods are ready if k8sUtils.all_pod_ready(job["jobId"]): try: launch_ps_dist_job(jobParams) except Exception as e: print(e) return jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) logging.info("job %s status: %s,%s" % (job["jobId"], result, json.dumps(detail))) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] dataHandler.Close()
def UpdateJobStatus(job, notifier=None, dataHandlerOri=None): assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running") if dataHandlerOri is None: dataHandler = DataHandler() else: dataHandler = dataHandlerOri jobParams = json.loads(base64.b64decode(job["jobParams"])) result = check_job_status(job["jobId"]) logging.info("++++++++ Job status: {} {}".format(job["jobId"], result)) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) elif result == "Running": if job["jobStatus"] != "running": started_at = datetime.datetime.now().isoformat() detail = [{ "startedAt": started_at, "message": "started at: {}".format(started_at) }] dataHandler.UpdateJobTextField( job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") elif result == "Failed": logging.warning("Job %s fails, cleaning...", job["jobId"]) if notifier is not None: notifier.notify( notify.new_job_state_change_message(job["userName"], job["jobId"], result.strip())) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "pod failed") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): k8sUtils.kubectl_delete(jobDescriptionPath) elif result == "Unknown" or result == "NotFound": if job["jobId"] not in UnusualJobs: logging.warning("!!! Job status ---{}---, job: {}".format( result, job["jobId"])) UnusualJobs[job["jobId"]] = datetime.datetime.now() # TODO # 1) May need to reduce the timeout. # It takes minutes before pod turns into "Unknown", we may don't need to wait so long. # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'. elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 30: del UnusualJobs[job["jobId"]] # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job["jobId"]) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) logging.warning( "Job {} fails in Kubernetes as {}, delete and re-submit.". format(job["jobId"], result)) KillJob(job["jobId"], "queued") if result != "Unknown" and result != "NotFound" and job[ "jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]] if dataHandlerOri is None: dataHandler.Close()
def UpdateJobStatus(job): dataHandler = DataHandler() jobParams = json.loads(base64.b64decode(job["jobParams"])) logging.info("start to update job status...") if job["jobStatus"] == "scheduling" and jobParams[ "jobtrainingtype"] == "PSDistJob": launch_ps_dist_job(jobParams) jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"], jobParams["workPath"], jobParams["dataPath"]) localJobPath = os.path.join(config["storage-mount-path"], jobPath) logPath = os.path.join(localJobPath, "logs/joblog.txt") result, detail = k8sUtils.GetJobStatus(job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result, json.dumps(detail)) logging.info(msg) jobDescriptionPath = os.path.join( config["storage-mount-path"], job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None if "userId" not in jobParams: jobParams["userId"] = "0" if result.strip() == "Succeeded": joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Running": if job["jobStatus"] != "running": dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "running") if "interactivePort" in jobParams: serviceAddress = k8sUtils.GetServiceAddress(job["jobId"]) serviceAddress = base64.b64encode(json.dumps(serviceAddress)) dataHandler.UpdateJobTextField(job["jobId"], "endpoints", serviceAddress) elif result.strip() == "Failed": printlog("Job %s fails, cleaning..." % job["jobId"]) joblog_manager.extract_job_log(job["jobId"], logPath, jobParams["userId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail) if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) elif result.strip() == "Unknown": if job["jobId"] not in UnusualJobs: UnusualJobs[job["jobId"]] = datetime.datetime.now() elif (datetime.datetime.now() - UnusualJobs[job["jobId"]]).seconds > 300: del UnusualJobs[job["jobId"]] retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: printlog("Job %s fails for more than 5 times, abort" % job["jobId"]) dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "cannot launch the job.") if jobDescriptionPath is not None and os.path.isfile( jobDescriptionPath): output = k8sUtils.kubectl_delete(jobDescriptionPath) logging.info("kubectl delete " + jobDescriptionPath + " output: " + str(output)) else: printlog( "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d" % (job["jobId"], retries)) SubmitJob(job) elif result.strip() == "PendingHostPort": printlog( "Cannot find host ports for job :%s, re-launch the job with different host ports " % (job["jobId"])) SubmitJob(job) if result.strip() != "Unknown" and job["jobId"] in UnusualJobs: del UnusualJobs[job["jobId"]]