Beispiel #1
0
def UpdateDistJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(detail))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    jobId = jobParams["jobId"]
    workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId)
    psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId)
    if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(
            jobParams["numpsworker"]) and "items" in psPodInfo and len(
                psPodInfo["items"]) == int(jobParams["numps"]):
        if job["jobStatus"] == "scheduling":
            launch_ps_dist_job(jobParams)
        if job["jobStatus"] == "running":
            result, detail = GetDistJobStatus(job["jobId"])
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                           base64.b64encode(detail))

            printlog("job %s status: %s" % (job["jobId"], result))

            jobDescriptionPath = os.path.join(
                config["storage-mount-path"], job["jobDescriptionPath"]
            ) if "jobDescriptionPath" in job else None

            if result.strip() == "Succeeded":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "finished")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Running":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                if job["jobStatus"] != "running":
                    dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                                   "running")
                if "interactivePort" in jobParams:
                    serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
                    serviceAddress = base64.b64encode(
                        json.dumps(serviceAddress))
                    dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                                   serviceAddress)

            elif result.strip() == "Failed":
                printlog("Job %s fails, cleaning..." % job["jobId"])
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "failed")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               detail)
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Unknown":
                if job["jobId"] not in UnusualJobs:
                    UnusualJobs[job["jobId"]] = datetime.datetime.now()
                elif (datetime.datetime.now() -
                      UnusualJobs[job["jobId"]]).seconds > 300:
                    del UnusualJobs[job["jobId"]]
                    retries = dataHandler.AddandGetJobRetries(job["jobId"])
                    if retries >= 5:
                        printlog("Job %s fails for more than 5 times, abort" %
                                 job["jobId"])
                        dataHandler.UpdateJobTextField(job["jobId"],
                                                       "jobStatus", "error")
                        dataHandler.UpdateJobTextField(
                            job["jobId"], "errorMsg", "cannot launch the job.")
                        if jobDescriptionPath is not None and os.path.isfile(
                                jobDescriptionPath):
                            k8sUtils.kubectl_delete(jobDescriptionPath)
                    else:
                        printlog(
                            "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                            % (job["jobId"], retries))
                        SubmitJob(job)

            if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
                del UnusualJobs[job["jobId"]]

    pass
def UpdateJobStatus(job):

    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))
    logging.info("start to update job status...")

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        launch_ps_dist_job(jobParams)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result,
                                                     json.dumps(detail))
    logging.info(msg)

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

        if "interactivePort" in jobParams:
            serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
            serviceAddress = base64.b64encode(json.dumps(serviceAddress))
            dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                           serviceAddress)

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()

        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]

            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")

                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    output = k8sUtils.kubectl_delete(jobDescriptionPath)
                    logging.info("kubectl delete " + jobDescriptionPath +
                                 " output: " + str(output))

            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)

    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]