Esempio n. 1
0
def job_status_detail_with_finished_time(job_status_detail, status, msg=""):
    # This method is called when a job succeeds/fails/is killed/has an error

    # job_status_detail must be None or a list
    if (job_status_detail
            is not None) and (not isinstance(job_status_detail, list)):
        return job_status_detail

    # Force adding an item for empty detail
    if (job_status_detail is None) or (len(job_status_detail) == 0):
        job_status_detail = [{}]

    finished_at = k8sUtils.localize_time(datetime.datetime.now())
    new_job_status_detail = []
    status_change_message = "{} at {}. {}".format(status, finished_at, msg)

    # add finishedAt for all pods if absent
    for pod_status_detail in job_status_detail:
        # Mark started time the same as finished time for a fast finishing job
        if "startedAt" not in pod_status_detail:
            pod_status_detail["startedAt"] = finished_at

        if "finishedAt" not in pod_status_detail:
            pod_status_detail["finishedAt"] = finished_at

        if "message" not in pod_status_detail:
            pod_status_detail["message"] = status_change_message
        else:
            pod_status_detail["message"] += "\n" + status_change_message
        new_job_status_detail.append(pod_status_detail)

    return new_job_status_detail
Esempio n. 2
0
def UpdateJobStatus(redis_conn,
                    launcher,
                    job,
                    notifier=None,
                    dataHandlerOri=None):
    assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running")
    if dataHandlerOri is None:
        dataHandler = DataHandler()
    else:
        dataHandler = dataHandlerOri
    jobParams = json.loads(b64decode(job["jobParams"]))

    result, details, diagnostics = launcher.get_job_status(job["jobId"])
    logger.info("Job status: %s %s", job["jobId"], result)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])

        # TODO: Refactor
        detail = get_job_status_detail(job)
        detail = job_status_detail_with_finished_time(detail, "finished")

        dataFields = {
            "jobStatusDetail": b64encode(json.dumps(detail)),
            "jobStatus": "finished"
        }
        conditionFields = {"jobId": job["jobId"]}
        dataHandler.UpdateJobTextFields(conditionFields, dataFields)

        launcher.delete_job(job["jobId"], force=True)

        if notifier is not None:
            notifier.notify(
                notify.new_job_state_change_message(job["userName"],
                                                    job["jobId"],
                                                    result.strip()))
    elif result == "Running":
        update_job_state_latency(redis_conn, job["jobId"], "running")
        launcher.scale_job(job)
        if job["jobStatus"] != "running":
            started_at = k8sUtils.localize_time(datetime.datetime.now())
            detail = [{
                "startedAt": started_at,
                "message": "started at: {}".format(started_at)
            }]

            dataFields = {
                "jobStatusDetail": b64encode(json.dumps(detail)),
                "jobStatus": "running"
            }
            conditionFields = {"jobId": job["jobId"]}
            dataHandler.UpdateJobTextFields(conditionFields, dataFields)
            if notifier is not None:
                notifier.notify(
                    notify.new_job_state_change_message(
                        job["userName"], job["jobId"], result.strip()))

    elif result == "Failed":
        now = datetime.datetime.now()
        params = json.loads(base64decode(job["jobParams"]))
        if params.get("debug") is True and (now - job["jobTime"]).seconds < 60:
            logger.info("leave job %s there for debug for 60s", job["jobId"])
            return
        logger.warning("Job %s fails, cleaning...", job["jobId"])

        if notifier is not None:
            notifier.notify(
                notify.new_job_state_change_message(job["userName"],
                                                    job["jobId"],
                                                    result.strip()))

        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])

        # TODO: Refactor
        detail = get_job_status_detail(job)
        detail = job_status_detail_with_finished_time(detail, "failed")

        dataFields = {
            "jobStatusDetail": b64encode(json.dumps(detail)),
            "jobStatus": "failed",
            "errorMsg": diagnostics
        }
        conditionFields = {"jobId": job["jobId"]}
        dataHandler.UpdateJobTextFields(conditionFields, dataFields)

        launcher.delete_job(job["jobId"], force=True)
    elif result == "Unknown" or result == "NotFound":
        if job["jobId"] not in UnusualJobs:
            logger.warning("!!! Job status ---{}---, job: {}".format(
                result, job["jobId"]))
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        # TODO
        # 1) May need to reduce the timeout.
        #     It takes minutes before pod turns into "Unknown", we may don't need to wait so long.
        # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'.
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 30:
            del UnusualJobs[job["jobId"]]

            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job["jobId"])
            for endpoint_id, endpoint in list(endpoints.items()):
                endpoint["status"] = "pending"
                logger.debug("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            logger.warning(
                "Job {} fails in Kubernetes as {}, delete and re-submit.".
                format(job["jobId"], result))
            launcher.kill_job(job["jobId"], "queued")
            if notifier is not None:
                notifier.notify(
                    notify.new_job_state_change_message(
                        job["userName"], job["jobId"], result.strip()))

    elif result == "Pending":
        _, detail = k8sUtils.GetJobStatus(job["jobId"])
        dataHandler.UpdateJobTextFields(
            {"jobId": job["jobId"]},
            {"jobStatusDetail": b64encode(json.dumps(detail))})

    if result != "Unknown" and result != "NotFound" and job[
            "jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]
    if dataHandlerOri is None:
        dataHandler.Close()