Ejemplo n.º 1
0
def KillJob(job):
    dataHandler = DataHandler()
    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))
    logging.info("Killing job %s, with status %s, %s" %
                 (job["jobId"], result, detail))
    if "jobDescriptionPath" in job and job["jobDescriptionPath"] is not None:
        jobDescriptionPath = os.path.join(config["storage-mount-path"],
                                          job["jobDescriptionPath"])
        if os.path.isfile(jobDescriptionPath):
            if k8sUtils.kubectl_delete(jobDescriptionPath) == 0:
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "killed")
                return True
            else:
                dataHandler.UpdateJobTextField(
                    job["jobId"], "errorMsg",
                    "Cannot delete job from Kubernetes Cluster!")
    else:
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                       "Cannot find job description file!")

    dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
    return False
Ejemplo n.º 2
0
def KillJob(job_id, desiredState="killed", dataHandlerOri=None):
    if dataHandlerOri is None:
        dataHandler = DataHandler()
    else:
        dataHandler = dataHandlerOri

    result, detail = k8sUtils.GetJobStatus(job_id)
    dataHandler.UpdateJobTextField(job_id, "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))
    logging.info("Killing job %s, with status %s, %s" %
                 (job_id, result, detail))

    job_deployer = JobDeployer()
    errors = job_deployer.delete_job(job_id, force=True)

    if len(errors) == 0:
        dataHandler.UpdateJobTextField(job_id, "jobStatus", desiredState)
        dataHandler.UpdateJobTextField(job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())
        if dataHandlerOri is None:
            dataHandler.Close()
        return True
    else:
        dataHandler.UpdateJobTextField(job_id, "jobStatus", "error")
        dataHandler.UpdateJobTextField(job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())
        if dataHandlerOri is None:
            dataHandler.Close()
        logging.error("Kill job failed with errors: {}".format(errors))
        return False
Ejemplo n.º 3
0
    def kill_job(self, job_id, desired_state="killed"):
        dataHandler = DataHandler()

        result, detail = k8sUtils.GetJobStatus(job_id)
        detail = job_status_detail_with_finished_time(detail, desired_state)
        dataHandler.UpdateJobTextFields(
            {"jobId": job_id},
            {"jobStatusDetail": b64encode(json.dumps(detail))})
        logger.info("Killing job %s, with status %s, %s" %
                    (job_id, result, detail))

        errors = self.delete_job(job_id, force=True)

        dataFields = {
            "jobStatusDetail": b64encode(json.dumps(detail)),
            "lastUpdated": datetime.datetime.now().isoformat()
        }
        conditionFields = {"jobId": job_id}
        if len(errors) == 0:
            dataFields["jobStatus"] = desired_state
            dataHandler.UpdateJobTextFields(conditionFields, dataFields)
            dataHandler.Close()
            return True
        else:
            dataFields["jobStatus"] = "error"
            dataHandler.UpdateJobTextFields(conditionFields, dataFields)
            dataHandler.Close()
            logger.error("Kill job failed with errors: {}".format(errors))
            return False
Ejemplo n.º 4
0
def UpdateDistJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(detail))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    jobId = jobParams["jobId"]
    workerPodInfo = k8sUtils.GetPod("distRole=worker,run=" + jobId)
    psPodInfo = k8sUtils.GetPod("distRole=ps,run=" + jobId)
    if "items" in workerPodInfo and len(workerPodInfo["items"]) == int(
            jobParams["numpsworker"]) and "items" in psPodInfo and len(
                psPodInfo["items"]) == int(jobParams["numps"]):
        if job["jobStatus"] == "scheduling":
            launch_ps_dist_job(jobParams)
        if job["jobStatus"] == "running":
            result, detail = GetDistJobStatus(job["jobId"])
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                           base64.b64encode(detail))

            printlog("job %s status: %s" % (job["jobId"], result))

            jobDescriptionPath = os.path.join(
                config["storage-mount-path"], job["jobDescriptionPath"]
            ) if "jobDescriptionPath" in job else None

            if result.strip() == "Succeeded":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "finished")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Running":
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                if job["jobStatus"] != "running":
                    dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                                   "running")
                if "interactivePort" in jobParams:
                    serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
                    serviceAddress = base64.b64encode(
                        json.dumps(serviceAddress))
                    dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                                   serviceAddress)

            elif result.strip() == "Failed":
                printlog("Job %s fails, cleaning..." % job["jobId"])
                joblog_manager.extract_job_log(job["jobId"], logPath,
                                               jobParams["userId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "failed")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               detail)
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)

            elif result.strip() == "Unknown":
                if job["jobId"] not in UnusualJobs:
                    UnusualJobs[job["jobId"]] = datetime.datetime.now()
                elif (datetime.datetime.now() -
                      UnusualJobs[job["jobId"]]).seconds > 300:
                    del UnusualJobs[job["jobId"]]
                    retries = dataHandler.AddandGetJobRetries(job["jobId"])
                    if retries >= 5:
                        printlog("Job %s fails for more than 5 times, abort" %
                                 job["jobId"])
                        dataHandler.UpdateJobTextField(job["jobId"],
                                                       "jobStatus", "error")
                        dataHandler.UpdateJobTextField(
                            job["jobId"], "errorMsg", "cannot launch the job.")
                        if jobDescriptionPath is not None and os.path.isfile(
                                jobDescriptionPath):
                            k8sUtils.kubectl_delete(jobDescriptionPath)
                    else:
                        printlog(
                            "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                            % (job["jobId"], retries))
                        SubmitJob(job)

            if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
                del UnusualJobs[job["jobId"]]

    pass
Ejemplo n.º 5
0
def UpdateJobStatus(redis_conn,
                    launcher,
                    job,
                    notifier=None,
                    dataHandlerOri=None):
    assert (job["jobStatus"] == "scheduling" or job["jobStatus"] == "running")
    if dataHandlerOri is None:
        dataHandler = DataHandler()
    else:
        dataHandler = dataHandlerOri
    jobParams = json.loads(b64decode(job["jobParams"]))

    result, details, diagnostics = launcher.get_job_status(job["jobId"])
    logger.info("Job status: %s %s", job["jobId"], result)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])

        # TODO: Refactor
        detail = get_job_status_detail(job)
        detail = job_status_detail_with_finished_time(detail, "finished")

        dataFields = {
            "jobStatusDetail": b64encode(json.dumps(detail)),
            "jobStatus": "finished"
        }
        conditionFields = {"jobId": job["jobId"]}
        dataHandler.UpdateJobTextFields(conditionFields, dataFields)

        launcher.delete_job(job["jobId"], force=True)

        if notifier is not None:
            notifier.notify(
                notify.new_job_state_change_message(job["userName"],
                                                    job["jobId"],
                                                    result.strip()))
    elif result == "Running":
        update_job_state_latency(redis_conn, job["jobId"], "running")
        launcher.scale_job(job)
        if job["jobStatus"] != "running":
            started_at = k8sUtils.localize_time(datetime.datetime.now())
            detail = [{
                "startedAt": started_at,
                "message": "started at: {}".format(started_at)
            }]

            dataFields = {
                "jobStatusDetail": b64encode(json.dumps(detail)),
                "jobStatus": "running"
            }
            conditionFields = {"jobId": job["jobId"]}
            dataHandler.UpdateJobTextFields(conditionFields, dataFields)
            if notifier is not None:
                notifier.notify(
                    notify.new_job_state_change_message(
                        job["userName"], job["jobId"], result.strip()))

    elif result == "Failed":
        now = datetime.datetime.now()
        params = json.loads(base64decode(job["jobParams"]))
        if params.get("debug") is True and (now - job["jobTime"]).seconds < 60:
            logger.info("leave job %s there for debug for 60s", job["jobId"])
            return
        logger.warning("Job %s fails, cleaning...", job["jobId"])

        if notifier is not None:
            notifier.notify(
                notify.new_job_state_change_message(job["userName"],
                                                    job["jobId"],
                                                    result.strip()))

        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])

        # TODO: Refactor
        detail = get_job_status_detail(job)
        detail = job_status_detail_with_finished_time(detail, "failed")

        dataFields = {
            "jobStatusDetail": b64encode(json.dumps(detail)),
            "jobStatus": "failed",
            "errorMsg": diagnostics
        }
        conditionFields = {"jobId": job["jobId"]}
        dataHandler.UpdateJobTextFields(conditionFields, dataFields)

        launcher.delete_job(job["jobId"], force=True)
    elif result == "Unknown" or result == "NotFound":
        if job["jobId"] not in UnusualJobs:
            logger.warning("!!! Job status ---{}---, job: {}".format(
                result, job["jobId"]))
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        # TODO
        # 1) May need to reduce the timeout.
        #     It takes minutes before pod turns into "Unknown", we may don't need to wait so long.
        # 2) If node resume before we resubmit the job, the job will end in status 'NotFound'.
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 30:
            del UnusualJobs[job["jobId"]]

            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job["jobId"])
            for endpoint_id, endpoint in list(endpoints.items()):
                endpoint["status"] = "pending"
                logger.debug("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            logger.warning(
                "Job {} fails in Kubernetes as {}, delete and re-submit.".
                format(job["jobId"], result))
            launcher.kill_job(job["jobId"], "queued")
            if notifier is not None:
                notifier.notify(
                    notify.new_job_state_change_message(
                        job["userName"], job["jobId"], result.strip()))

    elif result == "Pending":
        _, detail = k8sUtils.GetJobStatus(job["jobId"])
        dataHandler.UpdateJobTextFields(
            {"jobId": job["jobId"]},
            {"jobStatusDetail": b64encode(json.dumps(detail))})

    if result != "Unknown" and result != "NotFound" and job[
            "jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]
    if dataHandlerOri is None:
        dataHandler.Close()
Ejemplo n.º 6
0
def UpdateJobStatus(job):
    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        # launch user command only all pods are ready
        result, detail = k8sUtils.GetJobStatus(job["jobId"])
        if result in ["Failed", "Succeeded"]:
            # TODO shoudn't be here, update status
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", result)
            pass
        else:
            # previously status is 'scheduling', and now all pods are ready
            # TODO check all pods are ready
            if k8sUtils.all_pod_ready(job["jobId"]):
                try:
                    launch_ps_dist_job(jobParams)
                except Exception as e:
                    print(e)
            return

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    logging.info("job %s status: %s,%s" %
                 (job["jobId"], result, json.dumps(detail)))

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None
    if "userId" not in jobParams:
        jobParams["userId"] = "0"
    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)
        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            k8sUtils.kubectl_delete(jobDescriptionPath)

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()
        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")
                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    k8sUtils.kubectl_delete(jobDescriptionPath)
            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)
    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]

    dataHandler.Close()
Ejemplo n.º 7
0
def UpdateJobStatus(job):

    dataHandler = DataHandler()
    jobParams = json.loads(base64.b64decode(job["jobParams"]))
    logging.info("start to update job status...")

    if job["jobStatus"] == "scheduling" and jobParams[
            "jobtrainingtype"] == "PSDistJob":
        launch_ps_dist_job(jobParams)

    jobPath, workPath, dataPath = GetStoragePath(jobParams["jobPath"],
                                                 jobParams["workPath"],
                                                 jobParams["dataPath"])
    localJobPath = os.path.join(config["storage-mount-path"], jobPath)
    logPath = os.path.join(localJobPath, "logs/joblog.txt")

    result, detail = k8sUtils.GetJobStatus(job["jobId"])
    dataHandler.UpdateJobTextField(job["jobId"], "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))

    msg = "job %s status, result: %s, detail: %s" % (job["jobId"], result,
                                                     json.dumps(detail))
    logging.info(msg)

    jobDescriptionPath = os.path.join(
        config["storage-mount-path"],
        job["jobDescriptionPath"]) if "jobDescriptionPath" in job else None

    if "userId" not in jobParams:
        jobParams["userId"] = "0"

    if result.strip() == "Succeeded":
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "finished")

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Running":
        if job["jobStatus"] != "running":
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                           "running")

        if "interactivePort" in jobParams:
            serviceAddress = k8sUtils.GetServiceAddress(job["jobId"])
            serviceAddress = base64.b64encode(json.dumps(serviceAddress))
            dataHandler.UpdateJobTextField(job["jobId"], "endpoints",
                                           serviceAddress)

    elif result.strip() == "Failed":
        printlog("Job %s fails, cleaning..." % job["jobId"])
        joblog_manager.extract_job_log(job["jobId"], logPath,
                                       jobParams["userId"])
        dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "failed")
        dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", detail)

        if jobDescriptionPath is not None and os.path.isfile(
                jobDescriptionPath):
            output = k8sUtils.kubectl_delete(jobDescriptionPath)
            logging.info("kubectl delete " + jobDescriptionPath + " output: " +
                         str(output))

    elif result.strip() == "Unknown":
        if job["jobId"] not in UnusualJobs:
            UnusualJobs[job["jobId"]] = datetime.datetime.now()

        elif (datetime.datetime.now() -
              UnusualJobs[job["jobId"]]).seconds > 300:
            del UnusualJobs[job["jobId"]]

            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                printlog("Job %s fails for more than 5 times, abort" %
                         job["jobId"])
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "cannot launch the job.")

                if jobDescriptionPath is not None and os.path.isfile(
                        jobDescriptionPath):
                    output = k8sUtils.kubectl_delete(jobDescriptionPath)
                    logging.info("kubectl delete " + jobDescriptionPath +
                                 " output: " + str(output))

            else:
                printlog(
                    "Job %s fails in Kubernetes, delete and re-submit the job. Retries %d"
                    % (job["jobId"], retries))
                SubmitJob(job)

    elif result.strip() == "PendingHostPort":
        printlog(
            "Cannot find host ports for job :%s, re-launch the job with different host ports "
            % (job["jobId"]))

        SubmitJob(job)

    if result.strip() != "Unknown" and job["jobId"] in UnusualJobs:
        del UnusualJobs[job["jobId"]]