Ejemplo n.º 1
0
def KillJob(job_id, desiredState="killed", dataHandlerOri=None):
    if dataHandlerOri is None:
        dataHandler = DataHandler()
    else:
        dataHandler = dataHandlerOri

    result, detail = k8sUtils.GetJobStatus(job_id)
    dataHandler.UpdateJobTextField(job_id, "jobStatusDetail",
                                   base64.b64encode(json.dumps(detail)))
    logging.info("Killing job %s, with status %s, %s" %
                 (job_id, result, detail))

    job_deployer = JobDeployer()
    errors = job_deployer.delete_job(job_id, force=True)

    if len(errors) == 0:
        dataHandler.UpdateJobTextField(job_id, "jobStatus", desiredState)
        dataHandler.UpdateJobTextField(job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())
        if dataHandlerOri is None:
            dataHandler.Close()
        return True
    else:
        dataHandler.UpdateJobTextField(job_id, "jobStatus", "error")
        dataHandler.UpdateJobTextField(job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())
        if dataHandlerOri is None:
            dataHandler.Close()
        logging.error("Kill job failed with errors: {}".format(errors))
        return False
Ejemplo n.º 2
0
    def status(self):
        """
        Return role status in ["NotFound", "Pending", "Running", "Succeeded", "Failed", "Unknown"]
        It's slightly different from pod phase, when pod is running:
            CONTAINER_READY -> WORKER_READY -> JOB_READY (then the job finally in "Running" status.)
        """
        # pod-phase: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
        # node condition: https://kubernetes.io/docs/concepts/architecture/nodes/#condition
        deployer = JobDeployer()
        pods = deployer.get_pods(
            field_selector="metadata.name={}".format(self.pod_name))
        logging.debug("Pods: {}".format(pods))
        if (len(pods) < 1):
            return "NotFound"

        assert (len(pods) == 1)
        self.pod = pods[0]
        phase = self.pod.status.phase

        # !!! Pod is running, doesn't mean "Role" is ready and running.
        if (phase == "Running"):
            # Found that phase won't turn into "Unkonwn" even when we get 'unknown' from kubectl
            if self.pod.status.reason == "NodeLost":
                return "Unknown"

            # Check if the user command had been ran.
            if not self.isRoleReady():
                return "Pending"

        return phase
Ejemplo n.º 3
0
    def get_job_roles(job_id):
        deployer = JobDeployer()
        pods = deployer.get_pods(label_selector="run={}".format(job_id))

        job_roles = []
        for pod in pods:
            pod_name = pod.metadata.name
            if "distRole" in pod.metadata.labels:
                role = pod.metadata.labels["distRole"]
            else:
                role = "master"
            job_role = JobRole(role, pod_name)
            job_roles.append(job_role)
        return job_roles
Ejemplo n.º 4
0
def check_job_status(job_id):
    job_deployer = JobDeployer()
    job_roles = JobRole.get_job_roles(job_id)

    if len(job_roles) < 1:
        return "NotFound"

    # role status in ["NotFound", "Pending", "Running", "Succeeded", "Failed", "Unknown"]
    # TODO ??? when ps/master role "Succeeded", return Succeeded
    for job_role in job_roles:
        if job_role.role_name not in ["master", "ps"]:
            continue
        if job_role.status() == "Succeeded":
            logging.info("Job: {}, Succeeded!".format(job_id))
            return "Succeeded"

    statuses = [job_role.status() for job_role in job_roles]
    logging.info("Job: {}, status: {}".format(job_id, statuses))

    details = []
    for job_role in job_roles:
        details.append(job_role.pod_details().to_dict())
    logging.info("Job {}, details: {}".format(job_id, details))

    if "Failed" in statuses:
        return "Failed"
    if "Unknown" in statuses:
        return "Unknown"
    if "NotFound" in statuses:
        return "NotFound"
    if "Pending" in statuses:
        return "Pending"

    return "Running"
Ejemplo n.º 5
0
def SubmitJob(job):
    # check if existing any pod with label: run=job_id
    assert ("jobId" in job)
    job_id = job["jobId"]
    if not all_pods_not_existing(job_id):
        logging.warning(
            "Waiting until previously pods are cleaned up! Job {}".format(
                job_id))
        job_deployer = JobDeployer()
        errors = job_deployer.delete_job(job_id, force=True)
        if errors:
            logging.warning("Force delete job {}: {}".format(job_id, errors))
        return

    ret = {}
    dataHandler = DataHandler()

    try:
        # TODO refine later
        # before resubmit the job, reset the endpoints
        # update all endpoint to status 'pending', so it would restart when job is ready
        endpoints = dataHandler.GetJobEndpoints(job_id)
        for endpoint_id, endpoint in endpoints.items():
            endpoint["status"] = "pending"
            logging.info(
                "Reset endpoint status to 'pending': {}".format(endpoint_id))
            dataHandler.UpdateEndpoint(endpoint)

        job["cluster"] = config
        job_object, errors = JobSchema().load(job)
        # TODO assert job_object is a Job
        assert (isinstance(job_object, Job))

        job_object.params = json.loads(base64.b64decode(job["jobParams"]))

        # inject gid, uid and user
        # TODO it should return only one entry
        user_info = dataHandler.GetIdentityInfo(
            job_object.params["userName"])[0]
        job_object.params["gid"] = user_info["gid"]
        job_object.params["uid"] = user_info["uid"]
        job_object.params["user"] = job_object.get_alias()

        enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
        if job_object.params["jobtrainingtype"] == "RegularJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        elif job_object.params["jobtrainingtype"] == "PSDistJob":
            pod_template = DistPodTemplate(job_object.get_template())
        elif job_object.params["jobtrainingtype"] == "InferenceJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        else:
            dataHandler.SetJobError(
                job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                job_object.params["jobtrainingtype"])
            dataHandler.Close()
            return False

        pods, error = pod_template.generate_pods(job_object)
        if error:
            dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
            dataHandler.Close()
            return False

        job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
        job_description_path = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
        local_jobDescriptionPath = os.path.realpath(
            os.path.join(config["storage-mount-path"], job_description_path))
        if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
            os.makedirs(os.path.dirname(local_jobDescriptionPath))
        with open(local_jobDescriptionPath, 'w') as f:
            f.write(job_description)

        job_deployer = JobDeployer()
        try:
            pods = job_deployer.create_pods(pods)
            ret["output"] = "Created pods: {}".format(
                [pod.metadata.name for pod in pods])
        except Exception as e:
            ret["output"] = "Error: %s" % e.message
            logging.error(e, exc_info=True)

        ret["jobId"] = job_object.job_id

        dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath",
                                       job_description_path)
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                       base64.b64encode(job_description))
        dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = job_description_path
        jobMeta["jobPath"] = job_object.job_path
        jobMeta["workPath"] = job_object.work_path
        # the command of the first container
        jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        logging.error("Submit job failed: %s" % job, exc_info=True)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(job["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
            dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret
Ejemplo n.º 6
0
def all_pods_not_existing(job_id):
    job_deployer = JobDeployer()
    job_roles = JobRole.get_job_roles(job_id)
    statuses = [job_role.status() for job_role in job_roles]
    logging.info("Job: {}, status: {}".format(job_id, statuses))
    return all([status == "NotFound" for status in statuses])
Ejemplo n.º 7
0
import logging
import yaml
import logging.config

import argparse
from cluster_manager import setup_exporter_thread, manager_iteration_histogram, register_stack_trace_dump, update_file_modification_time

sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
import k8sUtils
from config import config, GetStoragePath, GetWorkPath
from DataHandler import DataHandler

from job_deployer import JobDeployer

logger = logging.getLogger(__name__)
deployer = JobDeployer()


def is_ssh_server_ready(pod_name):
    bash_script = "sudo service ssh status"
    output = k8sUtils.kubectl_exec("exec %s %s" % (pod_name, " -- " + bash_script))
    if output == "":
        return False
    return True


def query_ssh_port(pod_name):
    bash_script = "grep ^Port /etc/ssh/sshd_config | cut -d' ' -f2"
    status_code, output = deployer.pod_exec(pod_name, ["/bin/bash", "-c", bash_script])
    if status_code != 0:
        raise RuntimeError("Query ssh port failed: {}".format(pod_name))
Ejemplo n.º 8
0
 def isFileExisting(self, file):
     deployer = JobDeployer()
     status_code, _ = deployer.pod_exec(
         self.pod_name, ["/bin/sh", "-c", "ls -lrt {}".format(file)])
     return status_code == 0
Ejemplo n.º 9
0
 def create_job_deployer(self):
     job_deployer = JobDeployer()
     self.assertIsNotNone(job_deployer)
     return job_deployer