Ejemplo n.º 1
0
    def test_job_id_schema(self):
        job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES)
        self.assertFalse(errors)

        # uppercase
        attrs = VALID_JOB_ATTRIBUTES.copy()
        attrs.update({"jobId": "First-job"})
        job, errors = JobSchema().load(attrs)
        self.assertTrue("jobId" in errors)

        # space
        attrs = VALID_JOB_ATTRIBUTES.copy()
        attrs.update({"jobId": "first job"})
        job, errors = JobSchema().load(attrs)
        self.assertTrue("jobId" in errors)
Ejemplo n.º 2
0
    def test_loads(self):
        job_json = json.dumps(VALID_JOB_ATTRIBUTES)

        job, errors = JobSchema().loads(job_json)
        self.assertFalse(errors)
        self.assertEqual(job.job_id, VALID_JOB_ATTRIBUTES["jobId"])
        self.assertEqual(job.email, VALID_JOB_ATTRIBUTES["userName"])
Ejemplo n.º 3
0
    def test_dump(self):
        job = Job(cluster=config, job_id="test-job", email="*****@*****.**")

        result, errors = JobSchema().dump(job)

        self.assertFalse(errors)
        self.assertEqual(result["jobId"], "test-job")
        self.assertEqual(result["userName"], "*****@*****.**")
Ejemplo n.º 4
0
    def scale_job(self, job):
        assert ("jobId" in job)
        job["cluster"] = config
        job_object, errors = JobSchema().load(job)
        job_object.params = json.loads(b64decode(job["jobParams"]))
        if job_object.params["jobtrainingtype"] != "InferenceJob":
            return

        name = job_object.job_id + "-deployment"
        deployment = self._get_deployment(name=name)
        replicas = deployment.spec.replicas
        new_replicas = int(job_object.params["resourcegpu"])
        if replicas == new_replicas:
            return

        deployment.spec.replicas = new_replicas
        self._patch_deployment(name=name, body=deployment)
        logger.debug("Scale inference job %s from %d to %d." %
                     (job_object.job_id, replicas, new_replicas))
Ejemplo n.º 5
0
def SubmitJob(job):
    # check if existing any pod with label: run=job_id
    assert ("jobId" in job)
    job_id = job["jobId"]
    if not all_pods_not_existing(job_id):
        logging.warning(
            "Waiting until previously pods are cleaned up! Job {}".format(
                job_id))
        job_deployer = JobDeployer()
        errors = job_deployer.delete_job(job_id, force=True)
        if errors:
            logging.warning("Force delete job {}: {}".format(job_id, errors))
        return

    ret = {}
    dataHandler = DataHandler()

    try:
        # TODO refine later
        # before resubmit the job, reset the endpoints
        # update all endpoint to status 'pending', so it would restart when job is ready
        endpoints = dataHandler.GetJobEndpoints(job_id)
        for endpoint_id, endpoint in endpoints.items():
            endpoint["status"] = "pending"
            logging.info(
                "Reset endpoint status to 'pending': {}".format(endpoint_id))
            dataHandler.UpdateEndpoint(endpoint)

        job["cluster"] = config
        job_object, errors = JobSchema().load(job)
        # TODO assert job_object is a Job
        assert (isinstance(job_object, Job))

        job_object.params = json.loads(base64.b64decode(job["jobParams"]))

        # inject gid, uid and user
        # TODO it should return only one entry
        user_info = dataHandler.GetIdentityInfo(
            job_object.params["userName"])[0]
        job_object.params["gid"] = user_info["gid"]
        job_object.params["uid"] = user_info["uid"]
        job_object.params["user"] = job_object.get_alias()

        enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
        if job_object.params["jobtrainingtype"] == "RegularJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        elif job_object.params["jobtrainingtype"] == "PSDistJob":
            pod_template = DistPodTemplate(job_object.get_template())
        elif job_object.params["jobtrainingtype"] == "InferenceJob":
            pod_template = PodTemplate(job_object.get_template(),
                                       enable_custom_scheduler)
        else:
            dataHandler.SetJobError(
                job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                job_object.params["jobtrainingtype"])
            dataHandler.Close()
            return False

        pods, error = pod_template.generate_pods(job_object)
        if error:
            dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
            dataHandler.Close()
            return False

        job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
        job_description_path = "jobfiles/" + time.strftime(
            "%y%m%d"
        ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
        local_jobDescriptionPath = os.path.realpath(
            os.path.join(config["storage-mount-path"], job_description_path))
        if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
            os.makedirs(os.path.dirname(local_jobDescriptionPath))
        with open(local_jobDescriptionPath, 'w') as f:
            f.write(job_description)

        job_deployer = JobDeployer()
        try:
            pods = job_deployer.create_pods(pods)
            ret["output"] = "Created pods: {}".format(
                [pod.metadata.name for pod in pods])
        except Exception as e:
            ret["output"] = "Error: %s" % e.message
            logging.error(e, exc_info=True)

        ret["jobId"] = job_object.job_id

        dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                       "scheduling")
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath",
                                       job_description_path)
        dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                       base64.b64encode(job_description))
        dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                       datetime.datetime.now().isoformat())

        jobMeta = {}
        jobMeta["jobDescriptionPath"] = job_description_path
        jobMeta["jobPath"] = job_object.job_path
        jobMeta["workPath"] = job_object.work_path
        # the command of the first container
        jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

        jobMetaStr = base64.b64encode(json.dumps(jobMeta))
        dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                       jobMetaStr)
    except Exception as e:
        logging.error("Submit job failed: %s" % job, exc_info=True)
        ret["error"] = str(e)
        retries = dataHandler.AddandGetJobRetries(job["jobId"])
        if retries >= 5:
            dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error")
            dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                           "Cannot submit job!" + str(e))
    dataHandler.Close()
    return ret
Ejemplo n.º 6
0
 def create_a_job(self):
     job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES)
     self.assertFalse(errors)
     return job
Ejemplo n.º 7
0
    def submit_job_impl(self, job):
        # check if existing any pod with label: run=job_id
        assert ("jobId" in job)
        job_id = job["jobId"]
        if not self._all_pods_not_existing(job_id):
            logger.warning(
                "Waiting until previously pods are cleaned up! Job {}".format(
                    job_id))
            errors = self.delete_job(job_id, force=True)
            if errors:
                logger.warning("Force delete job {}: {}".format(
                    job_id, errors))
            return

        ret = {}
        dataHandler = DataHandler()

        try:
            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job_id)
            for endpoint_id, endpoint in list(endpoints.items()):
                endpoint["status"] = "pending"
                logger.debug("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            job["cluster"] = config
            job_object, errors = JobSchema().load(job)
            # TODO assert job_object is a Job
            assert isinstance(
                job_object,
                Job), "job_object is not of Job, but " + str(type(job_object))

            job_object.params = json.loads(b64decode(job["jobParams"]))

            # inject gid, uid and user
            # TODO it should return only one entry
            user_info = dataHandler.GetIdentityInfo(
                job_object.params["userName"])[0]
            job_object.params["gid"] = user_info["gid"]
            job_object.params["uid"] = user_info["uid"]
            job_object.params["user"] = job_object.get_alias()
            job_object.params["private_key"] = user_info["private_key"]
            job_object.params["ssh_public_keys"] = job_object.params.get(
                "ssh_public_keys", [])
            job_object.params["ssh_public_keys"].append(
                user_info["public_key"])

            if "job_token" not in job_object.params:
                if "master_token" in config and config[
                        "master_token"] is not None and "userName" in job_object.params:
                    plain_token = job_object.params["userName"] + \
                        ":" + config["master_token"]
                    job_object.params["job_token"] = hashlib.md5(
                        plain_token.encode("utf-8")).hexdigest()
                else:
                    job_object.params["job_token"] = "tryme2017"

            if "envs" not in job_object.params:
                job_object.params["envs"] = []
            job_object.params["envs"].append({
                "name":
                "DLTS_JOB_TOKEN",
                "value":
                job_object.params["job_token"]
            })

            blobfuse_secret_template = job_object.get_blobfuse_secret_template(
            )
            image_pull_secret_template = job_object.get_image_pull_secret_template(
            )
            secret_templates = {
                "blobfuse": blobfuse_secret_template,
                "imagePull": image_pull_secret_template
            }
            if job_object.params["jobtrainingtype"] == "RegularJob":
                pod_template = RegularJobTemplate(
                    job_object.get_template(),
                    secret_templates=secret_templates)
            elif job_object.params["jobtrainingtype"] == "PSDistJob":
                pod_template = DistributeJobTemplate(
                    job_object.get_template(),
                    secret_templates=secret_templates)
            elif job_object.params["jobtrainingtype"] == "InferenceJob":
                pod_template = InferenceJobTemplate(
                    job_object.get_template(),
                    deployment_template=job_object.get_deployment_template(),
                    secret_templates=secret_templates)
            else:
                dataHandler.SetJobError(
                    job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                    job_object.params["jobtrainingtype"])
                dataHandler.Close()
                return False

            pods, error = pod_template.generate_pods(job_object)
            if error:
                dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
                dataHandler.Close()
                return False

            job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])

            secrets = pod_template.generate_secrets(job_object)

            try:
                secrets = self.create_secrets(secrets)
                ret["output"] = "Created secrets: {}. ".format(
                    [secret.metadata.name for secret in secrets])
                created_pods = self.create_pods(pods)
                ret["output"] += "Created pods: {}".format(
                    [pod.metadata.name for pod in created_pods])
            except Exception as e:
                ret["output"] = "Error: %s" % e.message
                logger.exception(e)

            ret["jobId"] = job_object.job_id

            jobMeta = {}
            jobMeta["jobPath"] = job_object.job_path
            jobMeta["workPath"] = job_object.work_path
            # the command of the first container
            jobMeta["LaunchCMD"] = job_object.params["cmd"]

            jobMetaStr = b64encode(json.dumps(jobMeta))

            dataFields = {
                "jobStatus": "scheduling",
                "jobDescription": b64encode(job_description),
                "lastUpdated": datetime.datetime.now().isoformat(),
                "jobMeta": jobMetaStr
            }
            conditionFields = {"jobId": job_object.job_id}
            dataHandler.UpdateJobTextFields(conditionFields, dataFields)
        except Exception as e:
            logger.error("Submit job failed: %s" % job, exc_info=True)
            ret["error"] = str(e)
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                detail = get_job_status_detail(job)
                detail = job_status_detail_with_finished_time(
                    detail, "error", "Server error in job submission")

                dataFields = {
                    "jobStatus": "error",
                    "errorMsg": "Cannot submit job!" + str(e),
                    "jobStatusDetail": b64encode(json.dumps(detail))
                }
                conditionFields = {"jobId": job["jobId"]}
                dataHandler.UpdateJobTextFields(conditionFields, dataFields)
                # Try to clean up the job
                try:
                    self.delete_job(job_id, force=True)
                    logger.info(
                        "Cleaning up job %s succeeded after %d retries of job submission"
                        % (job["jobId"], retries))
                except:
                    logger.warning(
                        "Cleaning up job %s failed after %d retries of job submission"
                        % (job["jobId"], retries))

        dataHandler.Close()
        return ret
Ejemplo n.º 8
0
import os
from job import Job, JobSchema
from pod_template import PodTemplate

sys.path.append(
    os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils"))
from config import config

VALID_JOB_ATTRIBUTES = {
    "cluster": config,
    "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c",
    "userName": "******",
    "jobPath": "user_alias/jobs/date/job_id"
}

job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES)
assert (not errors)


class TestPodTemplate(unittest.TestCase):
    def test_generate_launch_script(self):
        job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c"
        path_to_save = "/tmp"
        user_id = "20000"
        gpu_num = 3
        user_script = "sleep infinity"

        script_file = PodTemplate.generate_launch_script(
            job_id, path_to_save, user_id, gpu_num, user_script)

        # return the container command
Ejemplo n.º 9
0
    def submit_job_impl(self, job):
        # check if existing any pod with label: run=job_id
        assert ("jobId" in job)
        job_id = job["jobId"]
        if not self._all_pods_not_existing(job_id):
            logging.warning(
                "Waiting until previously pods are cleaned up! Job {}".format(
                    job_id))
            job_deployer = JobDeployer()
            errors = job_deployer.delete_job(job_id, force=True)
            if errors:
                logging.warning("Force delete job {}: {}".format(
                    job_id, errors))
            return

        ret = {}
        dataHandler = DataHandler()

        try:
            # TODO refine later
            # before resubmit the job, reset the endpoints
            # update all endpoint to status 'pending', so it would restart when job is ready
            endpoints = dataHandler.GetJobEndpoints(job_id)
            for endpoint_id, endpoint in endpoints.items():
                endpoint["status"] = "pending"
                logging.info("Reset endpoint status to 'pending': {}".format(
                    endpoint_id))
                dataHandler.UpdateEndpoint(endpoint)

            job["cluster"] = config
            job_object, errors = JobSchema().load(job)
            # TODO assert job_object is a Job
            assert isinstance(
                job_object,
                Job), "job_object is not of Job, but " + str(type(job_object))

            job_object.params = json.loads(base64.b64decode(job["jobParams"]))

            # inject gid, uid and user
            # TODO it should return only one entry
            user_info = dataHandler.GetIdentityInfo(
                job_object.params["userName"])[0]
            job_object.params["gid"] = user_info["gid"]
            job_object.params["uid"] = user_info["uid"]
            job_object.params["user"] = job_object.get_alias()

            if "job_token" not in job_object.params:
                if "user_sign_token" in config and "userName" in job_object.params:
                    job_object.params["job_token"] = hashlib.md5(
                        job_object.params["userName"] + ":" +
                        config["user_sign_token"]).hexdigest()
                else:
                    job_object.params["job_token"] = "tryme2017"

            if "envs" not in job_object.params:
                job_object.params["envs"] = []
            job_object.params["envs"].append({
                "name":
                "DLTS_JOB_TOKEN",
                "value":
                job_object.params["job_token"]
            })

            enable_custom_scheduler = job_object.is_custom_scheduler_enabled()
            secret_template = job_object.get_blobfuse_secret_template()
            if job_object.params["jobtrainingtype"] == "RegularJob":
                pod_template = PodTemplate(
                    job_object.get_template(),
                    enable_custom_scheduler=enable_custom_scheduler,
                    secret_template=secret_template)
            elif job_object.params["jobtrainingtype"] == "PSDistJob":
                pod_template = DistPodTemplate(job_object.get_template(),
                                               secret_template=secret_template)
            elif job_object.params["jobtrainingtype"] == "InferenceJob":
                pod_template = PodTemplate(
                    job_object.get_template(),
                    deployment_template=job_object.get_deployment_template(),
                    enable_custom_scheduler=False,
                    secret_template=secret_template)
            else:
                dataHandler.SetJobError(
                    job_object.job_id, "ERROR: invalid jobtrainingtype: %s" %
                    job_object.params["jobtrainingtype"])
                dataHandler.Close()
                return False

            pods, error = pod_template.generate_pods(job_object)
            if error:
                dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error)
                dataHandler.Close()
                return False

            job_description = "\n---\n".join([yaml.dump(pod) for pod in pods])
            job_description_path = "jobfiles/" + time.strftime(
                "%y%m%d"
            ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml"
            local_jobDescriptionPath = os.path.realpath(
                os.path.join(config["storage-mount-path"],
                             job_description_path))
            if not os.path.exists(os.path.dirname(local_jobDescriptionPath)):
                os.makedirs(os.path.dirname(local_jobDescriptionPath))
            with open(local_jobDescriptionPath, 'w') as f:
                f.write(job_description)

            secrets = pod_template.generate_secrets(job_object)

            job_deployer = JobDeployer()
            try:
                secrets = job_deployer.create_secrets(secrets)
                ret["output"] = "Created secrets: {}. ".format(
                    [secret.metadata.name for secret in secrets])
                pods = job_deployer.create_pods(pods)
                ret["output"] += "Created pods: {}".format(
                    [pod.metadata.name for pod in pods])
            except Exception as e:
                ret["output"] = "Error: %s" % e.message
                logging.error(e, exc_info=True)

            ret["jobId"] = job_object.job_id

            dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus",
                                           "scheduling")
            dataHandler.UpdateJobTextField(job_object.job_id,
                                           "jobDescriptionPath",
                                           job_description_path)
            dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription",
                                           base64.b64encode(job_description))
            dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated",
                                           datetime.datetime.now().isoformat())

            jobMeta = {}
            jobMeta["jobDescriptionPath"] = job_description_path
            jobMeta["jobPath"] = job_object.job_path
            jobMeta["workPath"] = job_object.work_path
            # the command of the first container
            jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command

            jobMetaStr = base64.b64encode(json.dumps(jobMeta))
            dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta",
                                           jobMetaStr)
        except Exception as e:
            logging.error("Submit job failed: %s" % job, exc_info=True)
            ret["error"] = str(e)
            retries = dataHandler.AddandGetJobRetries(job["jobId"])
            if retries >= 5:
                dataHandler.UpdateJobTextField(job["jobId"], "jobStatus",
                                               "error")
                dataHandler.UpdateJobTextField(job["jobId"], "errorMsg",
                                               "Cannot submit job!" + str(e))

                detail = get_job_status_detail(job)
                detail = job_status_detail_with_finished_time(
                    detail, "error", "Server error in job submission")
                dataHandler.UpdateJobTextField(
                    job["jobId"], "jobStatusDetail",
                    base64.b64encode(json.dumps(detail)))

                # Try to clean up the job
                try:
                    job_deployer = JobDeployer()
                    job_deployer.delete_job(job_id, force=True)
                    logging.info(
                        "Cleaning up job %s succeeded after %d retries of job submission"
                        % (job["jobId"], retries))
                except:
                    logging.warning(
                        "Cleaning up job %s failed after %d retries of job submission"
                        % (job["jobId"], retries))

        dataHandler.Close()
        return ret
Ejemplo n.º 10
0
 def get(self):
     # List all jobs
     schema = JobSchema(many=True)
     jobs_dump = schema.dump(jobs)
     return jsonify(jobs_dump.data)
Ejemplo n.º 11
0
def job_to_json(job):
    schema = JobSchema(many=False)
    job_dump = schema.dump(job)
    return jsonify(job_dump.data)