def test_job_id_schema(self): job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) self.assertFalse(errors) # uppercase attrs = VALID_JOB_ATTRIBUTES.copy() attrs.update({"jobId": "First-job"}) job, errors = JobSchema().load(attrs) self.assertTrue("jobId" in errors) # space attrs = VALID_JOB_ATTRIBUTES.copy() attrs.update({"jobId": "first job"}) job, errors = JobSchema().load(attrs) self.assertTrue("jobId" in errors)
def test_loads(self): job_json = json.dumps(VALID_JOB_ATTRIBUTES) job, errors = JobSchema().loads(job_json) self.assertFalse(errors) self.assertEqual(job.job_id, VALID_JOB_ATTRIBUTES["jobId"]) self.assertEqual(job.email, VALID_JOB_ATTRIBUTES["userName"])
def test_dump(self): job = Job(cluster=config, job_id="test-job", email="*****@*****.**") result, errors = JobSchema().dump(job) self.assertFalse(errors) self.assertEqual(result["jobId"], "test-job") self.assertEqual(result["userName"], "*****@*****.**")
def scale_job(self, job): assert ("jobId" in job) job["cluster"] = config job_object, errors = JobSchema().load(job) job_object.params = json.loads(b64decode(job["jobParams"])) if job_object.params["jobtrainingtype"] != "InferenceJob": return name = job_object.job_id + "-deployment" deployment = self._get_deployment(name=name) replicas = deployment.spec.replicas new_replicas = int(job_object.params["resourcegpu"]) if replicas == new_replicas: return deployment.spec.replicas = new_replicas self._patch_deployment(name=name, body=deployment) logger.debug("Scale inference job %s from %d to %d." % (job_object.job_id, replicas, new_replicas))
def SubmitJob(job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not all_pods_not_existing(job_id): logging.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) job_deployer = JobDeployer() errors = job_deployer.delete_job(job_id, force=True) if errors: logging.warning("Force delete job {}: {}".format(job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info( "Reset endpoint status to 'pending': {}".format(endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert (isinstance(job_object, Job)) job_object.params = json.loads(base64.b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() enable_custom_scheduler = job_object.is_custom_scheduler_enabled() if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistPodTemplate(job_object.get_template()) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = PodTemplate(job_object.get_template(), enable_custom_scheduler) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) job_description_path = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" local_jobDescriptionPath = os.path.realpath( os.path.join(config["storage-mount-path"], job_description_path)) if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): os.makedirs(os.path.dirname(local_jobDescriptionPath)) with open(local_jobDescriptionPath, 'w') as f: f.write(job_description) job_deployer = JobDeployer() try: pods = job_deployer.create_pods(pods) ret["output"] = "Created pods: {}".format( [pod.metadata.name for pod in pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logging.error(e, exc_info=True) ret["jobId"] = job_object.job_id dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} jobMeta["jobDescriptionPath"] = job_description_path jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) dataHandler.Close() return ret
def create_a_job(self): job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) self.assertFalse(errors) return job
def submit_job_impl(self, job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not self._all_pods_not_existing(job_id): logger.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) errors = self.delete_job(job_id, force=True) if errors: logger.warning("Force delete job {}: {}".format( job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in list(endpoints.items()): endpoint["status"] = "pending" logger.debug("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert isinstance( job_object, Job), "job_object is not of Job, but " + str(type(job_object)) job_object.params = json.loads(b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() job_object.params["private_key"] = user_info["private_key"] job_object.params["ssh_public_keys"] = job_object.params.get( "ssh_public_keys", []) job_object.params["ssh_public_keys"].append( user_info["public_key"]) if "job_token" not in job_object.params: if "master_token" in config and config[ "master_token"] is not None and "userName" in job_object.params: plain_token = job_object.params["userName"] + \ ":" + config["master_token"] job_object.params["job_token"] = hashlib.md5( plain_token.encode("utf-8")).hexdigest() else: job_object.params["job_token"] = "tryme2017" if "envs" not in job_object.params: job_object.params["envs"] = [] job_object.params["envs"].append({ "name": "DLTS_JOB_TOKEN", "value": job_object.params["job_token"] }) blobfuse_secret_template = job_object.get_blobfuse_secret_template( ) image_pull_secret_template = job_object.get_image_pull_secret_template( ) secret_templates = { "blobfuse": blobfuse_secret_template, "imagePull": image_pull_secret_template } if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = RegularJobTemplate( job_object.get_template(), secret_templates=secret_templates) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistributeJobTemplate( job_object.get_template(), secret_templates=secret_templates) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = InferenceJobTemplate( job_object.get_template(), deployment_template=job_object.get_deployment_template(), secret_templates=secret_templates) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) secrets = pod_template.generate_secrets(job_object) try: secrets = self.create_secrets(secrets) ret["output"] = "Created secrets: {}. ".format( [secret.metadata.name for secret in secrets]) created_pods = self.create_pods(pods) ret["output"] += "Created pods: {}".format( [pod.metadata.name for pod in created_pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logger.exception(e) ret["jobId"] = job_object.job_id jobMeta = {} jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = job_object.params["cmd"] jobMetaStr = b64encode(json.dumps(jobMeta)) dataFields = { "jobStatus": "scheduling", "jobDescription": b64encode(job_description), "lastUpdated": datetime.datetime.now().isoformat(), "jobMeta": jobMetaStr } conditionFields = {"jobId": job_object.job_id} dataHandler.UpdateJobTextFields(conditionFields, dataFields) except Exception as e: logger.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time( detail, "error", "Server error in job submission") dataFields = { "jobStatus": "error", "errorMsg": "Cannot submit job!" + str(e), "jobStatusDetail": b64encode(json.dumps(detail)) } conditionFields = {"jobId": job["jobId"]} dataHandler.UpdateJobTextFields(conditionFields, dataFields) # Try to clean up the job try: self.delete_job(job_id, force=True) logger.info( "Cleaning up job %s succeeded after %d retries of job submission" % (job["jobId"], retries)) except: logger.warning( "Cleaning up job %s failed after %d retries of job submission" % (job["jobId"], retries)) dataHandler.Close() return ret
import os from job import Job, JobSchema from pod_template import PodTemplate sys.path.append( os.path.join(os.path.dirname(os.path.abspath(__file__)), "../utils")) from config import config VALID_JOB_ATTRIBUTES = { "cluster": config, "jobId": "ce7dca49-28df-450a-a03b-51b9c2ecc69c", "userName": "******", "jobPath": "user_alias/jobs/date/job_id" } job, errors = JobSchema().load(VALID_JOB_ATTRIBUTES) assert (not errors) class TestPodTemplate(unittest.TestCase): def test_generate_launch_script(self): job_id = "ce7dca49-28df-450a-a03b-51b9c2ecc69c" path_to_save = "/tmp" user_id = "20000" gpu_num = 3 user_script = "sleep infinity" script_file = PodTemplate.generate_launch_script( job_id, path_to_save, user_id, gpu_num, user_script) # return the container command
def submit_job_impl(self, job): # check if existing any pod with label: run=job_id assert ("jobId" in job) job_id = job["jobId"] if not self._all_pods_not_existing(job_id): logging.warning( "Waiting until previously pods are cleaned up! Job {}".format( job_id)) job_deployer = JobDeployer() errors = job_deployer.delete_job(job_id, force=True) if errors: logging.warning("Force delete job {}: {}".format( job_id, errors)) return ret = {} dataHandler = DataHandler() try: # TODO refine later # before resubmit the job, reset the endpoints # update all endpoint to status 'pending', so it would restart when job is ready endpoints = dataHandler.GetJobEndpoints(job_id) for endpoint_id, endpoint in endpoints.items(): endpoint["status"] = "pending" logging.info("Reset endpoint status to 'pending': {}".format( endpoint_id)) dataHandler.UpdateEndpoint(endpoint) job["cluster"] = config job_object, errors = JobSchema().load(job) # TODO assert job_object is a Job assert isinstance( job_object, Job), "job_object is not of Job, but " + str(type(job_object)) job_object.params = json.loads(base64.b64decode(job["jobParams"])) # inject gid, uid and user # TODO it should return only one entry user_info = dataHandler.GetIdentityInfo( job_object.params["userName"])[0] job_object.params["gid"] = user_info["gid"] job_object.params["uid"] = user_info["uid"] job_object.params["user"] = job_object.get_alias() if "job_token" not in job_object.params: if "user_sign_token" in config and "userName" in job_object.params: job_object.params["job_token"] = hashlib.md5( job_object.params["userName"] + ":" + config["user_sign_token"]).hexdigest() else: job_object.params["job_token"] = "tryme2017" if "envs" not in job_object.params: job_object.params["envs"] = [] job_object.params["envs"].append({ "name": "DLTS_JOB_TOKEN", "value": job_object.params["job_token"] }) enable_custom_scheduler = job_object.is_custom_scheduler_enabled() secret_template = job_object.get_blobfuse_secret_template() if job_object.params["jobtrainingtype"] == "RegularJob": pod_template = PodTemplate( job_object.get_template(), enable_custom_scheduler=enable_custom_scheduler, secret_template=secret_template) elif job_object.params["jobtrainingtype"] == "PSDistJob": pod_template = DistPodTemplate(job_object.get_template(), secret_template=secret_template) elif job_object.params["jobtrainingtype"] == "InferenceJob": pod_template = PodTemplate( job_object.get_template(), deployment_template=job_object.get_deployment_template(), enable_custom_scheduler=False, secret_template=secret_template) else: dataHandler.SetJobError( job_object.job_id, "ERROR: invalid jobtrainingtype: %s" % job_object.params["jobtrainingtype"]) dataHandler.Close() return False pods, error = pod_template.generate_pods(job_object) if error: dataHandler.SetJobError(job_object.job_id, "ERROR: %s" % error) dataHandler.Close() return False job_description = "\n---\n".join([yaml.dump(pod) for pod in pods]) job_description_path = "jobfiles/" + time.strftime( "%y%m%d" ) + "/" + job_object.job_id + "/" + job_object.job_id + ".yaml" local_jobDescriptionPath = os.path.realpath( os.path.join(config["storage-mount-path"], job_description_path)) if not os.path.exists(os.path.dirname(local_jobDescriptionPath)): os.makedirs(os.path.dirname(local_jobDescriptionPath)) with open(local_jobDescriptionPath, 'w') as f: f.write(job_description) secrets = pod_template.generate_secrets(job_object) job_deployer = JobDeployer() try: secrets = job_deployer.create_secrets(secrets) ret["output"] = "Created secrets: {}. ".format( [secret.metadata.name for secret in secrets]) pods = job_deployer.create_pods(pods) ret["output"] += "Created pods: {}".format( [pod.metadata.name for pod in pods]) except Exception as e: ret["output"] = "Error: %s" % e.message logging.error(e, exc_info=True) ret["jobId"] = job_object.job_id dataHandler.UpdateJobTextField(job_object.job_id, "jobStatus", "scheduling") dataHandler.UpdateJobTextField(job_object.job_id, "jobDescriptionPath", job_description_path) dataHandler.UpdateJobTextField(job_object.job_id, "jobDescription", base64.b64encode(job_description)) dataHandler.UpdateJobTextField(job_object.job_id, "lastUpdated", datetime.datetime.now().isoformat()) jobMeta = {} jobMeta["jobDescriptionPath"] = job_description_path jobMeta["jobPath"] = job_object.job_path jobMeta["workPath"] = job_object.work_path # the command of the first container jobMeta["LaunchCMD"] = pods[0].spec.containers[0].command jobMetaStr = base64.b64encode(json.dumps(jobMeta)) dataHandler.UpdateJobTextField(job_object.job_id, "jobMeta", jobMetaStr) except Exception as e: logging.error("Submit job failed: %s" % job, exc_info=True) ret["error"] = str(e) retries = dataHandler.AddandGetJobRetries(job["jobId"]) if retries >= 5: dataHandler.UpdateJobTextField(job["jobId"], "jobStatus", "error") dataHandler.UpdateJobTextField(job["jobId"], "errorMsg", "Cannot submit job!" + str(e)) detail = get_job_status_detail(job) detail = job_status_detail_with_finished_time( detail, "error", "Server error in job submission") dataHandler.UpdateJobTextField( job["jobId"], "jobStatusDetail", base64.b64encode(json.dumps(detail))) # Try to clean up the job try: job_deployer = JobDeployer() job_deployer.delete_job(job_id, force=True) logging.info( "Cleaning up job %s succeeded after %d retries of job submission" % (job["jobId"], retries)) except: logging.warning( "Cleaning up job %s failed after %d retries of job submission" % (job["jobId"], retries)) dataHandler.Close() return ret
def get(self): # List all jobs schema = JobSchema(many=True) jobs_dump = schema.dump(jobs) return jsonify(jobs_dump.data)
def job_to_json(job): schema = JobSchema(many=False) job_dump = schema.dump(job) return jsonify(job_dump.data)