def store_function( self, session, function, name, project="", tag="", versioned=False ): project = project or config.default_project self._create_project_if_not_exists(session, project) tag = tag or get_in(function, "metadata.tag") or "latest" hash_key = fill_function_hash(function, tag) # clear tag from object in case another function will "take" that tag update_in(function, "metadata.tag", "") # versioned means whether we want to version this function object so that it will queryable by its hash key # to enable that we set the uid to the hash key so it will have a unique record (Unique constraint of function # is the set (project, name, uid)) # when it's not enabled it means we want to have one unique function object for the set (project, name, tag) # that will be reused on every store function (cause we don't want to version each version e.g. create a new # record) so we set the uid to be unversioned-{tag} if versioned: uid = hash_key else: uid = f'unversioned-{tag}' updated = datetime.now(timezone.utc) update_in(function, "metadata.updated", updated) fn = self._get_function(session, name, project, uid) if not fn: fn = Function(name=name, project=project, uid=uid,) fn.updated = updated labels = get_in(function, "metadata.labels", {}) update_labels(fn, labels) fn.struct = function self._upsert(session, fn) self.tag_objects_v2(session, [fn], project, tag) return hash_key
def build_status(name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: str = "on", db_session: Session = Depends(deps.get_db_session)): logs = strtobool(logs) fn = get_db().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND, name=name, project=project, tag=tag) state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response(content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod }) logger.info("get pod {} status".format(pod)) state = get_k8s().get_pod_status(pod) logger.info("pod state={}".format(state)) if state == "succeeded": logger.info("build completed successfully") state = "ready" if state in ["failed", "error"]: logger.error("build {}, watch the build pod logs: {}".format( state, pod)) if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == "ready": update_in(fn, "spec.image", image) get_db().store_function(db_session, fn, name, project, tag) return Response(content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod })
def build_status(): name = request.args.get('name', '') project = request.args.get('project', '') tag = request.args.get('tag', '') offset = int(request.args.get('offset', '0')) logs = strtobool(request.args.get('logs', 'on')) fn = _db.get_function(name, project, tag) if not fn: return json_error(HTTPStatus.NOT_FOUND, name=name, project=project, tag=tag) state = get_in(fn, 'status.state', '') pod = get_in(fn, 'status.build_pod', '') image = get_in(fn, 'spec.build.image', '') out = b'' if not pod: if state == 'ready': image = image or get_in(fn, 'spec.image') return Response(out, mimetype='text/plain', headers={ "function_status": state, "function_image": image, "builder_pod": pod }) logger.info('get pod {} status'.format(pod)) state = _k8s.get_pod_status(pod) logger.info('pod state={}'.format(state)) if state == 'succeeded': logger.info('build completed successfully') state = 'ready' if state in ['failed', 'error']: logger.error('build {}, watch the build pod logs: {}'.format( state, pod)) if logs and state != 'pending': resp = _k8s.logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, 'status.state', state) if state == 'ready': update_in(fn, 'spec.image', image) _db.store_function(fn, name, project, tag) return Response(out, mimetype='text/plain', headers={ "function_status": state, "function_image": image, "builder_pod": pod })
def _update_result_body(result_path, event_body, result): if result_path and event_body: if not hasattr(event_body, "__getitem__"): raise TypeError( "result_path parameter supports only dict-like event bodies") update_in(event_body, result_path, result) else: event_body = result return event_body
def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> typing.Dict: job = deepcopy(self._mpijob_template) pod_labels = deepcopy(meta.labels) pod_labels['mlrun/job'] = meta.name update_in(job, 'metadata', meta.to_dict()) update_in(job, 'spec.template.metadata.labels', pod_labels) update_in(job, 'spec.replicas', self.spec.replicas or 1) if self.spec.image: self._update_container(job, 'image', self.full_image_path()) update_in(job, 'spec.template.spec.volumes', self.spec.volumes) self._update_container(job, 'volumeMounts', self.spec.volume_mounts) extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()} if runobj.spec.verbose: extra_env['MLRUN_LOG_LEVEL'] = 'debug' extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()] self._update_container(job, 'env', extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container(job, 'imagePullPolicy', self.spec.image_pull_policy) if self.spec.resources: self._update_container(job, 'resources', self.spec.resources) if self.spec.workdir: self._update_container(job, 'workingDir', self.spec.workdir) if self.spec.image_pull_secret: update_in( job, 'spec.template.spec.imagePullSecrets', [{ 'name': self.spec.image_pull_secret }], ) if self.spec.command: self._update_container(job, 'command', ['mpirun', 'python', self.spec.command] + self.spec.args) return job
def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> typing.Dict: job = deepcopy(self._mpijob_template) pod_labels = deepcopy(meta.labels) pod_labels["mlrun/job"] = meta.name update_in(job, "metadata", meta.to_dict()) update_in(job, "spec.template.metadata.labels", pod_labels) update_in(job, "spec.replicas", self.spec.replicas or 1) if self.spec.image: self._update_container(job, "image", self.full_image_path()) update_in(job, "spec.template.spec.volumes", self.spec.volumes) self._update_container(job, "volumeMounts", self.spec.volume_mounts) extra_env = self._generate_runtime_env(runobj) extra_env = [{"name": k, "value": v} for k, v in extra_env.items()] self._update_container(job, "env", extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container(job, "imagePullPolicy", self.spec.image_pull_policy) if self.spec.resources: self._update_container(job, "resources", self.spec.resources) if self.spec.workdir: self._update_container(job, "workingDir", self.spec.workdir) if self.spec.image_pull_secret: update_in( job, "spec.template.spec.imagePullSecrets", [{ "name": self.spec.image_pull_secret }], ) if self.spec.command: self._update_container(job, "command", ["mpirun", "python", self.spec.command] + self.spec.args) return job
def update_run(self, session, updates: dict, uid, project="", iter=0): project = project or config.default_project run = self._get_run(session, uid, project, iter) if not run: raise DBError(f"run {uid}:{project} not found") struct = run.struct for key, val in updates.items(): update_in(struct, key, val) run.struct = struct new_state = run_state(struct) if new_state: run.state = new_state start_time = run_start_time(struct) if start_time: run.start_time = start_time run.labels.clear() for name, value in run_labels(struct).items(): lbl = Run.Label(name=name, value=value, parent=run.id) run.labels.append(lbl) session.merge(run) session.commit() self._delete_empty_labels(session, Run.Label)
def get_log(project, uid): size = int(request.args.get('size', '-1')) offset = int(request.args.get('offset', '0')) out = b'' log_file = log_path(project, uid) if log_file.exists(): with log_file.open('rb') as fp: fp.seek(offset) out = fp.read(size) status = '' else: data = _db.read_run(uid, project) if not data: return json_error(HTTPStatus.NOT_FOUND, project=project, uid=uid) status = get_in(data, 'status.state', '') if _k8s: pods = _k8s.get_logger_pods(uid) if pods: pod, new_status = list(pods.items())[0] new_status = new_status.lower() # TODO: handle in cron/tracking if new_status != 'pending': resp = _k8s.logs(pod) if resp: out = resp.encode()[offset:] if status == 'running': now = now_date().isoformat() update_in(data, 'status.last_update', now) if new_status == 'failed': update_in(data, 'status.state', 'error') update_in(data, 'status.error', 'error, check logs') _db.store_run(data, uid, project) if new_status == 'succeeded': update_in(data, 'status.state', 'completed') _db.store_run(data, uid, project) status = new_status elif status == 'running': update_in(data, 'status.state', 'error') update_in(data, 'status.error', 'pod not found, maybe terminated') _db.store_run(data, uid, project) status = 'failed' return Response(out, mimetype='text/plain', headers={"pod_status": status})
def build_status( name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: bool = True, last_log_timestamp: float = 0.0, verbose: bool = False, db_session: Session = Depends(deps.get_db_session), ): fn = get_db().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND.value, name=name, project=project, tag=tag) # nuclio deploy status if fn.get("kind") in RuntimeKinds.nuclio_runtimes(): ( state, address, nuclio_name, last_log_timestamp, text, ) = get_nuclio_deploy_status(name, project, tag, last_log_timestamp=last_log_timestamp, verbose=verbose) if state == "ready": logger.info("Nuclio function deployed successfully", name=name) if state == "error": logger.error(f"Nuclio deploy error, {text}", name=name) update_in(fn, "status.nuclio_name", nuclio_name) update_in(fn, "status.state", state) update_in(fn, "status.address", address) versioned = False if state == "ready": # Versioned means the version will be saved in the DB forever, we don't want to spam # the DB with intermediate or unusable versions, only successfully deployed versions versioned = True get_db().store_function(db_session, fn, name, project, tag, versioned=versioned) return Response( content=text, media_type="text/plain", headers={ "x-mlrun-function-status": state, "x-mlrun-last-timestamp": str(last_log_timestamp), "x-mlrun-address": address, "x-mlrun-name": nuclio_name, }, ) # job deploy status state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response( content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod, }, ) logger.info("get pod {} status".format(pod)) state = get_k8s().get_pod_status(pod) logger.info("pod state={}".format(state)) if state == "succeeded": logger.info("build completed successfully") state = "ready" if state in ["failed", "error"]: logger.error("build {}, watch the build pod logs: {}".format( state, pod)) if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == "ready": update_in(fn, "spec.image", image) versioned = False if state == "ready": versioned = True get_db().store_function(db_session, fn, name, project, tag, versioned=versioned) return Response( content=out, media_type="text/plain", headers={ "x-mlrun-function-status": state, "function_status": state, "function_image": image, "builder_pod": pod, }, )
def get_log(db_session: Session, project: str, uid: str, size: int = -1, offset: int = 0, source: LogSources = LogSources.AUTO): out = b"" log_file = log_path(project, uid) status = None if log_file.exists() and source in [ LogSources.AUTO, LogSources.PERSISTENCY ]: with log_file.open("rb") as fp: fp.seek(offset) out = fp.read(size) status = "" elif source in [LogSources.AUTO, LogSources.K8S]: data = get_db().read_run(db_session, uid, project) if not data: log_and_raise(HTTPStatus.NOT_FOUND, project=project, uid=uid) status = get_in(data, "status.state", "") if get_k8s(): pods = get_k8s().get_logger_pods(uid) if pods: pod, new_status = list(pods.items())[0] new_status = new_status.lower() # TODO: handle in cron/tracking if new_status != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] if status == "running": now = now_date().isoformat() update_in(data, "status.last_update", now) if new_status == "failed": update_in(data, "status.state", "error") update_in(data, "status.error", "error, check logs") get_db().store_run(db_session, data, uid, project) if new_status == "succeeded": update_in(data, "status.state", "completed") get_db().store_run(db_session, data, uid, project) status = new_status elif status == "running": update_in(data, "status.state", "error") update_in(data, "status.error", "pod not found, maybe terminated") get_db().store_run(db_session, data, uid, project) status = "failed" return out, status
def build_status( name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: bool = True, last_log_timestamp: float = 0.0, verbose: bool = False, auth_info: mlrun.api.schemas.AuthInfo = Depends( deps.authenticate_request), db_session: Session = Depends(deps.get_db_session), ): mlrun.api.utils.auth.verifier.AuthVerifier( ).query_project_resource_permissions( mlrun.api.schemas.AuthorizationResourceTypes.function, project or mlrun.mlconf.default_project, name, # store since with the current mechanism we update the status (and store the function) in the DB when a client # query for the status mlrun.api.schemas.AuthorizationAction.store, auth_info, ) fn = mlrun.api.crud.Functions().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND.value, name=name, project=project, tag=tag) # nuclio deploy status if fn.get("kind") in RuntimeKinds.nuclio_runtimes(): ( state, address, nuclio_name, last_log_timestamp, text, status, ) = get_nuclio_deploy_status( name, project, tag, last_log_timestamp=last_log_timestamp, verbose=verbose, auth_info=auth_info, ) if state == "ready": logger.info("Nuclio function deployed successfully", name=name) if state in ["error", "unhealthy"]: logger.error(f"Nuclio deploy error, {text}", name=name) internal_invocation_urls = status.get("internalInvocationUrls", []) external_invocation_urls = status.get("externalInvocationUrls", []) # on earlier versions of mlrun, address used to represent the nodePort external invocation url # now that functions can be not exposed (using service_type clusterIP) this no longer relevant # and hence, for BC it would be filled with the external invocation url first item # or completely empty. address = external_invocation_urls[ 0] if external_invocation_urls else "" update_in(fn, "status.nuclio_name", nuclio_name) update_in(fn, "status.internal_invocation_urls", internal_invocation_urls) update_in(fn, "status.external_invocation_urls", external_invocation_urls) update_in(fn, "status.state", state) update_in(fn, "status.address", address) versioned = False if state == "ready": # Versioned means the version will be saved in the DB forever, we don't want to spam # the DB with intermediate or unusable versions, only successfully deployed versions versioned = True mlrun.api.crud.Functions().store_function( db_session, fn, name, project, tag, versioned=versioned, ) return Response( content=text, media_type="text/plain", headers={ "x-mlrun-function-status": state, "x-mlrun-last-timestamp": str(last_log_timestamp), "x-mlrun-address": address, "x-mlrun-internal-invocation-urls": ",".join(internal_invocation_urls), "x-mlrun-external-invocation-urls": ",".join(external_invocation_urls), "x-mlrun-name": nuclio_name, }, ) # job deploy status state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response( content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod, }, ) logger.info(f"get pod {pod} status") state = get_k8s().get_pod_status(pod) logger.info(f"pod state={state}") if state == "succeeded": logger.info("build completed successfully") state = mlrun.api.schemas.FunctionState.ready if state in ["failed", "error"]: logger.error(f"build {state}, watch the build pod logs: {pod}") state = mlrun.api.schemas.FunctionState.error if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == mlrun.api.schemas.FunctionState.ready: update_in(fn, "spec.image", image) versioned = False if state == mlrun.api.schemas.FunctionState.ready: versioned = True mlrun.api.crud.Functions().store_function( db_session, fn, name, project, tag, versioned=versioned, ) return Response( content=out, media_type="text/plain", headers={ "x-mlrun-function-status": state, "function_status": state, "function_image": image, "builder_pod": pod, }, )
def build_status( name: str = "", project: str = "", tag: str = "", offset: int = 0, logs: bool = True, last_log_timestamp: float = 0.0, verbose: bool = False, auth_verifier: deps.AuthVerifier = Depends(deps.AuthVerifier), db_session: Session = Depends(deps.get_db_session), ): fn = get_db().get_function(db_session, name, project, tag) if not fn: log_and_raise(HTTPStatus.NOT_FOUND.value, name=name, project=project, tag=tag) # nuclio deploy status if fn.get("kind") in RuntimeKinds.nuclio_runtimes(): ( state, address, nuclio_name, last_log_timestamp, text, status, ) = get_nuclio_deploy_status(name, project, tag, last_log_timestamp=last_log_timestamp, verbose=verbose) if state == "ready": logger.info("Nuclio function deployed successfully", name=name) if state in ["error", "unhealthy"]: logger.error(f"Nuclio deploy error, {text}", name=name) # internal / external invocation urls were added on nuclio 1.6.x # and hence, it might be empty # to backward compatible with older nuclio versions, we use hard-coded default values internal_invocation_urls = status.get( "internalInvocationUrls", [resolve_function_internal_invocation_url(name)]) external_invocation_urls = status.get("externalInvocationUrls", [address] if address else []) # on nuclio > 1.6.x we get the external invocation url on the status block if external_invocation_urls and not address: address = external_invocation_urls[0] update_in(fn, "status.nuclio_name", nuclio_name) update_in(fn, "status.internal_invocation_urls", internal_invocation_urls) update_in(fn, "status.external_invocation_urls", external_invocation_urls) update_in(fn, "status.state", state) update_in(fn, "status.address", address) versioned = False if state == "ready": # Versioned means the version will be saved in the DB forever, we don't want to spam # the DB with intermediate or unusable versions, only successfully deployed versions versioned = True get_db().store_function( db_session, fn, name, project, tag, versioned=versioned, leader_session=auth_verifier.auth_info.session, ) return Response( content=text, media_type="text/plain", headers={ "x-mlrun-function-status": state, "x-mlrun-last-timestamp": str(last_log_timestamp), "x-mlrun-address": address, "x-mlrun-internal-invocation-urls": ",".join(internal_invocation_urls), "x-mlrun-external-invocation-urls": ",".join(external_invocation_urls), "x-mlrun-name": nuclio_name, }, ) # job deploy status state = get_in(fn, "status.state", "") pod = get_in(fn, "status.build_pod", "") image = get_in(fn, "spec.build.image", "") out = b"" if not pod: if state == "ready": image = image or get_in(fn, "spec.image") return Response( content=out, media_type="text/plain", headers={ "function_status": state, "function_image": image, "builder_pod": pod, }, ) logger.info(f"get pod {pod} status") state = get_k8s().get_pod_status(pod) logger.info(f"pod state={state}") if state == "succeeded": logger.info("build completed successfully") state = mlrun.api.schemas.FunctionState.ready if state in ["failed", "error"]: logger.error(f"build {state}, watch the build pod logs: {pod}") state = mlrun.api.schemas.FunctionState.error if logs and state != "pending": resp = get_k8s().logs(pod) if resp: out = resp.encode()[offset:] update_in(fn, "status.state", state) if state == mlrun.api.schemas.FunctionState.ready: update_in(fn, "spec.image", image) versioned = False if state == mlrun.api.schemas.FunctionState.ready: versioned = True get_db().store_function( db_session, fn, name, project, tag, versioned=versioned, leader_session=auth_verifier.auth_info.session, ) return Response( content=out, media_type="text/plain", headers={ "x-mlrun-function-status": state, "function_status": state, "function_image": image, "builder_pod": pod, }, )
def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> dict: pod_labels = deepcopy(meta.labels) pod_labels['mlrun/job'] = meta.name # Populate mpijob object # start by populating pod templates launcher_pod_template = deepcopy(self._mpijob_pod_template) worker_pod_template = deepcopy(self._mpijob_pod_template) # configuration for both launcher and workers for pod_template in [launcher_pod_template, worker_pod_template]: if self.spec.image: self._update_container(pod_template, 'image', self.full_image_path()) self._update_container(pod_template, 'volumeMounts', self.spec.volume_mounts) extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()} # if self.spec.rundb: # extra_env['MLRUN_DBPATH'] = self.spec.rundb extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()] self._update_container(pod_template, 'env', extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container( pod_template, 'imagePullPolicy', self.spec.image_pull_policy) if self.spec.workdir: self._update_container(pod_template, 'workingDir', self.spec.workdir) if self.spec.image_pull_secret: update_in(pod_template, 'spec.imagePullSecrets', [{'name': self.spec.image_pull_secret}]) update_in(pod_template, 'metadata.labels', pod_labels) update_in(pod_template, 'spec.volumes', self.spec.volumes) # configuration for workers only # update resources only for workers because the launcher doesn't require # special resources (like GPUs, Memory, etc..) self._enrich_worker_configurations(worker_pod_template) # configuration for launcher only self._enrich_launcher_configurations(launcher_pod_template) # generate mpi job using both pod templates job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template) # update the replicas only for workers update_in(job, 'spec.mpiReplicaSpecs.Worker.replicas', self.spec.replicas or 1) if execution.get_param('slots_per_worker'): update_in(job, 'spec.slotsPerWorker', execution.get_param('slots_per_worker')) update_in(job, 'metadata', meta.to_dict()) return job
def tag_test(spec, name): spec = deepcopy(spec) update_in(spec, 'metadata.name', name) update_in(spec, 'metadata.lables.test', name) return spec
def _generate_mpi_job( self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta, ) -> dict: pod_labels = deepcopy(meta.labels) pod_labels["mlrun/job"] = meta.name # Populate mpijob object # start by populating pod templates launcher_pod_template = deepcopy(self._mpijob_pod_template) worker_pod_template = deepcopy(self._mpijob_pod_template) # configuration for both launcher and workers for pod_template in [launcher_pod_template, worker_pod_template]: if self.spec.image: self._update_container(pod_template, "image", self.full_image_path()) self._update_container(pod_template, "volumeMounts", self.spec.volume_mounts) extra_env = self._generate_runtime_env(runobj) extra_env = [{"name": k, "value": v} for k, v in extra_env.items()] self._update_container(pod_template, "env", extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container( pod_template, "imagePullPolicy", self.spec.image_pull_policy, ) if self.spec.workdir: self._update_container(pod_template, "workingDir", self.spec.workdir) if self.spec.image_pull_secret: update_in( pod_template, "spec.imagePullSecrets", [{ "name": self.spec.image_pull_secret }], ) update_in(pod_template, "metadata.labels", pod_labels) update_in(pod_template, "spec.volumes", self.spec.volumes) # configuration for workers only # update resources only for workers because the launcher # doesn't require special resources (like GPUs, Memory, etc..) self._enrich_worker_configurations(worker_pod_template) # configuration for launcher only self._enrich_launcher_configurations(launcher_pod_template) # generate mpi job using both pod templates job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template) # update the replicas only for workers update_in( job, "spec.mpiReplicaSpecs.Worker.replicas", self.spec.replicas or 1, ) update_in( job, "spec.cleanPodPolicy", self.spec.clean_pod_policy, ) if execution.get_param("slots_per_worker"): update_in( job, "spec.slotsPerWorker", execution.get_param("slots_per_worker"), ) update_in(job, "metadata", meta.to_dict()) return job
def _generate_mpi_job( self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta, ) -> dict: pod_labels = deepcopy(meta.labels) pod_labels["mlrun/job"] = meta.name # Populate mpijob object # start by populating pod templates launcher_pod_template = deepcopy(self._mpijob_pod_template) worker_pod_template = deepcopy(self._mpijob_pod_template) command, args, extra_env = self._get_cmd_args(runobj) # configuration for both launcher and workers for pod_template in [launcher_pod_template, worker_pod_template]: if self.spec.image: self._update_container(pod_template, "image", self.full_image_path()) self._update_container(pod_template, "volumeMounts", self.spec.volume_mounts) self._update_container(pod_template, "env", extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container( pod_template, "imagePullPolicy", self.spec.image_pull_policy, ) if self.spec.workdir: self._update_container(pod_template, "workingDir", self.spec.workdir) if self.spec.image_pull_secret: update_in( pod_template, "spec.imagePullSecrets", [{ "name": self.spec.image_pull_secret }], ) update_in(pod_template, "metadata.labels", pod_labels) update_in(pod_template, "spec.volumes", self.spec.volumes) update_in(pod_template, "spec.nodeName", self.spec.node_name) update_in(pod_template, "spec.nodeSelector", self.spec.node_selector) update_in(pod_template, "spec.affinity", self.spec._get_sanitized_affinity()) if self.spec.priority_class_name and len( mlconf.get_valid_function_priority_class_names()): update_in( pod_template, "spec.priorityClassName", self.spec.priority_class_name, ) # configuration for workers only # update resources only for workers because the launcher # doesn't require special resources (like GPUs, Memory, etc..) self._enrich_worker_configurations(worker_pod_template) # configuration for launcher only self._enrich_launcher_configurations(launcher_pod_template, [command] + args) # generate mpi job using both pod templates job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template) # update the replicas only for workers update_in( job, "spec.mpiReplicaSpecs.Worker.replicas", self.spec.replicas or 1, ) update_in( job, "spec.cleanPodPolicy", self.spec.clean_pod_policy, ) if execution.get_param("slots_per_worker"): update_in( job, "spec.slotsPerWorker", execution.get_param("slots_per_worker"), ) update_in(job, "metadata", meta.to_dict()) return job