def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> dict: pod_labels = deepcopy(meta.labels) pod_labels['mlrun/job'] = meta.name # Populate mpijob object # start by populating pod templates launcher_pod_template = deepcopy(self._mpijob_pod_template) worker_pod_template = deepcopy(self._mpijob_pod_template) # configuration for both launcher and workers for pod_template in [launcher_pod_template, worker_pod_template]: if self.spec.image: self._update_container(pod_template, 'image', self.full_image_path()) self._update_container(pod_template, 'volumeMounts', self.spec.volume_mounts) extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()} # if self.spec.rundb: # extra_env['MLRUN_DBPATH'] = self.spec.rundb extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()] self._update_container(pod_template, 'env', extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container( pod_template, 'imagePullPolicy', self.spec.image_pull_policy) if self.spec.workdir: self._update_container(pod_template, 'workingDir', self.spec.workdir) if self.spec.image_pull_secret: update_in(pod_template, 'spec.imagePullSecrets', [{'name': self.spec.image_pull_secret}]) update_in(pod_template, 'metadata.labels', pod_labels) update_in(pod_template, 'spec.volumes', self.spec.volumes) # configuration for workers only # update resources only for workers because the launcher doesn't require # special resources (like GPUs, Memory, etc..) self._enrich_worker_configurations(worker_pod_template) # configuration for launcher only self._enrich_launcher_configurations(launcher_pod_template) # generate mpi job using both pod templates job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template) # update the replicas only for workers update_in(job, 'spec.mpiReplicaSpecs.Worker.replicas', self.spec.replicas or 1) if execution.get_param('slots_per_worker'): update_in(job, 'spec.slotsPerWorker', execution.get_param('slots_per_worker')) update_in(job, 'metadata', meta.to_dict()) return job
def _generate_mpi_job( self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta, ) -> dict: pod_labels = deepcopy(meta.labels) pod_labels["mlrun/job"] = meta.name # Populate mpijob object # start by populating pod templates launcher_pod_template = deepcopy(self._mpijob_pod_template) worker_pod_template = deepcopy(self._mpijob_pod_template) # configuration for both launcher and workers for pod_template in [launcher_pod_template, worker_pod_template]: if self.spec.image: self._update_container(pod_template, "image", self.full_image_path()) self._update_container(pod_template, "volumeMounts", self.spec.volume_mounts) extra_env = self._generate_runtime_env(runobj) extra_env = [{"name": k, "value": v} for k, v in extra_env.items()] self._update_container(pod_template, "env", extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container( pod_template, "imagePullPolicy", self.spec.image_pull_policy, ) if self.spec.workdir: self._update_container(pod_template, "workingDir", self.spec.workdir) if self.spec.image_pull_secret: update_in( pod_template, "spec.imagePullSecrets", [{ "name": self.spec.image_pull_secret }], ) update_in(pod_template, "metadata.labels", pod_labels) update_in(pod_template, "spec.volumes", self.spec.volumes) # configuration for workers only # update resources only for workers because the launcher # doesn't require special resources (like GPUs, Memory, etc..) self._enrich_worker_configurations(worker_pod_template) # configuration for launcher only self._enrich_launcher_configurations(launcher_pod_template) # generate mpi job using both pod templates job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template) # update the replicas only for workers update_in( job, "spec.mpiReplicaSpecs.Worker.replicas", self.spec.replicas or 1, ) update_in( job, "spec.cleanPodPolicy", self.spec.clean_pod_policy, ) if execution.get_param("slots_per_worker"): update_in( job, "spec.slotsPerWorker", execution.get_param("slots_per_worker"), ) update_in(job, "metadata", meta.to_dict()) return job
def _generate_mpi_job( self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta, ) -> dict: pod_labels = deepcopy(meta.labels) pod_labels["mlrun/job"] = meta.name # Populate mpijob object # start by populating pod templates launcher_pod_template = deepcopy(self._mpijob_pod_template) worker_pod_template = deepcopy(self._mpijob_pod_template) command, args, extra_env = self._get_cmd_args(runobj) # configuration for both launcher and workers for pod_template in [launcher_pod_template, worker_pod_template]: if self.spec.image: self._update_container(pod_template, "image", self.full_image_path()) self._update_container(pod_template, "volumeMounts", self.spec.volume_mounts) self._update_container(pod_template, "env", extra_env + self.spec.env) if self.spec.image_pull_policy: self._update_container( pod_template, "imagePullPolicy", self.spec.image_pull_policy, ) if self.spec.workdir: self._update_container(pod_template, "workingDir", self.spec.workdir) if self.spec.image_pull_secret: update_in( pod_template, "spec.imagePullSecrets", [{ "name": self.spec.image_pull_secret }], ) update_in(pod_template, "metadata.labels", pod_labels) update_in(pod_template, "spec.volumes", self.spec.volumes) update_in(pod_template, "spec.nodeName", self.spec.node_name) update_in(pod_template, "spec.nodeSelector", self.spec.node_selector) update_in(pod_template, "spec.affinity", self.spec._get_sanitized_affinity()) if self.spec.priority_class_name and len( mlconf.get_valid_function_priority_class_names()): update_in( pod_template, "spec.priorityClassName", self.spec.priority_class_name, ) # configuration for workers only # update resources only for workers because the launcher # doesn't require special resources (like GPUs, Memory, etc..) self._enrich_worker_configurations(worker_pod_template) # configuration for launcher only self._enrich_launcher_configurations(launcher_pod_template, [command] + args) # generate mpi job using both pod templates job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template) # update the replicas only for workers update_in( job, "spec.mpiReplicaSpecs.Worker.replicas", self.spec.replicas or 1, ) update_in( job, "spec.cleanPodPolicy", self.spec.clean_pod_policy, ) if execution.get_param("slots_per_worker"): update_in( job, "spec.slotsPerWorker", execution.get_param("slots_per_worker"), ) update_in(job, "metadata", meta.to_dict()) return job