Example #1
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx,
                          meta: client.V1ObjectMeta) -> typing.Dict:
        job = deepcopy(self._mpijob_template)

        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name
        update_in(job, "metadata", meta.to_dict())
        update_in(job, "spec.template.metadata.labels", pod_labels)
        update_in(job, "spec.replicas", self.spec.replicas or 1)
        if self.spec.image:
            self._update_container(job, "image", self.full_image_path())
        update_in(job, "spec.template.spec.volumes", self.spec.volumes)
        self._update_container(job, "volumeMounts", self.spec.volume_mounts)
        update_in(job, "spec.template.spec.nodeName", self.spec.node_name)
        update_in(job, "spec.template.spec.nodeSelector",
                  self.spec.node_selector)
        update_in(job, "spec.template.spec.affinity",
                  self.spec._get_sanitized_affinity())
        if self.spec.priority_class_name and len(
                mlconf.get_valid_function_priority_class_names()):
            update_in(
                job,
                "spec.template.spec.priorityClassName",
                self.spec.priority_class_name,
            )

        extra_env = self._generate_runtime_env(runobj)
        extra_env = [{"name": k, "value": v} for k, v in extra_env.items()]
        self._update_container(job, "env", extra_env + self.spec.env)
        if self.spec.image_pull_policy:
            self._update_container(job, "imagePullPolicy",
                                   self.spec.image_pull_policy)
        if self.spec.resources:
            self._update_container(job, "resources", self.spec.resources)
        if self.spec.workdir:
            self._update_container(job, "workingDir", self.spec.workdir)

        if self.spec.image_pull_secret:
            update_in(
                job,
                "spec.template.spec.imagePullSecrets",
                [{
                    "name": self.spec.image_pull_secret
                }],
            )

        if self.spec.command:
            self._update_container(job, "command",
                                   ["mpirun", "python", self.spec.command] +
                                   self.spec.args)

        return job
Example #2
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels['mlrun/job'] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, 'image', self.full_image_path())
            self._update_container(pod_template, 'volumeMounts', self.spec.volume_mounts)
            extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()}
            # if self.spec.rundb:
            #     extra_env['MLRUN_DBPATH'] = self.spec.rundb
            extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()]
            self._update_container(pod_template, 'env', extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template, 'imagePullPolicy', self.spec.image_pull_policy)
            if self.spec.workdir:
                self._update_container(pod_template, 'workingDir', self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(pod_template, 'spec.imagePullSecrets',
                          [{'name': self.spec.image_pull_secret}])
            update_in(pod_template, 'metadata.labels', pod_labels)
            update_in(pod_template, 'spec.volumes', self.spec.volumes)

        # configuration for workers only
        # update resources only for workers because the launcher doesn't require
        # special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template, worker_pod_template)

        # update the replicas only for workers
        update_in(job, 'spec.mpiReplicaSpecs.Worker.replicas', self.spec.replicas or 1)

        if execution.get_param('slots_per_worker'):
            update_in(job, 'spec.slotsPerWorker', execution.get_param('slots_per_worker'))

        update_in(job, 'metadata', meta.to_dict())

        return job
Example #3
0
    def _generate_mpi_job(self, runobj: RunObject, execution: MLClientCtx,
                          meta: client.V1ObjectMeta) -> typing.Dict:
        job = deepcopy(self._mpijob_template)

        pod_labels = deepcopy(meta.labels)
        pod_labels['mlrun/job'] = meta.name
        update_in(job, 'metadata', meta.to_dict())
        update_in(job, 'spec.template.metadata.labels', pod_labels)
        update_in(job, 'spec.replicas', self.spec.replicas or 1)
        if self.spec.image:
            self._update_container(job, 'image', self.full_image_path())
        update_in(job, 'spec.template.spec.volumes', self.spec.volumes)
        self._update_container(job, 'volumeMounts', self.spec.volume_mounts)

        extra_env = {'MLRUN_EXEC_CONFIG': runobj.to_json()}
        if runobj.spec.verbose:
            extra_env['MLRUN_LOG_LEVEL'] = 'debug'
        extra_env = [{'name': k, 'value': v} for k, v in extra_env.items()]
        self._update_container(job, 'env', extra_env + self.spec.env)
        if self.spec.image_pull_policy:
            self._update_container(job, 'imagePullPolicy',
                                   self.spec.image_pull_policy)
        if self.spec.resources:
            self._update_container(job, 'resources', self.spec.resources)
        if self.spec.workdir:
            self._update_container(job, 'workingDir', self.spec.workdir)

        if self.spec.image_pull_secret:
            update_in(
                job,
                'spec.template.spec.imagePullSecrets',
                [{
                    'name': self.spec.image_pull_secret
                }],
            )

        if self.spec.command:
            self._update_container(job, 'command',
                                   ['mpirun', 'python', self.spec.command] +
                                   self.spec.args)

        return job
Example #4
0
    def _generate_mpi_job(
        self, runobj: RunObject, execution: MLClientCtx, meta: client.V1ObjectMeta
    ) -> typing.Dict:
        job = deepcopy(self._mpijob_template)

        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name
        update_in(job, "metadata", meta.to_dict())
        update_in(job, "spec.template.metadata.labels", pod_labels)
        update_in(job, "spec.replicas", self.spec.replicas or 1)
        if self.spec.image:
            self._update_container(job, "image", self.full_image_path())
        update_in(job, "spec.template.spec.volumes", self.spec.volumes)
        self._update_container(job, "volumeMounts", self.spec.volume_mounts)

        extra_env = {"MLRUN_EXEC_CONFIG": runobj.to_json()}
        if runobj.spec.verbose:
            extra_env["MLRUN_LOG_LEVEL"] = "debug"
        extra_env = [{"name": k, "value": v} for k, v in extra_env.items()]
        self._update_container(job, "env", extra_env + self.spec.env)
        if self.spec.image_pull_policy:
            self._update_container(job, "imagePullPolicy", self.spec.image_pull_policy)
        if self.spec.resources:
            self._update_container(job, "resources", self.spec.resources)
        if self.spec.workdir:
            self._update_container(job, "workingDir", self.spec.workdir)

        if self.spec.image_pull_secret:
            update_in(
                job,
                "spec.template.spec.imagePullSecrets",
                [{"name": self.spec.image_pull_secret}],
            )

        if self.spec.command:
            self._update_container(
                job, "command", ["mpirun", "python", self.spec.command] + self.spec.args
            )

        return job
Example #5
0
    def _generate_mpi_job(
        self,
        runobj: RunObject,
        execution: MLClientCtx,
        meta: client.V1ObjectMeta,
    ) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, "image",
                                       self.full_image_path())
            self._update_container(pod_template, "volumeMounts",
                                   self.spec.volume_mounts)
            extra_env = self._generate_runtime_env(runobj)
            extra_env = [{"name": k, "value": v} for k, v in extra_env.items()]
            self._update_container(pod_template, "env",
                                   extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template,
                    "imagePullPolicy",
                    self.spec.image_pull_policy,
                )
            if self.spec.workdir:
                self._update_container(pod_template, "workingDir",
                                       self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(
                    pod_template,
                    "spec.imagePullSecrets",
                    [{
                        "name": self.spec.image_pull_secret
                    }],
                )
            update_in(pod_template, "metadata.labels", pod_labels)
            update_in(pod_template, "spec.volumes", self.spec.volumes)

        # configuration for workers only
        # update resources only for workers because the launcher
        # doesn't require special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template,
                                              worker_pod_template)

        # update the replicas only for workers
        update_in(
            job,
            "spec.mpiReplicaSpecs.Worker.replicas",
            self.spec.replicas or 1,
        )

        update_in(
            job,
            "spec.cleanPodPolicy",
            self.spec.clean_pod_policy,
        )

        if execution.get_param("slots_per_worker"):
            update_in(
                job,
                "spec.slotsPerWorker",
                execution.get_param("slots_per_worker"),
            )

        update_in(job, "metadata", meta.to_dict())

        return job
Example #6
0
    def _generate_mpi_job(
        self,
        runobj: RunObject,
        execution: MLClientCtx,
        meta: client.V1ObjectMeta,
    ) -> dict:
        pod_labels = deepcopy(meta.labels)
        pod_labels["mlrun/job"] = meta.name

        # Populate mpijob object

        # start by populating pod templates
        launcher_pod_template = deepcopy(self._mpijob_pod_template)
        worker_pod_template = deepcopy(self._mpijob_pod_template)
        command, args, extra_env = self._get_cmd_args(runobj)

        # configuration for both launcher and workers
        for pod_template in [launcher_pod_template, worker_pod_template]:
            if self.spec.image:
                self._update_container(pod_template, "image",
                                       self.full_image_path())
            self._update_container(pod_template, "volumeMounts",
                                   self.spec.volume_mounts)
            self._update_container(pod_template, "env",
                                   extra_env + self.spec.env)
            if self.spec.image_pull_policy:
                self._update_container(
                    pod_template,
                    "imagePullPolicy",
                    self.spec.image_pull_policy,
                )
            if self.spec.workdir:
                self._update_container(pod_template, "workingDir",
                                       self.spec.workdir)
            if self.spec.image_pull_secret:
                update_in(
                    pod_template,
                    "spec.imagePullSecrets",
                    [{
                        "name": self.spec.image_pull_secret
                    }],
                )
            update_in(pod_template, "metadata.labels", pod_labels)
            update_in(pod_template, "spec.volumes", self.spec.volumes)
            update_in(pod_template, "spec.nodeName", self.spec.node_name)
            update_in(pod_template, "spec.nodeSelector",
                      self.spec.node_selector)
            update_in(pod_template, "spec.affinity",
                      self.spec._get_sanitized_affinity())
            if self.spec.priority_class_name and len(
                    mlconf.get_valid_function_priority_class_names()):
                update_in(
                    pod_template,
                    "spec.priorityClassName",
                    self.spec.priority_class_name,
                )

        # configuration for workers only
        # update resources only for workers because the launcher
        # doesn't require special resources (like GPUs, Memory, etc..)
        self._enrich_worker_configurations(worker_pod_template)

        # configuration for launcher only
        self._enrich_launcher_configurations(launcher_pod_template,
                                             [command] + args)

        # generate mpi job using both pod templates
        job = self._generate_mpi_job_template(launcher_pod_template,
                                              worker_pod_template)

        # update the replicas only for workers
        update_in(
            job,
            "spec.mpiReplicaSpecs.Worker.replicas",
            self.spec.replicas or 1,
        )

        update_in(
            job,
            "spec.cleanPodPolicy",
            self.spec.clean_pod_policy,
        )

        if execution.get_param("slots_per_worker"):
            update_in(
                job,
                "spec.slotsPerWorker",
                execution.get_param("slots_per_worker"),
            )

        update_in(job, "metadata", meta.to_dict())

        return job