Beispiel #1
0
    def submit(self, task: Task) -> str:
        """Submit a multi-worker PyTorchJob Task

        :param task: The task definition
        :type task: Task
        :return: A string handle name
        :rtype: str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        task.num_gpus = 1
        pod_spec = task_to_pod_spec(task,
                                    container_name="pytorch",
                                    secrets=secrets,
                                    configmaps=configmaps)
        template_metadata = client.V1ObjectMeta(name=task.name)

        template = client.V1PodTemplateSpec(metadata=template_metadata,
                                            spec=pod_spec)

        worker_replica_spec = {}
        worker_replica_spec['replicas'] = task.num_workers
        worker_replica_spec['restartPolicy'] = PyTorchJobHandler.RESTART_NEVER
        worker_replica_spec['template'] = template

        master_replica_spec = {}
        master_replica_spec['replicas'] = 1
        master_replica_spec['restartPolicy'] = PyTorchJobHandler.RESTART_NEVER
        master_replica_spec['template'] = template

        spec = {}
        spec['pytorchReplicaSpecs'] = {}
        spec['pytorchReplicaSpecs']['Master'] = master_replica_spec
        spec['pytorchReplicaSpecs']['Worker'] = worker_replica_spec

        pytorch_job_spec = {}
        pytorch_job_spec['kind'] = "PyTorchJob"
        pytorch_job_spec[
            'apiVersion'] = 'kubeflow.org/' + PyTorchJobHandler.VERSION
        pytorch_job_spec['metadata'] = client.V1ObjectMeta(
            generate_name=task.name)
        pytorch_job_spec['spec'] = spec

        pytorch_job = self.api.create_namespaced_custom_object(
            PyTorchJobHandler.GROUP,
            PyTorchJobHandler.VERSION,
            self.namespace,
            PyTorchJobHandler.PLURAL,
            pytorch_job_spec,
        )
        return pytorch_job['metadata']['name']
Beispiel #2
0
    def submit(self, task: Task) -> str:
        """Submit a multi-worker PyTorchJob Task

        :param task: The task definition
        :type task: Task
        :return: A string handle name
        :rtype: str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        task.num_gpus = 1
        pod_spec = task_to_pod_spec(task, container_name="pytorch-elasticjob", secrets=secrets, configmaps=configmaps)
        template_metadata = client.V1ObjectMeta(name=task.name)

        template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec)

        worker_replica_spec = {}
        worker_replica_spec['replicas'] = task.num_workers
        worker_replica_spec['restartPolicy'] = PyTorchElasticJobHandler.EXIT_CODE
        worker_replica_spec['template'] = template

        spec = {}
        spec['replicaSpecs'] = {}
        spec['replicaSpecs']['Worker'] = worker_replica_spec
        spec['minReplicas'] = task.num_workers
        spec['maxReplicas'] = task.num_workers
        etcd_svc = getenv('PYTORCH_ELASTIC_ETCD_SVC')
        if not etcd_svc:
            LOGGER.warning("No environment variable set for etcd service, looking for first available in elastic-job namespace")
            api = client.CoreV1Api()
            etcd_svc = [x for x in api.list_namespaced_service('elastic-job').items if x.metadata.name =='etcd-service'][0].spec.cluster_ip
        LOGGER.info("Using etcd service on %s:%d", etcd_svc, PyTorchElasticJobHandler.ETCD_PORT)
        spec['rdzvEndpoint'] = f'{etcd_svc}:{PyTorchElasticJobHandler.ETCD_PORT}'
        pytorch_job_spec = {}
        pytorch_job_spec['kind'] = PyTorchElasticJobHandler.NAME
        pytorch_job_spec['apiVersion'] = f'{PyTorchElasticJobHandler.GROUP}/{PyTorchElasticJobHandler.VERSION}'
        pytorch_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name)
        pytorch_job_spec['spec'] = spec

        pytorch_job = self.api.create_namespaced_custom_object(
            PyTorchElasticJobHandler.GROUP,
            PyTorchElasticJobHandler.VERSION,
            self.namespace,
            PyTorchElasticJobHandler.PLURAL,
            pytorch_job_spec,
        )
        return pytorch_job['metadata']['name']
Beispiel #3
0
    def submit(self, task: Task) -> str:
        """Submit a pod to run

        :param task: A task/pod
        :type task: Task
        :return: A string name
        :rtype str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        pod_spec = task_to_pod_spec(task,
                                    secrets=secrets,
                                    configmaps=configmaps)
        metadata = client.V1ObjectMeta(name=task.name)

        pod = client.V1Pod(metadata=metadata, spec=pod_spec)
        self.core_api.create_namespaced_pod(body=pod, namespace=self.namespace)
        return task.name
Beispiel #4
0
    def submit(self, task: Task):
        """Submit a new task as a Job

        :param task: A task definition
        :type task: Task
        :return: A string identifier for this task
        :rtype: str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        pod_spec = task_to_pod_spec(task, secrets=secrets, configmaps=configmaps)
        metadata = client.V1ObjectMeta(name=task.name)
        template_metadata = client.V1ObjectMeta(name='{}-template'.format(task.name))

        template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec)

        task_spec = client.V1JobSpec(template=template)
        task_obj = client.V1Job(kind=JobHandler.NAME, spec=task_spec, metadata=metadata)

        self.api.create_namespaced_job(body=task_obj, namespace=self.namespace)
        return task.name
Beispiel #5
0
    def submit(self, task: Task) -> str:
        """Submit a multi-worker TF Task

        :param task: A task definition to run
        :type task: Task
        :return: A task identifier
        :rtype: str
        """
        secrets = self._reference_secrets(task)
        configmaps = self._generate_configmaps(task)
        task.num_gpus = 1

        pod_spec = task_to_pod_spec(task,
                                    container_name="tensorflow",
                                    secrets=secrets,
                                    configmaps=configmaps)
        template_metadata = client.V1ObjectMeta(name=task.name)

        template = client.V1PodTemplateSpec(metadata=template_metadata,
                                            spec=pod_spec)

        worker_replica_spec = {}
        worker_replica_spec['replicas'] = task.num_workers
        worker_replica_spec['restartPolicy'] = TFJobHandler.RESTART_NEVER
        worker_replica_spec['template'] = template

        spec = {}
        spec['tfReplicaSpecs'] = {}
        spec['tfReplicaSpecs']['Worker'] = worker_replica_spec

        tf_job_spec = {}
        tf_job_spec['kind'] = 'TFJob'
        tf_job_spec['apiVersion'] = 'kubeflow.org/' + TFJobHandler.VERSION
        tf_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name)
        tf_job_spec['spec'] = spec

        tf_job = self.api.create_namespaced_custom_object(
            TFJobHandler.GROUP, TFJobHandler.VERSION, self.namespace,
            TFJobHandler.PLURAL, tf_job_spec)
        return tf_job['metadata']['name']