def submit(self, task: Task) -> str: """Submit a multi-worker PyTorchJob Task :param task: The task definition :type task: Task :return: A string handle name :rtype: str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) task.num_gpus = 1 pod_spec = task_to_pod_spec(task, container_name="pytorch", secrets=secrets, configmaps=configmaps) template_metadata = client.V1ObjectMeta(name=task.name) template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec) worker_replica_spec = {} worker_replica_spec['replicas'] = task.num_workers worker_replica_spec['restartPolicy'] = PyTorchJobHandler.RESTART_NEVER worker_replica_spec['template'] = template master_replica_spec = {} master_replica_spec['replicas'] = 1 master_replica_spec['restartPolicy'] = PyTorchJobHandler.RESTART_NEVER master_replica_spec['template'] = template spec = {} spec['pytorchReplicaSpecs'] = {} spec['pytorchReplicaSpecs']['Master'] = master_replica_spec spec['pytorchReplicaSpecs']['Worker'] = worker_replica_spec pytorch_job_spec = {} pytorch_job_spec['kind'] = "PyTorchJob" pytorch_job_spec[ 'apiVersion'] = 'kubeflow.org/' + PyTorchJobHandler.VERSION pytorch_job_spec['metadata'] = client.V1ObjectMeta( generate_name=task.name) pytorch_job_spec['spec'] = spec pytorch_job = self.api.create_namespaced_custom_object( PyTorchJobHandler.GROUP, PyTorchJobHandler.VERSION, self.namespace, PyTorchJobHandler.PLURAL, pytorch_job_spec, ) return pytorch_job['metadata']['name']
def submit(self, task: Task) -> str: """Submit a multi-worker PyTorchJob Task :param task: The task definition :type task: Task :return: A string handle name :rtype: str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) task.num_gpus = 1 pod_spec = task_to_pod_spec(task, container_name="pytorch-elasticjob", secrets=secrets, configmaps=configmaps) template_metadata = client.V1ObjectMeta(name=task.name) template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec) worker_replica_spec = {} worker_replica_spec['replicas'] = task.num_workers worker_replica_spec['restartPolicy'] = PyTorchElasticJobHandler.EXIT_CODE worker_replica_spec['template'] = template spec = {} spec['replicaSpecs'] = {} spec['replicaSpecs']['Worker'] = worker_replica_spec spec['minReplicas'] = task.num_workers spec['maxReplicas'] = task.num_workers etcd_svc = getenv('PYTORCH_ELASTIC_ETCD_SVC') if not etcd_svc: LOGGER.warning("No environment variable set for etcd service, looking for first available in elastic-job namespace") api = client.CoreV1Api() etcd_svc = [x for x in api.list_namespaced_service('elastic-job').items if x.metadata.name =='etcd-service'][0].spec.cluster_ip LOGGER.info("Using etcd service on %s:%d", etcd_svc, PyTorchElasticJobHandler.ETCD_PORT) spec['rdzvEndpoint'] = f'{etcd_svc}:{PyTorchElasticJobHandler.ETCD_PORT}' pytorch_job_spec = {} pytorch_job_spec['kind'] = PyTorchElasticJobHandler.NAME pytorch_job_spec['apiVersion'] = f'{PyTorchElasticJobHandler.GROUP}/{PyTorchElasticJobHandler.VERSION}' pytorch_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name) pytorch_job_spec['spec'] = spec pytorch_job = self.api.create_namespaced_custom_object( PyTorchElasticJobHandler.GROUP, PyTorchElasticJobHandler.VERSION, self.namespace, PyTorchElasticJobHandler.PLURAL, pytorch_job_spec, ) return pytorch_job['metadata']['name']
def submit(self, task: Task) -> str: """Submit a pod to run :param task: A task/pod :type task: Task :return: A string name :rtype str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) pod_spec = task_to_pod_spec(task, secrets=secrets, configmaps=configmaps) metadata = client.V1ObjectMeta(name=task.name) pod = client.V1Pod(metadata=metadata, spec=pod_spec) self.core_api.create_namespaced_pod(body=pod, namespace=self.namespace) return task.name
def submit(self, task: Task): """Submit a new task as a Job :param task: A task definition :type task: Task :return: A string identifier for this task :rtype: str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) pod_spec = task_to_pod_spec(task, secrets=secrets, configmaps=configmaps) metadata = client.V1ObjectMeta(name=task.name) template_metadata = client.V1ObjectMeta(name='{}-template'.format(task.name)) template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec) task_spec = client.V1JobSpec(template=template) task_obj = client.V1Job(kind=JobHandler.NAME, spec=task_spec, metadata=metadata) self.api.create_namespaced_job(body=task_obj, namespace=self.namespace) return task.name
def submit(self, task: Task) -> str: """Submit a multi-worker TF Task :param task: A task definition to run :type task: Task :return: A task identifier :rtype: str """ secrets = self._reference_secrets(task) configmaps = self._generate_configmaps(task) task.num_gpus = 1 pod_spec = task_to_pod_spec(task, container_name="tensorflow", secrets=secrets, configmaps=configmaps) template_metadata = client.V1ObjectMeta(name=task.name) template = client.V1PodTemplateSpec(metadata=template_metadata, spec=pod_spec) worker_replica_spec = {} worker_replica_spec['replicas'] = task.num_workers worker_replica_spec['restartPolicy'] = TFJobHandler.RESTART_NEVER worker_replica_spec['template'] = template spec = {} spec['tfReplicaSpecs'] = {} spec['tfReplicaSpecs']['Worker'] = worker_replica_spec tf_job_spec = {} tf_job_spec['kind'] = 'TFJob' tf_job_spec['apiVersion'] = 'kubeflow.org/' + TFJobHandler.VERSION tf_job_spec['metadata'] = client.V1ObjectMeta(generate_name=task.name) tf_job_spec['spec'] = spec tf_job = self.api.create_namespaced_custom_object( TFJobHandler.GROUP, TFJobHandler.VERSION, self.namespace, TFJobHandler.PLURAL, tf_job_spec) return tf_job['metadata']['name']