def _submit_job(image_name, client_args, container_args): client = k8s.Client( image_name=image_name, namespace=client_args.namespace, job_name=client_args.job_name, event_callback=None, cluster_spec=client_args.cluster_spec, ) client.create_master( resource_requests=client_args.master_resource_request, resource_limits=client_args.master_resource_limit, args=container_args, pod_priority=client_args.master_pod_priority, image_pull_policy=client_args.image_pull_policy, restart_policy=client_args.restart_policy, volume=client_args.volume, envs=parse_envs(client_args.envs), ) logger.info( "ElasticDL job %s was successfully submitted. The master pod is: %s." % (client_args.job_name, client.get_master_pod_name()))
def _submit_job(image_name, client_args, container_args): client = k8s.Client( image_name=image_name, namespace=client_args.namespace, job_name=client_args.job_name, event_callback=None, cluster_spec=client_args.cluster_spec, force_use_kube_config_file=client_args.force_use_kube_config_file, ) if client_args.yaml: client.dump_master_yaml( resource_requests=client_args.master_resource_request, resource_limits=client_args.master_resource_limit, args=container_args, pod_priority=client_args.master_pod_priority, image_pull_policy=client_args.image_pull_policy, restart_policy=client_args.restart_policy, volume=client_args.volume, envs=parse_envs(client_args.envs), yaml=client_args.yaml, ) logger.info("ElasticDL job %s YAML has been dumped into file %s." % (client_args.job_name, client_args.yaml)) else: client.create_master( resource_requests=client_args.master_resource_request, resource_limits=client_args.master_resource_limit, args=container_args, pod_priority=client_args.master_pod_priority, image_pull_policy=client_args.image_pull_policy, restart_policy=client_args.restart_policy, volume=client_args.volume, envs=parse_envs(client_args.envs), ) logger.info("ElasticDL job %s was successfully submitted. " "The master pod is: %s." % (client_args.job_name, client.get_master_pod_name()))
def start_embedding_pod_and_redis( self, command, args, embedding_service_id=0, resource_request="cpu=1,memory=4096Mi", resource_limit="cpu=1,memory=4096Mi", pod_priority=None, volume=None, image_pull_policy=None, restart_policy="Never", **kargs, ): logger.info("Starting pod for embedding service ...") self._k8s_client = k8s.Client(event_callback=None, **kargs) pod = self._k8s_client.create_embedding_service( worker_id=embedding_service_id, resource_requests=resource_request, resource_limits=resource_limit, pod_priority=pod_priority, volume=volume, image_pull_policy=image_pull_policy, command=command, args=args, restart_policy=restart_policy, ) # TODO: assign address with pod's domain name instead of pod's ip. # and should not fix ports address_ip = pod.status.pod_ip while not address_ip: pod = self._k8s_client.get_embedding_service_pod( embedding_service_id) address_ip = pod.status.pod_ip self._embedding_service_endpoint = { address_ip: [30001 + i for i in range(6)] }
def __init__(self, task_d, num_workers=1, worker_command=None, worker_args=None, worker_resource_request="cpu=1,memory=4096Mi", worker_resource_limit="cpu=1,memory=4096Mi", worker_pod_priority=None, num_ps=0, ps_command=None, ps_args=None, ps_resource_request="cpu=1,memory=4096Mi", ps_resource_limit="cpu=1,memory=4096Mi", ps_pod_priority=None, volume=None, image_pull_policy=None, restart_policy="Never", envs=None, **kwargs): self._num_workers = num_workers self._worker_command = worker_command self._worker_args = worker_args self._worker_resource_request = worker_resource_request self._worker_resource_limit = worker_resource_limit self._worker_pod_priority = worker_pod_priority self._num_ps = num_ps self._ps_command = ps_command self._ps_args = ps_args self._ps_resource_request = ps_resource_request self._ps_resource_limit = ps_resource_limit self._ps_pod_priority = ps_pod_priority self._restart_policy = restart_policy self._volume = volume self._image_pull_policy = image_pull_policy self._envs = envs self._task_d = task_d self._next_worker_id = itertools.count().__next__ # Protects followed variables, which are accessed from event_cb. self._lock = threading.Lock() # worker id to (pod name, phase) mapping # phase: None/Pending/Running/Succeeded/Failed/Unknown # None: worker was just launched, haven't received event yet. # Pending: worker pod not started yet # Running: worker pod is running # Succeeded: worker pod finishes all tasks and terminates with # no issue. # Failed: worker pod is killed for some reason # Unknown: unknown self._worker_pods_phase = {} # pod name to worker id mapping self._worker_pod_name_to_id = {} self._relaunch_deleted_live_worker = True self._ps_pods_phase = {} self._ps_pod_name_to_id = {} self._relaunch_deleted_live_ps = True self._failed_pods = [] self._k8s_client = k8s.Client(event_callback=self._event_cb, **kwargs) self._ps_addrs = self._get_addrs( self._num_ps, self._k8s_client.get_ps_service_address) # TODO: Select a worker address to be used for broadcasting model # parameters under allreduce-strategy. self._worker_addrs = self._get_addrs( self._num_workers, self._k8s_client.get_worker_service_address)
def test_client(self): tracker = WorkerTracker() c = k8s.Client( image_name="gcr.io/google-samples/hello-app:1.0", namespace="default", job_name="test-job-%d-%d" % (int(time.time()), random.randint(1, 101)), event_callback=tracker.event_cb, ) # Start master resource = "cpu=100m,memory=64M" c.create_master( resource_requests=resource, resource_limits=resource, pod_priority=None, args=None, volume=None, image_pull_policy="Never", restart_policy="Never", ) while tracker._count < 1: time.sleep(1) # Check master pod labels master = c.get_master_pod() self.assertEqual(master.metadata.labels[k8s.ELASTICDL_JOB_KEY], c.job_name) self.assertEqual( master.metadata.labels[k8s.ELASTICDL_REPLICA_TYPE_KEY], "master") self.assertEqual( master.metadata.labels[k8s.ELASTICDL_REPLICA_INDEX_KEY], "0") # Start 3 workers for i in range(3): _ = c.create_worker( worker_id=str(i), resource_requests=resource, resource_limits=resource, command=["echo"], pod_priority=None, args=None, volume=None, image_pull_policy="Never", restart_policy="Never", expose_ports=False, ) time.sleep(5) # Wait for workers to be added while tracker._count < 4: time.sleep(1) # Check worker pods labels for i in range(3): worker = c.get_worker_pod(i) self.assertEqual(worker.metadata.labels[k8s.ELASTICDL_JOB_KEY], c.job_name) self.assertEqual( worker.metadata.labels[k8s.ELASTICDL_REPLICA_TYPE_KEY], "worker", ) self.assertEqual( worker.metadata.labels[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i)) # Start 3 worker services for i in range(3): c.create_worker_service(i) # Check worker services for i in range(3): service = c.get_worker_service(i) self.assertIsNotNone(service) self.assertEqual(service.spec.selector[k8s.ELASTICDL_JOB_KEY], c.job_name) self.assertEqual( service.spec.selector[k8s.ELASTICDL_REPLICA_TYPE_KEY], "worker") self.assertEqual( service.spec.selector[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i)) # Start 2 ps pods for i in range(2): _ = c.create_ps( ps_id=str(i), resource_requests=resource, resource_limits=resource, command=["echo"], pod_priority=None, args=None, volume=None, image_pull_policy="Never", restart_policy="Never", expose_ports=False, ) time.sleep(5) # Wait for ps to be added while tracker._count < 6: time.sleep(1) # Check ps pods labels for i in range(2): ps = c.get_ps_pod(i) self.assertEqual(ps.metadata.labels[k8s.ELASTICDL_JOB_KEY], c.job_name) self.assertEqual( ps.metadata.labels[k8s.ELASTICDL_REPLICA_TYPE_KEY], "ps") self.assertEqual( ps.metadata.labels[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i)) # Start 2 ps services for i in range(2): c.create_ps_service(i) # Check ps services for i in range(2): service = c.get_ps_service(i) self.assertIsNotNone(service) self.assertEqual(service.spec.selector[k8s.ELASTICDL_JOB_KEY], c.job_name) self.assertEqual( service.spec.selector[k8s.ELASTICDL_REPLICA_TYPE_KEY], "ps") self.assertEqual( service.spec.selector[k8s.ELASTICDL_REPLICA_INDEX_KEY], str(i)) # Delete master and all ps and workers should also be deleted c.delete_master() # wait for all ps, workers and services to be deleted while tracker._count > 0: time.sleep(1)
def __init__(self, task_d, rendezvous_server=None, num_workers=1, worker_command=None, worker_args=None, worker_resource_request="cpu=1,memory=4096Mi", worker_resource_limit="cpu=1,memory=4096Mi", worker_pod_priority=None, num_ps=0, ps_command=None, ps_args=None, ps_resource_request="cpu=1,memory=4096Mi", ps_resource_limit="cpu=1,memory=4096Mi", ps_pod_priority=None, volume=None, image_pull_policy=None, restart_policy="Never", envs=None, disable_relaunch=False, log_file_path=None, **kwargs): self._num_workers = num_workers self._worker_command = worker_command self._worker_args = worker_args self._worker_resource_request = worker_resource_request self._worker_resource_limit = worker_resource_limit self._worker_pod_priority = _parse_worker_pod_priority( self._num_workers, worker_pod_priority) self._num_ps = num_ps self._ps_command = ps_command self._ps_args = ps_args self._ps_resource_request = ps_resource_request self._ps_resource_limit = ps_resource_limit self._ps_pod_priority = ps_pod_priority self._restart_policy = restart_policy self._volume = volume self._image_pull_policy = image_pull_policy self._envs = envs self._task_d = task_d self._rendezvous_server = rendezvous_server self._next_worker_id = itertools.count().__next__ self._log_file_path = log_file_path # Protects followed variables, which are accessed from event_cb. self._lock = threading.Lock() self._init_worker_pod_status() if disable_relaunch: self._k8s_client = k8s.Client(**kwargs) else: self._k8s_client = k8s.Client( event_callback=self._event_cb, periodic_call_func=self._process_worker, **kwargs) self._ps_addrs = self._get_addrs( self._num_ps, self._k8s_client.get_ps_service_address) self._worker_addrs = []