def testFailedWorkerPod(self): """ Start a pod running a python program destined to fail with restart_policy="Never" to test failed_worker_count """ task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1) task_d.recover_tasks = MagicMock() instance_manager = InstanceManager( task_d, job_name="test-failed-worker-pod-%d-%d" % (int(time.time()), random.randint(1, 101)), image_name="gcr.io/google-samples/hello-app:1.0", worker_command=["badcommand"], worker_args=["badargs"], namespace="default", num_workers=3, restart_policy="Never", ) instance_manager.start_workers() max_check_num = 20 for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if counters["Failed"] == 3: break instance_manager.stop_relaunch_and_remove_workers() for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if not counters: break task_d.recover_tasks.assert_has_calls( [call(0), call(1), call(2)], any_order=True )
def testCreateDeleteWorkerPod(self): task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1) task_d.recover_tasks = MagicMock() instance_manager = InstanceManager( task_d, job_name="test-create-worker-pod-%d-%d" % (int(time.time()), random.randint(1, 101)), image_name="gcr.io/google-samples/hello-app:1.0", worker_command=["echo"], worker_args=[], namespace="default", num_workers=3, ) instance_manager.start_workers() max_check_num = 20 for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if counters["Succeeded"] == 3: break instance_manager.stop_relaunch_and_remove_workers() for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if not counters: break task_d.recover_tasks.assert_has_calls( [call(0), call(1), call(2)], any_order=True )
def test_create_delete_worker_pod(self): task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1) task_d.recover_tasks = MagicMock() instance_manager = InstanceManager( task_d, job_name="test-create-worker-pod-%d-%d" % (int(time.time()), random.randint(1, 101)), image_name="ubuntu:18.04", worker_command=["/bin/bash"], worker_args=["-c", "echo"], namespace="default", num_workers=3, ) instance_manager.start_workers() max_check_num = 20 for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if counters["Succeeded"] == 3: break instance_manager.stop_relaunch_and_remove_workers() for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if not counters: break self.assertFalse(counters)
def test_relaunch_worker_pod(self): num_workers = 3 task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1) instance_manager = InstanceManager( task_d, job_name="test-relaunch-worker-pod-%d-%d" % (int(time.time()), random.randint(1, 101)), image_name="ubuntu:18.04", worker_command=["/bin/bash"], worker_args=["-c", "sleep 10 #"], namespace="default", num_workers=num_workers, ) instance_manager.start_workers() max_check_num = 60 for _ in range(max_check_num): time.sleep(1) counters = instance_manager.get_worker_counter() if counters["Running"] + counters["Pending"] > 0: break # Note: There is a slight chance of race condition. # Hack to find a worker to remove current_workers = set() live_workers = set() with instance_manager._lock: for ( k, (_, _, phase), ) in instance_manager._worker_pods_ip_phase.items(): current_workers.add(k) if phase in ["Running", "Pending"]: live_workers.add(k) self.assertTrue(live_workers) instance_manager._remove_worker(live_workers.pop()) # verify a new worker get launched found = False for _ in range(max_check_num): if found: break time.sleep(1) with instance_manager._lock: for k in instance_manager._worker_pods_ip_phase: if k not in range(num_workers, num_workers * 2): found = True else: self.fail("Failed to find newly launched worker.") instance_manager.stop_relaunch_and_remove_workers()
def test_get_worker_addrs(self): task_d = _TaskDispatcher({"f": (0, 10)}, {}, {}, 1, 1) instance_manager = InstanceManager( task_d, job_name="test-create-worker-pod-%d-%d" % (int(time.time()), random.randint(1, 101)), image_name="ubuntu:18.04", worker_command=["/bin/bash"], worker_args=["-c", "sleep 5 #"], namespace="default", num_workers=3, ) instance_manager.start_workers() max_check_num = 20 for _ in range(max_check_num): time.sleep(3) counters = instance_manager.get_worker_counter() if counters["Running"]: worker_addrs = instance_manager._get_alive_worker_addr() self.assertEqual(len(worker_addrs), counters["Running"]) instance_manager.stop_relaunch_and_remove_workers()