def test__in_place_update_host_maintenance(stateless_job, maintenance): # add enough instances so each host should have some tasks running stateless_job.job_spec.instance_count = 9 # need extra retry attempts, since in-place update would need more time # to process given agent is put in maintenance mode stateless_job.config = IntegrationTestConfig( max_retry_attempts=300, pool_file='test_stateless_respool.yaml', ), stateless_job.create() stateless_job.wait_for_all_pods_running() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 9 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) # Pick a host that is UP and start maintenance on it test_host = get_host_in_state(host_pb2.HOST_STATE_UP) resp = maintenance["start"]([test_host]) assert resp wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) update.wait_for_state(goal_state="SUCCEEDED")
def test__auto_rollback_update_reduce_instances_with_bad_config( stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, roll_back_on_failure=True, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLED_BACK") new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() # no instance should be removed assert (len( stateless_job.query_pods()) == stateless_job.job_spec.instance_count) assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
def test__update_with_sla_aware_host_maintenance(stateless_job, maintenance): """ 1. Create a stateless job with 3 instances. 2. Create a job update to update the instance job with instance count 2, add host-limit-1 constraint and define sla with maximum_unavailable_instances=1 3. Start host maintenance on one of the hosts 4. The host should transition to DOWN and the update workflow should SUCCEED """ stateless_job.create() stateless_job.wait_for_all_pods_running() job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 2 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=1) update.create() # Pick a host that is UP and start maintenance on it test_host = get_host_in_state(host_pb2.HOST_STATE_UP) resp = maintenance["start"]([test_host]) assert resp update.wait_for_state(goal_state="SUCCEEDED") wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
def test_auto_rollback_reduce_instances(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") job_spec_dump = load_test_config( UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC ) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) # increase the instance count updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, roll_back_on_failure=True, max_instance_attempts=1, max_failure_instances=1, batch_size=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLED_BACK") assert ( len(stateless_job.query_pods()) == stateless_job.job_spec.instance_count )
def test__create_update_add_instances_with_bad_config(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3 update = StatelessUpdate( stateless_job, batch_size=1, updated_job_spec=updated_job_spec, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") # only one instance should be added assert ( len(stateless_job.query_pods()) == stateless_job.job_spec.instance_count + 1 )
def test__auto_rollback_update_add_instances_with_bad_config(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = \ stateless_job.job_spec.instance_count + 3 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, roll_back_on_failure=True, max_failure_instances=1, max_instance_attempts=1) update.create() update.wait_for_state(goal_state='ROLLED_BACK') new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() # no instance should be added assert len(stateless_job.query_pods()) == \ stateless_job.job_spec.instance_count assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance): """ 1. Create a stateless job with 6 instances. Wait for all instances to reach RUNNING state. This means that there is at least one host with 2 or more instances on it 2. Start a bad job update with max failure tolerance of 1 and auto-rollback disabled. 3. Start host maintenance on one of the hosts (say host A). 4. Wait for the update to fail. There should be 2 instances unavailable. 5. Since 2 instances are already unavailable and maximum_unavailable_instances=1, host maintenance should not proceed. Verify that the host A doesn't transition to DOWN. """ stateless_job.job_spec.sla.maximum_unavailable_instances = 1 stateless_job.job_spec.instance_count = 6 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 6 updated_job_spec.sla.maximum_unavailable_instances = 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, max_failure_instances=1, max_instance_attempts=1, batch_size=2, ) update.create() # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] maintenance["start"]([test_host]) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") try: wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
def test__in_place_update_success_rate_with_component_restart( stateless_job, jobmgr, resmgr, placement_engines): stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) # restart all components except hostmgr jobmgr.restart() time.sleep(random.randint(1, 10)) resmgr.restart() time.sleep(random.randint(1, 10)) placement_engines.restart() update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 log.info("total mismatch: %d", count) assert count == 0
def test__in_place_update_success_rate(stateless_job): stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config(update_stateless_job_spec()) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 if minicluster_type() == "k8s": updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 log.info("total mismatch: %d", count) assert count == 0
def test__create_update_reduce_instances_with_bad_config(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, batch_size=1, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() assert len(old_pod_infos) == len(new_pod_infos)
def test__in_place_update_success_rate_with_component_restart(self, failure_tester): ''' Test in-place update can finish after multiple components restart ''' stateless_job = failure_tester.stateless_job() stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config("test_update_stateless_job_spec.yaml") updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 update = failure_tester.stateless_update(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) # restart all components except hostmgr leader1 = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader1 assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader1) failure_tester.reset_client() stateless_job.client = failure_tester.client time.sleep(random.randint(1, 10)) leader2 = failure_tester.fw.get_leader_info(failure_tester.resmgr) assert leader2 assert 0 != failure_tester.fw.restart(failure_tester.resmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.resmgr, leader2) failure_tester.reset_client() stateless_job.client = failure_tester.client time.sleep(random.randint(1, 10)) assert 0 != failure_tester.fw.restart(failure_tester.stateless_pe) update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 log.info("total mismatch: %d", count) assert count == 0