def test__auto_rollback_update_reduce_instances_with_bad_config( stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, roll_back_on_failure=True, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLED_BACK") new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() # no instance should be removed assert (len( stateless_job.query_pods()) == stateless_job.job_spec.instance_count) assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
def test__host_maintenance_violate_sla(stateless_job, maintenance): """ 1. Create a stateless job(instance_count=4) with host-limit-1 constraint and MaximumUnavailableInstances=1. This means that there one instance that is unavailable. 2. Start host maintenance on one of the hosts (say A). 3. Since one instance is already unavailable, no more instances should be killed due to host maintenance. Verify that host A does not transition to DOWN. """ job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') json_format.ParseDict(job_spec_dump, stateless_job.job_spec) stateless_job.job_spec.instance_count = 4 stateless_job.create() stateless_job.wait_for_all_pods_running(num_pods=3) # Pick a host that is UP and start maintenance on it test_host1 = get_host_in_state(host_pb2.HOST_STATE_UP) resp = maintenance["start"]([test_host1]) assert resp try: wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING) assert len( stateless_job.query_pods(states=[pod_pb2.POD_STATE_RUNNING])) == 3
def test__update_with_sla_aware_host_maintenance(stateless_job, maintenance): """ 1. Create a stateless job with 3 instances. 2. Create a job update to update the instance job with instance count 2, add host-limit-1 constraint and define sla with maximum_unavailable_instances=1 3. Start host maintenance on one of the hosts 4. The host should transition to DOWN and the update workflow should SUCCEED """ stateless_job.create() stateless_job.wait_for_all_pods_running() job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 2 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=1) update.create() # Pick a host that is UP and start maintenance on it test_host = get_host_in_state(host_pb2.HOST_STATE_UP) resp = maintenance["start"]([test_host]) assert resp update.wait_for_state(goal_state="SUCCEEDED") wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
def test_auto_rollback_reduce_instances(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") job_spec_dump = load_test_config( UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC ) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) # increase the instance count updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, roll_back_on_failure=True, max_instance_attempts=1, max_failure_instances=1, batch_size=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLED_BACK") assert ( len(stateless_job.query_pods()) == stateless_job.job_spec.instance_count )
def test__in_place_update_host_maintenance(stateless_job, maintenance): # add enough instances so each host should have some tasks running stateless_job.job_spec.instance_count = 9 # need extra retry attempts, since in-place update would need more time # to process given agent is put in maintenance mode stateless_job.config = IntegrationTestConfig( max_retry_attempts=300, pool_file='test_stateless_respool.yaml', ), stateless_job.create() stateless_job.wait_for_all_pods_running() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 9 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) # Pick a host that is UP and start maintenance on it test_host = get_host_in_state(host_pb2.HOST_STATE_UP) resp = maintenance["start"]([test_host]) assert resp wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) update.wait_for_state(goal_state="SUCCEEDED")
def test__create_update_add_instances_with_bad_config(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3 update = StatelessUpdate( stateless_job, batch_size=1, updated_job_spec=updated_job_spec, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") # only one instance should be added assert ( len(stateless_job.query_pods()) == stateless_job.job_spec.instance_count + 1 )
def test__auto_rollback_update_add_instances_with_bad_config(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = \ stateless_job.job_spec.instance_count + 3 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, roll_back_on_failure=True, max_failure_instances=1, max_instance_attempts=1) update.create() update.wait_for_state(goal_state='ROLLED_BACK') new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() # no instance should be added assert len(stateless_job.query_pods()) == \ stateless_job.job_spec.instance_count assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance): """ 1. Create a stateless job with 6 instances. Wait for all instances to reach RUNNING state. This means that there is at least one host with 2 or more instances on it 2. Start a bad job update with max failure tolerance of 1 and auto-rollback disabled. 3. Start host maintenance on one of the hosts (say host A). 4. Wait for the update to fail. There should be 2 instances unavailable. 5. Since 2 instances are already unavailable and maximum_unavailable_instances=1, host maintenance should not proceed. Verify that the host A doesn't transition to DOWN. """ stateless_job.job_spec.sla.maximum_unavailable_instances = 1 stateless_job.job_spec.instance_count = 6 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 6 updated_job_spec.sla.maximum_unavailable_instances = 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, max_failure_instances=1, max_instance_attempts=1, batch_size=2, ) update.create() # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] maintenance["start"]([test_host]) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") try: wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
def test__in_place_update_success_rate_with_component_restart( stateless_job, jobmgr, resmgr, placement_engines): stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) # restart all components except hostmgr jobmgr.restart() time.sleep(random.randint(1, 10)) resmgr.restart() time.sleep(random.randint(1, 10)) placement_engines.restart() update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 log.info("total mismatch: %d", count) assert count == 0
def test__in_place_update_success_rate(stateless_job): stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config(update_stateless_job_spec()) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 if minicluster_type() == "k8s": updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1 update = StatelessUpdate(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 log.info("total mismatch: %d", count) assert count == 0
def test__host_maintenance_violate_sla_restart_jobmgr(self, failure_tester, maintenance): """ 1. Create a stateless job(instance_count=4) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, one of the instances will not get placed (hence unavailable). 2. Start host maintenance on one of the hosts (say A). 3. Restart job manager. 4. Since one instance is already unavailable, no more instances should be killed due to host maintenance. Verify that host A does not transition to DOWN """ stateless_job = failure_tester.stateless_job() job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') json_format.ParseDict(job_spec_dump, stateless_job.job_spec) stateless_job.job_spec.instance_count = 4 stateless_job.create() stateless_job.wait_for_all_pods_running(num_pods=3) # Pick a host that is UP and start maintenance on it test_host1 = get_host_in_state( host_pb2.HOST_STATE_UP, failure_tester.client) # update the client in maintenance fixture maintenance["update_client"](failure_tester.client) resp = maintenance["start"]([test_host1]) assert resp leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() stateless_job.client = failure_tester.client # update the client of maintainance maintenance["update_client"](failure_tester.client) try: wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING) assert len(stateless_job.query_pods( states=[pod_pb2.POD_STATE_RUNNING])) == 3
def test__stop_start_pod_on_sla_violated_job(stateless_job): """ 1. Create a stateless job(instance_count=5) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of the instances will not get placed (hence unavailable). 2. Kill one of the running instances (say i). Instance should get killed. 3. Start instance i. Instance i should transit to PENDING (due to host limit 1 constraint, instance won't get placed). """ job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') json_format.ParseDict(job_spec_dump, stateless_job.job_spec) stateless_job.job_spec.instance_count = 5 stateless_job.create() stateless_job.wait_for_all_pods_running(num_pods=3) test_instance = None for i in range(0, stateless_job.job_spec.instance_count): if stateless_job.get_pod_status(i).state == pod_pb2.POD_STATE_RUNNING: test_instance = i break print test_instance assert not test_instance == None ranges = task_pb2.InstanceRange(to=test_instance + 1) setattr(ranges, "from", test_instance) stateless_job.stop(ranges=[ranges]) def instance_killed(): return stateless_job.get_pod_status( test_instance).state == pod_pb2.POD_STATE_KILLED stateless_job.wait_for_condition(instance_killed) stateless_job.start(ranges=[ranges]) def instance_pending(): return stateless_job.get_pod_status( test_instance).state == pod_pb2.POD_STATE_PENDING stateless_job.wait_for_condition(instance_pending)
def test__create_update_reduce_instances_with_bad_config(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, batch_size=1, max_failure_instances=1, max_instance_attempts=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() assert len(old_pod_infos) == len(new_pod_infos)
def test__in_place_update_success_rate_with_component_restart(self, failure_tester): ''' Test in-place update can finish after multiple components restart ''' stateless_job = failure_tester.stateless_job() stateless_job.job_spec.instance_count = 30 stateless_job.create() stateless_job.wait_for_all_pods_running() old_pod_infos = stateless_job.query_pods() job_spec_dump = load_test_config("test_update_stateless_job_spec.yaml") updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 30 update = failure_tester.stateless_update(stateless_job, updated_job_spec=updated_job_spec, batch_size=0) update.create(in_place=True) # restart all components except hostmgr leader1 = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader1 assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader1) failure_tester.reset_client() stateless_job.client = failure_tester.client time.sleep(random.randint(1, 10)) leader2 = failure_tester.fw.get_leader_info(failure_tester.resmgr) assert leader2 assert 0 != failure_tester.fw.restart(failure_tester.resmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.resmgr, leader2) failure_tester.reset_client() stateless_job.client = failure_tester.client time.sleep(random.randint(1, 10)) assert 0 != failure_tester.fw.restart(failure_tester.stateless_pe) update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() old_pod_dict = {} new_pod_dict = {} for old_pod_info in old_pod_infos: split_index = old_pod_info.status.pod_id.value.rfind('-') pod_name = old_pod_info.status.pod_id.value[:split_index] old_pod_dict[pod_name] = old_pod_info.status.host for new_pod_info in new_pod_infos: split_index = new_pod_info.status.pod_id.value.rfind('-') pod_name = new_pod_info.status.pod_id.value[:split_index] new_pod_dict[pod_name] = new_pod_info.status.host count = 0 for pod_name, pod_id in old_pod_dict.items(): if new_pod_dict[pod_name] != old_pod_dict[pod_name]: log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name], new_pod_dict[pod_name]) count = count + 1 log.info("total mismatch: %d", count) assert count == 0