def test__update_with_host_maintenance__bad_config(stateless_job, maintenance): """ 1. Create a stateless job with 6 instances. Wait for all instances to reach RUNNING state. This means that there is at least one host with 2 or more instances on it 2. Start a bad job update with max failure tolerance of 1 and auto-rollback disabled. 3. Start host maintenance on one of the hosts (say host A). 4. Wait for the update to fail. There should be 2 instances unavailable. 5. Since 2 instances are already unavailable and maximum_unavailable_instances=1, host maintenance should not proceed. Verify that the host A doesn't transition to DOWN. """ stateless_job.job_spec.sla.maximum_unavailable_instances = 1 stateless_job.job_spec.instance_count = 6 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [t[0] for t in sorted( host_to_task_count.items(), key=operator.itemgetter(1), reverse=True)] job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 6 updated_job_spec.sla.maximum_unavailable_instances = 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, max_failure_instances=1, max_instance_attempts=1, batch_size=2, ) update.create() # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] maintenance["start"]([test_host]) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") try: wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
def test__host_maintenance_and_agent_down(stateless_job, maintenance): """ 1. Create a large stateless job )that take up more than two-thirds of the cluster resources) with MaximumUnavailableInstances=2. 2. Start host maintenance on one of the hosts (say A) having pods of the job. MaximumUnavailableInstances=2 ensures that not more than 2 pods are unavailable due to host maintenance at a time. 3. Take down another host which has pods running on it. This will TASK_LOST to be sent for all pods on the host after 75 seconds. 4. Since TASK_LOST would cause the job SLA to be violated (due to insufficient resources), instances on the host A should not be killed once LOST event is received. Verify that host A does not transition to DOWN. """ stateless_job.job_spec.instance_count = 30 stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3 stateless_job.job_spec.sla.maximum_unavailable_instances = 2 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] # take down another host which has pods of the job host_container = get_container([sorted_hosts[1]]) # Pick a host that has pods running on it and start maintenance on it. test_host = sorted_hosts[0] try: host_container.stop() maintenance["start"]([test_host]) wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING) pass finally: host_container.start()
def test__host_maintenance_violate_sla_restart_jobmgr(self, failure_tester, maintenance): """ 1. Create a stateless job(instance_count=4) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, one of the instances will not get placed (hence unavailable). 2. Start host maintenance on one of the hosts (say A). 3. Restart job manager. 4. Since one instance is already unavailable, no more instances should be killed due to host maintenance. Verify that host A does not transition to DOWN """ stateless_job = failure_tester.stateless_job() job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') json_format.ParseDict(job_spec_dump, stateless_job.job_spec) stateless_job.job_spec.instance_count = 4 stateless_job.create() stateless_job.wait_for_all_pods_running(num_pods=3) # Pick a host that is UP and start maintenance on it test_host1 = get_host_in_state( host_pb2.HOST_STATE_UP, failure_tester.client) # update the client in maintenance fixture maintenance["update_client"](failure_tester.client) resp = maintenance["start"]([test_host1]) assert resp leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() stateless_job.client = failure_tester.client # update the client of maintainance maintenance["update_client"](failure_tester.client) try: wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING) assert len(stateless_job.query_pods( states=[pod_pb2.POD_STATE_RUNNING])) == 3
def clean_up(): if not draining_hosts: return for h in draining_hosts: wait_for_host_state(h, hpb.HOST_STATE_DOWN) stop(draining_hosts)