def test__host_maintenance_within_sla_limit(stateless_job, maintenance): """ 1. Create a stateless job(instance_count=4) and MaximumUnavailableInstances=1. Wait for all pods to reach RUNNING state. This means there is at least one host with more than one instance. 2. Start host maintenance on a host (say A) with more than 1 instance. 3. Pods on the host A should get killed in a way (1 at a time) that doesn't violate the SLA and host A should transition to DOWN """ stateless_job.job_spec.instance_count = 4 stateless_job.job_spec.sla.maximum_unavailable_instances = 1 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1)) ] # Pick a host that has pods running on it and start maintenance on it. test_host = sorted_hosts[0] resp = maintenance["start"]([test_host]) assert resp # Wait for host to transition to DOWN attempts = 0 max_retry_attempts = 20 log.info( "%s waiting for state %s", test_host, host_pb2.HostState.Name(host_pb2.HOST_STATE_DOWN), ) while attempts < max_retry_attempts: try: if is_host_in_state(test_host, host_pb2.HOST_STATE_DOWN): break # if the number of available pods is less than 2 (instance_count - # maximum_unavailable_instances) fail the test if len(stateless_job.query_pods( states=[pod_pb2.POD_STATE_RUNNING])) < 2: assert False except Exception as e: log.warn(e) finally: time.sleep(5) attempts += 1 if attempts == max_retry_attempts: log.info( "%s max attempts reached to wait for host state %s", test_host, host_pb2.HostState.Name(host_pb2.HOST_STATE_DOWN), ) assert False
def test__update_with_host_maintenance_and_agent_down(stateless_job, maintenance): """ 1. Create a large stateless job (that take up more than two-thirds of the cluster resources) with MaximumUnavailableInstances=2. 2. Start host maintenance on one of the hosts (say A) having pods of the job. MaximumUnavailableInstances=2 ensures that not more than 2 pods are unavailable due to host maintenance at a time. 3. Take down another host which has pods running on it. This will TASK_LOST to be sent for all pods on the host after 75 seconds. 4. Start an update to modify the instance spec of one of the pods. 5. Since TASK_LOST would cause the job SLA to be violated, instances on the host A should not be killed once LOST event is received. Verify that host A does not transition to DOWN. """ stateless_job.job_spec.instance_count = 30 stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3 stateless_job.job_spec.sla.maximum_unavailable_instances = 2 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] # pick another host which has pods of the job to take down host_container = get_container([sorted_hosts[1]]) try: host_container.stop() maintenance["start"]([test_host]) stateless_job.job_spec.instance_spec[10].containers.extend([ pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec( disk_limit_mb=20)) ]) update = StatelessUpdate(stateless_job, updated_job_spec=stateless_job.job_spec, batch_size=0) update.create() update.wait_for_state(goal_state="SUCCEEDED") stateless_job.stop() wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING) pass finally: host_container.start()
def get_host_in_state(state): """ returns a host in the specified state. Note that the caller should make sure there is at least one host in the the requested state. :param state: host_pb2.HostState :return: Hostname of a host in the specified state """ resp = query_hosts([state]) assert len(resp.host_infos) > 0 return resp.host_infos[0].hostname
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance): """ 1. Create a stateless job with 6 instances. Wait for all instances to reach RUNNING state. This means that there is at least one host with 2 or more instances on it 2. Start a bad job update with max failure tolerance of 1 and auto-rollback disabled. 3. Start host maintenance on one of the hosts (say host A). 4. Wait for the update to fail. There should be 2 instances unavailable. 5. Since 2 instances are already unavailable and maximum_unavailable_instances=1, host maintenance should not proceed. Verify that the host A doesn't transition to DOWN. """ stateless_job.job_spec.sla.maximum_unavailable_instances = 1 stateless_job.job_spec.instance_count = 6 stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = [h.hostname for h in query_hosts([]).host_infos] host_to_task_count = get_host_to_task_count(hosts, stateless_job) sorted_hosts = [ t[0] for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1), reverse=True) ] job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC) updated_job_spec = JobSpec() json_format.ParseDict(job_spec_dump, updated_job_spec) updated_job_spec.instance_count = 6 updated_job_spec.sla.maximum_unavailable_instances = 1 update = StatelessUpdate( stateless_job, updated_job_spec=updated_job_spec, max_failure_instances=1, max_instance_attempts=1, batch_size=2, ) update.create() # Pick a host that has pods running on it to start maintenance on it. test_host = sorted_hosts[0] maintenance["start"]([test_host]) update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED") try: wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN) assert False, 'Host should not transition to DOWN' except: assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
def test__host_maintenance_no_sla_defined_for_job(stateless_job, maintenance): """ 1. Create a stateless job(instance_count=3) without SLA defined 2. Start host maintenance on all 3 hosts. 3. The pods should get killed and all the hosts should transition to DOWN """ stateless_job.create() stateless_job.wait_for_all_pods_running() hosts = query_hosts([]).host_infos maintenance["start"]([h.hostname for h in hosts]) for h in hosts: wait_for_host_state(h.hostname, host_pb2.HOST_STATE_DOWN)