def test__host_maintenance_lifecycle(host_affinity_job, maintenance):
    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(hpb.HOST_STATE_UP)

    # Set host affinity of the job to the selected host
    host_affinity_job.job_config.defaultConfig.\
        constraint.labelConstraint.label.value = test_host

    host_affinity_job.create()

    # Start maintenance on the selected host
    resp = maintenance['start']([test_host])
    assert resp

    assert is_host_in_state(test_host, hpb.HOST_STATE_DRAINING)

    # Wait for host to transition to DOWN
    wait_for_host_state(test_host, hpb.HOST_STATE_DOWN)

    # Complete maintenance on the test hosts
    resp = maintenance['stop']([test_host])
    assert resp

    # Host should no longer be DOWN
    assert not is_host_in_state(test_host, hpb.HOST_STATE_DOWN)

    wait_for_host_state(test_host, hpb.HOST_STATE_UP)
Exemple #2
0
def test__host_maintenance_violate_sla(stateless_job, maintenance):
    """
    1. Create a stateless job(instance_count=4) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. This means that there one instance that is
       unavailable.
    2. Start host maintenance on one of the hosts (say A).
    3. Since one instance is already unavailable, no more instances should be
       killed due to host maintenance. Verify that host A does not transition
       to DOWN.
    """
    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
    stateless_job.job_spec.instance_count = 4
    stateless_job.create()
    stateless_job.wait_for_all_pods_running(num_pods=3)

    # Pick a host that is UP and start maintenance on it
    test_host1 = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host1])
    assert resp

    try:
        wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING)
        assert len(
            stateless_job.query_pods(states=[pod_pb2.POD_STATE_RUNNING])) == 3
Exemple #3
0
def test__host_maintenance_within_sla_limit(stateless_job, maintenance):
    """
    1. Create a stateless job(instance_count=4) and MaximumUnavailableInstances=1.
       Wait for all pods to reach RUNNING state. This means there is at least one
       host with more than one instance.
    2. Start host maintenance on a host (say A) with more than 1 instance.
    3. Pods on the host A should get killed in a way (1 at a time)
       that doesn't violate the SLA and host A should transition to DOWN
    """
    stateless_job.job_spec.instance_count = 4
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0]
        for t in sorted(host_to_task_count.items(), key=operator.itemgetter(1))
    ]

    # Pick a host that has pods running on it and start maintenance on it.
    test_host = sorted_hosts[0]

    resp = maintenance["start"]([test_host])
    assert resp

    # Wait for host to transition to DOWN
    attempts = 0
    max_retry_attempts = 20

    log.info(
        "%s waiting for state %s",
        test_host,
        host_pb2.HostState.Name(host_pb2.HOST_STATE_DOWN),
    )
    while attempts < max_retry_attempts:
        try:
            if is_host_in_state(test_host, host_pb2.HOST_STATE_DOWN):
                break

            # if the number of available pods is less than 2 (instance_count -
            # maximum_unavailable_instances) fail the test
            if len(stateless_job.query_pods(
                    states=[pod_pb2.POD_STATE_RUNNING])) < 2:
                assert False
        except Exception as e:
            log.warn(e)
        finally:
            time.sleep(5)
            attempts += 1

    if attempts == max_retry_attempts:
        log.info(
            "%s max attempts reached to wait for host state %s",
            test_host,
            host_pb2.HostState.Name(host_pb2.HOST_STATE_DOWN),
        )
        assert False
Exemple #4
0
def test__update_with_host_maintenance_and_agent_down(stateless_job,
                                                      maintenance):
    """
    1. Create a large stateless job (that take up more than two-thirds of
       the cluster resources) with MaximumUnavailableInstances=2.
    2. Start host maintenance on one of the hosts (say A) having pods of the job.
       MaximumUnavailableInstances=2 ensures that not more than 2 pods are
       unavailable due to host maintenance at a time.
    3. Take down another host which has pods running on it. This will TASK_LOST
       to be sent for all pods on the host after 75 seconds.
    4. Start an update to modify the instance spec of one of the pods.
    5. Since TASK_LOST would cause the job SLA to be violated, instances on the
       host A should not be killed once LOST event is received. Verify that
       host A does not transition to DOWN.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3
    stateless_job.job_spec.sla.maximum_unavailable_instances = 2
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    # pick another host which has pods of the job to take down
    host_container = get_container([sorted_hosts[1]])

    try:
        host_container.stop()
        maintenance["start"]([test_host])

        stateless_job.job_spec.instance_spec[10].containers.extend([
            pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec(
                disk_limit_mb=20))
        ])
        update = StatelessUpdate(stateless_job,
                                 updated_job_spec=stateless_job.job_spec,
                                 batch_size=0)
        update.create()
        update.wait_for_state(goal_state="SUCCEEDED")

        stateless_job.stop()

        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
        pass
    finally:
        host_container.start()
Exemple #5
0
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance):
    """
    1. Create a stateless job with 6 instances. Wait for all instances to reach
       RUNNING state. This means that there is at least one host with 2 or more
       instances on it
    2. Start a bad job update with max failure tolerance of 1 and auto-rollback
       disabled.
    3. Start host maintenance on one of the hosts (say host A).
    4. Wait for the update to fail. There should be 2 instances unavailable.
    5. Since 2 instances are already unavailable and
       maximum_unavailable_instances=1, host maintenance should not proceed.
       Verify that the host A doesn't transition to DOWN.
    """
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.job_spec.instance_count = 6
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 6
    updated_job_spec.sla.maximum_unavailable_instances = 1
    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
        batch_size=2,
    )
    update.create()

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    maintenance["start"]([test_host])

    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    try:
        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
Exemple #6
0
    def test__host_maintenance_violate_sla_restart_jobmgr(self, failure_tester, maintenance):
        """
        1. Create a stateless job(instance_count=4) with host-limit-1 constraint and
        MaximumUnavailableInstances=1. Since there are only 3 UP hosts, one of
        the instances will not get placed (hence unavailable).
        2. Start host maintenance on one of the hosts (say A).
        3. Restart job manager.
        4. Since one instance is already unavailable, no more instances should be
        killed due to host maintenance. Verify that host A does not transition to DOWN
        """
        stateless_job = failure_tester.stateless_job()

        job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
        json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
        stateless_job.job_spec.instance_count = 4
        stateless_job.create()
        stateless_job.wait_for_all_pods_running(num_pods=3)

        # Pick a host that is UP and start maintenance on it
        test_host1 = get_host_in_state(
            host_pb2.HOST_STATE_UP, failure_tester.client)
        # update the client in maintenance fixture
        maintenance["update_client"](failure_tester.client)
        resp = maintenance["start"]([test_host1])
        assert resp

        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client
        # update the client of maintainance
        maintenance["update_client"](failure_tester.client)

        try:
            wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN)
            assert False, 'Host should not transition to DOWN'
        except:
            assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING)
            assert len(stateless_job.query_pods(
                states=[pod_pb2.POD_STATE_RUNNING])) == 3