Esempio n. 1
0
def test__in_place_update_host_maintenance(stateless_job, maintenance):
    # add enough instances so each host should have some tasks running
    stateless_job.job_spec.instance_count = 9
    # need extra retry attempts, since in-place update would need more time
    # to process given agent is put in maintenance mode
    stateless_job.config = IntegrationTestConfig(
        max_retry_attempts=300,
        pool_file='test_stateless_respool.yaml',
    ),
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 9
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
    update.wait_for_state(goal_state="SUCCEEDED")
Esempio n. 2
0
def test__auto_rollback_update_reduce_instances_with_bad_config(
        stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        roll_back_on_failure=True,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLED_BACK")
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    # no instance should be removed
    assert (len(
        stateless_job.query_pods()) == stateless_job.job_spec.instance_count)
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
Esempio n. 3
0
def test__update_with_sla_aware_host_maintenance(stateless_job, maintenance):
    """
    1. Create a stateless job with 3 instances.
    2. Create a job update to update the instance job with instance count 2,
    add host-limit-1 constraint and define sla with maximum_unavailable_instances=1
    3. Start host maintenance on one of the hosts
    4. The host should transition to DOWN and the update workflow should SUCCEED
    """
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 2

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=1)
    update.create()

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    update.wait_for_state(goal_state="SUCCEEDED")
    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
Esempio n. 4
0
def test_auto_rollback_reduce_instances(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    job_spec_dump = load_test_config(
        UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC
    )
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    # increase the instance count
    updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        roll_back_on_failure=True,
        max_instance_attempts=1,
        max_failure_instances=1,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLED_BACK")
    assert (
        len(stateless_job.query_pods())
        == stateless_job.job_spec.instance_count
    )
Esempio n. 5
0
def test__create_update_add_instances_with_bad_config(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(
        stateless_job,
        batch_size=1,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    # only one instance should be added
    assert (
        len(stateless_job.query_pods())
        == stateless_job.job_spec.instance_count + 1
    )
Esempio n. 6
0
def test__auto_rollback_update_add_instances_with_bad_config(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = \
        stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             roll_back_on_failure=True,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='ROLLED_BACK')
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    # no instance should be added
    assert len(stateless_job.query_pods()) == \
        stateless_job.job_spec.instance_count
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
Esempio n. 7
0
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance):
    """
    1. Create a stateless job with 6 instances. Wait for all instances to reach
       RUNNING state. This means that there is at least one host with 2 or more
       instances on it
    2. Start a bad job update with max failure tolerance of 1 and auto-rollback
       disabled.
    3. Start host maintenance on one of the hosts (say host A).
    4. Wait for the update to fail. There should be 2 instances unavailable.
    5. Since 2 instances are already unavailable and
       maximum_unavailable_instances=1, host maintenance should not proceed.
       Verify that the host A doesn't transition to DOWN.
    """
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.job_spec.instance_count = 6
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 6
    updated_job_spec.sla.maximum_unavailable_instances = 1
    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
        batch_size=2,
    )
    update.create()

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    maintenance["start"]([test_host])

    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    try:
        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
Esempio n. 8
0
def test__in_place_update_success_rate_with_component_restart(
        stateless_job, jobmgr, resmgr, placement_engines):
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)

    update.create(in_place=True)

    # restart all components except hostmgr
    jobmgr.restart()
    time.sleep(random.randint(1, 10))
    resmgr.restart()
    time.sleep(random.randint(1, 10))
    placement_engines.restart()

    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    log.info("total mismatch: %d", count)
    assert count == 0
Esempio n. 9
0
    def __init__(self, job,
                 updated_job_file=None,
                 client=None,
                 config=None,
                 pool=None,
                 batch_size=None,
                 updated_job_spec=None,
                 roll_back_on_failure=None,
                 max_instance_attempts=None,
                 max_failure_instances=None,
                 start_paused=None,
                 ):

        self.config = config or IntegrationTestConfig()
        self.client = client or Client()
        self.pool = pool or Pool(self.config)
        if updated_job_spec is None:
            job_config_dump = load_test_config(updated_job_file)
            updated_job_spec = JobSpec()
            json_format.ParseDict(job_config_dump, updated_job_spec)
        self.updated_job_spec = updated_job_spec
        self.batch_size = batch_size or 0
        self.roll_back_on_failure = roll_back_on_failure or False
        self.max_instance_attempts = max_instance_attempts or 0
        self.max_failure_instances = max_failure_instances or 0
        self.start_paused = start_paused or False
        self.job = job
Esempio n. 10
0
def test__in_place_update_success_rate(stateless_job):
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(update_stateless_job_spec())
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    if minicluster_type() == "k8s":
        updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)
    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    log.info("total mismatch: %d", count)
    assert count == 0
Esempio n. 11
0
def test__create_update_reduce_instances_with_bad_config(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        batch_size=1,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == len(new_pod_infos)
Esempio n. 12
0
    def test__in_place_update_success_rate_with_component_restart(self, failure_tester):
        '''
        Test in-place update can finish after multiple components restart
        '''
        stateless_job = failure_tester.stateless_job()
        stateless_job.job_spec.instance_count = 30
        stateless_job.create()
        stateless_job.wait_for_all_pods_running()
        old_pod_infos = stateless_job.query_pods()

        job_spec_dump = load_test_config("test_update_stateless_job_spec.yaml")
        updated_job_spec = JobSpec()
        json_format.ParseDict(job_spec_dump, updated_job_spec)

        updated_job_spec.instance_count = 30
        update = failure_tester.stateless_update(stateless_job,
                                                 updated_job_spec=updated_job_spec,
                                                 batch_size=0)

        update.create(in_place=True)

        # restart all components except hostmgr
        leader1 = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader1
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader1)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        time.sleep(random.randint(1, 10))

        leader2 = failure_tester.fw.get_leader_info(failure_tester.resmgr)
        assert leader2
        assert 0 != failure_tester.fw.restart(failure_tester.resmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.resmgr, leader2)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        time.sleep(random.randint(1, 10))

        assert 0 != failure_tester.fw.restart(failure_tester.stateless_pe)

        update.wait_for_state(goal_state='SUCCEEDED')

        new_pod_infos = stateless_job.query_pods()

        old_pod_dict = {}
        new_pod_dict = {}

        for old_pod_info in old_pod_infos:
            split_index = old_pod_info.status.pod_id.value.rfind('-')
            pod_name = old_pod_info.status.pod_id.value[:split_index]
            old_pod_dict[pod_name] = old_pod_info.status.host

        for new_pod_info in new_pod_infos:
            split_index = new_pod_info.status.pod_id.value.rfind('-')
            pod_name = new_pod_info.status.pod_id.value[:split_index]
            new_pod_dict[pod_name] = new_pod_info.status.host

        count = 0
        for pod_name, pod_id in old_pod_dict.items():
            if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
                log.info("%s, prev:%s, cur:%s", pod_name,
                         old_pod_dict[pod_name], new_pod_dict[pod_name])
                count = count + 1
        log.info("total mismatch: %d", count)
        assert count == 0