コード例 #1
0
ファイル: test_update.py プロジェクト: zhaohc10/peloton
def test__auto_rollback_update_reduce_instances_with_bad_config(
        stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        roll_back_on_failure=True,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLED_BACK")
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    # no instance should be removed
    assert (len(
        stateless_job.query_pods()) == stateless_job.job_spec.instance_count)
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
コード例 #2
0
def test__host_maintenance_violate_sla(stateless_job, maintenance):
    """
    1. Create a stateless job(instance_count=4) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. This means that there one instance that is
       unavailable.
    2. Start host maintenance on one of the hosts (say A).
    3. Since one instance is already unavailable, no more instances should be
       killed due to host maintenance. Verify that host A does not transition
       to DOWN.
    """
    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
    stateless_job.job_spec.instance_count = 4
    stateless_job.create()
    stateless_job.wait_for_all_pods_running(num_pods=3)

    # Pick a host that is UP and start maintenance on it
    test_host1 = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host1])
    assert resp

    try:
        wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING)
        assert len(
            stateless_job.query_pods(states=[pod_pb2.POD_STATE_RUNNING])) == 3
コード例 #3
0
ファイル: test_update.py プロジェクト: zhaohc10/peloton
def test__update_with_sla_aware_host_maintenance(stateless_job, maintenance):
    """
    1. Create a stateless job with 3 instances.
    2. Create a job update to update the instance job with instance count 2,
    add host-limit-1 constraint and define sla with maximum_unavailable_instances=1
    3. Start host maintenance on one of the hosts
    4. The host should transition to DOWN and the update workflow should SUCCEED
    """
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 2

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=1)
    update.create()

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    update.wait_for_state(goal_state="SUCCEEDED")
    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
コード例 #4
0
def test_auto_rollback_reduce_instances(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    job_spec_dump = load_test_config(
        UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC
    )
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    # increase the instance count
    updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        roll_back_on_failure=True,
        max_instance_attempts=1,
        max_failure_instances=1,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLED_BACK")
    assert (
        len(stateless_job.query_pods())
        == stateless_job.job_spec.instance_count
    )
コード例 #5
0
ファイル: test_update.py プロジェクト: zhaohc10/peloton
def test__in_place_update_host_maintenance(stateless_job, maintenance):
    # add enough instances so each host should have some tasks running
    stateless_job.job_spec.instance_count = 9
    # need extra retry attempts, since in-place update would need more time
    # to process given agent is put in maintenance mode
    stateless_job.config = IntegrationTestConfig(
        max_retry_attempts=300,
        pool_file='test_stateless_respool.yaml',
    ),
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 9
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
    update.wait_for_state(goal_state="SUCCEEDED")
コード例 #6
0
def test__create_update_add_instances_with_bad_config(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(
        stateless_job,
        batch_size=1,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    # only one instance should be added
    assert (
        len(stateless_job.query_pods())
        == stateless_job.job_spec.instance_count + 1
    )
コード例 #7
0
def test__auto_rollback_update_add_instances_with_bad_config(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = \
        stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             roll_back_on_failure=True,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='ROLLED_BACK')
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    # no instance should be added
    assert len(stateless_job.query_pods()) == \
        stateless_job.job_spec.instance_count
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
コード例 #8
0
ファイル: test_update.py プロジェクト: zhaohc10/peloton
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance):
    """
    1. Create a stateless job with 6 instances. Wait for all instances to reach
       RUNNING state. This means that there is at least one host with 2 or more
       instances on it
    2. Start a bad job update with max failure tolerance of 1 and auto-rollback
       disabled.
    3. Start host maintenance on one of the hosts (say host A).
    4. Wait for the update to fail. There should be 2 instances unavailable.
    5. Since 2 instances are already unavailable and
       maximum_unavailable_instances=1, host maintenance should not proceed.
       Verify that the host A doesn't transition to DOWN.
    """
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.job_spec.instance_count = 6
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 6
    updated_job_spec.sla.maximum_unavailable_instances = 1
    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
        batch_size=2,
    )
    update.create()

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    maintenance["start"]([test_host])

    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    try:
        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
コード例 #9
0
ファイル: test_update.py プロジェクト: benzei/peloton
def test__in_place_update_success_rate_with_component_restart(
        stateless_job, jobmgr, resmgr, placement_engines):
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)

    update.create(in_place=True)

    # restart all components except hostmgr
    jobmgr.restart()
    time.sleep(random.randint(1, 10))
    resmgr.restart()
    time.sleep(random.randint(1, 10))
    placement_engines.restart()

    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    log.info("total mismatch: %d", count)
    assert count == 0
コード例 #10
0
ファイル: test_update.py プロジェクト: zhaohc10/peloton
def test__in_place_update_success_rate(stateless_job):
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(update_stateless_job_spec())
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    if minicluster_type() == "k8s":
        updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)
    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    log.info("total mismatch: %d", count)
    assert count == 0
コード例 #11
0
    def test__host_maintenance_violate_sla_restart_jobmgr(self, failure_tester, maintenance):
        """
        1. Create a stateless job(instance_count=4) with host-limit-1 constraint and
        MaximumUnavailableInstances=1. Since there are only 3 UP hosts, one of
        the instances will not get placed (hence unavailable).
        2. Start host maintenance on one of the hosts (say A).
        3. Restart job manager.
        4. Since one instance is already unavailable, no more instances should be
        killed due to host maintenance. Verify that host A does not transition to DOWN
        """
        stateless_job = failure_tester.stateless_job()

        job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
        json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
        stateless_job.job_spec.instance_count = 4
        stateless_job.create()
        stateless_job.wait_for_all_pods_running(num_pods=3)

        # Pick a host that is UP and start maintenance on it
        test_host1 = get_host_in_state(
            host_pb2.HOST_STATE_UP, failure_tester.client)
        # update the client in maintenance fixture
        maintenance["update_client"](failure_tester.client)
        resp = maintenance["start"]([test_host1])
        assert resp

        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client
        # update the client of maintainance
        maintenance["update_client"](failure_tester.client)

        try:
            wait_for_host_state(test_host1, host_pb2.HOST_STATE_DOWN)
            assert False, 'Host should not transition to DOWN'
        except:
            assert is_host_in_state(test_host1, host_pb2.HOST_STATE_DRAINING)
            assert len(stateless_job.query_pods(
                states=[pod_pb2.POD_STATE_RUNNING])) == 3
コード例 #12
0
def test__stop_start_pod_on_sla_violated_job(stateless_job):
    """
    1. Create a stateless job(instance_count=5) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of
       the instances will not get placed (hence unavailable).
    2. Kill one of the running instances (say i). Instance should get killed.
    3. Start instance i. Instance i should transit to PENDING (due to host
       limit 1 constraint, instance won't get placed).
    """
    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    json_format.ParseDict(job_spec_dump, stateless_job.job_spec)
    stateless_job.job_spec.instance_count = 5
    stateless_job.create()
    stateless_job.wait_for_all_pods_running(num_pods=3)

    test_instance = None
    for i in range(0, stateless_job.job_spec.instance_count):
        if stateless_job.get_pod_status(i).state == pod_pb2.POD_STATE_RUNNING:
            test_instance = i
            break

    print test_instance
    assert not test_instance == None

    ranges = task_pb2.InstanceRange(to=test_instance + 1)
    setattr(ranges, "from", test_instance)
    stateless_job.stop(ranges=[ranges])

    def instance_killed():
        return stateless_job.get_pod_status(
            test_instance).state == pod_pb2.POD_STATE_KILLED

    stateless_job.wait_for_condition(instance_killed)

    stateless_job.start(ranges=[ranges])

    def instance_pending():
        return stateless_job.get_pod_status(
            test_instance).state == pod_pb2.POD_STATE_PENDING

    stateless_job.wait_for_condition(instance_pending)
コード例 #13
0
def test__create_update_reduce_instances_with_bad_config(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        batch_size=1,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == len(new_pod_infos)
コード例 #14
0
    def test__in_place_update_success_rate_with_component_restart(self, failure_tester):
        '''
        Test in-place update can finish after multiple components restart
        '''
        stateless_job = failure_tester.stateless_job()
        stateless_job.job_spec.instance_count = 30
        stateless_job.create()
        stateless_job.wait_for_all_pods_running()
        old_pod_infos = stateless_job.query_pods()

        job_spec_dump = load_test_config("test_update_stateless_job_spec.yaml")
        updated_job_spec = JobSpec()
        json_format.ParseDict(job_spec_dump, updated_job_spec)

        updated_job_spec.instance_count = 30
        update = failure_tester.stateless_update(stateless_job,
                                                 updated_job_spec=updated_job_spec,
                                                 batch_size=0)

        update.create(in_place=True)

        # restart all components except hostmgr
        leader1 = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader1
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader1)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        time.sleep(random.randint(1, 10))

        leader2 = failure_tester.fw.get_leader_info(failure_tester.resmgr)
        assert leader2
        assert 0 != failure_tester.fw.restart(failure_tester.resmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.resmgr, leader2)
        failure_tester.reset_client()
        stateless_job.client = failure_tester.client

        time.sleep(random.randint(1, 10))

        assert 0 != failure_tester.fw.restart(failure_tester.stateless_pe)

        update.wait_for_state(goal_state='SUCCEEDED')

        new_pod_infos = stateless_job.query_pods()

        old_pod_dict = {}
        new_pod_dict = {}

        for old_pod_info in old_pod_infos:
            split_index = old_pod_info.status.pod_id.value.rfind('-')
            pod_name = old_pod_info.status.pod_id.value[:split_index]
            old_pod_dict[pod_name] = old_pod_info.status.host

        for new_pod_info in new_pod_infos:
            split_index = new_pod_info.status.pod_id.value.rfind('-')
            pod_name = new_pod_info.status.pod_id.value[:split_index]
            new_pod_dict[pod_name] = new_pod_info.status.host

        count = 0
        for pod_name, pod_id in old_pod_dict.items():
            if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
                log.info("%s, prev:%s, cur:%s", pod_name,
                         old_pod_dict[pod_name], new_pod_dict[pod_name])
                count = count + 1
        log.info("total mismatch: %d", count)
        assert count == 0