Beispiel #1
0
def test__in_place_kill_job_release_host():
    job1 = StatelessJob(
        job_file="test_stateless_job_spec.yaml",
    )
    job1.create()
    job1.wait_for_state(goal_state="RUNNING")

    job2 = StatelessJob(
        job_file="test_stateless_job_spec.yaml",
    )
    job2.create()
    job2.wait_for_state(goal_state="RUNNING")

    update1 = StatelessUpdate(job1,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=0)
    update1.create(in_place=True)
    # stop the update
    job1.stop()

    update2 = StatelessUpdate(job2,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=0)
    update2.create()

    # both updates should complete
    update1.wait_for_state(goal_state="SUCCEEDED")
    update2.wait_for_state(goal_state="SUCCEEDED")
Beispiel #2
0
def test_allocation_update_job__add_instances_restart_hostmgr_and_placement_engine(
        stateless_job,
        in_place,
        hostmgr,
        placement_engines,
):
    """
    1. Create a job
    2. Restart hostmgr and placement engines.
    3. Wait for job to come up and verify the allocation is as expected.
    4. Update the job to increase the instance count.
    5. Restart hostmgr and placement engines.
    6. Wait for all pods to transit to running state and verify the allocation is as expected.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    restart_hostmgr_and_placement_engine(hostmgr, placement_engines)
    stateless_job.wait_for_all_pods_running()

    verify_allocation(stateless_job)

    stateless_job.job_spec.instance_count = 50
    update = StatelessUpdate(
        stateless_job, updated_job_spec=stateless_job.job_spec)
    update.create(in_place=in_place)
    restart_hostmgr_and_placement_engine(hostmgr, placement_engines)
    update.wait_for_state(goal_state="SUCCEEDED")

    verify_allocation(stateless_job)

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")

    assert stateless_job.pool.get_allocation("cpu") == 0
Beispiel #3
0
def test_auto_rollback_reduce_instances(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    job_spec_dump = load_test_config(
        UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC
    )
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    # increase the instance count
    updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        roll_back_on_failure=True,
        max_instance_attempts=1,
        max_failure_instances=1,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLED_BACK")
    assert (
        len(stateless_job.query_pods())
        == stateless_job.job_spec.instance_count
    )
Beispiel #4
0
def test__auto_rollback_update_add_instances_with_bad_config(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = \
        stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             roll_back_on_failure=True,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='ROLLED_BACK')
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    # no instance should be added
    assert len(stateless_job.query_pods()) == \
        stateless_job.job_spec.instance_count
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
Beispiel #5
0
def test__in_place_update_host_maintenance(stateless_job, maintenance):
    # add enough instances so each host should have some tasks running
    stateless_job.job_spec.instance_count = 9
    # need extra retry attempts, since in-place update would need more time
    # to process given agent is put in maintenance mode
    stateless_job.config = IntegrationTestConfig(
        max_retry_attempts=300,
        pool_file='test_stateless_respool.yaml',
    ),
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 9
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
    update.wait_for_state(goal_state="SUCCEEDED")
Beispiel #6
0
def test__create_update_stopped_job(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    old_pod_states = set()
    for pod_info in old_pod_infos:
        old_pod_states.add(pod_info.spec.pod_name.value)

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    stateless_job.start()
    update.wait_for_state(goal_state="SUCCEEDED")
    stateless_job.wait_for_state(goal_state="RUNNING")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)

    # Only new instances should be RUNNING
    for pod_info in new_pod_infos:
        if pod_info.spec.pod_name.value in new_pod_infos:
            assert pod_info.status.state == pod_pb2.POD_STATE_KILLED
        else:
            assert pod_info.status.state == pod_pb2.POD_STATE_RUNNING
Beispiel #7
0
def test__update_with_sla_aware_host_maintenance(stateless_job, maintenance):
    """
    1. Create a stateless job with 3 instances.
    2. Create a job update to update the instance job with instance count 2,
    add host-limit-1 constraint and define sla with maximum_unavailable_instances=1
    3. Start host maintenance on one of the hosts
    4. The host should transition to DOWN and the update workflow should SUCCEED
    """
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml')
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 2

    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=1)
    update.create()

    # Pick a host that is UP and start maintenance on it
    test_host = get_host_in_state(host_pb2.HOST_STATE_UP)
    resp = maintenance["start"]([test_host])
    assert resp

    update.wait_for_state(goal_state="SUCCEEDED")
    wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
Beispiel #8
0
def test__update_reduce_instances_stopped_tasks(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == 3
    # first increase instances
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(new_pod_infos) == 5
    # now stop last 2 tasks
    ranges = task_pb2.InstanceRange(to=5)
    setattr(ranges, "from", 3)
    stateless_job.stop(ranges=[ranges])
    # now reduce instance count
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(new_pod_infos) == 3
Beispiel #9
0
def test__auto_rollback_update_reduce_instances_with_bad_config(
        stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count - 1

    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        roll_back_on_failure=True,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLED_BACK")
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    # no instance should be removed
    assert (len(
        stateless_job.query_pods()) == stateless_job.job_spec.instance_count)
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
Beispiel #10
0
def test_manual_rollback_increase_instances(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    # reduce instance count and then roll it back
    update2 = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC,
    )
    update2.create(in_place=in_place)
    update3 = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    update3.create(in_place=in_place)
    update3.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == len(new_pod_infos)
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
Beispiel #11
0
def test__create_update_add_instances_with_bad_config(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = stateless_job.job_spec.instance_count + 3

    update = StatelessUpdate(
        stateless_job,
        batch_size=1,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    # only one instance should be added
    assert (
        len(stateless_job.query_pods())
        == stateless_job.job_spec.instance_count + 1
    )
Beispiel #12
0
def test_allocation_update_job__add_and_remove_instances(stateless_job, in_place):
    """
    1. Create a job and verify the allocation is as expected.
    2. Update the job to increase the instance count and verify the allocation is as expected.
    3. Update the job to reduce the instance count and verify the allocation is as expected.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    verify_allocation(stateless_job)

    stateless_job.job_spec.instance_count = 50
    update = StatelessUpdate(
        stateless_job, updated_job_spec=stateless_job.job_spec)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")

    verify_allocation(stateless_job)

    stateless_job.job_spec.instance_count = 10
    update = StatelessUpdate(
        stateless_job, updated_job_spec=stateless_job.job_spec)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")

    verify_allocation(stateless_job)

    stateless_job.stop()
    stateless_job.wait_for_state(goal_state="KILLED")

    assert stateless_job.pool.get_allocation("cpu") == 0
Beispiel #13
0
def test__create_update_before_job_fully_created(stateless_job, in_place):
    stateless_job.create()
    update = StatelessUpdate(stateless_job,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    assert (stateless_job.get_spec().default_spec.containers[0].command.value
            == "while :; do echo updated; sleep 10; done")
Beispiel #14
0
def test__update_with_host_maintenance_and_agent_down(stateless_job,
                                                      maintenance):
    """
    1. Create a large stateless job (that take up more than two-thirds of
       the cluster resources) with MaximumUnavailableInstances=2.
    2. Start host maintenance on one of the hosts (say A) having pods of the job.
       MaximumUnavailableInstances=2 ensures that not more than 2 pods are
       unavailable due to host maintenance at a time.
    3. Take down another host which has pods running on it. This will TASK_LOST
       to be sent for all pods on the host after 75 seconds.
    4. Start an update to modify the instance spec of one of the pods.
    5. Since TASK_LOST would cause the job SLA to be violated, instances on the
       host A should not be killed once LOST event is received. Verify that
       host A does not transition to DOWN.
    """
    stateless_job.job_spec.instance_count = 30
    stateless_job.job_spec.default_spec.containers[0].resource.cpu_limit = 0.3
    stateless_job.job_spec.sla.maximum_unavailable_instances = 2
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    # pick another host which has pods of the job to take down
    host_container = get_container([sorted_hosts[1]])

    try:
        host_container.stop()
        maintenance["start"]([test_host])

        stateless_job.job_spec.instance_spec[10].containers.extend([
            pod_pb2.ContainerSpec(resource=pod_pb2.ResourceSpec(
                disk_limit_mb=20))
        ])
        update = StatelessUpdate(stateless_job,
                                 updated_job_spec=stateless_job.job_spec,
                                 batch_size=0)
        update.create()
        update.wait_for_state(goal_state="SUCCEEDED")

        stateless_job.stop()

        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
        pass
    finally:
        host_container.start()
Beispiel #15
0
def test__create_update_with_failed_health_check(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC,
        max_failure_instances=1,
        max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='FAILED', failed_state='SUCCEEDED')
Beispiel #16
0
def test__create_update_update_job_config(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_JOB_CONFIG_UPDATE_SPEC)
    update.create()
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert_pod_id_equal(old_pod_infos, new_pod_infos)
Beispiel #17
0
def test__create_update_with_failed_health_check(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")
Beispiel #18
0
def test__abort_update(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLING_FORWARD")
    update.abort()
    update.wait_for_state(goal_state="ABORTED")
Beispiel #19
0
def test__create_update_to_unset_health_check():
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    update = StatelessUpdate(job,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
Beispiel #20
0
def test__create_update_add_instances(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
Beispiel #21
0
def test__create_update(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(stateless_job,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
Beispiel #22
0
def patch_job(job, job_spec):
    """
    patch one job to desired state
    """
    log.info("patch job_name: %s, job_id: %s", job_spec.name, job.get_job_id())
    update = StatelessUpdate(job,
                             updated_job_spec=job_spec,
                             config=IntegrationTestConfig(
                                 pool_file=RESPOOL_FILE_NAME,
                                 max_retry_attempts=MAX_RETRY_ATTEMPTS))
    update.create()
    update.wait_for_state()
    job.wait_for_all_pods_running()
Beispiel #23
0
def test__create_update_to_disable_health_check():
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    job.job_spec.default_spec.containers[0].liveness_check.enabled = False
    update = StatelessUpdate(job,
                             updated_job_spec=job.job_spec,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
Beispiel #24
0
def test__update_with_host_maintenance__bad_config(stateless_job, maintenance):
    """
    1. Create a stateless job with 6 instances. Wait for all instances to reach
       RUNNING state. This means that there is at least one host with 2 or more
       instances on it
    2. Start a bad job update with max failure tolerance of 1 and auto-rollback
       disabled.
    3. Start host maintenance on one of the hosts (say host A).
    4. Wait for the update to fail. There should be 2 instances unavailable.
    5. Since 2 instances are already unavailable and
       maximum_unavailable_instances=1, host maintenance should not proceed.
       Verify that the host A doesn't transition to DOWN.
    """
    stateless_job.job_spec.sla.maximum_unavailable_instances = 1
    stateless_job.job_spec.instance_count = 6
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()

    hosts = [h.hostname for h in query_hosts([]).host_infos]
    host_to_task_count = get_host_to_task_count(hosts, stateless_job)
    sorted_hosts = [
        t[0] for t in sorted(host_to_task_count.items(),
                             key=operator.itemgetter(1),
                             reverse=True)
    ]

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_BAD_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)
    updated_job_spec.instance_count = 6
    updated_job_spec.sla.maximum_unavailable_instances = 1
    update = StatelessUpdate(
        stateless_job,
        updated_job_spec=updated_job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
        batch_size=2,
    )
    update.create()

    # Pick a host that has pods running on it to start maintenance on it.
    test_host = sorted_hosts[0]
    maintenance["start"]([test_host])

    update.wait_for_state(goal_state="FAILED", failed_state="SUCCEEDED")

    try:
        wait_for_host_state(test_host, host_pb2.HOST_STATE_DOWN)
        assert False, 'Host should not transition to DOWN'
    except:
        assert is_host_in_state(test_host, host_pb2.HOST_STATE_DRAINING)
Beispiel #25
0
def test__create_update_add_instances_with_batch_size(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_pod_infos = stateless_job.query_pods()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
    new_pod_infos = stateless_job.query_pods()
    assert len(old_pod_infos) == 3
    assert len(new_pod_infos) == 5
Beispiel #26
0
def test__in_place_update_success_rate_with_component_restart(
        stateless_job, jobmgr, resmgr, placement_engines):
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config(UPDATE_STATELESS_JOB_SPEC)
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)

    update.create(in_place=True)

    # restart all components except hostmgr
    jobmgr.restart()
    time.sleep(random.randint(1, 10))
    resmgr.restart()
    time.sleep(random.randint(1, 10))
    placement_engines.restart()

    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    log.info("total mismatch: %d", count)
    assert count == 0
Beispiel #27
0
def test__auto_rollback_update_with_failed_health_check(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_BAD_HEALTH_CHECK_SPEC,
        roll_back_on_failure=True,
        max_failure_instances=1,
        max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='ROLLED_BACK')
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_spec_equal(old_instance_zero_spec, new_instance_zero_spec)
Beispiel #28
0
def test_update_killed_job(in_place):
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    update = StatelessUpdate(
        job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")

    assert job.get_spec().instance_count == 3
    assert job.get_status().state == stateless_pb2.JOB_STATE_KILLED
Beispiel #29
0
def test__create_update_with_batch_size(stateless_job):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state='RUNNING')
    old_pod_infos = stateless_job.query_pods()
    old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_SPEC,
        batch_size=1,
    )
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
    new_pod_infos = stateless_job.query_pods()
    new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec()
    assert_pod_id_changed(old_pod_infos, new_pod_infos)
    assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
Beispiel #30
0
def test_stop_running_job_with_active_update_add_instances(stateless_job, in_place):
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    assert len(stateless_job.query_pods()) == 3

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLING_FORWARD")

    stateless_job.stop()
    update.wait_for_state(goal_state="SUCCEEDED")
    assert stateless_job.get_spec().instance_count == 5