Example #1
0
def test__job_create_manual_rollback(client):
    """
    Start a job update, and half-way to a manual rollback
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for few instances running
    time.sleep(5)

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    # rollback update
    client.rollback_job_update(job_update_key)
    wait_for_rolled_back(client, job_update_key)

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0
Example #2
0
def test__simple_auto_rolled_back(client):
    """
    Create a job, then issue a bad config update and validate
    job is rolled back to previous version
    """
    start_job_update(client, 'test_dc_labrat.yaml',
                     'start job update test/dc/labrat')

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_bad_config.yaml'),
        'rollout bad config')
    wait_for_rolled_back(client, res.key)

    # validate job is rolled back to previous config
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job},
                      statuses={api.ScheduleStatus.RUNNING}))

    tasks = res.tasks
    assert len(tasks) == 3

    for t in tasks:
        for r in t.assignedTask.task.resources:
            if r.numCpus > 0:
                assert r.numCpus == 0.25
            elif r.ramMb > 0:
                assert r.ramMb == 128
            elif r.diskMb > 0:
                assert r.diskMb == 128
            else:
                assert False, 'unexpected resource {}'.format(r)
Example #3
0
def test__job_create_fail_manual_rollback(client):
    """
    Start a failed job update, while half-way in the update,
    trigger a manual rollback
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"),
        "start job update test/dc/labrat_large_job (failed)",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for the first instance starting
    time.sleep(5)
    wait_for_failed(client, job_key, instances=[0])

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    # rollback update
    client.rollback_job_update(job_update_key)
    wait_for_rolled_back(client, job_update_key)

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    # validate no tasks are running
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))

    tasks = res.tasks
    assert len(tasks) == 0
Example #4
0
def test__simple_manual_rollback(client):
    """
    Start a job update which will create a job. Do another update on the job
    and half-way to a manual rollback
    """
    job_key = start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    verify_events_sorted(res.detailsList[0].updateEvents)
    verify_events_sorted(res.detailsList[0].instanceEvents)
    verify_first_and_last_job_update_status(
        res.detailsList[0].updateEvents,
        api.JobUpdateStatus.ROLLING_FORWARD,
        api.JobUpdateStatus.ROLLED_FORWARD,
    )
    verify_task_config(
        client,
        job_key,
        {
            "test_key_1": "test_value_1",
            "test_key_2": "test_value_2"
        },
    )

    # Do update on previously created job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job_diff_labels",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for few instances running
    time.sleep(5)

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    verify_events_sorted(res.detailsList[0].updateEvents)
    verify_events_sorted(res.detailsList[0].instanceEvents)
    verify_first_and_last_job_update_status(
        res.detailsList[0].updateEvents,
        api.JobUpdateStatus.ROLLING_FORWARD,
        api.JobUpdateStatus.ROLLING_FORWARD,
    )
    verify_task_config(
        client,
        job_key,
        {
            "test_key_1": "test_value_1",
            "test_key_2": "test_value_2",
            "test_key_11": "test_value_11",
            "test_key_22": "test_value_22",
        },
    )

    # rollback update
    client.rollback_job_update(job_update_key)
    wait_for_rolled_back(client, job_update_key)

    # verify events are sorted ascending, and last update event is ROLLED_BACK
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    verify_events_sorted(res.detailsList[0].updateEvents)
    verify_events_sorted(res.detailsList[0].instanceEvents)
    verify_first_and_last_job_update_status(
        res.detailsList[0].updateEvents,
        api.JobUpdateStatus.ROLLING_FORWARD,
        api.JobUpdateStatus.ROLLED_BACK,
    )
    verify_task_config(
        client,
        job_key,
        {
            "test_key_1": "test_value_1",
            "test_key_2": "test_value_2"
        },
    )  # rolled back to previous task config
Example #5
0
def test__auto_rollback_with_pinned_instances__remove_instances(client):
    """
    1. Create a job.
    2. Start a bad update on a subset of instances and adding more instances.
    3. The instances should rollback to their previous version.
       No instances should be removed.
    """
    req = get_job_update_request("test_dc_labrat_large_job.yaml")
    res = client.start_job_update(req,
                                  "start job update test/dc/labrat_large_job")
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a update with updateOnlyTheseInstances parameter,
    # and reduce instance count
    update_instances = set([2, 3, 4, 5])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])
    pinned_req.instanceCount = 8
    pinned_req.settings.maxFailedInstances = 3
    pinned_req.settings.updateGroupSize = 1

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_back(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "3"
        else:
            assert run_id == "1"
Example #6
0
def test__auto_rollback_with_pinned_instances__stopped_instances(client):
    """
    1. Create a job (v1).
    2. Start update  on the first subset of instances (v2).
    3. Start update on second subset of instances (v3).
    4. Stop some instances.
    5. Start a bad update on a subset consisting of at least
       one instance in each of v1, v2, v3 and stopped
    6. The instances should rollback to their respective previous good versions.
       The stopped instances in the bad update should transit to running.
    """
    all_instances = set([i for i in xrange(10)])
    # Create a job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a update on first subset of instances
    update_instances_1 = set([4, 5, 6, 7])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances_1])

    res = client.start_job_update(
        pinned_req,
        "start job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)

    # Start another update on the second subset of instances
    update_instances_2 = set([8, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_new_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances_2])

    res = client.start_job_update(
        pinned_req,
        "start another job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)

    # Stop some instances
    stop_instances = set([5, 8])
    client.kill_tasks(
        job_key,
        stop_instances,
        "killing instance 5, 8 for job test/dc/labrat_large_job",
    )
    wait_for_killed(client, job_key, stop_instances)

    # Start a bad update
    bad_update_instances = set([0, 5, 6, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in bad_update_instances])
    pinned_req.settings.maxFailedInstances = 1
    pinned_req.settings.maxPerInstanceFailures = 1
    pinned_req.settings.updateGroupSize = 2

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_back(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(
        res.tasks) == (len(all_instances - stop_instances) +
                       len(bad_update_instances.intersection(stop_instances)))

    for t in res.tasks:
        if t.assignedTask.instanceId in update_instances_1:
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, "unexpected metadata %s" % m
        elif t.assignedTask.instanceId in update_instances_2:
            print(t.assignedTask.instanceId)
            assert len(t.assignedTask.task.metadata) == 1
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_12":
                    assert m.value == "test_value_12"
                else:
                    assert False, "unexpected metadata %s" % m
        else:
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m

        if t.assignedTask.instanceId in (stop_instances -
                                         bad_update_instances):
            assert False, "unexpected start of stopped instance"
Example #7
0
def test__auto_rollback_with_pinned_instances(client):
    """
    1. Create a job.
    2. Start a bad update (version 2) targeting subset of instances.
    3. Wait for the instances to be auto-rolled back.
       Only the instances specified above should be affected.
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a bad update with updateOnlyTheseInstances parameter
    update_instances = [0, 2, 3, 7, 9]
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])
    pinned_req.settings.updateGroupSize = 5
    pinned_req.settings.maxFailedInstances = 3

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_back(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    # verify that the run-id of only the pinned instances has changed
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "3"
        else:
            assert run_id == "1"

        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, (
                    "unexpected metadata %s for unaffected instances" % m)