Exemple #1
0
def test__start_job_update_revocable_job(client):
    """
    Given 12 non-revocable cpus, and 12 revocable cpus
    Create a non-revocable of 3 instance, with 3 CPU per instance
    Create a revocable job of 1 instance, with 4 CPU per instance
    """
    non_revocable_job = start_job_update(
        client,
        "test_dc_labrat_cpus_large.yaml",
        "start job update test/dc/labrat_large",
    )

    revocable_job = start_job_update(
        client,
        "test_dc_labrat_revocable.yaml",
        "start job update test/dc/labrat_revocable",
    )

    # Add some wait time for lucene index to build
    time.sleep(10)

    # validate 1 revocable tasks are running
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={revocable_job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 1

    # validate 3 non-revocable tasks are running
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={non_revocable_job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 3
Exemple #2
0
def test__host_limit_1(client, hostmgr):
    """
    - Create a job with host limit 1 constraint and validate each pod
    is running on different host.
    - Update a job, wait for it to complete and verify host limit 1 constraint.
    - Update a job, restart host manager, then wait for update to complete and
    lastly verify host limit 1 constraint.
    """
    # Create job.
    job_key = start_job_update(client, "test_dc_labrat.yaml",
                               "start job update test/dc/labrat")

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 3

    verify_host_limit_1(res.tasks)

    # Start a update with host limit 1 constraint
    job_key = start_job_update(client, "test_dc_labrat_1.yaml",
                               "start job update test/dc/labrat_1")

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 3

    verify_host_limit_1(res.tasks)

    # Start an update, and restart hostmanager before update completes.
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat.yaml"),
        "start job update test/dc/labrat",
    )

    # restart host manager
    hostmgr.restart()

    wait_for_rolled_forward(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 3

    verify_host_limit_1(res.tasks)
Exemple #3
0
def get_task_status(client, job_key, instances=None):
    """Querying current task status for job.

    Args:
        client: aurora client object
        job_key: aurora JobKey struct specifying the job to query for
        instances: a list of instance ids to wait for, wait for all instances
            passed as None

    Returns:
        a list of ScheduleStatus enum representing the state for all tasks
    """
    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys=[job_key]))

    assert res.tasks is not None

    tasks_per_instance = {}
    for t in res.tasks:
        instance_id = t.assignedTask.instanceId
        if instance_id not in tasks_per_instance:
            tasks_per_instance[instance_id] = []

        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        tasks_per_instance[instance_id].append((run_id, t.status))

    # grab task status from latest pod run
    return [
        max(statuses)[1] for iid, statuses in tasks_per_instance.iteritems()
        if instances is None or iid in instances
    ]
Exemple #4
0
def test__simple_update_tasks_reconcile(client, hostmgr, mesos_master):
    """
    Restart host manager and mesos master multiple times,
    to make sure mesos tasks are reconciled correctly.
    """
    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_large_job.yaml'),
        'start job update test/dc/labrat_large_job')

    # wait for sometime for jobmgr goal state engine to kick-in
    time.sleep(random.randint(1, 10))

    # First restart
    hostmgr.restart()
    time.sleep(random.randint(1, 5))
    mesos_master.restart()

    # Second restart
    hostmgr.restart()
    time.sleep(random.randint(1, 5))
    mesos_master.restart()

    # Third restart
    hostmgr.restart()
    time.sleep(random.randint(1, 5))
    mesos_master.restart()

    wait_for_rolled_forward(client, res.key)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
Exemple #5
0
def test__simple_update_with_restart_component(client, jobmgr, resmgr, hostmgr,
                                               mesos_master):
    """
    Start an update, and restart jobmgr, resmgr, hostmgr & mesos master.
    """
    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_large_job.yaml'),
        'start job update test/dc/labrat_large_job')

    # wait for sometime for jobmgr goal state engine to kick-in
    time.sleep(random.randint(1, 10))
    jobmgr.restart()

    # wait for sometime to enqueue gangs
    time.sleep(random.randint(1, 10))

    # clear any admission and queues
    resmgr.restart()

    # wait for sometime to acquire host lock
    time.sleep(random.randint(1, 10))

    # clear host `placing` lock
    hostmgr.restart()
    time.sleep(random.randint(1, 10))

    # restart mesos master to jumble up host manager state
    mesos_master.restart()

    wait_for_rolled_forward(client, res.key)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
Exemple #6
0
def test__simple_auto_rolled_back(client):
    """
    Create a job, then issue a bad config update and validate
    job is rolled back to previous version
    """
    start_job_update(client, 'test_dc_labrat.yaml',
                     'start job update test/dc/labrat')

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_bad_config.yaml'),
        'rollout bad config')
    wait_for_rolled_back(client, res.key)

    # validate job is rolled back to previous config
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job},
                      statuses={api.ScheduleStatus.RUNNING}))

    tasks = res.tasks
    assert len(tasks) == 3

    for t in tasks:
        for r in t.assignedTask.task.resources:
            if r.numCpus > 0:
                assert r.numCpus == 0.25
            elif r.ramMb > 0:
                assert r.ramMb == 128
            elif r.diskMb > 0:
                assert r.diskMb == 128
            else:
                assert False, 'unexpected resource {}'.format(r)
Exemple #7
0
def test__abort_auto_rollback_and_update(client):
    """
    1. Create a job
    2. Start a bad update, wait for auto-rollback to kick-in
    3. Once auto-rollback starts, abort an update.
    4. Do a new good update and all the instances should converge to the new config.
    """
    start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"),
        "rollout bad config",
    )

    # wait for auto-rollback to kick-in
    wait_for_auto_rolling_back(client, res.key)

    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_BACK},
        api.JobUpdateStatus.ABORTED,
    )

    new_config = get_job_update_request(
        "test_dc_labrat_large_job_new_config.yaml"
    )
    res = client.start_job_update(new_config, "rollout good config")
    # Sleep for a while so that update gets triggered.
    time.sleep(5)
    wait_for_rolled_forward(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(
            jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING}
        )
    )
    assert len(res.tasks) == 10

    for t in res.tasks:
        assert len(t.assignedTask.task.metadata) == 1
        assert (
            list(t.assignedTask.task.metadata)[0].key
            == list(new_config.taskConfig.metadata)[0].key
        )
        assert (
            list(t.assignedTask.task.metadata)[0].value
            == list(new_config.taskConfig.metadata)[0].value
        )

        assert t.ancestorId
Exemple #8
0
def test__deploy_on_aborted_update(client):
    """
    Deploy an update, and abort half-way. Then re-deploy
    same update. Updated instances should not restart again.
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )

    # Few instances start
    time.sleep(5)

    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_FORWARD},
        api.JobUpdateStatus.ABORTED,
    )

    # Not all instances were created
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job})
    )
    assert len(res.tasks) < 10

    # deploy same update, should impact remaining instances
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"),
        "start job update test/dc/labrat_large_job_diff_executor",
    )
    wait_for_rolled_forward(client, res.key)

    # All instances are created
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job})
    )
    assert len(res.tasks) == 10

    # No instances should have ancestor id, thereby validating
    # instances created in previous update are not restarted/redeployed
    for task in res.tasks:
        assert task.ancestorId is None
Exemple #9
0
def verify_task_config(client, job_key, metadata_dict):
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))

    for t in res.tasks:
        for m in t.assignedTask.task.metadata:
            if m.key in metadata_dict:
                assert m.value == metadata_dict[m.key]
            else:
                assert False, "unexpected metadata {}".format(m)
Exemple #10
0
def get_running_tasks(client, job_key):
    """Calls getTasksWithoutConfigs endpoint to get currently running tasks.

    Args:
        client: aurora client object
        job_key: aurora job key
    """
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    return res.tasks
    def test__simple_update_tasks_reconcile(self, failure_tester):
        """
        Restart host manager and mesos master multiple times,
        to make sure mesos tasks are reconciled correctly.
        """
        res = failure_tester.aurorabridge_client.start_job_update(
            get_job_update_request('test_dc_labrat_large_job.yaml'),
            'start job update test/dc/labrat_large_job')

        # wait for sometime for jobmgr goal state engine to kick-in
        time.sleep(random.randint(1, 10))

        # First restart
        leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader)
        failure_tester.reset_client()

        time.sleep(random.randint(1, 5))

        assert 0 != failure_tester.fw.restart(failure_tester.mesos_master)

        # Second restart
        leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader)
        failure_tester.reset_client()

        time.sleep(random.randint(1, 5))

        assert 0 != failure_tester.fw.restart(failure_tester.mesos_master)

        # Third restart
        leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader)
        failure_tester.reset_client()

        time.sleep(random.randint(1, 5))

        assert 0 != failure_tester.fw.restart(failure_tester.mesos_master)

        # Sleep to help the cluster to stabilize
        time.sleep(10)

        wait_for_rolled_forward(failure_tester.aurorabridge_client, res.key)
        res = failure_tester.aurorabridge_client.get_tasks_without_configs(
            api.TaskQuery(jobKeys={res.key.job},
                          statuses={api.ScheduleStatus.RUNNING}))
        assert len(res.tasks) == 10
    def test__simple_update_with_restart_component(self, failure_tester):
        """
        Start an update, and restart jobmgr, resmgr, hostmgr & mesos master.
        """
        res = failure_tester.aurorabridge_client.start_job_update(
            get_job_update_request('test_dc_labrat_large_job.yaml'),
            'start job update test/dc/labrat_large_job')

        # wait for sometime for jobmgr goal state engine to kick-in
        time.sleep(random.randint(1, 10))

        leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader)
        failure_tester.reset_client()

        # wait for sometime to enqueue gangs
        time.sleep(random.randint(1, 10))

        # clear any admission and queues
        leader = failure_tester.fw.get_leader_info(failure_tester.resmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.resmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.resmgr, leader)
        failure_tester.reset_client()

        # wait for sometime to acquire host lock
        time.sleep(random.randint(1, 10))

        # clear host `placing` lock
        leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr)
        assert leader
        assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader")
        failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader)
        failure_tester.reset_client()
        time.sleep(random.randint(1, 10))

        # restart mesos master to jumble up host manager state
        assert 0 != failure_tester.fw.restart(failure_tester.mesos_master)

        #  Sleep to help the cluster to stabilize
        time.sleep(10)

        wait_for_rolled_forward(failure_tester.aurorabridge_client,
                                res.key,
                                timeout_secs=200)
        res = failure_tester.aurorabridge_client.get_tasks_without_configs(
            api.TaskQuery(jobKeys={res.key.job},
                          statuses={api.ScheduleStatus.RUNNING}))
        assert len(res.tasks) == 10
Exemple #13
0
def get_task_status(client, job_key):
    '''Querying current task status for job.

    Args:
        client: aurora client object
        job_key: aurora JobKey struct specifying the job to query for

    Returns:
        a list of ScheduleStatus enum representing the state for all tasks
    '''
    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys=[job_key]))

    assert res.tasks is not None

    return [t.status for t in res.tasks]
Exemple #14
0
def test__simple_update_with_diff(client):
    """
    test simple update use case where second update has config
    change, here all the instances will move to new config
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    # Do update with labels changed
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    job_key = res.key.job
    wait_for_rolled_forward(client, res.key)

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    for task in res.tasks:
        assert task.ancestorId is not None

    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) == 0
Exemple #15
0
def get_task_status(client, job_key, instances=None):
    '''Querying current task status for job.

    Args:
        client: aurora client object
        job_key: aurora JobKey struct specifying the job to query for
        instances: a list of instance ids to wait for, wait for all instances
            passed as None

    Returns:
        a list of ScheduleStatus enum representing the state for all tasks
    '''
    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys=[job_key]))

    assert res.tasks is not None

    return [
        t.status for t in res.tasks
        if not instances or t.assignedTask.instanceId in instances
    ]
Exemple #16
0
def test__simple_update_events_purge(client, jobmgr, resmgr):
    """
    Restart job manager and resource manager multiple times,
    to make sure mesos task status update events are purged
    correctly.
    """
    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_large_job.yaml'),
        'start job update test/dc/labrat_large_job')

    # wait for sometime for jobmgr goal state engine to kick-in
    time.sleep(random.randint(1, 10))

    # First restart
    jobmgr.restart()
    time.sleep(random.randint(1, 5))
    resmgr.restart()
    time.sleep(random.randint(1, 5))

    # Second restart
    jobmgr.restart()
    time.sleep(random.randint(1, 5))
    resmgr.restart()
    time.sleep(random.randint(1, 5))

    # Third restart
    jobmgr.restart()
    time.sleep(random.randint(1, 5))
    resmgr.restart()

    # Sleep to ensure lucene index converges
    time.sleep(10)

    wait_for_rolled_forward(client, res.key)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
Exemple #17
0
def test__job_create_fail_manual_rollback(client):
    """
    Start a failed job update, while half-way in the update,
    trigger a manual rollback
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"),
        "start job update test/dc/labrat_large_job (failed)",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for the first instance starting
    time.sleep(5)
    wait_for_failed(client, job_key, instances=[0])

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    # rollback update
    client.rollback_job_update(job_update_key)
    wait_for_rolled_back(client, job_update_key)

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList[0].updateEvents) > 0
    assert len(res.detailsList[0].instanceEvents) > 0

    # validate no tasks are running
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))

    tasks = res.tasks
    assert len(tasks) == 0
Exemple #18
0
def test__abort_auto_rollback_with_pinned_instances_and_update(client):
    """
    1. Create a job.
    2. Start a bad update (version 2) targeting subset of instances.
    3. Wait for the auto-rollback to kick-in.
    4. Once auto-rollback kicks in, abort the update.
    4. Start a new good update and wait for all instances to converge to that update.
    """
    # Create a job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a bad update with updateOnlyTheseInstances parameter
    update_instances = [0, 2, 3, 7, 9]
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])
    pinned_req.settings.maxFailedInstances = 4

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    # wait for auto-rollback to kick-in
    wait_for_auto_rolling_back(client, res.key, timeout_secs=150)

    # abort the update
    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_BACK},
        api.JobUpdateStatus.ABORTED,
    )

    # start a new good update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_new_config.yaml"),
        "start job update test/dc/labrat_large_job with a good config",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 1
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_12":
                assert m.value == "test_value_12"
            else:
                assert False, "unexpected metadata %s" % m

        if t.assignedTask.instanceId in update_instances:
            # only a few of the pinned instances might have rolled back
            assert run_id == "3" or run_id == "4"
        else:
            assert run_id == "2"
Exemple #19
0
def test__auto_rollback_with_pinned_instances__remove_instances(client):
    """
    1. Create a job.
    2. Start a bad update on a subset of instances and adding more instances.
    3. The instances should rollback to their previous version.
       No instances should be removed.
    """
    req = get_job_update_request("test_dc_labrat_large_job.yaml")
    res = client.start_job_update(req,
                                  "start job update test/dc/labrat_large_job")
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a update with updateOnlyTheseInstances parameter,
    # and reduce instance count
    update_instances = set([2, 3, 4, 5])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])
    pinned_req.instanceCount = 8
    pinned_req.settings.maxFailedInstances = 3
    pinned_req.settings.updateGroupSize = 1

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_back(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "3"
        else:
            assert run_id == "1"
Exemple #20
0
def test__auto_rollback_with_pinned_instances__stopped_instances(client):
    """
    1. Create a job (v1).
    2. Start update  on the first subset of instances (v2).
    3. Start update on second subset of instances (v3).
    4. Stop some instances.
    5. Start a bad update on a subset consisting of at least
       one instance in each of v1, v2, v3 and stopped
    6. The instances should rollback to their respective previous good versions.
       The stopped instances in the bad update should transit to running.
    """
    all_instances = set([i for i in xrange(10)])
    # Create a job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a update on first subset of instances
    update_instances_1 = set([4, 5, 6, 7])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances_1])

    res = client.start_job_update(
        pinned_req,
        "start job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)

    # Start another update on the second subset of instances
    update_instances_2 = set([8, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_new_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances_2])

    res = client.start_job_update(
        pinned_req,
        "start another job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)

    # Stop some instances
    stop_instances = set([5, 8])
    client.kill_tasks(
        job_key,
        stop_instances,
        "killing instance 5, 8 for job test/dc/labrat_large_job",
    )
    wait_for_killed(client, job_key, stop_instances)

    # Start a bad update
    bad_update_instances = set([0, 5, 6, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in bad_update_instances])
    pinned_req.settings.maxFailedInstances = 1
    pinned_req.settings.maxPerInstanceFailures = 1
    pinned_req.settings.updateGroupSize = 2

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_back(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(
        res.tasks) == (len(all_instances - stop_instances) +
                       len(bad_update_instances.intersection(stop_instances)))

    for t in res.tasks:
        if t.assignedTask.instanceId in update_instances_1:
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, "unexpected metadata %s" % m
        elif t.assignedTask.instanceId in update_instances_2:
            print(t.assignedTask.instanceId)
            assert len(t.assignedTask.task.metadata) == 1
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_12":
                    assert m.value == "test_value_12"
                else:
                    assert False, "unexpected metadata %s" % m
        else:
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m

        if t.assignedTask.instanceId in (stop_instances -
                                         bad_update_instances):
            assert False, "unexpected start of stopped instance"
Exemple #21
0
def test__auto_rollback_with_pinned_instances(client):
    """
    1. Create a job.
    2. Start a bad update (version 2) targeting subset of instances.
    3. Wait for the instances to be auto-rolled back.
       Only the instances specified above should be affected.
    """
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a bad update with updateOnlyTheseInstances parameter
    update_instances = [0, 2, 3, 7, 9]
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_bad_config.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])
    pinned_req.settings.updateGroupSize = 5
    pinned_req.settings.maxFailedInstances = 3

    res = client.start_job_update(
        pinned_req,
        "start a bad update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_back(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    # verify that the run-id of only the pinned instances has changed
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "3"
        else:
            assert run_id == "1"

        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, (
                    "unexpected metadata %s for unaffected instances" % m)
Exemple #22
0
def test__get_tasks_without_configs_task_queries(client):
    # Verify no tasks are returned before creating.
    res = client.get_tasks_without_configs(api.TaskQuery())
    assert len(res.tasks) == 0

    # Create jobs.
    test_dc_labrat_key = start_job_update(client, "test_dc_labrat_read.yaml",
                                          "start job update test/dc/labrat")
    test_dc_labrat_0_key = start_job_update(
        client, "test_dc_labrat0.yaml", "start job update test/dc/labrat0")
    test_dc_0_labrat_1_key = start_job_update(
        client, "test_dc0_labrat1.yaml", "start job update test/dc0/labrat1")
    test_dc_labrat_1_key = start_job_update(
        client, "test_dc_labrat1.yaml", "start job update test/dc/labrat1")
    test2_dc2_labrat2_key = start_job_update(
        client, "test2_dc2_labrat2.yaml", "start job update test2/dc2/labrat2")

    # Add some wait time for lucene index to build
    time.sleep(10)

    # Kill one of the jobs.
    client.kill_tasks(test_dc_labrat_1_key, None,
                      "killing all tasks test/dc/labrat1")
    wait_for_killed(client, test_dc_labrat_1_key)

    for message, query, expected_job_keys in [
        (
            "query job keys",
            api.TaskQuery(jobKeys={
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test2_dc2_labrat2_key,
            }),
            [test_dc_labrat_key, test_dc_labrat_0_key, test2_dc2_labrat2_key],
        ),
        (
            "query role + env + name",
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                environment=test_dc_labrat_key.environment,
                jobName=test_dc_labrat_key.name,
            ),
            [test_dc_labrat_key],
        ),
        (
            "query role + env",
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                environment=test_dc_labrat_key.environment,
            ),
            [test_dc_labrat_key, test_dc_labrat_0_key, test_dc_labrat_1_key],
        ),
        (
            "query role",
            api.TaskQuery(role=test_dc_labrat_key.role),
            [
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test_dc_labrat_1_key,
                test_dc_0_labrat_1_key,
            ],
        ),
        (
            "query role + statuses",
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                statuses={api.ScheduleStatus.RUNNING},
            ),
            [test_dc_labrat_key, test_dc_labrat_0_key, test_dc_0_labrat_1_key],
        ),
    ]:
        res = client.get_tasks_without_configs(query)
        # Expect 3 tasks per job key.
        assert len(res.tasks) == len(expected_job_keys) * 2, message
        assert_keys_equal(
            remove_duplicate_keys(t.assignedTask.task.job for t in res.tasks),
            expected_job_keys,
            message=message,
        )
Exemple #23
0
def test__update_with_pinned_instances__deploy_stopped_instances(client):
    """
    test pinned instance deployment with stop / deploy instances:
    1. start a regular update (version 1) on all instances
    2. stop subset of instances
    3. start a new update (version 2) targeting subset of instances
       (stopped instances included), expect stopped instances to be
       brought up with new version and other targeted instances to
       be updated
    4. start regular update (version 1) again on all instances, expect
       only instances affected by previous step to be updated
    """
    all_instances = set([i for i in xrange(10)])

    # start a regular update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # stop subset of instances
    stop_instances = set([2, 8])
    client.kill_tasks(
        job_key,
        stop_instances,
        "killing instance 2, 8 for job test/dc/labrat_large_job",
    )
    wait_for_killed(client, job_key, stop_instances)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances - stop_instances)
    for t in res.tasks:
        assert t.assignedTask.instanceId in (all_instances - stop_instances)

    # start a update with updateOnlyTheseInstances parameter
    # expect stopped instances to be started
    update_instances = set([2, 3, 5, 8])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])

    res = client.start_job_update(
        pinned_req,
        "start second job update test/dc/labrat_large_job with pinned instances and label diff",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in update_instances

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "2"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        elif t.assignedTask.instanceId in (all_instances - update_instances):
            assert run_id == "1"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        else:
            assert False, ("unexpected instance id %s" %
                           t.assignedTask.instanceId)

    # start the regular update again same as the first one
    # expect changes only for instances updated by previous update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"),
        "start third job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in update_instances

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "3"
        elif t.assignedTask.instanceId in (all_instances - update_instances):
            assert run_id == "1"
        else:
            assert False, ("unexpected instance id %s" %
                           t.assignedTask.instanceId)

        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, (
                    "unexpected metadata %s for affected instances" % m)
Exemple #24
0
def test__update_with_pinned_instances__start_stopped_instances_all(client):
    """
    test pinned instance deployment with stop / start all instances:
    1. start a regular update (version 1) on all instances
    2. stop all instances
    3. start the same update (version 1) on all instances (stopped
       instances included), expect all instances to be updated and
       start running
    4. start regular update (version 1) again on all instances, expect
       no change on all instances
    """
    all_instances = set([i for i in xrange(10)])

    # start a regular update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # stop all instances
    stop_instances = set([i for i in xrange(10)])
    client.kill_tasks(
        job_key,
        stop_instances,
        "killing all instances for job test/dc/labrat_large_job",
    )
    wait_for_killed(client, job_key, stop_instances)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 0

    # start a update without updateOnlyTheseInstances parameter
    # expect all instances to be started
    update_instances = set([i for i in xrange(10)])

    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"),
        "start second job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in (update_instances & stop_instances)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "2"

        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, (
                    "unexpected metadata %s for affected instances" % m)

    # start the regular update again same as the first one
    # expect no change for all instances
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start third job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert res.detailsList[0].instanceEvents is None

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in stop_instances:
            assert run_id == "2"
        elif t.assignedTask.instanceId in (all_instances - stop_instances):
            assert run_id == "1"
        else:
            assert False, ("unexpected instance id %s" %
                           t.assignedTask.instanceId)

        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, (
                    "unexpected metadata %s for affected instances" % m)
Exemple #25
0
def test__update_with_pinned_instances__stopped_instances(client):
    """
    test pinned instance deployment with stopped instances:
    1. start a regular update (version 1) on all instances
    2. stop subset of instances
    3. start another update (version 2) targeting subset of instances
       (stopped instances not included), expect only targeted instances
       to be updated and stopped instances remain stopped
    """
    all_instances = set([i for i in xrange(10)])

    # start a regular update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # stop subset of instances
    stop_instances = set([1, 6])
    client.kill_tasks(
        job_key,
        stop_instances,
        "killing instance 1, 6 for job test/dc/labrat_large_job",
    )
    wait_for_killed(client, job_key, stop_instances)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances - stop_instances)
    for t in res.tasks:
        assert t.assignedTask.instanceId in (all_instances - stop_instances)

    # start a update with updateOnlyTheseInstances parameter
    update_instances = set([0, 2, 3, 7, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])

    res = client.start_job_update(
        pinned_req,
        "start job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in update_instances

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances - stop_instances)

    # expect instance 0, 2, 3, 7, 9 to be updated to newer version, with run id 2
    # expect instance 1, 6 remain at stopped
    # expect instance 4, 5, 8 remain at original version, with run id 1
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "2"
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        elif t.assignedTask.instanceId in (all_instances - stop_instances):
            assert run_id == "1"
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for unaffected instances" % m)
        else:
            assert False, ("unexpected instance id %s: should be stopped" %
                           t.assignedTask.instanceId)
Exemple #26
0
def test__update_with_pinned_instances__add_remove_instance(client):
    """
    test pinned instance deployment with add / remove instances:
    1. start a regular update (version 1) on all instances
    2. start another update (version 2) targeting subset of instances,
       while adding instances, expect only add and targeted instances
       to be updated
    3. start regular update (version 1) again on all instances, while
       removing instances, expect only instances affected by previous
       step to be updated and additional instances removed
    """
    all_instances = set(range(8))

    # start a regular update
    req = get_job_update_request("test_dc_labrat_large_job.yaml")
    req.instanceCount = 8
    res = client.start_job_update(req,
                                  "start job update test/dc/labrat_large_job")
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 8
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start a update with updateOnlyTheseInstances parameter,
    # and add instances
    update_instances = set([0, 2, 3, 8, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])

    res = client.start_job_update(
        pinned_req,
        "start job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in update_instances

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if t.assignedTask.instanceId in update_instances:
            if t.assignedTask.instanceId in all_instances:
                assert run_id == "2"
            else:
                assert run_id == "1"

            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        else:
            assert run_id == "1"
            assert len(t.assignedTask.task.metadata) == 2
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for unaffected instances" % m)

    # start a regular update again should affect instances updated in
    # previous request, and remove instances
    req = get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml")
    req.instanceCount = 8
    res = client.start_job_update(
        req,
        "start job update test/dc/labrat_large_job again (with executor data order diff)",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in update_instances

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 8
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

        if t.assignedTask.instanceId in (update_instances & all_instances):
            assert run_id == "3"
        else:
            assert run_id == "1"
Exemple #27
0
def test__manual_rollback_abort(client):
    """
    - Create Job
    - Start an update
    - Perform manual rollback on rolling_forward update
    - Abort rolling_back update
    - Stateless job will actually have two updates, but bridge will dedupe
      last two updates (as manual rollback was done)
    - Validate that task config for each instance
    """
    # Create a job and wait for it to complete
    start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    # Do update on previously created job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job_diff_labels",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for few instances running
    time.sleep(10)

    # rollback update
    client.rollback_job_update(job_update_key)

    # wait for sometime to trigger manual rollback
    time.sleep(5)

    # abort the manual rollback
    client.abort_job_update(job_update_key, "abort update")

    # 2 updates must be present for this job
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList) == 2

    # first event is rolling forward
    assert (res.detailsList[0].updateEvents[0].status ==
            api.JobUpdateStatus.ROLLING_FORWARD)

    # second last element is rolling back, after manual rollback is triggered
    assert (res.detailsList[0].updateEvents[-2].status ==
            api.JobUpdateStatus.ROLLING_BACK)

    # most recent event is aborted, once manual rollback is aborted
    assert (res.detailsList[0].updateEvents[-1].status ==
            api.JobUpdateStatus.ABORTED)

    # wait for all tasks to be running after invoking abort
    count = 0
    while count < 6:
        res = client.get_tasks_without_configs(
            api.TaskQuery(jobKeys={job_key},
                          statuses={api.ScheduleStatus.RUNNING}))
        if len(res.tasks) == 10:
            break

        count = count + 1
        time.sleep(10)

    # run-id == 1: Job Create
    # run-id == 2: Job Update with diff labels
    # run-id == 3: Update rollback to previous version
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if run_id == "1" or run_id == "3":
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        elif run_id == "2":
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        else:
            assert False, ("unexpected run id %s" % t.assignedTask.taskId)
Exemple #28
0
def test__get_tasks_without_configs__previous_run(client):
    """
    test getTasksWithoutConfigs endpoint for tasks from previous runs:
    1. start a regular update (version 1) on all instances
    2. start a another update (version 2) on all instances
    """
    req1 = get_job_update_request("test_dc_labrat_large_job.yaml")
    req1.settings.updateGroupSize = 10

    req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml")
    req2.settings.updateGroupSize = 10

    # start a regular update
    job_key = start_job_update(client, req1,
                               "start job update test/dc/labrat_large_job")

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start 6 new updates (assuming pod_runs_depth is 6), expect run id 1
    # to be excluded
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")
    start_job_update(client, req1, "start job update test/dc/labrat_large_job")
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")
    start_job_update(client, req1, "start job update test/dc/labrat_large_job")
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")
    start_job_update(client, req1, "start job update test/dc/labrat_large_job")

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 10 * 6
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2

        if run_id in ("7"):
            assert t.status == api.ScheduleStatus.RUNNING
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m
        elif run_id in ("6", "4", "2"):
            assert t.status == api.ScheduleStatus.KILLED
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, "unexpected metadata %s" % m
        elif run_id in ("5", "3"):
            assert t.status == api.ScheduleStatus.KILLED
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m
        else:
            assert False, "unexpected run id: %d" % run_id
Exemple #29
0
def test__get_tasks_without_configs(client):
    # Create job.
    job_key = start_job_update(client, "test_dc_labrat_read.yaml",
                               "start job update test/dc/labrat")

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 2

    host_counts = defaultdict(int)

    for t in res.tasks:
        # ScheduledTask
        assert api.ScheduleStatus.RUNNING == t.status
        assert t.ancestorId is None

        # ScheduledTask.TaskEvent
        assert api.ScheduleStatus.RUNNING == t.taskEvents[-1].status
        assert "peloton" == t.taskEvents[-1].scheduler

        # ScheduledTask.AssignedTask
        assert t.assignedTask.taskId is not None
        assert t.assignedTask.slaveId is not None
        assert t.assignedTask.slaveHost is not None
        assert t.assignedTask.instanceId in (0, 1)

        # ScheduledTask.AssignedTask.TaskConfig
        assert "test" == t.assignedTask.task.job.role
        assert "dc" == t.assignedTask.task.job.environment
        assert "labrat" == t.assignedTask.task.job.name
        assert "testuser" == t.assignedTask.task.owner.user
        assert t.assignedTask.task.isService
        assert 5 == t.assignedTask.task.priority
        assert "preemptible" == t.assignedTask.task.tier
        assert 2 == len(t.assignedTask.task.metadata)
        for m in t.assignedTask.task.metadata:
            if "test_key_1" == m.key:
                assert "test_value_1" == m.value
            elif "test_key_2" == m.key:
                assert "test_value_2" == m.value
            else:
                assert False, "unexpected metadata {}".format(m)
        assert 3 == len(t.assignedTask.task.resources)
        for r in t.assignedTask.task.resources:
            if r.numCpus > 0:
                assert 0.25 == r.numCpus
            elif r.ramMb > 0:
                assert 128 == r.ramMb
            elif r.diskMb > 0:
                assert 128 == r.diskMb
            else:
                assert False, "unexpected resource {}".format(r)
        assert 1 == len(t.assignedTask.task.constraints)
        assert "host" == list(t.assignedTask.task.constraints)[0].name
        assert (1 == list(
            t.assignedTask.task.constraints)[0].constraint.limit.limit)

        host_counts[t.assignedTask.slaveHost] += 1

    # Ensure the host limit is enforced.
    for host, count in host_counts.iteritems():
        assert count == 1, "{host} has more than 1 task".format(host=host)
Exemple #30
0
def test__update_with_pinned_instances__deploy_stopped_instances_mixed(client):
    """
    test pinned instance deployment with mixed version and instance state
    1. start a regular update (version 1) on all instances
    2. stop subset of instances
    3. start a new update (version 2) targeting subset of instances
       (some of stopped instances included), expect targeted instances
       to be either brought up with newer version or updated with new
       version
    4. start regular update (version 1) again on another set of instances
       (some of previously stopped instances included, some of instances
       updated in previous step included), expect only stopped and
       instances affected previous step to be either brought up or updated
    """
    all_instances = set([i for i in xrange(10)])

    # start a regular update
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job.yaml"),
        "start job update test/dc/labrat_large_job",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # stop subset of instances
    stop_instances = set([2, 8])
    client.kill_tasks(
        job_key,
        stop_instances,
        "killing instance 2, 8 for job test/dc/labrat_large_job",
    )
    wait_for_killed(client, job_key, stop_instances)
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances - stop_instances)
    for t in res.tasks:
        assert t.assignedTask.instanceId in (all_instances - stop_instances)

    # start a update with updateOnlyTheseInstances parameter
    # expected only instances which targeted by updateOnlyTheseInstances
    # to be updated, within which stopped ones are started.
    update_instances = set([3, 5, 8])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])

    res = client.start_job_update(
        pinned_req,
        "start second job update test/dc/labrat_large_job with pinned instances and label diff",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        assert ie.instanceId in update_instances

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len((all_instances - stop_instances)
                                 | update_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2
        if t.assignedTask.instanceId in update_instances:
            assert run_id == "2"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        elif t.assignedTask.instanceId in (all_instances - stop_instances -
                                           update_instances):
            assert run_id == "1"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        else:
            assert False, ("unexpected instance id %s: should be stopped" %
                           t.assignedTask.instanceId)

    # start the regular update again same as the first one, targeting
    # subset of instances.
    # expect instance start / updated iff the instance has different config
    # or instance is stopped.
    update_2_instances = set([2, 3, 8, 9])
    pinned_req_2 = get_job_update_request(
        "test_dc_labrat_large_job_diff_executor.yaml")
    pinned_req_2.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_2_instances])

    res = client.start_job_update(
        pinned_req_2, "start third job update test/dc/labrat_large_job")
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key))
    assert len(res.detailsList) == 1
    assert len(res.detailsList[0].instanceEvents) > 0
    for ie in res.detailsList[0].instanceEvents:
        # exclude instances that are previously running and still on
        # the first update
        assert ie.instanceId in (
            update_2_instances -
            (all_instances - update_instances - stop_instances))

    # Expected instances for each corresponding state:
    #
    #   v1s  - instances on original job config (v1) and stopped
    #   v1r1 - instances on original job config (v1) and running with run id 1
    #   v1r2 - instances on original job config (v1) and running with run id 2
    #   v1r3 - instances on original job config (v1) and running with run id 3
    #   v2r2 - instances on updated job config (v2) and running with run id 2
    #
    # How did we calculate the instance ids?
    #
    # Let T1, T2, T3, T4 be each of the four operations, which are
    #   T1 - start original update (v1 job config) for all instances (let it be A)
    #   T2 - stop subset of instances (let it be S)
    #   T3 - start new update (v2 job config) on subset of instances (let it be U1)
    #   T4 - start origin update again (v1 job config) on subset of instances (let it be U2)
    #
    # At T1:
    #   v1r1 = A
    #
    # At T2:
    #   v1s = S
    #   v1r1' = v1r1 - S = A - S
    #
    # At T3:
    #   v1s' = v1s - U1 = S - U1
    #   v2r1 = (empty set)
    #   v2r2 = U1
    #   v1r1'' = A - v2r2 - v1s' = A - U1 - (S - U1)
    #
    # At T4:
    #   v1s'' = v1s' - U2 = S - U1 - U2
    #   v1r2 = U2 & v1s' = U2 & (S - U1)
    #   v1r3 = U1 & U2
    #   v2r2' = v2r2 - U2 = U1 - U2
    #   v1r1''' = A - v1s'' - v1r2 - v1r3 - v2r2'
    v1s = stop_instances - update_instances - update_2_instances
    v1r2 = update_2_instances & (stop_instances - update_instances)
    v1r3 = update_instances & update_2_instances
    v2r2 = update_instances - update_2_instances
    v1r1 = all_instances - v1s - v1r2 - v1r3 - v2r2

    assert not v1s, "should not be any instances remain as stopped"
    assert v1r1, "expect instances to be in version 1 run id 1"
    assert v1r2, "expect instances to be in version 1 run id 2"
    assert v1r3, "expect instances to be in version 1 run id 3"
    assert v2r2, "expect instances to be in version 2 run id 2"

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == len(all_instances)
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2

        if t.assignedTask.instanceId in v1r1:
            # version 1, run 1
            assert run_id == "1"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)

        elif t.assignedTask.instanceId in v1r2:
            # version 1, run 2
            assert run_id == "2"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)

        elif t.assignedTask.instanceId in v1r3:
            # version 1, run 3
            assert run_id == "3"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)

        elif t.assignedTask.instanceId in v2r2:
            # version 2, run 2
            assert run_id == "2"
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)

        else:
            assert False, ("unexpected instance id %s" %
                           t.assignedTask.instanceId)