Ejemplo n.º 1
0
def test__get_jobs__get_job_summary(client):
    # Verify no jobs are returned before jobs are created.
    res = client.get_job_summary("")
    assert len(res.summaries) == 0

    res = client.get_jobs("")
    assert len(res.configs) == 0

    # Create two jobs under same role.
    test_dc_labrat_key = start_job_update(client, "test_dc_labrat_read.yaml",
                                          "start job update test/dc/labrat")
    test_dc_labrat_0_key = start_job_update(
        client, "test_dc_labrat0.yaml", "start job update test/dc/labrat0")

    # Different role should not show up.
    start_job_update(client, "test2_dc2_labrat2.yaml",
                     "start job update test2/dc2/labrat2")

    # Add some wait time for lucene index to build
    time.sleep(10)

    # reduce instance count by 1 for test/dc/labrat0
    client.kill_tasks(
        test_dc_labrat_0_key,
        {0},
        "killing instance 0 for task test/dc/labrat0",
    )
    wait_for_killed(client, test_dc_labrat_0_key, {0})

    # Ensure get_job_summary returns both jobs under role=test.
    res = client.get_job_summary(test_dc_labrat_key.role)
    assert len(res.summaries) == 2, "{jobs}".format(
        jobs=[s.job.key for s in res.summaries])

    assert_keys_equal(
        [s.job.key for s in res.summaries],
        [test_dc_labrat_key, test_dc_labrat_0_key],
    )

    for s in res.summaries:
        if s.job.key == test_dc_labrat_0_key:
            assert s.stats.activeTaskCount == 1
        else:
            assert s.stats.activeTaskCount == 2
        assert s.job.instanceCount == 2

    # Ensure get_jobs returns both jobs under role=test.
    res = client.get_jobs(test_dc_labrat_key.role)
    assert len(res.configs) == 2

    assert_keys_equal(
        [c.taskConfig.job for c in res.configs],
        [test_dc_labrat_key, test_dc_labrat_0_key],
    )

    for c in res.configs:
        if c.key == test_dc_labrat_0_key:
            assert c.instanceCount == 1
        else:
            assert c.instanceCount == 2
Ejemplo n.º 2
0
def test__start_job_update_revocable_job(client):
    """
    Given 12 non-revocable cpus, and 12 revocable cpus
    Create a non-revocable of 3 instance, with 3 CPU per instance
    Create a revocable job of 1 instance, with 4 CPU per instance
    """
    non_revocable_job = start_job_update(
        client,
        "test_dc_labrat_cpus_large.yaml",
        "start job update test/dc/labrat_large",
    )

    revocable_job = start_job_update(
        client,
        "test_dc_labrat_revocable.yaml",
        "start job update test/dc/labrat_revocable",
    )

    # Add some wait time for lucene index to build
    time.sleep(10)

    # validate 1 revocable tasks are running
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={revocable_job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 1

    # validate 3 non-revocable tasks are running
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={non_revocable_job},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 3
Ejemplo n.º 3
0
def test__get_job_update_details__deleted_job(client):
    """
    test JobMgr's private API - QueryJobCache (used by getJobUpdateDetails)
    won't crash if the job is deleted.
    """
    # start first update
    req1 = get_job_update_request("test_dc_labrat_large_job.yaml")
    req1.settings.updateGroupSize = 10

    job_key = start_job_update(client, req1,
                               "start job update test/dc/labrat_large_job")

    # force delete job
    delete_jobs()

    # start second update
    req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml")
    req2.settings.updateGroupSize = 10

    job_key = start_job_update(client, req2,
                               "start job update test/dc/labrat_large_job")

    # verify getJobUpdateDetails response
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(role=job_key.role))
    assert len(res.detailsList) == 1
Ejemplo n.º 4
0
def test__simple_auto_rolled_back(client):
    """
    Create a job, then issue a bad config update and validate
    job is rolled back to previous version
    """
    start_job_update(client, 'test_dc_labrat.yaml',
                     'start job update test/dc/labrat')

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.start_job_update(
        get_job_update_request('test_dc_labrat_bad_config.yaml'),
        'rollout bad config')
    wait_for_rolled_back(client, res.key)

    # validate job is rolled back to previous config
    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={res.key.job},
                      statuses={api.ScheduleStatus.RUNNING}))

    tasks = res.tasks
    assert len(tasks) == 3

    for t in tasks:
        for r in t.assignedTask.task.resources:
            if r.numCpus > 0:
                assert r.numCpus == 0.25
            elif r.ramMb > 0:
                assert r.ramMb == 128
            elif r.diskMb > 0:
                assert r.diskMb == 128
            else:
                assert False, 'unexpected resource {}'.format(r)
Ejemplo n.º 5
0
def test__abort_auto_rollback_and_update(client):
    """
    1. Create a job
    2. Start a bad update, wait for auto-rollback to kick-in
    3. Once auto-rollback starts, abort an update.
    4. Do a new good update and all the instances should converge to the new config.
    """
    start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"),
        "rollout bad config",
    )

    # wait for auto-rollback to kick-in
    wait_for_auto_rolling_back(client, res.key)

    client.abort_job_update(res.key, "abort update")
    wait_for_update_status(
        client,
        res.key,
        {api.JobUpdateStatus.ROLLING_BACK},
        api.JobUpdateStatus.ABORTED,
    )

    new_config = get_job_update_request(
        "test_dc_labrat_large_job_new_config.yaml"
    )
    res = client.start_job_update(new_config, "rollout good config")
    # Sleep for a while so that update gets triggered.
    time.sleep(5)
    wait_for_rolled_forward(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(
            jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING}
        )
    )
    assert len(res.tasks) == 10

    for t in res.tasks:
        assert len(t.assignedTask.task.metadata) == 1
        assert (
            list(t.assignedTask.task.metadata)[0].key
            == list(new_config.taskConfig.metadata)[0].key
        )
        assert (
            list(t.assignedTask.task.metadata)[0].value
            == list(new_config.taskConfig.metadata)[0].value
        )

        assert t.ancestorId
Ejemplo n.º 6
0
def test__host_limit_1(client, hostmgr):
    """
    - Create a job with host limit 1 constraint and validate each pod
    is running on different host.
    - Update a job, wait for it to complete and verify host limit 1 constraint.
    - Update a job, restart host manager, then wait for update to complete and
    lastly verify host limit 1 constraint.
    """
    # Create job.
    job_key = start_job_update(client, "test_dc_labrat.yaml",
                               "start job update test/dc/labrat")

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 3

    verify_host_limit_1(res.tasks)

    # Start a update with host limit 1 constraint
    job_key = start_job_update(client, "test_dc_labrat_1.yaml",
                               "start job update test/dc/labrat_1")

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 3

    verify_host_limit_1(res.tasks)

    # Start an update, and restart hostmanager before update completes.
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat.yaml"),
        "start job update test/dc/labrat",
    )

    # restart host manager
    hostmgr.restart()

    wait_for_rolled_forward(client, res.key)

    res = client.get_tasks_without_configs(
        api.TaskQuery(jobKeys={job_key},
                      statuses={api.ScheduleStatus.RUNNING}))
    assert len(res.tasks) == 3

    verify_host_limit_1(res.tasks)
Ejemplo n.º 7
0
def test__start_job_update_with_get_jobs(client, aurorabridge):
    """
    Check GetJobs returns the correct results when queried by role, and after
    a new job is created. This is used to verify that StartJobUpdate would
    correctly invalidate role to job id cache.
    Meanwhile, the test will restart aurorabridge container at the end, and
    verify the cache will be populated correctly as well.
    """

    labrat0 = start_job_update(
        client,
        "test_dc_labrat0.yaml",
        "start job update test/dc/labrat0",
    )

    res = client.get_jobs(labrat0.role)
    assert len(res.configs) == 1
    assert list(res.configs)[0].taskConfig.job == labrat0

    # Verify that start a new job invalidates the job role to id cache,
    # and returns the correct result.
    labrat1 = start_job_update(
        client,
        "test_dc_labrat1.yaml",
        "start job update test/dc/labrat1",
    )

    res = client.get_jobs(labrat1.role)
    assert len(res.configs) == 2
    assert_keys_equal(
        [c.taskConfig.job for c in res.configs],
        [labrat0, labrat1],
    )

    # Verify that after aurorabridge container restart, the cache will still
    # be populated correctly and return the correct result.
    aurorabridge.restart()
    time.sleep(10)

    res = client.get_jobs(labrat1.role)
    assert len(res.configs) == 2
    assert_keys_equal(
        [c.taskConfig.job for c in res.configs],
        [labrat0, labrat1],
    )
Ejemplo n.º 8
0
def test__mesos_task_label(client):
    # verify aurora metadata is correctly populated in mesos task level
    start_job_update(
        client,
        'test_dc_labrat_uns.yaml',
        'start job update test/dc/labrat_uns')

    state = get_mesos_maser_state()
    assert len(state['frameworks']) == 1
    assert state['frameworks'][0]['name'] == 'Peloton'

    framework = state['frameworks'][0]
    assert len(framework['tasks']) == 1

    task = framework['tasks'][0]
    assert len(task['labels']) > 0

    for l in task['labels']:
        if l['key'] == 'org.apache.aurora.metadata.uns':
            break
    else:
        assert False, 'expected label not found'
Ejemplo n.º 9
0
def test__get_jobs__get_job_summary(client):
    # Verify no jobs are returned before jobs are created.
    res = client.get_job_summary('')
    assert len(res.summaries) == 0

    res = client.get_jobs('')
    assert len(res.configs) == 0

    # Create two jobs under same role.
    test_dc_labrat_key = start_job_update(client, 'test_dc_labrat_read.yaml',
                                          'start job update test/dc/labrat')
    test_dc_labrat_0_key = start_job_update(
        client, 'test_dc_labrat0.yaml', 'start job update test/dc/labrat0')

    # Different role should not show up.
    start_job_update(client, 'test2_dc2_labrat2.yaml',
                     'start job update test2/dc2/labrat2')

    # Ensure get_job_summary returns both jobs under role=test.
    res = client.get_job_summary(test_dc_labrat_key.role)
    assert len(res.summaries) == 2

    assert_keys_equal([s.job.key for s in res.summaries],
                      [test_dc_labrat_key, test_dc_labrat_0_key])

    for s in res.summaries:
        assert s.stats.activeTaskCount == 2
        assert s.job.instanceCount == 2

    # Ensure get_jobs returns both jobs under role=test.
    res = client.get_jobs(test_dc_labrat_key.role)
    assert len(res.configs) == 2

    assert_keys_equal([c.taskConfig.job for c in res.configs],
                      [test_dc_labrat_key, test_dc_labrat_0_key])

    for c in res.configs:
        assert c.instanceCount == 2
Ejemplo n.º 10
0
def test__mesos_task_label(client):
    # verify aurora metadata is correctly populated in mesos task level
    start_job_update(
        client,
        "test_dc_labrat_uns.yaml",
        "start job update test/dc/labrat_uns",
    )

    state = get_mesos_maser_state()
    assert len(state["frameworks"]) == 1
    assert state["frameworks"][0]["name"] == "Peloton"

    framework = state["frameworks"][0]
    assert len(framework["tasks"]) == 1

    task = framework["tasks"][0]
    assert len(task["labels"]) > 0

    for l in task["labels"]:
        if l["key"] == "org.apache.aurora.metadata.uns":
            break
    else:
        assert False, "expected label not found"
Ejemplo n.º 11
0
def test__get_job_update_details__filter_non_update_workflow(client):
    """
    test getJobUpdateDetails endpoint for filtering non-update workflows
    """
    req1 = get_job_update_request("test_dc_labrat_large_job.yaml")
    req1.settings.updateGroupSize = 10

    req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml")
    req2.settings.updateGroupSize = 10

    # start a regular update
    job_key = start_job_update(client, req1,
                               "start job update test/dc/labrat_large_job")

    # trigger an unexpected restart through peloton api
    jobs = list_jobs()
    assert len(jobs) == 1

    job = StatelessJob(job_id=jobs[0].job_id.value)
    job.restart(batch_size=10)
    job.wait_for_workflow_state(goal_state="SUCCEEDED")  # wait for restart

    # start a new update
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")

    # verify getJobUpdateDetails response
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(role=job_key.role))
    assert len(res.detailsList) == 2

    for i, detail in enumerate(res.detailsList):
        if i == 0:
            assert len(detail.update.instructions.initialState) > 0
            for initial in detail.update.instructions.initialState:
                assert initial.task.metadata, 'Expect metadata to be present'
        else:
            assert len(detail.update.instructions.initialState) == 0
Ejemplo n.º 12
0
def test__start_job_update_with_msg(client):
    update_msg = 'update msg 1'
    job_key = start_job_update(client, 'test_dc_labrat.yaml', update_msg)

    res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key))

    assert len(res.detailsList) == 1

    # verify events are sorted ascending
    assert len(res.detailsList[0].updateEvents) > 0
    update_events_ts = [e.timestampMs for e in res.detailsList[0].updateEvents]
    assert update_events_ts == sorted(update_events_ts)
    assert len(res.detailsList[0].instanceEvents) > 0
    instance_events_ts = [e.timestampMs for e in res.detailsList[0].instanceEvents]
    assert instance_events_ts == sorted(instance_events_ts)

    assert res.detailsList[0].updateEvents[0].status == \
        api.JobUpdateStatus.ROLLING_FORWARD
    assert res.detailsList[0].updateEvents[0].message == update_msg
    assert res.detailsList[0].updateEvents[-1].status == \
        api.JobUpdateStatus.ROLLED_FORWARD
Ejemplo n.º 13
0
def test__manual_rollback_abort(client):
    """
    - Create Job
    - Start an update
    - Perform manual rollback on rolling_forward update
    - Abort rolling_back update
    - Stateless job will actually have two updates, but bridge will dedupe
      last two updates (as manual rollback was done)
    - Validate that task config for each instance
    """
    # Create a job and wait for it to complete
    start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    # Do update on previously created job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job_diff_labels",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for few instances running
    time.sleep(10)

    # rollback update
    client.rollback_job_update(job_update_key)

    # wait for sometime to trigger manual rollback
    time.sleep(5)

    # abort the manual rollback
    client.abort_job_update(job_update_key, "abort update")

    # 2 updates must be present for this job
    res = client.get_job_update_details(
        None, api.JobUpdateQuery(jobKey=job_key)
    )
    assert len(res.detailsList) == 2

    # first event is rolling forward
    assert (
        res.detailsList[0].updateEvents[0].status
        == api.JobUpdateStatus.ROLLING_FORWARD
    )

    # second last element is rolling back, after manual rollback is triggered
    assert (
        res.detailsList[0].updateEvents[-2].status
        == api.JobUpdateStatus.ROLLING_BACK
    )

    # most recent event is aborted, once manual rollback is aborted
    assert (
        res.detailsList[0].updateEvents[-1].status
        == api.JobUpdateStatus.ABORTED
    )

    # wait for all tasks to be running after invoking abort
    count = 0
    while count < 6:
        res = client.get_tasks_without_configs(
            api.TaskQuery(jobKeys={job_key}, statuses={
                          api.ScheduleStatus.RUNNING})
        )
        if len(res.tasks) == 10:
            break

        count = count + 1
        time.sleep(10)

    # run-id == 1: Job Create
    # run-id == 2: Job Update with diff labels
    # run-id == 3: Update rollback to previous version
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if run_id == "1" or run_id == "3":
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m
                    )
        elif run_id == "2":
            # Preference to update instances has changed.
            # Previously behavior, start to update instance from instance_0.
            # run_1 (config_1) -> on_update run_2 (config_2) -> rollback run_3 (config_1)
            #
            # Current behavior, start with instance which is not available or killed,
            # thereby instances which were on-going update will be rolled back first.
            # If an instance was in KILLED state (run-id=1) when it was picked up for rollback,
            # the run-id will be bumped up to 2 and move the config to the original one.
            # An instance with run-id=2 can either be on config_1 or config_2 (due to first update)
            # run_1 (config_1) -> (instance is in KILLED state, rollback runs first on it) rollback run_2 (config_1)
            continue
        else:
            assert False, (
                "unexpected run id %s" % t.assignedTask.taskId
            )
Ejemplo n.º 14
0
def test__simple_manual_rollback(client):
    """
    Start a job update which will create a job. Do another update on the job
    and half-way to a manual rollback
    """
    job_key = start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    verify_events_sorted(res.detailsList[0].updateEvents)
    verify_events_sorted(res.detailsList[0].instanceEvents)
    verify_first_and_last_job_update_status(
        res.detailsList[0].updateEvents,
        api.JobUpdateStatus.ROLLING_FORWARD,
        api.JobUpdateStatus.ROLLED_FORWARD,
    )
    verify_task_config(
        client,
        job_key,
        {
            "test_key_1": "test_value_1",
            "test_key_2": "test_value_2"
        },
    )

    # Do update on previously created job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job_diff_labels",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for few instances running
    time.sleep(5)

    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    verify_events_sorted(res.detailsList[0].updateEvents)
    verify_events_sorted(res.detailsList[0].instanceEvents)
    verify_first_and_last_job_update_status(
        res.detailsList[0].updateEvents,
        api.JobUpdateStatus.ROLLING_FORWARD,
        api.JobUpdateStatus.ROLLING_FORWARD,
    )
    verify_task_config(
        client,
        job_key,
        {
            "test_key_1": "test_value_1",
            "test_key_2": "test_value_2",
            "test_key_11": "test_value_11",
            "test_key_22": "test_value_22",
        },
    )

    # rollback update
    client.rollback_job_update(job_update_key)
    wait_for_rolled_back(client, job_update_key)

    # verify events are sorted ascending, and last update event is ROLLED_BACK
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    verify_events_sorted(res.detailsList[0].updateEvents)
    verify_events_sorted(res.detailsList[0].instanceEvents)
    verify_first_and_last_job_update_status(
        res.detailsList[0].updateEvents,
        api.JobUpdateStatus.ROLLING_FORWARD,
        api.JobUpdateStatus.ROLLED_BACK,
    )
    verify_task_config(
        client,
        job_key,
        {
            "test_key_1": "test_value_1",
            "test_key_2": "test_value_2"
        },
    )  # rolled back to previous task config
Ejemplo n.º 15
0
def test__get_tasks_without_configs(client):
    # Create job.
    job_key = start_job_update(client, "test_dc_labrat_read.yaml",
                               "start job update test/dc/labrat")

    # Add some wait time for lucene index to build
    time.sleep(10)

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 2

    host_counts = defaultdict(int)

    for t in res.tasks:
        # ScheduledTask
        assert api.ScheduleStatus.RUNNING == t.status
        assert t.ancestorId is None

        # ScheduledTask.TaskEvent
        assert api.ScheduleStatus.RUNNING == t.taskEvents[-1].status
        assert "peloton" == t.taskEvents[-1].scheduler

        # ScheduledTask.AssignedTask
        assert t.assignedTask.taskId is not None
        assert t.assignedTask.slaveId is not None
        assert t.assignedTask.slaveHost is not None
        assert t.assignedTask.instanceId in (0, 1)

        # ScheduledTask.AssignedTask.TaskConfig
        assert "test" == t.assignedTask.task.job.role
        assert "dc" == t.assignedTask.task.job.environment
        assert "labrat" == t.assignedTask.task.job.name
        assert "testuser" == t.assignedTask.task.owner.user
        assert t.assignedTask.task.isService
        assert 5 == t.assignedTask.task.priority
        assert "preemptible" == t.assignedTask.task.tier
        assert 2 == len(t.assignedTask.task.metadata)
        for m in t.assignedTask.task.metadata:
            if "test_key_1" == m.key:
                assert "test_value_1" == m.value
            elif "test_key_2" == m.key:
                assert "test_value_2" == m.value
            else:
                assert False, "unexpected metadata {}".format(m)
        assert 3 == len(t.assignedTask.task.resources)
        for r in t.assignedTask.task.resources:
            if r.numCpus > 0:
                assert 0.25 == r.numCpus
            elif r.ramMb > 0:
                assert 128 == r.ramMb
            elif r.diskMb > 0:
                assert 128 == r.diskMb
            else:
                assert False, "unexpected resource {}".format(r)
        assert 1 == len(t.assignedTask.task.constraints)
        assert "host" == list(t.assignedTask.task.constraints)[0].name
        assert (1 == list(
            t.assignedTask.task.constraints)[0].constraint.limit.limit)

        host_counts[t.assignedTask.slaveHost] += 1

    # Ensure the host limit is enforced.
    for host, count in host_counts.iteritems():
        assert count == 1, "{host} has more than 1 task".format(host=host)
Ejemplo n.º 16
0
def test__manual_rollback_abort(client):
    """
    - Create Job
    - Start an update
    - Perform manual rollback on rolling_forward update
    - Abort rolling_back update
    - Stateless job will actually have two updates, but bridge will dedupe
      last two updates (as manual rollback was done)
    - Validate that task config for each instance
    """
    # Create a job and wait for it to complete
    start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    # Do update on previously created job
    res = client.start_job_update(
        get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"),
        "start job update test/dc/labrat_large_job_diff_labels",
    )
    job_update_key = res.key
    job_key = res.key.job

    # wait for few instances running
    time.sleep(10)

    # rollback update
    client.rollback_job_update(job_update_key)

    # wait for sometime to trigger manual rollback
    time.sleep(5)

    # abort the manual rollback
    client.abort_job_update(job_update_key, "abort update")

    # 2 updates must be present for this job
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(jobKey=job_key))
    assert len(res.detailsList) == 2

    # first event is rolling forward
    assert (res.detailsList[0].updateEvents[0].status ==
            api.JobUpdateStatus.ROLLING_FORWARD)

    # second last element is rolling back, after manual rollback is triggered
    assert (res.detailsList[0].updateEvents[-2].status ==
            api.JobUpdateStatus.ROLLING_BACK)

    # most recent event is aborted, once manual rollback is aborted
    assert (res.detailsList[0].updateEvents[-1].status ==
            api.JobUpdateStatus.ABORTED)

    # wait for all tasks to be running after invoking abort
    count = 0
    while count < 6:
        res = client.get_tasks_without_configs(
            api.TaskQuery(jobKeys={job_key},
                          statuses={api.ScheduleStatus.RUNNING}))
        if len(res.tasks) == 10:
            break

        count = count + 1
        time.sleep(10)

    # run-id == 1: Job Create
    # run-id == 2: Job Update with diff labels
    # run-id == 3: Update rollback to previous version
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        if run_id == "1" or run_id == "3":
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        elif run_id == "2":
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, (
                        "unexpected metadata %s for affected instances" % m)
        else:
            assert False, ("unexpected run id %s" % t.assignedTask.taskId)
Ejemplo n.º 17
0
def test__get_tasks_without_configs__previous_run(client):
    """
    test getTasksWithoutConfigs endpoint for tasks from previous runs:
    1. start a regular update (version 1) on all instances
    2. start a another update (version 2) on all instances
    """
    req1 = get_job_update_request("test_dc_labrat_large_job.yaml")
    req1.settings.updateGroupSize = 10

    req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml")
    req2.settings.updateGroupSize = 10

    # start a regular update
    job_key = start_job_update(client, req1,
                               "start job update test/dc/labrat_large_job")

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 10
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert run_id == "1"
        assert len(t.assignedTask.task.metadata) == 2
        for m in t.assignedTask.task.metadata:
            if m.key == "test_key_1":
                assert m.value == "test_value_1"
            elif m.key == "test_key_2":
                assert m.value == "test_value_2"
            else:
                assert False, "unexpected metadata %s" % m

    # start 6 new updates (assuming pod_runs_depth is 6), expect run id 1
    # to be excluded
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")
    start_job_update(client, req1, "start job update test/dc/labrat_large_job")
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")
    start_job_update(client, req1, "start job update test/dc/labrat_large_job")
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")
    start_job_update(client, req1, "start job update test/dc/labrat_large_job")

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 10 * 6
    for t in res.tasks:
        _, _, run_id = t.assignedTask.taskId.rsplit("-", 2)
        assert len(t.assignedTask.task.metadata) == 2

        if run_id in ("7"):
            assert t.status == api.ScheduleStatus.RUNNING
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m
        elif run_id in ("6", "4", "2"):
            assert t.status == api.ScheduleStatus.KILLED
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, "unexpected metadata %s" % m
        elif run_id in ("5", "3"):
            assert t.status == api.ScheduleStatus.KILLED
            for m in t.assignedTask.task.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m
        else:
            assert False, "unexpected run id: %d" % run_id
Ejemplo n.º 18
0
def test__get_tasks_without_configs_task_queries(client):
    # Verify no tasks are returned before creating.
    res = client.get_tasks_without_configs(api.TaskQuery())
    assert len(res.tasks) == 0

    # Create jobs.
    test_dc_labrat_key = start_job_update(client, 'test_dc_labrat_read.yaml',
                                          'start job update test/dc/labrat')
    test_dc_labrat_0_key = start_job_update(
        client, 'test_dc_labrat0.yaml', 'start job update test/dc/labrat0')
    test_dc_0_labrat_1_key = start_job_update(
        client, 'test_dc0_labrat1.yaml', 'start job update test/dc0/labrat1')
    test_dc_labrat_1_key = start_job_update(
        client, 'test_dc_labrat1.yaml', 'start job update test/dc/labrat1')
    test2_dc2_labrat2_key = start_job_update(
        client, 'test2_dc2_labrat2.yaml', 'start job update test2/dc2/labrat2')

    # Kill one of the jobs.
    client.kill_tasks(test_dc_labrat_1_key, {0, 1},
                      'killing all tasks test/dc/labrat1')
    wait_for_killed(client, test_dc_labrat_1_key)

    for message, query, expected_job_keys in [
        (
            'query job keys',
            api.TaskQuery(jobKeys={
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test2_dc2_labrat2_key,
            }),
            [
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test2_dc2_labrat2_key,
            ],
        ),
        (
            'query role + env + name',
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                environment=test_dc_labrat_key.environment,
                jobName=test_dc_labrat_key.name,
            ),
            [test_dc_labrat_key],
        ),
        (
            'query role + env',
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                environment=test_dc_labrat_key.environment,
            ),
            [
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test_dc_labrat_1_key,
            ],
        ),
        (
            'query role',
            api.TaskQuery(role=test_dc_labrat_key.role),
            [
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test_dc_labrat_1_key,
                test_dc_0_labrat_1_key,
            ],
        ),
        (
            'query role + statuses',
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                statuses={api.ScheduleStatus.RUNNING},
            ),
            [
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test_dc_0_labrat_1_key,
            ],
        )
    ]:
        res = client.get_tasks_without_configs(query)
        # Expect 3 tasks per job key.
        assert len(res.tasks) == len(expected_job_keys) * 2, message
        assert_keys_equal(remove_duplicate_keys(t.assignedTask.task.job
                                                for t in res.tasks),
                          expected_job_keys,
                          message=message)
Ejemplo n.º 19
0
def test__get_tasks_without_configs_task_queries(client):
    # Verify no tasks are returned before creating.
    res = client.get_tasks_without_configs(api.TaskQuery())
    assert len(res.tasks) == 0

    # Create jobs.
    test_dc_labrat_key = start_job_update(client, "test_dc_labrat_read.yaml",
                                          "start job update test/dc/labrat")
    test_dc_labrat_0_key = start_job_update(
        client, "test_dc_labrat0.yaml", "start job update test/dc/labrat0")
    test_dc_0_labrat_1_key = start_job_update(
        client, "test_dc0_labrat1.yaml", "start job update test/dc0/labrat1")
    test_dc_labrat_1_key = start_job_update(
        client, "test_dc_labrat1.yaml", "start job update test/dc/labrat1")
    test2_dc2_labrat2_key = start_job_update(
        client, "test2_dc2_labrat2.yaml", "start job update test2/dc2/labrat2")

    # Add some wait time for lucene index to build
    time.sleep(10)

    # Kill one of the jobs.
    client.kill_tasks(test_dc_labrat_1_key, None,
                      "killing all tasks test/dc/labrat1")
    wait_for_killed(client, test_dc_labrat_1_key)

    for message, query, expected_job_keys in [
        (
            "query job keys",
            api.TaskQuery(jobKeys={
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test2_dc2_labrat2_key,
            }),
            [test_dc_labrat_key, test_dc_labrat_0_key, test2_dc2_labrat2_key],
        ),
        (
            "query role + env + name",
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                environment=test_dc_labrat_key.environment,
                jobName=test_dc_labrat_key.name,
            ),
            [test_dc_labrat_key],
        ),
        (
            "query role + env",
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                environment=test_dc_labrat_key.environment,
            ),
            [test_dc_labrat_key, test_dc_labrat_0_key, test_dc_labrat_1_key],
        ),
        (
            "query role",
            api.TaskQuery(role=test_dc_labrat_key.role),
            [
                test_dc_labrat_key,
                test_dc_labrat_0_key,
                test_dc_labrat_1_key,
                test_dc_0_labrat_1_key,
            ],
        ),
        (
            "query role + statuses",
            api.TaskQuery(
                role=test_dc_labrat_key.role,
                statuses={api.ScheduleStatus.RUNNING},
            ),
            [test_dc_labrat_key, test_dc_labrat_0_key, test_dc_0_labrat_1_key],
        ),
    ]:
        res = client.get_tasks_without_configs(query)
        # Expect 3 tasks per job key.
        assert len(res.tasks) == len(expected_job_keys) * 2, message
        assert_keys_equal(
            remove_duplicate_keys(t.assignedTask.task.job for t in res.tasks),
            expected_job_keys,
            message=message,
        )
Ejemplo n.º 20
0
def test__get_config_summary__with_pinned_instances(client):
    """
    test pinned instance update which divides instances to two sets of
    configs, and verify getConfigSummary endpoint returns the correct
    result.
    """
    all_instances = set(range(10))

    # start a regular update
    job_key = start_job_update(
        client,
        "test_dc_labrat_large_job.yaml",
        "start job update test/dc/labrat_large_job",
    )

    tasks = get_running_tasks(client, job_key)
    assert len(tasks) == 10

    # start a update with updateOnlyTheseInstances parameter
    update_instances = set([0, 2, 3, 7, 9])
    pinned_req = get_job_update_request(
        "test_dc_labrat_large_job_diff_labels.yaml")
    pinned_req.settings.updateOnlyTheseInstances = set(
        [api.Range(first=i, last=i) for i in update_instances])

    res = client.start_job_update(
        pinned_req,
        "start job update test/dc/labrat_large_job with pinned instances",
    )
    wait_for_rolled_forward(client, res.key)
    job_key = res.key.job

    tasks = get_running_tasks(client, job_key)
    assert len(tasks) == 10

    res = client.get_config_summary(job_key)
    assert len(res.summary.groups) == 2
    for group in res.summary.groups:
        instances = set(expand_instance_range(group.instances))

        if instances == update_instances:
            # instances updated in the second update
            assert len(group.config.metadata) == 2
            for m in group.config.metadata:
                if m.key == "test_key_11":
                    assert m.value == "test_value_11"
                elif m.key == "test_key_22":
                    assert m.value == "test_value_22"
                else:
                    assert False, "unexpected metadata %s" % m

        elif instances == all_instances - update_instances:
            # instances updated from the first update
            assert len(group.config.metadata) == 2
            for m in group.config.metadata:
                if m.key == "test_key_1":
                    assert m.value == "test_value_1"
                elif m.key == "test_key_2":
                    assert m.value == "test_value_2"
                else:
                    assert False, "unexpected metadata %s" % m

        else:
            assert False, "unexpected instance range: %s" % group.instances
Ejemplo n.º 21
0
def test__get_tasks_without_configs(client):
    # Create job.
    job_key = start_job_update(client, 'test_dc_labrat.yaml',
                               'start job update test/dc/labrat')

    res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key}))
    assert len(res.tasks) == 3

    host_counts = defaultdict(int)

    for t in res.tasks:
        # ScheduledTask
        assert api.ScheduleStatus.RUNNING == t.status
        assert t.ancestorId is None

        # ScheduledTask.TaskEvent
        assert api.ScheduleStatus.RUNNING == t.taskEvents[-1].status
        assert "peloton" == t.taskEvents[-1].scheduler

        # ScheduledTask.AssignedTask
        assert t.assignedTask.taskId is not None
        assert t.assignedTask.slaveId is not None
        assert t.assignedTask.slaveHost is not None
        assert t.assignedTask.instanceId in (0, 1, 2)

        # ScheduledTask.AssignedTask.TaskConfig
        assert 'test' == t.assignedTask.task.job.role
        assert 'dc' == t.assignedTask.task.job.environment
        assert 'labrat' == t.assignedTask.task.job.name
        assert 'testuser' == t.assignedTask.task.owner.user
        assert t.assignedTask.task.isService
        assert 5 == t.assignedTask.task.priority
        assert 'preemptible' == t.assignedTask.task.tier
        assert 2 == len(t.assignedTask.task.metadata)
        for m in t.assignedTask.task.metadata:
            if 'test_key_1' == m.key:
                assert 'test_value_1' == m.value
            elif 'test_key_2' == m.key:
                assert 'test_value_2' == m.value
            else:
                assert False, 'unexpected metadata {}'.format(m)
        assert 3 == len(t.assignedTask.task.resources)
        for r in t.assignedTask.task.resources:
            if r.numCpus > 0:
                assert 0.25 == r.numCpus
            elif r.ramMb > 0:
                assert 32 == r.ramMb
            elif r.diskMb > 0:
                assert 128 == r.diskMb
            else:
                assert False, 'unexpected resource {}'.format(r)
        assert 1 == len(t.assignedTask.task.constraints)
        assert 'host' == list(t.assignedTask.task.constraints)[0].name
        assert 1 == list(
            t.assignedTask.task.constraints)[0].constraint.limit.limit

        host_counts[t.assignedTask.slaveHost] += 1

    # Ensure the host limit is enforced.
    for host, count in host_counts.iteritems():
        assert count == 1, '{host} has more than 1 task'.format(host=host)