Beispiel #1
0
def test__delete_running_job_with_force_flag():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.delete(force_delete=True)
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
Beispiel #2
0
def test__kill_sla_violated_job():
    """
    1. Create a stateless job(instance_count=5) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of
       the instances will not get placed (hence unavailable).
    2. Kill job and wait for the job to reach KILLED state
    """
    job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", )
    job.job_spec.instance_count = 5
    job.create()
    job.wait_for_all_pods_running(num_pods=3)

    job.stop()
    job.wait_for_state(goal_state='KILLED')
Beispiel #3
0
def stateless_job(request, peloton_client):
    job = StatelessJob(client=peloton_client, )
    if util.minicluster_type() == "k8s":
        job = StatelessJob(
            job_file="test_stateless_job_spec_k8s.yaml",
            client=peloton_client,
        )

    # teardown
    def kill_stateless_job():
        print("\nstopping stateless job")
        job.stop()

    request.addfinalizer(kill_stateless_job)

    return job
Beispiel #4
0
def wait_for_deletion(client, timeout_secs):
    """
    Wait for job deletion to complete.
    """
    deadline = time.time() + timeout_secs
    while time.time() < deadline:
        try:
            jobs = [
                StatelessJob(job_id=s.job_id.value, client=client)
                for s in list_jobs()
            ]
            if len(jobs) == 0:
                return
            time.sleep(2)
        except grpc.RpcError as e:
            # Catch "not-found" error here because QueryJobs endpoint does
            # two db queries in sequence: "QueryJobs" and "GetUpdate".
            # However, when we delete a job, updates are deleted first,
            # there is a slight chance QueryJobs will fail to query the
            # update, returning "not-found" error.
            if e.code() == grpc.StatusCode.NOT_FOUND:
                time.sleep(2)
                continue
            raise

    assert False, "timed out waiting for jobs to be deleted"
Beispiel #5
0
def patch_jobs(active_jobs=None, desired_jobs=None):
    """
    patch jobs check current state of the job and applies desired goal state
    for the job. It can yield to create a job or updating a job.
    """
    jobs = {}
    for job_name, job_spec in desired_jobs.items():
        if job_name in active_jobs.keys():
            # job exists -> update to desired state
            patch_job(active_jobs[job_name], job_spec)
            jobs[job_name] = active_jobs[job_name].get_job_id()
        else:
            # job does not exist -> create
            job = StatelessJob(job_config=job_spec,
                               config=IntegrationTestConfig(
                                   pool_file=RESPOOL_FILE_NAME,
                                   max_retry_attempts=MAX_RETRY_ATTEMPTS))
            job.create()
            time.sleep(10)
            job.wait_for_all_pods_running()
            jobs[job_name] = job.get_job_id()

    # TODO: Kill any undesired active job running in the canary cluster

    return jobs
Beispiel #6
0
def test_stop_running_job_with_active_update_remove_instances(in_place):
    stateless_job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    stateless_job.create()
    stateless_job.wait_for_state(goal_state="RUNNING")
    assert len(stateless_job.query_pods()) == 5

    update = StatelessUpdate(
        stateless_job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC,
        batch_size=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="ROLLING_FORWARD")

    stateless_job.stop()
    update.wait_for_state(goal_state="SUCCEEDED")
    assert stateless_job.get_spec().instance_count == 3
Beispiel #7
0
def test__delete_running_job_without_force_flag():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    try:
        job.delete()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.ABORTED
        return
    raise Exception("job in non-terminal state error not received")
Beispiel #8
0
def test__delete_sla_violated_job():
    """
    1. Create a stateless job(instance_count=5) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of
       the instances will not get placed (hence unavailable).
    2. Force delete the job and verify that the job is deleted
    """
    job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", )
    job.job_spec.instance_count = 5
    job.create()
    job.wait_for_all_pods_running(num_pods=3)

    job.delete(force_delete=True)
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
Beispiel #9
0
def test__health_check_detects_healthy_tasks():
    job = StatelessJob(
        job_file='test_stateless_job_successful_health_check_spec.yaml',
        config=IntegrationTestConfig(max_retry_attempts=100))
    job.job_spec.instance_count = 1
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    def task_has_healthy_events():
        for pod_event in job.get_pod(0).get_pod_events():
            if pod_event.healthy == 'HEALTHY':
                return True

    job.wait_for_condition(task_has_healthy_events)
Beispiel #10
0
def test__delete_initialized_job_with_force_flag():
    job = StatelessJob()
    job.create()
    # the job might have transitioned to INITIALIZED/PENDING
    # since there is no way to fine control the job transitions
    job.delete(force_delete=True)
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
Beispiel #11
0
def get_unique_job(request):
    """
    Finds a unique job to run for a test.
    Job selected in random across multiple test suite runs.
    """
    while True:
        job = None
        FILE_LOCK.acquire()
        try:
            pytest.test_job_map = read_from_file(TEST_JOB_MAP_FILE)
            pytest.job_in_use = read_from_file(JOB_IN_USE_FILE)

            job_list = list(pytest.jobs.keys())
            random.shuffle(job_list)

            for j in job_list:
                test_name = request.node.name
                id = test_name.split("[")[0] + "_" + j

                # check if test && job are already matched
                if id in pytest.test_job_map or j in pytest.job_in_use:
                    continue

                pytest.test_job_map[id] = ""
                pytest.job_in_use[j] = ""

                write_to_file(TEST_JOB_MAP_FILE, pytest.test_job_map)
                write_to_file(JOB_IN_USE_FILE, pytest.job_in_use)

                log.info(
                    "test_job_mapping:: test_name: %s, map_id: %s",
                    test_name,
                    id,
                )

                # create deep copy for job
                job = StatelessJob(
                    job_id=pytest.jobs[j],
                    config=IntegrationTestConfig(
                        pool_file=RESPOOL_FILE_NAME,
                        max_retry_attempts=MAX_RETRY_ATTEMPTS,
                    ),
                )
                break
        finally:
            FILE_LOCK.release()

        if job is not None:
            break
        time.sleep(10)

    return job
Beispiel #12
0
def cleanup_jobs(client, timeout_secs=20):
    """
    Calls peloton API to delete all currently running jobs
    """
    jobs = [
        StatelessJob(job_id=s.job_id.value, client=client)
        for s in list_jobs()
    ]

    for job in jobs:
        job.delete(force_delete=True)

    wait_for_deletion(client, timeout_secs)
Beispiel #13
0
def test__health_check_detects_unhealthy_tasks():
    job = StatelessJob(
        job_file="test_stateless_job_failed_health_check_spec.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.job_spec.instance_count = 1
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    def task_has_unhealthy_events():
        for pod_event in job.get_pod(0).get_pod_events():
            if pod_event.healthy == "HEALTH_STATE_UNHEALTHY":
                return True

    job.wait_for_condition(task_has_unhealthy_events)
Beispiel #14
0
def stop_jobs(client):
    '''
    Calls peloton API to terminate all batch jobs and stateless jobs
    '''
    # obtain a list of jobs from all resource pools and terminate them
    jobs = list_jobs()
    for job in jobs:
        job = StatelessJob(client=client, job_id=job.job_id.value)
        job.config.max_retry_attempts = 100
        job.stop()
        job.wait_for_terminated()
Beispiel #15
0
def test__failed_task_throttled_by_exponential_backoff():
    job = StatelessJob(job_file='test_stateless_job_exit_with_err_spec.yaml',
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    time.sleep(40)

    pod_events = job.get_pod(0).get_pod_events()
    # if throttle is effective, the task should not create many
    # pod events. Otherwise it can generate many pod events, during
    # the time window
    pod_id = pod_events[0].pod_id.value
    run_id = int(pod_id[pod_id.rindex('-') + 1:])
    assert 1 < run_id < 20
Beispiel #16
0
def test__create_update_to_unset_health_check():
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    update = StatelessUpdate(job,
                             updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
Beispiel #17
0
def patch_jobs(active_jobs=None, desired_jobs=None):
    """
    patch jobs check current state of the job and applies desired goal state
    for the job. It can yield to create a job or updating a job.
    """
    jobs = {}
    for job_name, job_spec in desired_jobs.items():
        if job_name in active_jobs.keys():
            j = active_jobs[job_name]

            # failfast is not None then do not run canary test
            # until dirty jobs are restored manually.
            if os.getenv("FAILFAST") == "NO":
                # job exists -> update to desired state
                patch_job(j, job_spec)
                jobs[job_name] = j.get_job_id()
            else:
                # if job update diff has non-nil result means that previous
                # canary test run failed and we want more runs to block
                # until issue is manually debugged and state is restored.
                job_spec.respool_id.value = j.get_spec().respool_id.value
                resp = j.get_replace_job_diff(job_spec=job_spec)
                print resp
                if len(resp.instances_removed) > 0 or \
                   len(resp.instances_updated) > 0 or \
                   len(resp.instances_added) > 0:
                    pytest.exit(
                        "canary test run was aborted, since jobs are dirty!!")

                jobs[job_name] = j.get_job_id()
        else:
            # job does not exist -> create
            job = StatelessJob(
                job_config=job_spec,
                config=IntegrationTestConfig(
                    pool_file=RESPOOL_FILE_NAME,
                    max_retry_attempts=MAX_RETRY_ATTEMPTS,
                ),
            )
            job.create()
            time.sleep(10)
            job.wait_for_all_pods_running()
            jobs[job_name] = job.get_job_id()

    # TODO: Kill any undesired active job running in the canary cluster

    return jobs
Beispiel #18
0
def test__create_update_to_disable_health_check():
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
                       config=IntegrationTestConfig(max_retry_attempts=100))
    job.create()
    job.wait_for_state(goal_state='RUNNING')

    job.job_spec.default_spec.containers[0].liveness_check.enabled = False
    update = StatelessUpdate(job,
                             updated_job_spec=job.job_spec,
                             max_failure_instances=1,
                             max_instance_attempts=1)
    update.create()
    update.wait_for_state(goal_state='SUCCEEDED')
Beispiel #19
0
def test__in_place_update_success_rate():
    stateless_job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml")
    stateless_job.job_spec.instance_count = 30
    stateless_job.create()
    stateless_job.wait_for_all_pods_running()
    old_pod_infos = stateless_job.query_pods()

    job_spec_dump = load_test_config("test_stateless_job_spec_k8s.yaml")
    updated_job_spec = JobSpec()
    json_format.ParseDict(job_spec_dump, updated_job_spec)

    updated_job_spec.instance_count = 30
    updated_job_spec.default_spec.containers[0].resource.mem_limit_mb = 0.1
    update = StatelessUpdate(stateless_job,
                             updated_job_spec=updated_job_spec,
                             batch_size=0)
    update.create(in_place=True)
    update.wait_for_state(goal_state='SUCCEEDED')

    new_pod_infos = stateless_job.query_pods()

    old_pod_dict = {}
    new_pod_dict = {}

    for old_pod_info in old_pod_infos:
        split_index = old_pod_info.status.pod_id.value.rfind('-')
        pod_name = old_pod_info.status.pod_id.value[:split_index]
        old_pod_dict[pod_name] = old_pod_info.status.host

    for new_pod_info in new_pod_infos:
        split_index = new_pod_info.status.pod_id.value.rfind('-')
        pod_name = new_pod_info.status.pod_id.value[:split_index]
        new_pod_dict[pod_name] = new_pod_info.status.host

    count = 0
    for pod_name, pod_id in old_pod_dict.items():
        if new_pod_dict[pod_name] != old_pod_dict[pod_name]:
            log.info("%s, prev:%s, cur:%s", pod_name, old_pod_dict[pod_name],
                     new_pod_dict[pod_name])
            count = count + 1
    assert count == 0
Beispiel #20
0
def test__create_update_to_set_health_check(in_place):
    job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_SPEC,
        config=IntegrationTestConfig(
            max_retry_attempts=100,
            pool_file='test_stateless_respool.yaml',
        ),
    )
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    update = StatelessUpdate(
        job,
        updated_job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
Beispiel #21
0
def test__create_update_to_change_health_check_config(in_place):
    job = StatelessJob(
        job_file=UPDATE_STATELESS_JOB_WITH_HEALTH_CHECK_SPEC,
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.job_spec.default_spec.containers[0].liveness_check.enabled = False
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.job_spec.default_spec.containers[
        0].liveness_check.initial_interval_secs = 2
    update = StatelessUpdate(
        job,
        updated_job_spec=job.job_spec,
        max_failure_instances=1,
        max_instance_attempts=1,
    )
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")
Beispiel #22
0
def test__get_job_update_details__filter_non_update_workflow(client):
    """
    test getJobUpdateDetails endpoint for filtering non-update workflows
    """
    req1 = get_job_update_request("test_dc_labrat_large_job.yaml")
    req1.settings.updateGroupSize = 10

    req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml")
    req2.settings.updateGroupSize = 10

    # start a regular update
    job_key = start_job_update(client, req1,
                               "start job update test/dc/labrat_large_job")

    # trigger an unexpected restart through peloton api
    jobs = list_jobs()
    assert len(jobs) == 1

    job = StatelessJob(job_id=jobs[0].job_id.value)
    job.restart(batch_size=10)
    job.wait_for_workflow_state(goal_state="SUCCEEDED")  # wait for restart

    # start a new update
    start_job_update(client, req2, "start job update test/dc/labrat_large_job")

    # verify getJobUpdateDetails response
    res = client.get_job_update_details(None,
                                        api.JobUpdateQuery(role=job_key.role))
    assert len(res.detailsList) == 2

    for i, detail in enumerate(res.detailsList):
        if i == 0:
            assert len(detail.update.instructions.initialState) > 0
            for initial in detail.update.instructions.initialState:
                assert initial.task.metadata, 'Expect metadata to be present'
        else:
            assert len(detail.update.instructions.initialState) == 0
Beispiel #23
0
def test__in_place_kill_job_release_host():
    job1 = StatelessJob(job_file="test_stateless_job_spec.yaml", )
    job1.create()
    job1.wait_for_state(goal_state="RUNNING")

    job2 = StatelessJob(job_file="test_stateless_job_spec.yaml", )
    job2.create()
    job2.wait_for_state(goal_state="RUNNING")

    update1 = StatelessUpdate(job1,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=0)
    update1.create(in_place=True)
    # stop the update
    job1.stop()

    update2 = StatelessUpdate(job2,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=0)
    update2.create()

    # both updates should complete
    update1.wait_for_state(goal_state="SUCCEEDED")
    update2.wait_for_state(goal_state="SUCCEEDED")
Beispiel #24
0
def test__delete_job_bad_version():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    try:
        job.delete(entity_version="1-2-3")
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.ABORTED
        assert INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details()
        return
    raise Exception("entity version mismatch error not received")
Beispiel #25
0
def test__delete_killed_job():
    job = StatelessJob()
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    job.delete()
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
Beispiel #26
0
def _list_jobs():
    return [StatelessJob(job_id=s.job_id.value) for s in list_jobs()]
Beispiel #27
0
def test__placement_exclusive_job(exclusive_host):
    excl_constraint = pod_pb2.Constraint(
        type=1,  # Label constraint
        label_constraint=pod_pb2.LabelConstraint(
            kind=2,  # Host
            condition=2,  # Equal
            requirement=1,
            label=peloton_pb2_v1alpha.Label(key="peloton/exclusive",
                                            value="exclusive-test-label"),
        ),
    )
    # We have 1 exclusive host and 2 non-exclusive hosts. Set number of
    # instances to be a few more than what can run simulatenously on
    # a single exclusive host
    job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml")
    job.job_spec.default_spec.constraint.CopyFrom(excl_constraint)
    job.job_spec.instance_count = 6
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    job.wait_for_all_pods_running(num_pods=4)

    job.stop()
    job.wait_for_terminated()

    # check that all of them ran on exclusive host
    pod_summaries = job.list_pods()
    for s in pod_summaries:
        if s.status.host:
            assert "exclusive" in s.status.host
Beispiel #28
0
def test__placement_non_exclusive_job(exclusive_host):
    # We have 1 exclusive host and 2 non-exclusive hosts. Set number of
    # instances to be a few more than what can run simulatenously
    # on 2 non-exclusive hosts
    job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml")
    job.job_spec.instance_count = 10
    job.create()
    job.wait_for_state(goal_state="RUNNING")
    job.wait_for_all_pods_running(num_pods=5)

    job.stop()
    job.wait_for_terminated()

    # check that none of them ran on exclusive host
    pod_summaries = job.list_pods()
    for s in pod_summaries:
        if s.status.host:
            assert "exclusive" not in s.status.host
Beispiel #29
0
def test__failed_task_automatically_restart():
    job = StatelessJob(
        job_file="test_stateless_job_exit_with_err_spec.yaml",
        config=IntegrationTestConfig(max_retry_attempts=100),
    )
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    old_pod_id = job.get_pod(0).get_pod_status().pod_id.value

    def job_not_running():
        return job.get_status().state != "JOB_STATE_RUNNING"

    job.wait_for_condition(job_not_running)

    def pod_id_changed():
        new_pod_id = job.get_pod(0).get_pod_status().pod_id.value
        return old_pod_id != new_pod_id

    job.wait_for_condition(pod_id_changed)
Beispiel #30
0
def test_update_killed_job(in_place):
    job = StatelessJob(job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC)
    job.create()
    job.wait_for_state(goal_state="RUNNING")

    job.stop()
    job.wait_for_state(goal_state="KILLED")

    update = StatelessUpdate(
        job,
        updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC)
    update.create(in_place=in_place)
    update.wait_for_state(goal_state="SUCCEEDED")

    assert job.get_spec().instance_count == 3
    assert job.get_status().state == stateless_pb2.JOB_STATE_KILLED