Esempio n. 1
0
def test__delete_sla_violated_job():
    """
    1. Create a stateless job(instance_count=5) with host-limit-1 constraint and
       MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of
       the instances will not get placed (hence unavailable).
    2. Force delete the job and verify that the job is deleted
    """
    job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", )
    job.job_spec.instance_count = 5
    job.create()
    job.wait_for_all_pods_running(num_pods=3)

    job.delete(force_delete=True)
    time.sleep(10)

    try:
        job.get_job()
    except grpc.RpcError as e:
        assert e.code() == grpc.StatusCode.NOT_FOUND
        return
    raise Exception("job not found error not received")
Esempio n. 2
0
def test__create_revocable_job():
    revocable_job1 = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml")
    revocable_job1.create()
    revocable_job1.wait_for_state(goal_state="RUNNING")
    revocable_job1.wait_for_all_pods_running()

    revocable_job2 = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml")
    revocable_job2.create()
    revocable_job2.wait_for_state(goal_state="RUNNING")
    revocable_job2.wait_for_all_pods_running()

    non_revocable_job = StatelessJob(
        job_file="test_stateless_job_cpus_large_spec.yaml")
    non_revocable_job.create()
    non_revocable_job.wait_for_state(goal_state="RUNNING")
    non_revocable_job.wait_for_all_pods_running()

    # cleanup jobs from jobmgr
    revocable_job1.stop()
    revocable_job2.stop()
    non_revocable_job.stop()
    revocable_job1.wait_for_terminated()
    revocable_job2.wait_for_terminated()
    non_revocable_job.wait_for_terminated()
Esempio n. 3
0
def test__in_place_update_multi_component_restart(jobmgr, resmgr, hostmgr,
                                                  placement_engines,
                                                  batch_size):
    # need extra retry attempts, since in-place update would need more time
    # to process given hostmgr would be restarted
    job1 = StatelessJob(
        job_file="test_stateless_job_spec.yaml",
        config=IntegrationTestConfig(max_retry_attempts=300),
    )
    job1.create()
    job1.wait_for_all_pods_running()

    job2 = StatelessJob(
        job_file="test_stateless_job_spec.yaml",
        config=IntegrationTestConfig(max_retry_attempts=300),
    )
    job2.create()
    job2.wait_for_all_pods_running()

    update1 = StatelessUpdate(job1,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=batch_size)
    update1.create(in_place=True)

    update2 = StatelessUpdate(job2,
                              updated_job_file=UPDATE_STATELESS_JOB_SPEC,
                              batch_size=batch_size)
    update2.create()

    jobmgr.restart()
    time.sleep(random.randint(1, 10))
    resmgr.restart()
    time.sleep(random.randint(1, 10))
    hostmgr.restart()
    time.sleep(random.randint(1, 10))
    placement_engines.restart()

    update1.wait_for_state(goal_state="SUCCEEDED")
    update2.wait_for_state(goal_state="SUCCEEDED")
Esempio n. 4
0
def test__revocable_tasks_move_to_revocable_queue():
    revocable_job1 = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml"
    )
    revocable_job1.create()
    revocable_job1.wait_for_state(goal_state="RUNNING")
    revocable_job1.wait_for_all_pods_running()

    # 1 task is running out of 3
    def partial_tasks_running():
        count = 0
        for pod_id in range(0, revocable_job2.job_spec.instance_count):
            pod_state = revocable_job2.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 1

    revocable_job2 = StatelessJob(
        job_file="test_stateless_job_revocable_slack_limit_spec.yaml"
    )
    revocable_job2.create()

    # sleep for 5 seconds to make sure job has enough time
    time.sleep(5)
    revocable_job2.wait_for_condition(partial_tasks_running)

    non_revocable_job = StatelessJob(job_file="test_stateless_job_spec.yaml")
    non_revocable_job.create()
    non_revocable_job.wait_for_state("RUNNING")
    non_revocable_job.wait_for_all_pods_running()

    # cleanup jobs from jobmgr
    revocable_job1.stop()
    revocable_job2.stop()
    non_revocable_job.stop()
    revocable_job1.wait_for_terminated()
    revocable_job2.wait_for_terminated()
    non_revocable_job.wait_for_terminated()
Esempio n. 5
0
def test__stop_nonrevocable_job_to_free_resources_for_revocable_job():
    non_revocable_job1 = StatelessJob(
        job_file="test_stateless_job_memory_large_spec.yaml"
    )
    non_revocable_job1.create()
    non_revocable_job1.wait_for_state("RUNNING")

    non_revocable_job2 = StatelessJob(
        job_file="test_stateless_preemptible_job_memory_large_spec.yaml"
    )
    non_revocable_job2.create()
    non_revocable_job2.wait_for_state("RUNNING")

    non_revocable_job1.wait_for_all_pods_running()
    non_revocable_job2.wait_for_all_pods_running()

    revocable_job = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml"
    )
    revocable_job.create()

    # no tasks should be running
    def no_task_running():
        count = 0
        for pod_id in range(0, revocable_job.job_spec.instance_count):
            pod_state = revocable_job.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 0

    # give job 5 seconds to run, even after that no tasks should be running
    time.sleep(5)
    revocable_job.wait_for_condition(no_task_running)

    # stop non_revocable job to free up resources for revocable job
    non_revocable_job2.stop()
    non_revocable_job2.wait_for_terminated()

    # After non_revocable job is killed, all revocable tasks should be running
    revocable_job.wait_for_all_pods_running()

    # cleanup jobs from jobmgr
    non_revocable_job1.stop()
    revocable_job.stop()
    non_revocable_job1.wait_for_terminated()
    revocable_job.wait_for_terminated()
Esempio n. 6
0
def test__preempt_revocable_job_to_run_non_revocable_job():
    non_revocable_job1 = StatelessJob(
        job_file="test_stateless_preemptible_job_memory_large_spec.yaml"
    )
    non_revocable_job1.create()
    non_revocable_job1.wait_for_state(goal_state="RUNNING")
    non_revocable_job1.wait_for_all_pods_running()

    revocable_job = StatelessJob(
        job_file="test_stateless_job_revocable_spec.yaml"
    )
    revocable_job.create()
    revocable_job.wait_for_state(goal_state="RUNNING")
    revocable_job.wait_for_all_pods_running()

    # launch second non-revocable job which will pre-empt revocable job
    non_revocable_job2 = StatelessJob(
        job_file="test_stateless_job_memory_large_spec.yaml"
    )
    non_revocable_job2.create()
    non_revocable_job2.wait_for_state(goal_state="RUNNING")
    non_revocable_job2.wait_for_all_pods_running()

    # no revocable job tasks should be running
    def zero_tasks_running():
        count = 0
        for pod_id in range(0, revocable_job.job_spec.instance_count):
            pod_state = revocable_job.get_pod(pod_id).get_pod_status().state
            if pod_state == pod.POD_STATE_RUNNING:
                count += 1
        return count == 0

    revocable_job.wait_for_condition(zero_tasks_running)

    revocable_job.stop()
    non_revocable_job1.stop()
    non_revocable_job2.stop()
    revocable_job.wait_for_terminated()
    non_revocable_job1.wait_for_terminated()
    non_revocable_job2.wait_for_terminated()