def test__in_place_kill_job_release_host(): job1 = StatelessJob( job_file="test_stateless_job_spec.yaml", ) job1.create() job1.wait_for_state(goal_state="RUNNING") job2 = StatelessJob( job_file="test_stateless_job_spec.yaml", ) job2.create() job2.wait_for_state(goal_state="RUNNING") update1 = StatelessUpdate(job1, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=0) update1.create(in_place=True) # stop the update job1.stop() update2 = StatelessUpdate(job2, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=0) update2.create() # both updates should complete update1.wait_for_state(goal_state="SUCCEEDED") update2.wait_for_state(goal_state="SUCCEEDED")
def test__placement_exclusive_job(exclusive_host): excl_constraint = pod_pb2.Constraint( type=1, # Label constraint label_constraint=pod_pb2.LabelConstraint( kind=2, # Host condition=2, # Equal requirement=1, label=peloton_pb2_v1alpha.Label(key="peloton/exclusive", value="exclusive-test-label"), ), ) # We have 1 exclusive host and 2 non-exclusive hosts. Set number of # instances to be a few more than what can run simulatenously on # a single exclusive host job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml") job.job_spec.default_spec.constraint.CopyFrom(excl_constraint) job.job_spec.instance_count = 6 job.create() job.wait_for_state(goal_state="RUNNING") job.wait_for_all_pods_running(num_pods=4) job.stop() job.wait_for_terminated() # check that all of them ran on exclusive host pod_summaries = job.list_pods() for s in pod_summaries: if s.status.host: assert "exclusive" in s.status.host
def stop_jobs(client): ''' Calls peloton API to terminate all batch jobs and stateless jobs ''' # obtain a list of jobs from all resource pools and terminate them jobs = list_jobs() for job in jobs: job = StatelessJob(client=client, job_id=job.job_id.value) job.config.max_retry_attempts = 100 job.stop() job.wait_for_terminated()
def test__kill_sla_violated_job(): """ 1. Create a stateless job(instance_count=5) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of the instances will not get placed (hence unavailable). 2. Kill job and wait for the job to reach KILLED state """ job = StatelessJob(job_file="test_stateless_job_spec_sla.yaml", ) job.job_spec.instance_count = 5 job.create() job.wait_for_all_pods_running(num_pods=3) job.stop() job.wait_for_state(goal_state='KILLED')
def test__delete_job_bad_version(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") try: job.delete(entity_version="1-2-3") except grpc.RpcError as e: assert e.code() == grpc.StatusCode.ABORTED assert INVALID_ENTITY_VERSION_ERR_MESSAGE in e.details() return raise Exception("entity version mismatch error not received")
def test__restart_killed_job(): job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml") job.create() job.wait_for_state(goal_state="RUNNING") old_pod_infos = job.query_pods() job.stop() job.wait_for_state(goal_state="KILLED") job.restart(in_place=False) job.wait_for_all_pods_running() new_pod_infos = job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test_update_killed_job(in_place): job = StatelessJob(job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") update = StatelessUpdate( job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") assert job.get_spec().instance_count == 3 assert job.get_status().state == stateless_pb2.JOB_STATE_KILLED
def test__delete_killed_job(): job = StatelessJob() job.create() job.wait_for_state(goal_state="RUNNING") job.stop() job.wait_for_state(goal_state="KILLED") job.delete() time.sleep(10) try: job.get_job() except grpc.RpcError as e: assert e.code() == grpc.StatusCode.NOT_FOUND return raise Exception("job not found error not received")
def test__placement_non_exclusive_job(exclusive_host): # We have 1 exclusive host and 2 non-exclusive hosts. Set number of # instances to be a few more than what can run simulatenously # on 2 non-exclusive hosts job = StatelessJob(job_file="test_stateless_job_cpus_large_spec.yaml") job.job_spec.instance_count = 10 job.create() job.wait_for_state(goal_state="RUNNING") job.wait_for_all_pods_running(num_pods=5) job.stop() job.wait_for_terminated() # check that none of them ran on exclusive host pod_summaries = job.list_pods() for s in pod_summaries: if s.status.host: assert "exclusive" not in s.status.host
def test_stop_running_job_with_active_update_remove_instances(in_place): stateless_job = StatelessJob( job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") assert len(stateless_job.query_pods()) == 5 update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC, batch_size=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="ROLLING_FORWARD") stateless_job.stop() update.wait_for_state(goal_state="SUCCEEDED") assert stateless_job.get_spec().instance_count == 3
def test__revocable_job_slack_limit(): revocable_job = StatelessJob( job_file="test_stateless_job_revocable_slack_limit_spec.yaml") revocable_job.create() revocable_job.wait_for_state(goal_state="RUNNING") # 2 tasks are running out of 3 def partial_tasks_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 2 revocable_job.wait_for_condition(partial_tasks_running) # cleanup job from jobmgr revocable_job.stop() revocable_job.wait_for_terminated()
def test__create_revocable_job(): revocable_job1 = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml") revocable_job1.create() revocable_job1.wait_for_state(goal_state="RUNNING") revocable_job1.wait_for_all_pods_running() revocable_job2 = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml") revocable_job2.create() revocable_job2.wait_for_state(goal_state="RUNNING") revocable_job2.wait_for_all_pods_running() non_revocable_job = StatelessJob( job_file="test_stateless_job_cpus_large_spec.yaml") non_revocable_job.create() non_revocable_job.wait_for_state(goal_state="RUNNING") non_revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr revocable_job1.stop() revocable_job2.stop() non_revocable_job.stop() revocable_job1.wait_for_terminated() revocable_job2.wait_for_terminated() non_revocable_job.wait_for_terminated()
def test__revocable_tasks_move_to_revocable_queue(): revocable_job1 = StatelessJob(job_file='test_stateless_job_revocable_spec.yaml') revocable_job1.create() revocable_job1.wait_for_state(goal_state='RUNNING') revocable_job1.wait_for_all_pods_running() # 1 task is running out of 3 def partial_tasks_running(): count = 0 for pod_id in range(0, revocable_job2.job_spec.instance_count): pod_state = revocable_job2.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 1 revocable_job2 = StatelessJob( job_file='test_stateless_job_revocable_slack_limit_spec.yaml') revocable_job2.create() # sleep for 5 seconds to make sure job has enough time time.sleep(5) revocable_job2.wait_for_condition(partial_tasks_running) non_revocable_job = StatelessJob(job_file='test_stateless_job_spec.yaml') non_revocable_job.create() non_revocable_job.wait_for_state('RUNNING') non_revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr revocable_job1.stop() revocable_job2.stop() non_revocable_job.stop() revocable_job1.wait_for_terminated() revocable_job2.wait_for_terminated() non_revocable_job.wait_for_terminated()
def test__stop_nonrevocable_job_to_free_resources_for_revocable_job(): non_revocable_job1 = StatelessJob( job_file="test_stateless_job_memory_large_spec.yaml" ) non_revocable_job1.create() non_revocable_job1.wait_for_state("RUNNING") non_revocable_job2 = StatelessJob( job_file="test_stateless_preemptible_job_memory_large_spec.yaml" ) non_revocable_job2.create() non_revocable_job2.wait_for_state("RUNNING") non_revocable_job1.wait_for_all_pods_running() non_revocable_job2.wait_for_all_pods_running() revocable_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml" ) revocable_job.create() # no tasks should be running def no_task_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 0 # give job 5 seconds to run, even after that no tasks should be running time.sleep(5) revocable_job.wait_for_condition(no_task_running) # stop non_revocable job to free up resources for revocable job non_revocable_job2.stop() non_revocable_job2.wait_for_terminated() # After non_revocable job is killed, all revocable tasks should be running revocable_job.wait_for_all_pods_running() # cleanup jobs from jobmgr non_revocable_job1.stop() revocable_job.stop() non_revocable_job1.wait_for_terminated() revocable_job.wait_for_terminated()
def test__preempt_revocable_job_to_run_non_revocable_job(): non_revocable_job1 = StatelessJob( job_file="test_stateless_preemptible_job_memory_large_spec.yaml" ) non_revocable_job1.create() non_revocable_job1.wait_for_state(goal_state="RUNNING") non_revocable_job1.wait_for_all_pods_running() revocable_job = StatelessJob( job_file="test_stateless_job_revocable_spec.yaml" ) revocable_job.create() revocable_job.wait_for_state(goal_state="RUNNING") revocable_job.wait_for_all_pods_running() # launch second non-revocable job which will pre-empt revocable job non_revocable_job2 = StatelessJob( job_file="test_stateless_job_memory_large_spec.yaml" ) non_revocable_job2.create() non_revocable_job2.wait_for_state(goal_state="RUNNING") non_revocable_job2.wait_for_all_pods_running() # no revocable job tasks should be running def zero_tasks_running(): count = 0 for pod_id in range(0, revocable_job.job_spec.instance_count): pod_state = revocable_job.get_pod(pod_id).get_pod_status().state if pod_state == pod.POD_STATE_RUNNING: count += 1 return count == 0 revocable_job.wait_for_condition(zero_tasks_running) revocable_job.stop() non_revocable_job1.stop() non_revocable_job2.stop() revocable_job.wait_for_terminated() non_revocable_job1.wait_for_terminated() non_revocable_job2.wait_for_terminated()