def test__host_limit(peloton_client): job = Job( client=peloton_client, job_file="test_stateless_job_host_limit_1.yaml", config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=2), ) job.create() job.wait_for_state(goal_state="RUNNING") # All running tasks should have different hosts def different_hosts_for_running_tasks(): hosts = set() num_running, num_pending = 0, 0 tasks = job.list_tasks().value for id, t in tasks.items(): if t.runtime.state == task_pb2.TaskState.Value("RUNNING"): num_running = num_running + 1 hosts.add(t.runtime.host) if t.runtime.state == task_pb2.TaskState.Value("PENDING"): num_pending = num_pending + 1 # number of running tasks should be equal to the size of the hosts set # there should be 1 task in PENDING return len(hosts) == num_running and num_pending == 1 job.wait_for_condition(different_hosts_for_running_tasks) job.stop() job.wait_for_state(goal_state="KILLED")
def test__preemption_task_level(respool_a, respool_b): p_job_a = Job( job_file="test_preemptible_job_preemption_override.yaml", pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10), ) p_job_a.create() p_job_a.wait_for_state(goal_state="RUNNING") # we should have all 12 tasks in running state def all_running(): return all(t.state == 8 for t in p_job_a.get_tasks().values()) p_job_a.wait_for_condition(all_running) # odd instance ids should be preempted expected_preempted_tasks = set([1, 3, 5, 7, 9, 11]) # even instance ids should be running expected_running_tasks = set([0, 2, 4, 6, 8, 10]) preempted_task_set, running_task_set = set([]), set([]) # 6(6 CPUs worth) tasks from p_job_a should be preempted def task_preempted(): preempted_task_set.clear() running_task_set.clear() preempted_count, running_count = 0, 0 for t in p_job_a.get_tasks().values(): # tasks should be KILLED since killOnPreempt is set to true if t.state == task.KILLED: preempted_count += 1 preempted_task_set.add(t.instance_id) if t.state == task.RUNNING: running_count += 1 running_task_set.add(t.instance_id) return running_count == 6 and preempted_count == 6 p_job_b = Job( job_file="test_preemptible_job.yaml", pool=respool_b, config=IntegrationTestConfig(), ) # starting the second job should change the entitlement calculation and # start preempting tasks from p_job_a p_job_b.create() # 6 tasks(odd instance ids) should be preempted from job1 to make space for job2 p_job_a.wait_for_condition(task_preempted) # check the preempted tasks and check instance ids should be odd. assert preempted_task_set == expected_preempted_tasks assert running_task_set == expected_running_tasks # wait for p_job_b to start running p_job_b.wait_for_state("RUNNING") kill_jobs([p_job_a, p_job_b])
def test_update_job_increase_instances(peloton_client): job = Job( client=peloton_client, job_file="long_running_job.yaml", config=IntegrationTestConfig(max_retry_attempts=100), ) job.create() job.wait_for_state(goal_state="RUNNING") # job has only 1 task to begin with expected_count = 3 def tasks_count(): count = 0 for t in job.get_tasks().values(): if t.state == 8 or t.state == 9: count += 1 print("total instances running/completed: %d" % count) return count == expected_count job.wait_for_condition(tasks_count) # update the job with the new config job.update(new_job_file="long_running_job_update_instances.yaml") # number of tasks should increase to 4 expected_count = 4 job.wait_for_condition(tasks_count) job.wait_for_state(goal_state="RUNNING") kill_jobs([job])
def test__preemption_spark_goalstate(respool_a, respool_b): p_job_a = Job( job_file="test_preemptible_job_preemption_policy.yaml", pool=respool_a, config=IntegrationTestConfig( max_retry_attempts=100, sleep_time_sec=10 ), ) p_job_a.create() p_job_a.wait_for_state(goal_state="RUNNING") # we should have all 12 tasks in running state def all_running(): return all(t.state == 8 for t in p_job_a.get_tasks().values()) p_job_a.wait_for_condition(all_running) preempted_task_set = {} # 6(6 CPUs worth) tasks from p_job_a should be preempted def task_preempted(): count = 0 for t in p_job_a.get_tasks().values(): # tasks should be KILLED since killOnPreempt is set to true if t.state == task.KILLED: count += 1 preempted_task_set[t] = True return count == 6 p_job_b = Job( job_file="test_preemptible_job.yaml", pool=respool_b, config=IntegrationTestConfig(), ) # starting the second job should change the entitlement calculation p_job_b.create() # 6 jobs should be preempted from job1 to make space for job2 p_job_a.wait_for_condition(task_preempted) # check the preempted tasks and check the runtime info. for t in preempted_task_set: assert t.state == task.KILLED assert t.goal_state == task.PREEMPTING kill_jobs([p_job_a, p_job_b])
def test__preemption_tasks_reschedules_task(respool_a, respool_b): p_job_a = Job( job_file="test_preemptible_job.yaml", pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=5), ) p_job_a.create() p_job_a.wait_for_state(goal_state="RUNNING") # we should have all 12 tasks in running state def all_running(): return all(t.state == 8 for t in p_job_a.get_tasks().values()) p_job_a.wait_for_condition(all_running) # 6(6 CPUs worth) tasks from p_job_a should be preempted def task_preempted(): count = 0 for t in p_job_a.get_tasks().values(): # tasks should be enqueued back by the jobmanager and once # enqueued they should transition to PENDING state if t.state == task.PENDING: count += 1 return count == 6 p_job_b = Job( job_file="test_preemptible_job.yaml", pool=respool_b, config=IntegrationTestConfig( max_retry_attempts=100, sleep_time_sec=10 ), ) # starting the second job should change the entitlement calculation p_job_b.create() # 6 tasks should be preempted from job1 to make space for job2 p_job_a.wait_for_condition(task_preempted) # p_job_b should succeed p_job_b.wait_for_state(goal_state="SUCCEEDED") kill_jobs([p_job_a, p_job_b])
def test_task_killed_in_ready_succeeds_when_re_enqueued(placement_engines): # Tests that a if task is deleted which is in READY state in resource # manager and if is re-enqueued succeeds. # stop the placement engines to keep the tasks in READY state placement_engines.stop() # decorate the client to add peloton private API stubs c = with_private_stubs(Client()) # create long running job with 2 instances long_running_job = Job( job_file='long_running_job.yaml', options=[ with_instance_count(2), ], client=c, ) long_running_job.create() long_running_job.wait_for_state(goal_state='PENDING') task = long_running_job.get_task(0) # wait for task to reach READY task.wait_for_pending_state(goal_state='READY') # kill the task task.stop() # re-enqueue the task task.start() # gentlemen, start your (placement) engines placement_engines.start() def wait_for_instance_to_run(): return long_running_job.get_task(0).state_str == 'RUNNING' long_running_job.wait_for_condition(wait_for_instance_to_run)
def test__tasks_reserve_execution(hostreservepool, peloton_client): p_job_median = Job( client=peloton_client, job_file='test_hostreservation_job_median.yaml', pool=hostreservepool, config=IntegrationTestConfig( max_retry_attempts=100, sleep_time_sec=1), ) p_job_median.create() p_job_median.wait_for_state(goal_state='RUNNING') # we should have all 3 tasks in running state def all_running(): return all(t.state == task.RUNNING for t in p_job_median.get_tasks().values()) p_job_median.wait_for_condition(all_running) # decorate the client to add peloton private API stubs client = with_private_stubs(peloton_client) p_job_large = Job( job_file='test_hostreservation_job_large.yaml', pool=hostreservepool, config=IntegrationTestConfig( sleep_time_sec=1, max_retry_attempts=300), options=[with_instance_count(1)], client=client, ) p_job_large.create() p_job_large.wait_for_state(goal_state='PENDING') request = hostmgr.GetHostsByQueryRequest() # task should get into reserved state and RUNNING state t1 = p_job_large.get_task(0) t1.wait_for_pending_state(goal_state="RESERVED") # the task is running on reserved host def get_reserved_host(): resp = client.hostmgr_svc.GetHostsByQuery( request, metadata=p_job_large.client.hostmgr_metadata, timeout=p_job_large.config.rpc_timeout_sec,) for h in resp.hosts: if h.status == 'reserved': return h.hostname return '' def is_reserved(): return get_reserved_host() != '' p_job_large.wait_for_condition(is_reserved) reserved_host = get_reserved_host() t1.wait_for_pending_state(goal_state="RUNNING") assert reserved_host == t1.get_info().runtime.host # p_job_large should succeed p_job_large.wait_for_state() # no host is in reserved state response = client.hostmgr_svc.GetHostsByQuery( request, metadata=p_job_large.client.hostmgr_metadata, timeout=p_job_large.config.rpc_timeout_sec,) for host in response.hosts: assert host.status != 'reserved' kill_jobs([p_job_median, p_job_large])
def test__preemption_non_preemptible_task(respool_a, respool_b): # Create 2 Jobs : 1 preemptible and 1 non-preemptible in respool A p_job_a = Job( job_file="test_preemptible_job.yaml", pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10), ) p_job_a.update_instance_count(6) np_job_a = Job( job_file="test_preemptible_job.yaml", pool=respool_a, config=IntegrationTestConfig(), ) np_job_a.job_config.sla.preemptible = False np_job_a.update_instance_count(6) # preemptible job takes 6 CPUs p_job_a.create() # non preemptible job takes 6 reserved CPUs np_job_a.create() p_job_a.wait_for_state("RUNNING") np_job_a.wait_for_state("RUNNING") # pool allocation is more than reservation assert np_job_a.pool.get_reservation("cpu") < np_job_a.pool.get_allocation( "cpu") # Create another job in respool B p_job_b = Job( job_file="test_preemptible_job.yaml", pool=respool_b, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10), ) p_job_b.update_instance_count(6) p_job_b.create() # p_job_b should remain PENDING since all resources are used by # p_job_a p_job_b.wait_for_state("PENDING") # p_job_a should be preempted and go back to PENDING p_job_a.wait_for_state(goal_state="PENDING") # np_job_a should keep RUNNING np_job_a.wait_for_state("RUNNING") def all_tasks_running(): count = 0 for t in np_job_a.get_tasks().values(): if t.state == task.RUNNING: count += 1 return count == 6 # p_job_b should start running p_job_b.wait_for_condition(all_tasks_running) # pool A allocation is equal to reservation assert np_job_a.pool.get_reservation( "cpu") == np_job_a.pool.get_allocation("cpu") # pool B allocation is equal to reservation assert p_job_b.pool.get_reservation("cpu") == p_job_b.pool.get_allocation( "cpu") # wait for p_job_b to finish p_job_b.wait_for_state("SUCCEEDED") # make sure p_job_a starts running p_job_a.wait_for_state("RUNNING") kill_jobs([p_job_a, np_job_a, p_job_b])