def test_controller_task_limit(): # This tests the controller limit of a resource pool. Once it is fully # allocated by a controller task, subsequent tasks can't be admitted. # 1. start controller job1 which uses all the controller limit # 2. start controller job2, make sure it remains pending. # 3. kill job1, make sure job2 starts running. # job1 uses all the controller limit job1 = Job(job_file='test_controller_job.yaml', config=IntegrationTestConfig( pool_file='test_respool_controller_limit.yaml')) job1.create() job1.wait_for_state(goal_state='RUNNING') # job2 should remain pending as job1 used the controller limit job2 = Job(job_file='test_controller_job.yaml', config=IntegrationTestConfig( pool_file='test_respool_controller_limit.yaml')) job2.create() # sleep for 5 seconds to make sure job 2 has enough time time.sleep(5) # make sure job2 can't run job2.wait_for_state(goal_state='PENDING') # stop job1 job1.stop() job1.wait_for_state(goal_state='KILLED') # make sure job2 starts running job2.wait_for_state(goal_state='RUNNING') kill_jobs([job2])
def test_non_preemptible_job(respool_a): # start non-preemptible job using all of CPU reservation. np_job_a_1 = Job(job_file='test_non_preemptible_job.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100)) np_job_a_1.create() np_job_a_1.wait_for_state(goal_state='RUNNING') # the resource pools CPU allocation should be equal to the reservation. pool_info = np_job_a_1.pool.pool_info() assert get_reservation('cpu', pool_info) == \ get_allocation('cpu', pool_info) # start another non-preemptible job which should not be admitted as all # the reservation(CPU) of the resource pool is used up. np_job_a_2 = Job(job_file='test_non_preemptible_job.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=5)) np_job_a_2.create() np_job_a_2.wait_for_state(goal_state='PENDING') # start preemptible job which should start running. p_job_a = Job(job_file='test_job.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100)) p_job_a.create() p_job_a.wait_for_state(goal_state='RUNNING') # stop the first non-preemptible job. np_job_a_1.stop() np_job_a_1.wait_for_state(goal_state='KILLED') # make sure the second one completes. np_job_a_2.wait_for_state(goal_state='RUNNING') kill_jobs([np_job_a_2, p_job_a])
def test_update_job_increase_instances(): job = Job(job_file='long_running_job.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.create() job.wait_for_state(goal_state='RUNNING') # job has only 1 task to begin with expected_count = 3 def tasks_count(): count = 0 for t in job.get_tasks().values(): if t.state == 8 or t.state == 9: count += 1 print "total instances running/completed: %d" % count return count == expected_count job.wait_for_condition(tasks_count) # update the job with the new config job.update(new_job_file='long_running_job_update_instances.yaml') # number of tasks should increase to 4 expected_count = 4 job.wait_for_condition(tasks_count) job.wait_for_state(goal_state='RUNNING') kill_jobs([job])
def test__create_a_stateless_job_with_3_tasks_on_3_different_hosts(): label_key = "job.name" label_value = "peloton_stateless_job" job = Job( job_file='test_stateless_job.yaml', config=IntegrationTestConfig( max_retry_attempts=100, ), options=[ with_labels({ label_key: label_value, }), with_constraint(_label_constraint(label_key, label_value)), with_instance_count(3), ] ) job.create() job.wait_for_state(goal_state='RUNNING') # Determine if tasks run on different hosts hosts = set() for _, task in job.get_tasks().iteritems(): task_info = task.get_info() hosts = hosts.union({task_info.runtime.host}) kill_jobs([job]) # Ensure that the tasks run on 3 different hosts assert len(hosts) == 3
def test_job_succeeds_if_controller_task_succeeds(): # only controller task in cjob would succeed. # other tasks would fail, but only controller task should determine # job terminal state cjob = Job(job_file='test_job_succecced_controller_task.yaml') cjob.create() cjob.wait_for_state(goal_state='SUCCEEDED') kill_jobs([cjob])
def test__preemption_task_level(respool_a, respool_b): p_job_a = Job(job_file='test_preemptible_job_preemption_override.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10)) p_job_a.create() p_job_a.wait_for_state(goal_state='RUNNING') # we should have all 12 tasks in running state def all_running(): return all(t.state == 8 for t in p_job_a.get_tasks().values()) p_job_a.wait_for_condition(all_running) # odd instance ids should be preempted expected_preempted_tasks = set([1, 3, 5, 7, 9, 11]) # even instance ids should be running expected_running_tasks = set([0, 2, 4, 6, 8, 10]) preempted_task_set, running_task_set = set([]), set([]) # 6(6 CPUs worth) tasks from p_job_a should be preempted def task_preempted(): preempted_task_set.clear() running_task_set.clear() preempted_count, running_count = 0, 0 for t in p_job_a.get_tasks().values(): # tasks should be KILLED since killOnPreempt is set to true if t.state == task.KILLED: preempted_count += 1 preempted_task_set.add(t.instance_id) if t.state == task.RUNNING: running_count += 1 running_task_set.add(t.instance_id) return running_count == 6 and preempted_count == 6 p_job_b = Job(job_file='test_preemptible_job.yaml', pool=respool_b, config=IntegrationTestConfig()) # starting the second job should change the entitlement calculation and # start preempting tasks from p_job_a p_job_b.create() # 6 tasks(odd instance ids) should be preempted from job1 to make space for job2 p_job_a.wait_for_condition(task_preempted) # check the preempted tasks and check instance ids should be odd. assert preempted_task_set == expected_preempted_tasks assert running_task_set == expected_running_tasks # wait for p_job_b to start running p_job_b.wait_for_state('RUNNING') kill_jobs([p_job_a, p_job_b])
def test__create_a_batch_job_and_restart_jobmgr_completes_jobs(jobmgr): job = Job(job_file='test_job_no_container.yaml', config=IntegrationTestConfig(max_retry_attempts=100)) job.create() # Restart immediately. That will lave some fraction unallocated and another # fraction initialized. jobmgr.restart() job.wait_for_state() kill_jobs([job])
def test__preemption_spark_goalstate(respool_a, respool_b): p_job_a = Job(job_file='test_preemptible_job_preemption_policy.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10)) p_job_a.create() p_job_a.wait_for_state(goal_state='RUNNING') # we should have all 12 tasks in running state def all_running(): return all(t.state == 8 for t in p_job_a.get_tasks().values()) p_job_a.wait_for_condition(all_running) preempted_task_set = {} # 6(6 CPUs worth) tasks from p_job_a should be preempted def task_preempted(): count = 0 for t in p_job_a.get_tasks().values(): # tasks should be KILLED since killOnPreempt is set to true if t.state == task.KILLED: count += 1 preempted_task_set[t] = True return count == 6 p_job_b = Job(job_file='test_preemptible_job.yaml', pool=respool_b, config=IntegrationTestConfig()) # starting the second job should change the entitlement calculation p_job_b.create() # 6 jobs should be preempted from job1 to make space for job2 p_job_a.wait_for_condition(task_preempted) # check the preempted tasks and check the runtime info. for t in preempted_task_set: assert t.state == task.KILLED assert t.goal_state == task.PREEMPTING kill_jobs([p_job_a, p_job_b])
def test__preemption_tasks_reschedules_task(respool_a, respool_b): p_job_a = Job(job_file='test_preemptible_job.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=5)) p_job_a.create() p_job_a.wait_for_state(goal_state='RUNNING') # we should have all 12 tasks in running state def all_running(): return all(t.state == 8 for t in p_job_a.get_tasks().values()) p_job_a.wait_for_condition(all_running) # 6(6 CPUs worth) tasks from p_job_a should be preempted def task_preempted(): count = 0 for t in p_job_a.get_tasks().values(): # tasks should be enqueued back by the jobmanager and once # enqueued they should transition to PENDING state if t.state == task.PENDING: count += 1 return count == 6 p_job_b = Job(job_file='test_preemptible_job.yaml', pool=respool_b, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10)) # starting the second job should change the entitlement calculation p_job_b.create() # 6 tasks should be preempted from job1 to make space for job2 p_job_a.wait_for_condition(task_preempted) # p_job_b should succeed p_job_b.wait_for_state(goal_state='SUCCEEDED') kill_jobs([p_job_a, p_job_b])
def test_controller_task_limit_executor_can_run(): # This tests the controller limit isn't applied to non-controller jobs. # 1. start controller cjob1 which uses all the controller limit # 2. start controller cjob2, make sure it remains pending. # 3. start non-controller job, make sure it succeeds. # job1 uses all the controller limit cjob1 = Job(job_file='test_controller_job.yaml', config=IntegrationTestConfig( pool_file='test_respool_controller_limit.yaml')) cjob1.create() cjob1.wait_for_state(goal_state='RUNNING') # job2 should remain pending as job1 used the controller limit cjob2 = Job(job_file='test_controller_job.yaml', config=IntegrationTestConfig( pool_file='test_respool_controller_limit.yaml')) cjob2.create() # sleep for 5 seconds to make sure job 2 has enough time time.sleep(5) # make sure job2 can't run cjob2.wait_for_state(goal_state='PENDING') # start a normal executor job job = Job(job_file='test_job.yaml', config=IntegrationTestConfig( pool_file='test_respool_controller_limit.yaml')) job.create() # make sure job can run and finish job.wait_for_state(goal_state='SUCCEEDED') kill_jobs([cjob1, cjob2])
def test__preemption_non_preemptible_task(respool_a, respool_b): # Create 2 Jobs : 1 preemptible and 1 non-preemptible in respool A p_job_a = Job(job_file='test_preemptible_job.yaml', pool=respool_a, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10)) p_job_a.update_instance_count(6) np_job_a = Job(job_file='test_preemptible_job.yaml', pool=respool_a, config=IntegrationTestConfig()) np_job_a.job_config.sla.preemptible = False np_job_a.update_instance_count(6) # preemptible job takes 6 CPUs p_job_a.create() # non preemptible job takes 6 reserved CPUs np_job_a.create() p_job_a.wait_for_state('RUNNING') np_job_a.wait_for_state('RUNNING') # pool allocation is more than reservation pool_info = np_job_a.pool.pool_info() assert get_reservation('cpu', pool_info) < get_allocation('cpu', pool_info) # Create another job in respool B p_job_b = Job(job_file='test_preemptible_job.yaml', pool=respool_b, config=IntegrationTestConfig(max_retry_attempts=100, sleep_time_sec=10)) p_job_b.update_instance_count(6) p_job_b.create() # p_job_b should remain PENDING since all resources are used by # p_job_a p_job_b.wait_for_state('PENDING') # p_job_a should be preempted and go back to PENDING p_job_a.wait_for_state(goal_state='PENDING') # np_job_a should keep RUNNING np_job_a.wait_for_state('RUNNING') def all_tasks_running(): count = 0 for t in np_job_a.get_tasks().values(): if t.state == task.RUNNING: count += 1 return count == 6 # p_job_b should start running p_job_b.wait_for_condition(all_tasks_running) # pool A allocation is equal to reservation pool_info = np_job_a.pool.pool_info() assert get_reservation('cpu', pool_info) == \ get_allocation('cpu', pool_info) # pool B allocation is equal to reservation pool_info = p_job_b.pool.pool_info() assert get_reservation('cpu', pool_info) == \ get_allocation('cpu', pool_info) # wait for p_job_b to finish p_job_b.wait_for_state('SUCCEEDED') # make sure p_job_a starts running p_job_a.wait_for_state('RUNNING') kill_jobs([p_job_a, np_job_a, p_job_b])
def test__create_2_stateless_jobs_with_task_to_task_anti_affinity_between_jobs(): # noqa label_key = "job.name" jobs = [] for i in range(2): job = Job( job_file='test_stateless_job.yaml', config=IntegrationTestConfig( max_retry_attempts=100, ), options=[ with_labels({ label_key: "peloton_stateless_job%s" % i }), with_job_name('TestPelotonDockerJob_Stateless' + repr(i)), with_instance_count(1), ] ) job.job_config.defaultConfig.constraint.CopyFrom( task_pb2.Constraint( type=2, andConstraint=task_pb2.AndConstraint( constraints=[ task_pb2.Constraint( type=1, labelConstraint=task_pb2.LabelConstraint( kind=1, condition=2, requirement=0, label=peloton_pb2.Label( # Tasks of my own job key='job.name', value='peloton_stateless_job%s' % i, ), ), ), task_pb2.Constraint( type=1, labelConstraint=task_pb2.LabelConstraint( kind=1, condition=2, requirement=0, label=peloton_pb2.Label( # Avoid tasks of the other job key='job.name', value='peloton_stateless_job%s' % ((i + 1) % 2), ), ), ), ] ), ) ) jobs.append(job) for job in jobs: job.create() time.sleep(1) # Determine if tasks run on different hosts hosts = set() for job in jobs: job.wait_for_state(goal_state='RUNNING') for _, task in job.get_tasks().iteritems(): task_info = task.get_info() hosts = hosts.union(set({task_info.runtime.host})) kill_jobs(jobs) # Ensure that the tasks run on 2 different hosts assert len(hosts) == 2