def test__stop_start_partial_tests_with_multiple_ranges(test_job): test_job.create() test_job.wait_for_state(goal_state='RUNNING') range1 = task_pb2.InstanceRange(to=1) setattr(range1, 'from', 0) range2 = task_pb2.InstanceRange(to=2) setattr(range2, 'from', 1) def wait_for_instance_to_stop(): return (test_job.get_task(0).state_str == 'KILLED' and test_job.get_task(1).state_str == 'KILLED') test_job.stop(ranges=[range1, range2]) test_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return (test_job.get_task(0).state_str == 'RUNNING' and test_job.get_task(1).state_str == 'RUNNING') test_job.start(ranges=[range1, range2]) test_job.wait_for_condition(wait_for_instance_to_run) test_job.stop() test_job.wait_for_state(goal_state='KILLED')
def test__stop_start_partial_tests_with_multiple_ranges(long_running_job): long_running_job.create() long_running_job.wait_for_state(goal_state="RUNNING") range1 = task_pb2.InstanceRange(to=1) setattr(range1, "from", 0) range2 = task_pb2.InstanceRange(to=2) setattr(range2, "from", 1) def wait_for_instance_to_stop(): return (long_running_job.get_task(0).state_str == "KILLED" and long_running_job.get_task(1).state_str == "KILLED") long_running_job.stop(ranges=[range1, range2]) long_running_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return (long_running_job.get_task(0).state_str == "RUNNING" and long_running_job.get_task(1).state_str == "RUNNING") long_running_job.start(ranges=[range1, range2]) long_running_job.wait_for_condition(wait_for_instance_to_run) long_running_job.stop() long_running_job.wait_for_state(goal_state="KILLED")
def stop_task(self, job_id, instance_id): """ param job_id: id of the job param instance_id: instance id of the task to stop type job_id: str type instance_id: int rtype: task.StopResponse """ rng = task.InstanceRange(to=instance_id + 1) setattr(rng, "from", instance_id) request = task.StopRequest( jobId=peloton.JobID(value=job_id), ranges=[rng] ) try: print_okblue("Stopping task %d of Job %s" % (instance_id, job_id)) resp = self.client.task_svc.Stop( request, metadata=self.client.jobmgr_metadata, timeout=default_timeout, ) return resp except Exception as e: print_fail("Exception calling Stop Tasks :%s" % str(e)) raise
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts( test_job, mesos_master, jobmgr): test_job.create() test_job.wait_for_state(goal_state='RUNNING') range = task_pb2.InstanceRange(to=1) setattr(range, 'from', 0) def wait_for_instance_to_stop(): return test_job.get_task(0).state_str == 'KILLED' mesos_master.stop() test_job.stop(ranges=[range]) jobmgr.restart() mesos_master.start() test_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return test_job.get_task(0).state_str == 'RUNNING' mesos_master.stop() test_job.start(ranges=[range]) jobmgr.restart() mesos_master.start() test_job.wait_for_condition(wait_for_instance_to_run) mesos_master.stop() test_job.stop() jobmgr.restart() mesos_master.start() test_job.wait_for_state(goal_state='KILLED')
def test__update_reduce_instances_stopped_tasks(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() assert len(old_pod_infos) == 3 # first increase instances update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() assert len(new_pod_infos) == 5 # now stop last 2 tasks ranges = task_pb2.InstanceRange(to=5) setattr(ranges, "from", 3) stateless_job.stop(ranges=[ranges]) # now reduce instance count update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_REDUCE_INSTANCES_SPEC, ) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() assert len(new_pod_infos) == 3
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts( long_running_job, mesos_master, jobmgr): long_running_job.create() long_running_job.wait_for_state(goal_state="RUNNING") range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) def wait_for_instance_to_stop(): return long_running_job.get_task(0).state_str == "KILLED" mesos_master.stop() long_running_job.stop(ranges=[range]) jobmgr.restart() mesos_master.start() long_running_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return long_running_job.get_task(0).state_str == "RUNNING" mesos_master.stop() long_running_job.start(ranges=[range]) jobmgr.restart() mesos_master.start() long_running_job.wait_for_condition(wait_for_instance_to_run) mesos_master.stop() long_running_job.stop() jobmgr.restart() mesos_master.start() long_running_job.wait_for_terminated()
def test__stop_start_tasks_when_mesos_master_down_kills_tasks_when_started( test_job, mesos_master): test_job.create() test_job.wait_for_state(goal_state="RUNNING") range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) def wait_for_instance_to_stop(): return test_job.get_task(0).state_str == "KILLED" mesos_master.stop() test_job.stop(ranges=[range]) mesos_master.start() test_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return test_job.get_task(0).state_str == "RUNNING" mesos_master.stop() test_job.start(ranges=[range]) mesos_master.start() test_job.wait_for_condition(wait_for_instance_to_run) mesos_master.stop() test_job.stop() mesos_master.start() test_job.wait_for_terminated()
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts_stateless_job( self, failure_tester): """ Start and stop some tasks in a stateless job while mesos master is not running and job manager restarts, verify that those tasks start and stop as expected """ # Step 1: Start a stateless job. stateless_job = failure_tester.stateless_job() stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) # Step 2: Stop a subset of job instances when mesos master is down. assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) stateless_job.stop(ranges=[range]) # Step 3: Restart job manager. leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() stateless_job.client = failure_tester.client # Step 4: Start mesos master and wait for the instances to be stopped. assert 0 != failure_tester.fw.start(failure_tester.mesos_master) def wait_for_instance_to_stop(): return stateless_job.get_task(0).state_str == "KILLED" stateless_job.wait_for_condition(wait_for_instance_to_stop) # Step 5: Start the same subset of instances when mesos master is down. assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) stateless_job.start(ranges=[range]) # Step 6: Restart job manager. leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() stateless_job.client = failure_tester.client # Step 7: Start mesos master and wait for the instances to transit to RUNNING. assert 0 != failure_tester.fw.start(failure_tester.mesos_master) def wait_for_instance_to_run(): return stateless_job.get_task(0).state_str == "RUNNING" stateless_job.wait_for_condition(wait_for_instance_to_run)
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts_batch_job( self, failure_tester): """ Start and stop some tasks in a job while mesos master is not running and job manager restarts, verify that those tasks start and stop as expected """ # Step 1: start the job long_running_job = failure_tester.job(job_file="long_running_job.yaml") long_running_job.create() range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) # Step 2: stop some tasks in the job while mesos master is not running assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) long_running_job.stop(ranges=[range]) leader1 = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader1 assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader1) failure_tester.reset_client() long_running_job.client = failure_tester.client assert 0 != failure_tester.fw.start(failure_tester.mesos_master) def wait_for_instance_to_stop(): return long_running_job.get_task(0).state_str == "KILLED" long_running_job.wait_for_condition(wait_for_instance_to_stop) # Step 3: start the same tasks that were stopped while mesos master is not running assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) long_running_job.start(ranges=[range]) leader2 = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader2 assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader2) failure_tester.reset_client() long_running_job.client = failure_tester.client assert 0 != failure_tester.fw.start(failure_tester.mesos_master) def wait_for_instance_to_run(): return long_running_job.get_task(0).state_str == "RUNNING" long_running_job.wait_for_condition(wait_for_instance_to_run)
def test__stop_start_tasks_when_mesos_master_down_and_jobmgr_restarts( stateless_job, mesos_master, jobmgr): """ 1. Create stateless job. 2. Wait for job state RUNNING. 3. Stop a subset of job instances when mesos master is down. 4. Restart job manager. 5. Start mesos master and wait for the instances to be stopped. 6. Start the same subset of instances when mesos master is down. 7. Restart job manager. 8. Start mesos master and wait for the instances to transit to RUNNING. 9. Stop the job when mesos master is down. 10. Restart job manager. 11. Start mesos master and wait for the job to terminate """ stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) def wait_for_instance_to_stop(): return stateless_job.get_task(0).state_str == "KILLED" mesos_master.stop() stateless_job.stop(ranges=[range]) jobmgr.restart() mesos_master.start() stateless_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return stateless_job.get_task(0).state_str == "RUNNING" mesos_master.stop() stateless_job.start(ranges=[range]) jobmgr.restart() mesos_master.start() stateless_job.wait_for_condition(wait_for_instance_to_run) mesos_master.stop() stateless_job.stop() jobmgr.restart() mesos_master.start() stateless_job.wait_for_terminated()
def test__stop_start_pod_on_sla_violated_job(stateless_job): """ 1. Create a stateless job(instance_count=5) with host-limit-1 constraint and MaximumUnavailableInstances=1. Since there are only 3 UP hosts, 2 of the instances will not get placed (hence unavailable). 2. Kill one of the running instances (say i). Instance should get killed. 3. Start instance i. Instance i should transit to PENDING (due to host limit 1 constraint, instance won't get placed). """ job_spec_dump = load_test_config('test_stateless_job_spec_sla.yaml') json_format.ParseDict(job_spec_dump, stateless_job.job_spec) stateless_job.job_spec.instance_count = 5 stateless_job.create() stateless_job.wait_for_all_pods_running(num_pods=3) test_instance = None for i in range(0, stateless_job.job_spec.instance_count): if stateless_job.get_pod_status(i).state == pod_pb2.POD_STATE_RUNNING: test_instance = i break print test_instance assert not test_instance == None ranges = task_pb2.InstanceRange(to=test_instance + 1) setattr(ranges, "from", test_instance) stateless_job.stop(ranges=[ranges]) def instance_killed(): return stateless_job.get_pod_status( test_instance).state == pod_pb2.POD_STATE_KILLED stateless_job.wait_for_condition(instance_killed) stateless_job.start(ranges=[ranges]) def instance_pending(): return stateless_job.get_pod_status( test_instance).state == pod_pb2.POD_STATE_PENDING stateless_job.wait_for_condition(instance_pending)
def test__stop_start_partial_tests_with_single_range(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) def wait_for_instance_to_stop(): return stateless_job.get_task(0).state_str == "KILLED" stateless_job.stop(ranges=[range]) stateless_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return stateless_job.get_task(0).state_str == "RUNNING" stateless_job.start(ranges=[range]) stateless_job.wait_for_condition(wait_for_instance_to_run) stateless_job.stop() stateless_job.wait_for_state(goal_state="KILLED")
def test__stop_start_partial_tests_with_single_range(test_job): test_job.create() test_job.wait_for_state(goal_state='RUNNING') range = task_pb2.InstanceRange(to=1) setattr(range, 'from', 0) def wait_for_instance_to_stop(): return test_job.get_task(0).state_str == 'KILLED' test_job.stop(ranges=[range]) test_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return test_job.get_task(0).state_str == 'RUNNING' test_job.start(ranges=[range]) test_job.wait_for_condition(wait_for_instance_to_run) test_job.stop() test_job.wait_for_state(goal_state='KILLED')
def test__stop_start_tasks_when_mesos_master_down_kills_tasks_when_started(self, failure_tester): ''' Start Mesos master after creating a long running job and stopping the job and verify that the job is still running 1. Create stateless job. 2. Wait for job state RUNNING. 3. Stop a subset of job instances when mesos master is down. 4. Start mesos master and wait for the instances to be stopped. 5. Start the same subset of instances when mesos master is down. 6. Start mesos master and wait for the instances to transit to RUNNING. 7. Stop the job when mesos master is down. 8. Start mesos master and wait for the job to terminate ''' long_running_job = failure_tester.job(job_file="long_running_job.yaml") long_running_job.create() long_running_job.wait_for_state(goal_state="RUNNING") range = task_pb2.InstanceRange(to=1) setattr(range, "from", 0) def wait_for_instance_to_stop(): return long_running_job.get_task(0).state_str == "KILLED" assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) long_running_job.stop(ranges=[range]) assert 0 != failure_tester.fw.start(failure_tester.mesos_master) long_running_job.wait_for_condition(wait_for_instance_to_stop) def wait_for_instance_to_run(): return long_running_job.get_task(0).state_str == "RUNNING" assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) long_running_job.start(ranges=[range]) assert 0 != failure_tester.fw.start(failure_tester.mesos_master) long_running_job.wait_for_condition(wait_for_instance_to_run) assert 0 != failure_tester.fw.stop(failure_tester.mesos_master) long_running_job.stop() assert 0 != failure_tester.fw.start(failure_tester.mesos_master) long_running_job.wait_for_terminated()
def _get_range(self): _range = task.InstanceRange(to=self.instance_id + 1) # 'from' a reserved keyword so we have to do this setattr(_range, 'from', self.instance_id) return [_range]