def test__restart_restart_jobmgr(self, failure_tester, in_place): ''' Restart job-manager leader while stateless job is restarted and verify if the tasks in the job are changed ''' stateless_job = failure_tester.stateless_job() stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() stateless_job.restart(in_place=in_place) leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() stateless_job.client = failure_tester.client stateless_job.wait_for_workflow_state(goal_state="SUCCEEDED") stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__create_update_stopped_job(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() old_pod_states = set() for pod_info in old_pod_infos: old_pod_states.add(pod_info.spec.pod_name.value) stateless_job.stop() stateless_job.wait_for_state(goal_state="KILLED") update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC, batch_size=1, ) update.create(in_place=in_place) stateless_job.start() update.wait_for_state(goal_state="SUCCEEDED") stateless_job.wait_for_state(goal_state="RUNNING") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert len(old_pod_infos) == 3 assert len(new_pod_infos) == 5 assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec) # Only new instances should be RUNNING for pod_info in new_pod_infos: if pod_info.spec.pod_name.value in new_pod_infos: assert pod_info.status.state == pod_pb2.POD_STATE_KILLED else: assert pod_info.status.state == pod_pb2.POD_STATE_RUNNING
def test__create_update_restart_jobmgr(self, failure_tester, in_place): ''' Restart job-manager leader while stateless job is updated and verify if the tasks in the job are changed ''' stateless_job = failure_tester.stateless_job() stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = failure_tester.stateless_update( stateless_job, updated_job_file="test_update_stateless_job_update_and_add_instances_spec.yaml", batch_size=1, ) update.create(in_place=in_place) leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() stateless_job.client = failure_tester.client update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert len(old_pod_infos) == 3 assert len(new_pod_infos) == 5 assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__restart_restart_jobmgr(stateless_job, jobmgr, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() stateless_job.restart(in_place=in_place) jobmgr.restart() stateless_job.wait_for_workflow_state(goal_state="SUCCEEDED") stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__restart_pods(canary_job): old_pod_infos = canary_job.query_pods() # TODO add back batch size after API update in peloton client # stateless_job.restart(batch_size=1) canary_job.restart() canary_job.wait_for_workflow_state(goal_state='SUCCEEDED') canary_job.wait_for_all_pods_running() new_pod_infos = canary_job.query_pods() # restart should kill and start already running instances assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__restart_running_job(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() stateless_job.restart(in_place=in_place) stateless_job.wait_for_workflow_state(goal_state="SUCCEEDED") stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() # restart should kill and start already running instances assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__start_killed_job(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() stateless_job.stop() stateless_job.wait_for_state(goal_state="KILLED") stateless_job.start() stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__restart_restart_jobmgr(stateless_job, jobmgr): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') old_pod_infos = stateless_job.query_pods() stateless_job.restart() jobmgr.restart() stateless_job.wait_for_workflow_state(goal_state='SUCCEEDED') stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__create_update(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate(stateless_job, updated_job_file=UPDATE_STATELESS_JOB_SPEC) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__restart_running_job_with_batch_size(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') old_pod_infos = stateless_job.query_pods() stateless_job.restart(batch_size=1) stateless_job.wait_for_workflow_state(goal_state='SUCCEEDED') stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() # restart should kill and start already running instances assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__restart_killed_job_with_batch_size(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') stateless_job.stop() stateless_job.wait_for_state(goal_state='KILLED') old_pod_infos = stateless_job.query_pods() stateless_job.restart(batch_size=1) stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__restart_killed_job(): job = StatelessJob(job_file="test_stateless_job_spec_k8s.yaml") job.create() job.wait_for_state(goal_state="RUNNING") old_pod_infos = job.query_pods() job.stop() job.wait_for_state(goal_state="KILLED") job.restart(in_place=False) job.wait_for_all_pods_running() new_pod_infos = job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__restart_killed_job_with_batch_size(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") stateless_job.stop() stateless_job.wait_for_state(goal_state="KILLED") old_pod_infos = stateless_job.query_pods() # TODO add back batch size after API update in peloton client # stateless_job.restart(batch_size=1) stateless_job.restart(in_place=in_place) stateless_job.wait_for_all_pods_running() new_pod_infos = stateless_job.query_pods() assert_pod_id_changed(old_pod_infos, new_pod_infos)
def test__create_update_with_batch_size(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_SPEC, batch_size=1, ) update.create() update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__create_update_update_and_add_instances(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state='RUNNING') old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC, ) update.create() update.wait_for_state(goal_state='SUCCEEDED') new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert len(old_pod_infos) == 3 assert len(new_pod_infos) == 5 assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__pause_resume_initialized_update(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate(stateless_job, batch_size=1, updated_job_file=UPDATE_STATELESS_JOB_SPEC) update.create(in_place=in_place) # immediately pause the update, so the update may still be INITIALIZED update.pause() update.wait_for_state(goal_state="PAUSED") update.resume() update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__create_update_update_and_add_instances_with_batch(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC, batch_size=1, ) update.create(in_place=in_place) update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert len(old_pod_infos) == 3 assert len(new_pod_infos) == 5 assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__pause_resume__update(stateless_job, in_place): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate(stateless_job, batch_size=1, updated_job_file=UPDATE_STATELESS_JOB_SPEC) update.create(in_place=in_place) # sleep for 1 sec so update can begin to roll forward time.sleep(1) update.pause() update.wait_for_state(goal_state="PAUSED") update.resume() update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__create_update_update_start_paused(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC, start_paused=True, ) update.create() update.wait_for_state(goal_state="PAUSED") update.resume() update.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert len(old_pod_infos) == 3 assert len(new_pod_infos) == 5 assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)
def test__create_multiple_consecutive_updates(stateless_job): stateless_job.create() stateless_job.wait_for_state(goal_state="RUNNING") old_pod_infos = stateless_job.query_pods() old_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() update1 = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_ADD_INSTANCES_SPEC) update1.create() update2 = StatelessUpdate( stateless_job, updated_job_file=UPDATE_STATELESS_JOB_UPDATE_AND_ADD_INSTANCES_SPEC, batch_size=1, ) update2.create() update2.wait_for_state(goal_state="SUCCEEDED") new_pod_infos = stateless_job.query_pods() new_instance_zero_spec = stateless_job.get_pod(0).get_pod_spec() assert len(old_pod_infos) == 3 assert len(new_pod_infos) == 5 assert_pod_id_changed(old_pod_infos, new_pod_infos) assert_pod_spec_changed(old_instance_zero_spec, new_instance_zero_spec)