def test__simple_update_with_no_diff(client): """ test simple update use case where second update has no config change, thereby new workflow created will have no impact """ res = client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0 # Do update with same config, which will yield no impact res = client.start_job_update( get_job_update_request('test_dc_labrat_large_job_diff_executor.yaml'), 'start job update test/dc/labrat_large_job_diff_executor') wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList[0].updateEvents) > 0 assert res.detailsList[0].instanceEvents is None # Do another update with same config, which will yield no impact res = client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList[0].updateEvents) > 0 assert res.detailsList[0].instanceEvents is None
def test__get_job_update_details__deleted_job(client): """ test JobMgr's private API - QueryJobCache (used by getJobUpdateDetails) won't crash if the job is deleted. """ # start first update req1 = get_job_update_request("test_dc_labrat_large_job.yaml") req1.settings.updateGroupSize = 10 job_key = start_job_update(client, req1, "start job update test/dc/labrat_large_job") # force delete job delete_jobs() # start second update req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml") req2.settings.updateGroupSize = 10 job_key = start_job_update(client, req2, "start job update test/dc/labrat_large_job") # verify getJobUpdateDetails response res = client.get_job_update_details(None, api.JobUpdateQuery(role=job_key.role)) assert len(res.detailsList) == 1
def test__abort_auto_rollback_and_update(client): """ 1. Create a job 2. Start a bad update, wait for auto-rollback to kick-in 3. Once auto-rollback starts, abort an update. 4. Do a new good update and all the instances should converge to the new config. """ start_job_update( client, "test_dc_labrat_large_job.yaml", "start job update test/dc/labrat_large_job", ) # Add some wait time for lucene index to build time.sleep(10) res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"), "rollout bad config", ) # wait for auto-rollback to kick-in wait_for_auto_rolling_back(client, res.key) client.abort_job_update(res.key, "abort update") wait_for_update_status( client, res.key, {api.JobUpdateStatus.ROLLING_BACK}, api.JobUpdateStatus.ABORTED, ) new_config = get_job_update_request( "test_dc_labrat_large_job_new_config.yaml" ) res = client.start_job_update(new_config, "rollout good config") # Sleep for a while so that update gets triggered. time.sleep(5) wait_for_rolled_forward(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery( jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING} ) ) assert len(res.tasks) == 10 for t in res.tasks: assert len(t.assignedTask.task.metadata) == 1 assert ( list(t.assignedTask.task.metadata)[0].key == list(new_config.taskConfig.metadata)[0].key ) assert ( list(t.assignedTask.task.metadata)[0].value == list(new_config.taskConfig.metadata)[0].value ) assert t.ancestorId
def test__simple_update_tasks_reconcile(client, hostmgr, mesos_master): """ Restart host manager and mesos master multiple times, to make sure mesos tasks are reconciled correctly. """ res = client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') # wait for sometime for jobmgr goal state engine to kick-in time.sleep(random.randint(1, 10)) # First restart hostmgr.restart() time.sleep(random.randint(1, 5)) mesos_master.restart() # Second restart hostmgr.restart() time.sleep(random.randint(1, 5)) mesos_master.restart() # Third restart hostmgr.restart() time.sleep(random.randint(1, 5)) mesos_master.restart() wait_for_rolled_forward(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10
def test__simple_update_with_restart_component(client, jobmgr, resmgr, hostmgr, mesos_master): """ Start an update, and restart jobmgr, resmgr, hostmgr & mesos master. """ res = client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') # wait for sometime for jobmgr goal state engine to kick-in time.sleep(random.randint(1, 10)) jobmgr.restart() # wait for sometime to enqueue gangs time.sleep(random.randint(1, 10)) # clear any admission and queues resmgr.restart() # wait for sometime to acquire host lock time.sleep(random.randint(1, 10)) # clear host `placing` lock hostmgr.restart() time.sleep(random.randint(1, 10)) # restart mesos master to jumble up host manager state mesos_master.restart() wait_for_rolled_forward(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10
def test__simple_auto_rolled_back(client): """ Create a job, then issue a bad config update and validate job is rolled back to previous version """ start_job_update(client, 'test_dc_labrat.yaml', 'start job update test/dc/labrat') # Add some wait time for lucene index to build time.sleep(10) res = client.start_job_update( get_job_update_request('test_dc_labrat_bad_config.yaml'), 'rollout bad config') wait_for_rolled_back(client, res.key) # validate job is rolled back to previous config res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING})) tasks = res.tasks assert len(tasks) == 3 for t in tasks: for r in t.assignedTask.task.resources: if r.numCpus > 0: assert r.numCpus == 0.25 elif r.ramMb > 0: assert r.ramMb == 128 elif r.diskMb > 0: assert r.diskMb == 128 else: assert False, 'unexpected resource {}'.format(r)
def test__roll_forward_paused_update_abort(client): """ Create an update, pause it and then abort it """ res = client.start_job_update( get_job_update_request("test_dc_labrat.yaml"), "create job") key = res.key # Sleep for sometime to let Peloton transition workflow state # from INITIALIZED -> ROLLING_FORWARD or let it be as-is time.sleep(5) client.pause_job_update(key, "pause update") wait_for_update_status( client, key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ROLL_FORWARD_PAUSED, ) client.abort_job_update(key, "abort update") wait_for_update_status( client, key, { api.JobUpdateStatus.ROLLING_FORWARD, api.JobUpdateStatus.ROLL_FORWARD_PAUSED, }, api.JobUpdateStatus.ABORTED, )
def test__job_create_manual_rollback(client): """ Start a job update, and half-way to a manual rollback """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) job_update_key = res.key job_key = res.key.job # wait for few instances running time.sleep(5) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0 # rollback update client.rollback_job_update(job_update_key) wait_for_rolled_back(client, job_update_key) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0
def test__deploy_on_aborted_update(client): """ Deploy an update, and abort half-way. Then re-deploy same update. Updated instances should not restart again. """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) # Few instances start time.sleep(5) client.abort_job_update(res.key, "abort update") wait_for_update_status( client, res.key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ABORTED, ) # Not all instances were created res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}) ) assert len(res.tasks) < 10 # deploy same update, should impact remaining instances res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"), "start job update test/dc/labrat_large_job_diff_executor", ) wait_for_rolled_forward(client, res.key) # All instances are created res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}) ) assert len(res.tasks) == 10 # No instances should have ancestor id, thereby validating # instances created in previous update are not restarted/redeployed for task in res.tasks: assert task.ancestorId is None
def test__simple_update_with_diff(client): """ test simple update use case where second update has config change, here all the instances will move to new config """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0 # Do update with labels changed res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"), "start job update test/dc/labrat_large_job", ) job_key = res.key.job wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0 res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) for task in res.tasks: assert task.ancestorId is not None res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) == 0
def test__failed_update(client): """ update failed """ res = client.start_job_update( get_job_update_request('test_dc_labrat_bad_config.yaml'), 'rollout bad config') wait_for_update_status(client, res.key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.FAILED)
def test__simple_update_tasks_reconcile(self, failure_tester): """ Restart host manager and mesos master multiple times, to make sure mesos tasks are reconciled correctly. """ res = failure_tester.aurorabridge_client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') # wait for sometime for jobmgr goal state engine to kick-in time.sleep(random.randint(1, 10)) # First restart leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader) failure_tester.reset_client() time.sleep(random.randint(1, 5)) assert 0 != failure_tester.fw.restart(failure_tester.mesos_master) # Second restart leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader) failure_tester.reset_client() time.sleep(random.randint(1, 5)) assert 0 != failure_tester.fw.restart(failure_tester.mesos_master) # Third restart leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader) failure_tester.reset_client() time.sleep(random.randint(1, 5)) assert 0 != failure_tester.fw.restart(failure_tester.mesos_master) # Sleep to help the cluster to stabilize time.sleep(10) wait_for_rolled_forward(failure_tester.aurorabridge_client, res.key) res = failure_tester.aurorabridge_client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10
def test__start_job_update_with_pulse(client): req = get_job_update_request('test_dc_labrat_pulsed.yaml') res = client.start_job_update(req, 'start pulsed job update test/dc/labrat') assert get_update_status(client, res.key) == \ api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE client.pulse_job_update(res.key) wait_for_update_status( client, res.key, { api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE, api.JobUpdateStatus.ROLLING_FORWARD, }, api.JobUpdateStatus.ROLLED_FORWARD)
def test__simple_update_with_restart_component(self, failure_tester): """ Start an update, and restart jobmgr, resmgr, hostmgr & mesos master. """ res = failure_tester.aurorabridge_client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') # wait for sometime for jobmgr goal state engine to kick-in time.sleep(random.randint(1, 10)) leader = failure_tester.fw.get_leader_info(failure_tester.jobmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.jobmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.jobmgr, leader) failure_tester.reset_client() # wait for sometime to enqueue gangs time.sleep(random.randint(1, 10)) # clear any admission and queues leader = failure_tester.fw.get_leader_info(failure_tester.resmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.resmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.resmgr, leader) failure_tester.reset_client() # wait for sometime to acquire host lock time.sleep(random.randint(1, 10)) # clear host `placing` lock leader = failure_tester.fw.get_leader_info(failure_tester.hostmgr) assert leader assert 0 != failure_tester.fw.restart(failure_tester.hostmgr, "leader") failure_tester.wait_for_leader_change(failure_tester.hostmgr, leader) failure_tester.reset_client() time.sleep(random.randint(1, 10)) # restart mesos master to jumble up host manager state assert 0 != failure_tester.fw.restart(failure_tester.mesos_master) # Sleep to help the cluster to stabilize time.sleep(10) wait_for_rolled_forward(failure_tester.aurorabridge_client, res.key, timeout_secs=200) res = failure_tester.aurorabridge_client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10
def test__override_rolling_forward_update_with_diff(client): """ Override an on-going update with config change, will abort current update and start latest one. """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"), "start job update test/dc/labrat_large_job_diff_labels", ) wait_for_rolled_forward(client, res.key) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=res.key.job)) assert len(res.detailsList) == 2 # Previous rolling_forward update is aborted assert (res.detailsList[1].updateEvents[-1].status == api.JobUpdateStatus.ABORTED)
def test__get_job_update_details__filter_non_update_workflow(client): """ test getJobUpdateDetails endpoint for filtering non-update workflows """ req1 = get_job_update_request("test_dc_labrat_large_job.yaml") req1.settings.updateGroupSize = 10 req2 = get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml") req2.settings.updateGroupSize = 10 # start a regular update job_key = start_job_update(client, req1, "start job update test/dc/labrat_large_job") # trigger an unexpected restart through peloton api jobs = list_jobs() assert len(jobs) == 1 job = StatelessJob(job_id=jobs[0].job_id.value) job.restart(batch_size=10) job.wait_for_workflow_state(goal_state="SUCCEEDED") # wait for restart # start a new update start_job_update(client, req2, "start job update test/dc/labrat_large_job") # verify getJobUpdateDetails response res = client.get_job_update_details(None, api.JobUpdateQuery(role=job_key.role)) assert len(res.detailsList) == 2 for i, detail in enumerate(res.detailsList): if i == 0: assert len(detail.update.instructions.initialState) > 0 for initial in detail.update.instructions.initialState: assert initial.task.metadata, 'Expect metadata to be present' else: assert len(detail.update.instructions.initialState) == 0
def test__rolling_forward_abort(client): """ Create an update and then abort it """ res = client.start_job_update( get_job_update_request('test_dc_labrat.yaml'), 'create job') key = res.key # Sleep for sometime to let Peloton transition workflow state # from INITIALIZED -> ROLLING_FORWARD or let it be as-is time.sleep(5) client.abort_job_update(key, 'abort update') wait_for_update_status(client, key, {api.JobUpdateStatus.ROLLING_FORWARD}, api.JobUpdateStatus.ABORTED)
def test__pulsed_update_abort(client): """ Create a pulse update, and then abort it """ req = get_job_update_request('test_dc_labrat_pulsed.yaml') res = client.start_job_update(req, 'start pulsed job update test/dc/labrat') assert get_update_status(client, res.key) == \ api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE key = res.key client.abort_job_update(key, 'abort update') wait_for_update_status(client, key, {api.JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE}, api.JobUpdateStatus.ABORTED)
def test__host_limit_1(client, hostmgr): """ - Create a job with host limit 1 constraint and validate each pod is running on different host. - Update a job, wait for it to complete and verify host limit 1 constraint. - Update a job, restart host manager, then wait for update to complete and lastly verify host limit 1 constraint. """ # Create job. job_key = start_job_update(client, "test_dc_labrat.yaml", "start job update test/dc/labrat") # Add some wait time for lucene index to build time.sleep(10) res = client.get_tasks_without_configs(api.TaskQuery(jobKeys={job_key})) assert len(res.tasks) == 3 verify_host_limit_1(res.tasks) # Start a update with host limit 1 constraint job_key = start_job_update(client, "test_dc_labrat_1.yaml", "start job update test/dc/labrat_1") res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 3 verify_host_limit_1(res.tasks) # Start an update, and restart hostmanager before update completes. res = client.start_job_update( get_job_update_request("test_dc_labrat.yaml"), "start job update test/dc/labrat", ) # restart host manager hostmgr.restart() wait_for_rolled_forward(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 3 verify_host_limit_1(res.tasks)
def test__simple_update_events_purge(client, jobmgr, resmgr): """ Restart job manager and resource manager multiple times, to make sure mesos task status update events are purged correctly. """ res = client.start_job_update( get_job_update_request('test_dc_labrat_large_job.yaml'), 'start job update test/dc/labrat_large_job') # wait for sometime for jobmgr goal state engine to kick-in time.sleep(random.randint(1, 10)) # First restart jobmgr.restart() time.sleep(random.randint(1, 5)) resmgr.restart() time.sleep(random.randint(1, 5)) # Second restart jobmgr.restart() time.sleep(random.randint(1, 5)) resmgr.restart() time.sleep(random.randint(1, 5)) # Third restart jobmgr.restart() time.sleep(random.randint(1, 5)) resmgr.restart() # Sleep to ensure lucene index converges time.sleep(10) wait_for_rolled_forward(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={res.key.job}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10
def test__job_create_fail_manual_rollback(client): """ Start a failed job update, while half-way in the update, trigger a manual rollback """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_bad_config.yaml"), "start job update test/dc/labrat_large_job (failed)", ) job_update_key = res.key job_key = res.key.job # wait for the first instance starting time.sleep(5) wait_for_failed(client, job_key, instances=[0]) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0 # rollback update client.rollback_job_update(job_update_key) wait_for_rolled_back(client, job_update_key) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) assert len(res.detailsList[0].updateEvents) > 0 assert len(res.detailsList[0].instanceEvents) > 0 # validate no tasks are running res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) tasks = res.tasks assert len(tasks) == 0
def test__simple_manual_rollback(client): """ Start a job update which will create a job. Do another update on the job and half-way to a manual rollback """ job_key = start_job_update( client, "test_dc_labrat_large_job.yaml", "start job update test/dc/labrat_large_job", ) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) verify_events_sorted(res.detailsList[0].updateEvents) verify_events_sorted(res.detailsList[0].instanceEvents) verify_first_and_last_job_update_status( res.detailsList[0].updateEvents, api.JobUpdateStatus.ROLLING_FORWARD, api.JobUpdateStatus.ROLLED_FORWARD, ) verify_task_config( client, job_key, { "test_key_1": "test_value_1", "test_key_2": "test_value_2" }, ) # Do update on previously created job res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_labels.yaml"), "start job update test/dc/labrat_large_job_diff_labels", ) job_update_key = res.key job_key = res.key.job # wait for few instances running time.sleep(5) res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) verify_events_sorted(res.detailsList[0].updateEvents) verify_events_sorted(res.detailsList[0].instanceEvents) verify_first_and_last_job_update_status( res.detailsList[0].updateEvents, api.JobUpdateStatus.ROLLING_FORWARD, api.JobUpdateStatus.ROLLING_FORWARD, ) verify_task_config( client, job_key, { "test_key_1": "test_value_1", "test_key_2": "test_value_2", "test_key_11": "test_value_11", "test_key_22": "test_value_22", }, ) # rollback update client.rollback_job_update(job_update_key) wait_for_rolled_back(client, job_update_key) # verify events are sorted ascending, and last update event is ROLLED_BACK res = client.get_job_update_details(None, api.JobUpdateQuery(jobKey=job_key)) verify_events_sorted(res.detailsList[0].updateEvents) verify_events_sorted(res.detailsList[0].instanceEvents) verify_first_and_last_job_update_status( res.detailsList[0].updateEvents, api.JobUpdateStatus.ROLLING_FORWARD, api.JobUpdateStatus.ROLLED_BACK, ) verify_task_config( client, job_key, { "test_key_1": "test_value_1", "test_key_2": "test_value_2" }, ) # rolled back to previous task config
def test__abort_auto_rollback_with_pinned_instances_and_update(client): """ 1. Create a job. 2. Start a bad update (version 2) targeting subset of instances. 3. Wait for the auto-rollback to kick-in. 4. Once auto-rollback kicks in, abort the update. 4. Start a new good update and wait for all instances to converge to that update. """ # Create a job res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # start a bad update with updateOnlyTheseInstances parameter update_instances = [0, 2, 3, 7, 9] pinned_req = get_job_update_request( "test_dc_labrat_large_job_bad_config.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) pinned_req.settings.maxFailedInstances = 4 res = client.start_job_update( pinned_req, "start a bad update test/dc/labrat_large_job with pinned instances", ) # wait for auto-rollback to kick-in wait_for_auto_rolling_back(client, res.key, timeout_secs=150) # abort the update client.abort_job_update(res.key, "abort update") wait_for_update_status( client, res.key, {api.JobUpdateStatus.ROLLING_BACK}, api.JobUpdateStatus.ABORTED, ) # start a new good update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_new_config.yaml"), "start job update test/dc/labrat_large_job with a good config", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert len(t.assignedTask.task.metadata) == 1 for m in t.assignedTask.task.metadata: if m.key == "test_key_12": assert m.value == "test_value_12" else: assert False, "unexpected metadata %s" % m if t.assignedTask.instanceId in update_instances: # only a few of the pinned instances might have rolled back assert run_id == "3" or run_id == "4" else: assert run_id == "2"
def test__auto_rollback_with_pinned_instances__remove_instances(client): """ 1. Create a job. 2. Start a bad update on a subset of instances and adding more instances. 3. The instances should rollback to their previous version. No instances should be removed. """ req = get_job_update_request("test_dc_labrat_large_job.yaml") res = client.start_job_update(req, "start job update test/dc/labrat_large_job") wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # start a update with updateOnlyTheseInstances parameter, # and reduce instance count update_instances = set([2, 3, 4, 5]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_bad_config.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) pinned_req.instanceCount = 8 pinned_req.settings.maxFailedInstances = 3 pinned_req.settings.updateGroupSize = 1 res = client.start_job_update( pinned_req, "start a bad update test/dc/labrat_large_job with pinned instances", ) wait_for_rolled_back(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) if t.assignedTask.instanceId in update_instances: assert run_id == "3" else: assert run_id == "1"
def test__auto_rollback_with_pinned_instances__stopped_instances(client): """ 1. Create a job (v1). 2. Start update on the first subset of instances (v2). 3. Start update on second subset of instances (v3). 4. Stop some instances. 5. Start a bad update on a subset consisting of at least one instance in each of v1, v2, v3 and stopped 6. The instances should rollback to their respective previous good versions. The stopped instances in the bad update should transit to running. """ all_instances = set([i for i in xrange(10)]) # Create a job res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # start a update on first subset of instances update_instances_1 = set([4, 5, 6, 7]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_diff_labels.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances_1]) res = client.start_job_update( pinned_req, "start job update test/dc/labrat_large_job with pinned instances", ) wait_for_rolled_forward(client, res.key) # Start another update on the second subset of instances update_instances_2 = set([8, 9]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_new_config.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances_2]) res = client.start_job_update( pinned_req, "start another job update test/dc/labrat_large_job with pinned instances", ) wait_for_rolled_forward(client, res.key) # Stop some instances stop_instances = set([5, 8]) client.kill_tasks( job_key, stop_instances, "killing instance 5, 8 for job test/dc/labrat_large_job", ) wait_for_killed(client, job_key, stop_instances) # Start a bad update bad_update_instances = set([0, 5, 6, 9]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_bad_config.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in bad_update_instances]) pinned_req.settings.maxFailedInstances = 1 pinned_req.settings.maxPerInstanceFailures = 1 pinned_req.settings.updateGroupSize = 2 res = client.start_job_update( pinned_req, "start a bad update test/dc/labrat_large_job with pinned instances", ) wait_for_rolled_back(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len( res.tasks) == (len(all_instances - stop_instances) + len(bad_update_instances.intersection(stop_instances))) for t in res.tasks: if t.assignedTask.instanceId in update_instances_1: assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_11": assert m.value == "test_value_11" elif m.key == "test_key_22": assert m.value == "test_value_22" else: assert False, "unexpected metadata %s" % m elif t.assignedTask.instanceId in update_instances_2: print(t.assignedTask.instanceId) assert len(t.assignedTask.task.metadata) == 1 for m in t.assignedTask.task.metadata: if m.key == "test_key_12": assert m.value == "test_value_12" else: assert False, "unexpected metadata %s" % m else: assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m if t.assignedTask.instanceId in (stop_instances - bad_update_instances): assert False, "unexpected start of stopped instance"
def test__auto_rollback_with_pinned_instances(client): """ 1. Create a job. 2. Start a bad update (version 2) targeting subset of instances. 3. Wait for the instances to be auto-rolled back. Only the instances specified above should be affected. """ res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 10 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # start a bad update with updateOnlyTheseInstances parameter update_instances = [0, 2, 3, 7, 9] pinned_req = get_job_update_request( "test_dc_labrat_large_job_bad_config.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) pinned_req.settings.updateGroupSize = 5 pinned_req.settings.maxFailedInstances = 3 res = client.start_job_update( pinned_req, "start a bad update test/dc/labrat_large_job with pinned instances", ) wait_for_rolled_back(client, res.key) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) # verify that the run-id of only the pinned instances has changed for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) if t.assignedTask.instanceId in update_instances: assert run_id == "3" else: assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for unaffected instances" % m)
def test__update_with_pinned_instances__deploy_stopped_instances_mixed(client): """ test pinned instance deployment with mixed version and instance state 1. start a regular update (version 1) on all instances 2. stop subset of instances 3. start a new update (version 2) targeting subset of instances (some of stopped instances included), expect targeted instances to be either brought up with newer version or updated with new version 4. start regular update (version 1) again on another set of instances (some of previously stopped instances included, some of instances updated in previous step included), expect only stopped and instances affected previous step to be either brought up or updated """ all_instances = set([i for i in xrange(10)]) # start a regular update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # stop subset of instances stop_instances = set([2, 8]) client.kill_tasks( job_key, stop_instances, "killing instance 2, 8 for job test/dc/labrat_large_job", ) wait_for_killed(client, job_key, stop_instances) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances - stop_instances) for t in res.tasks: assert t.assignedTask.instanceId in (all_instances - stop_instances) # start a update with updateOnlyTheseInstances parameter # expected only instances which targeted by updateOnlyTheseInstances # to be updated, within which stopped ones are started. update_instances = set([3, 5, 8]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_diff_labels.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) res = client.start_job_update( pinned_req, "start second job update test/dc/labrat_large_job with pinned instances and label diff", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert len(res.detailsList[0].instanceEvents) > 0 for ie in res.detailsList[0].instanceEvents: assert ie.instanceId in update_instances res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len((all_instances - stop_instances) | update_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert len(t.assignedTask.task.metadata) == 2 if t.assignedTask.instanceId in update_instances: assert run_id == "2" for m in t.assignedTask.task.metadata: if m.key == "test_key_11": assert m.value == "test_value_11" elif m.key == "test_key_22": assert m.value == "test_value_22" else: assert False, ( "unexpected metadata %s for affected instances" % m) elif t.assignedTask.instanceId in (all_instances - stop_instances - update_instances): assert run_id == "1" for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m) else: assert False, ("unexpected instance id %s: should be stopped" % t.assignedTask.instanceId) # start the regular update again same as the first one, targeting # subset of instances. # expect instance start / updated iff the instance has different config # or instance is stopped. update_2_instances = set([2, 3, 8, 9]) pinned_req_2 = get_job_update_request( "test_dc_labrat_large_job_diff_executor.yaml") pinned_req_2.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_2_instances]) res = client.start_job_update( pinned_req_2, "start third job update test/dc/labrat_large_job") wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert len(res.detailsList[0].instanceEvents) > 0 for ie in res.detailsList[0].instanceEvents: # exclude instances that are previously running and still on # the first update assert ie.instanceId in ( update_2_instances - (all_instances - update_instances - stop_instances)) # Expected instances for each corresponding state: # # v1s - instances on original job config (v1) and stopped # v1r1 - instances on original job config (v1) and running with run id 1 # v1r2 - instances on original job config (v1) and running with run id 2 # v1r3 - instances on original job config (v1) and running with run id 3 # v2r2 - instances on updated job config (v2) and running with run id 2 # # How did we calculate the instance ids? # # Let T1, T2, T3, T4 be each of the four operations, which are # T1 - start original update (v1 job config) for all instances (let it be A) # T2 - stop subset of instances (let it be S) # T3 - start new update (v2 job config) on subset of instances (let it be U1) # T4 - start origin update again (v1 job config) on subset of instances (let it be U2) # # At T1: # v1r1 = A # # At T2: # v1s = S # v1r1' = v1r1 - S = A - S # # At T3: # v1s' = v1s - U1 = S - U1 # v2r1 = (empty set) # v2r2 = U1 # v1r1'' = A - v2r2 - v1s' = A - U1 - (S - U1) # # At T4: # v1s'' = v1s' - U2 = S - U1 - U2 # v1r2 = U2 & v1s' = U2 & (S - U1) # v1r3 = U1 & U2 # v2r2' = v2r2 - U2 = U1 - U2 # v1r1''' = A - v1s'' - v1r2 - v1r3 - v2r2' v1s = stop_instances - update_instances - update_2_instances v1r2 = update_2_instances & (stop_instances - update_instances) v1r3 = update_instances & update_2_instances v2r2 = update_instances - update_2_instances v1r1 = all_instances - v1s - v1r2 - v1r3 - v2r2 assert not v1s, "should not be any instances remain as stopped" assert v1r1, "expect instances to be in version 1 run id 1" assert v1r2, "expect instances to be in version 1 run id 2" assert v1r3, "expect instances to be in version 1 run id 3" assert v2r2, "expect instances to be in version 2 run id 2" res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert len(t.assignedTask.task.metadata) == 2 if t.assignedTask.instanceId in v1r1: # version 1, run 1 assert run_id == "1" for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m) elif t.assignedTask.instanceId in v1r2: # version 1, run 2 assert run_id == "2" for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m) elif t.assignedTask.instanceId in v1r3: # version 1, run 3 assert run_id == "3" for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m) elif t.assignedTask.instanceId in v2r2: # version 2, run 2 assert run_id == "2" for m in t.assignedTask.task.metadata: if m.key == "test_key_11": assert m.value == "test_value_11" elif m.key == "test_key_22": assert m.value == "test_value_22" else: assert False, ( "unexpected metadata %s for affected instances" % m) else: assert False, ("unexpected instance id %s" % t.assignedTask.instanceId)
def test__update_with_pinned_instances__deploy_stopped_instances(client): """ test pinned instance deployment with stop / deploy instances: 1. start a regular update (version 1) on all instances 2. stop subset of instances 3. start a new update (version 2) targeting subset of instances (stopped instances included), expect stopped instances to be brought up with new version and other targeted instances to be updated 4. start regular update (version 1) again on all instances, expect only instances affected by previous step to be updated """ all_instances = set([i for i in xrange(10)]) # start a regular update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # stop subset of instances stop_instances = set([2, 8]) client.kill_tasks( job_key, stop_instances, "killing instance 2, 8 for job test/dc/labrat_large_job", ) wait_for_killed(client, job_key, stop_instances) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances - stop_instances) for t in res.tasks: assert t.assignedTask.instanceId in (all_instances - stop_instances) # start a update with updateOnlyTheseInstances parameter # expect stopped instances to be started update_instances = set([2, 3, 5, 8]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_diff_labels.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) res = client.start_job_update( pinned_req, "start second job update test/dc/labrat_large_job with pinned instances and label diff", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert len(res.detailsList[0].instanceEvents) > 0 for ie in res.detailsList[0].instanceEvents: assert ie.instanceId in update_instances res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert len(t.assignedTask.task.metadata) == 2 if t.assignedTask.instanceId in update_instances: assert run_id == "2" for m in t.assignedTask.task.metadata: if m.key == "test_key_11": assert m.value == "test_value_11" elif m.key == "test_key_22": assert m.value == "test_value_22" else: assert False, ( "unexpected metadata %s for affected instances" % m) elif t.assignedTask.instanceId in (all_instances - update_instances): assert run_id == "1" for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m) else: assert False, ("unexpected instance id %s" % t.assignedTask.instanceId) # start the regular update again same as the first one # expect changes only for instances updated by previous update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"), "start third job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert len(res.detailsList[0].instanceEvents) > 0 for ie in res.detailsList[0].instanceEvents: assert ie.instanceId in update_instances res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert len(t.assignedTask.task.metadata) == 2 if t.assignedTask.instanceId in update_instances: assert run_id == "3" elif t.assignedTask.instanceId in (all_instances - update_instances): assert run_id == "1" else: assert False, ("unexpected instance id %s" % t.assignedTask.instanceId) for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m)
def test__update_with_pinned_instances__start_stopped_instances_all(client): """ test pinned instance deployment with stop / start all instances: 1. start a regular update (version 1) on all instances 2. stop all instances 3. start the same update (version 1) on all instances (stopped instances included), expect all instances to be updated and start running 4. start regular update (version 1) again on all instances, expect no change on all instances """ all_instances = set([i for i in xrange(10)]) # start a regular update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # stop all instances stop_instances = set([i for i in xrange(10)]) client.kill_tasks( job_key, stop_instances, "killing all instances for job test/dc/labrat_large_job", ) wait_for_killed(client, job_key, stop_instances) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == 0 # start a update without updateOnlyTheseInstances parameter # expect all instances to be started update_instances = set([i for i in xrange(10)]) res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job_diff_executor.yaml"), "start second job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert len(res.detailsList[0].instanceEvents) > 0 for ie in res.detailsList[0].instanceEvents: assert ie.instanceId in (update_instances & stop_instances) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "2" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m) # start the regular update again same as the first one # expect no change for all instances res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start third job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert res.detailsList[0].instanceEvents is None res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) if t.assignedTask.instanceId in stop_instances: assert run_id == "2" elif t.assignedTask.instanceId in (all_instances - stop_instances): assert run_id == "1" else: assert False, ("unexpected instance id %s" % t.assignedTask.instanceId) assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for affected instances" % m)
def test__update_with_pinned_instances__stopped_instances(client): """ test pinned instance deployment with stopped instances: 1. start a regular update (version 1) on all instances 2. stop subset of instances 3. start another update (version 2) targeting subset of instances (stopped instances not included), expect only targeted instances to be updated and stopped instances remain stopped """ all_instances = set([i for i in xrange(10)]) # start a regular update res = client.start_job_update( get_job_update_request("test_dc_labrat_large_job.yaml"), "start job update test/dc/labrat_large_job", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances) for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, "unexpected metadata %s" % m # stop subset of instances stop_instances = set([1, 6]) client.kill_tasks( job_key, stop_instances, "killing instance 1, 6 for job test/dc/labrat_large_job", ) wait_for_killed(client, job_key, stop_instances) res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances - stop_instances) for t in res.tasks: assert t.assignedTask.instanceId in (all_instances - stop_instances) # start a update with updateOnlyTheseInstances parameter update_instances = set([0, 2, 3, 7, 9]) pinned_req = get_job_update_request( "test_dc_labrat_large_job_diff_labels.yaml") pinned_req.settings.updateOnlyTheseInstances = set( [api.Range(first=i, last=i) for i in update_instances]) res = client.start_job_update( pinned_req, "start job update test/dc/labrat_large_job with pinned instances", ) wait_for_rolled_forward(client, res.key) job_key = res.key.job res = client.get_job_update_details(None, api.JobUpdateQuery(key=res.key)) assert len(res.detailsList) == 1 assert len(res.detailsList[0].instanceEvents) > 0 for ie in res.detailsList[0].instanceEvents: assert ie.instanceId in update_instances res = client.get_tasks_without_configs( api.TaskQuery(jobKeys={job_key}, statuses={api.ScheduleStatus.RUNNING})) assert len(res.tasks) == len(all_instances - stop_instances) # expect instance 0, 2, 3, 7, 9 to be updated to newer version, with run id 2 # expect instance 1, 6 remain at stopped # expect instance 4, 5, 8 remain at original version, with run id 1 for t in res.tasks: _, _, run_id = t.assignedTask.taskId.rsplit("-", 2) if t.assignedTask.instanceId in update_instances: assert run_id == "2" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_11": assert m.value == "test_value_11" elif m.key == "test_key_22": assert m.value == "test_value_22" else: assert False, ( "unexpected metadata %s for affected instances" % m) elif t.assignedTask.instanceId in (all_instances - stop_instances): assert run_id == "1" assert len(t.assignedTask.task.metadata) == 2 for m in t.assignedTask.task.metadata: if m.key == "test_key_1": assert m.value == "test_value_1" elif m.key == "test_key_2": assert m.value == "test_value_2" else: assert False, ( "unexpected metadata %s for unaffected instances" % m) else: assert False, ("unexpected instance id %s: should be stopped" % t.assignedTask.instanceId)