def test_port_static_to_dynamic_port(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['BROKER_PORT'] = '0' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) for broker_id in range(config.DEFAULT_BROKER_COUNT): result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'broker get {}'.format(broker_id), json=True) assert result['port'] != 9092 result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints broker', json=True) assert len(result['address']) == config.DEFAULT_BROKER_COUNT assert len(result['dns']) == config.DEFAULT_BROKER_COUNT for port in result['address']: assert int(port.split(':')[-1]) != 9092 for port in result['dns']: assert int(port.split(':')[-1]) != 9092
def test_custom_zookeeper(): broker_ids = sdk_tasks.get_task_ids( FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic create {}'.format(config.DEFAULT_TOPIC_NAME), json=True) assert sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == [config.DEFAULT_TOPIC_NAME] marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) # should be using default path when this envvar is empty/unset: assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(ZK_SERVICE_PATH) marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated( FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME) # wait for brokers to finish registering test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=FOLDERED_SERVICE_NAME) zookeeper = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == []
def test_kill_hello_node(): config.check_running() hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0') sdk_cmd.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids) config.check_running()
def test_pods_restart_graceful_shutdown(): install_options_helper(30) world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "world-0") rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod restart world-0" ) assert rc == 0, "Pod restart failed" jsonobj = json.loads(stdout) assert len(jsonobj) == 2 assert jsonobj["pod"] == "world-0" assert len(jsonobj["tasks"]) == 1 assert jsonobj["tasks"][0] == "world-0-server" sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0", world_ids) check_healthy() # ensure the SIGTERM was sent via the "all clean" message in the world # service's signal trap/handler, BUT not the shell command, indicated # by "echo". _, stdout, _ = sdk_cmd.run_cli("task log --completed --lines=1000 {}".format(world_ids[0])) clean_msg = None for s in stdout.split("\n"): if s.find("echo") < 0 and s.find("all clean") >= 0: clean_msg = s assert clean_msg is not None
def test_config_update_then_scheduler_died(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME) config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_updates_then_all_executors_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9( { "service": {"yaml": "marathon_constraint"}, "hello": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, "world": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, } ) sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, get_num_private_agents() * 2, additional_options=options, ) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0") sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running( config.SERVICE_NAME, get_num_private_agents() * 2 - 1, timeout_seconds=10 ) sdk_tasks.check_running(config.SERVICE_NAME, get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def test_custom_zookeeper(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) broker_ids = sdk_tasks.get_task_ids(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: test_utils.create_topic(config.DEFAULT_TOPIC_NAME, service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) # should be using default path when this envvar is empty/unset: assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(sdk_utils.get_zk_path(foldered_name)) marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_updated(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(foldered_name) # wait for brokers to finish registering test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=foldered_name) zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: topic_list_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'topic list', json=True) test_utils.assert_topic_lists_are_equal_without_automatic_topics([], topic_list_info)
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_pod_restart(): hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0") # get current agent id: rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod info hello-0", print_output=False ) assert rc == 0, "Pod info failed" old_agent = json.loads(stdout)[0]["info"]["slaveId"]["value"] rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod restart hello-0" ) assert rc == 0, "Pod restart failed" jsonobj = json.loads(stdout) assert len(jsonobj) == 2 assert jsonobj["pod"] == "hello-0" assert len(jsonobj["tasks"]) == 1 assert jsonobj["tasks"][0] == "hello-0-server" sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", hello_ids) check_healthy() # check agent didn't move: rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod info hello-0", print_output=False ) assert rc == 0, "Second pod info failed" new_agent = json.loads(stdout)[0]["info"]["slaveId"]["value"] assert old_agent == new_agent
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) log.info('expiry ms: ' + str(expiry_ms)) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') log.info('old config: ') log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(foldered_name, old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_pods_restart_graceful_shutdown(): options = { "world": { "kill_grace_period": 30 } } sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=options) world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running() # ensure the SIGTERM was sent via the "all clean" message in the world # service's signal trap/handler, BUT not the shell command, indicated # by "echo". stdout = sdk_cmd.run_cli( "task log --completed --lines=1000 {}".format(world_ids[0])) clean_msg = None for s in stdout.split('\n'): if s.find('echo') < 0 and s.find('all clean') >= 0: clean_msg = s assert clean_msg is not None
def test_secrets_basic(): # 1) create Secrets # 2) install examples/secrets.yml # 3) if secret file is not created, tasks will fail # 4) wait till deployment finishes # 5) do replace operation # 6) ensure all tasks are running # 7) delete Secrets sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server") world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server") # ensure that secrets work after replace sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0) # tasks will fail if secret files are not created by mesos module sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # clean up and delete secrets delete_secrets("{}/".format(config.SERVICE_NAME))
def test_modify_app_config_rollback(): app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) log.info("expiry ms: " + str(expiry_ms)) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") log.info("old config: ") log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config["env"][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_no_change(): broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) plan1 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) config = marathon.get_config(SERVICE_NAME) marathon.update_app(SERVICE_NAME, config) plan2 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) assert plan1 == plan2 try: tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids, timeout_seconds=60) assert False, "Should not restart tasks now" except AssertionError as arg: raise arg except: pass tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) assert plan2['status'] == 'COMPLETE' assert plan2['phases'][0]['status'] == 'COMPLETE' for step in range(DEFAULT_BROKER_COUNT): assert plan2['phases'][0]['steps'][step]['status'] == 'COMPLETE'
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_updated_placement_constraints_replaced_tasks_do_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Replace the task, and verify it moves hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', old_ids) assert get_task_host('hello-0-server') == other_agent
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: Optional[List[str]] = None, ) -> None: """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list") assert rc == 0, "Pod list failed" pod_list = set(json.loads(stdout)) pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else [] pods_to_update = set(pods_with_updated_tasks + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_service_startup_rapid(): max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS task_short_name = 'kafka-0' broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # the following 'dcos kafka topic ....' command has expected output as follows: # 'Output: 100 records sent ....' # but may fail, i.e. have output such as follows: # '...leader not available...' stdout = '' retries = 15 while retries > 0: retries -= 1 stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100') if 'records sent' in stdout: break jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == task_short_name assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ] starting_fallback_time = datetime.datetime.now() sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ]) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # extract starting and started lines from log starting_time = started_time = None retry_seconds_remaining = max_restart_seconds + startup_padding_seconds while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None): stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1)) task_lines = stdout.split('\n') for log_line in reversed(task_lines): if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line: starting_time = log_line_ts(log_line) elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line: started_time = log_line_ts(log_line) if starting_time is None or started_time is None: time.sleep(retry_delay_seconds) if started_time is None or starting_time is None: f = open('/tmp/kafka_startup_stdout', 'w') f.write(stdout) f.close() if starting_time is None: starting_time = starting_fallback_time assert starting_time is not None assert started_time is not None assert started_time >= starting_time assert (started_time - starting_time).total_seconds() <= max_restart_seconds
def upgrade_or_downgrade(package_name, running_task_count): task_ids = tasks.get_task_ids(package_name, '') marathon.destroy_app(package_name) install.install(package_name, running_task_count) print('Waiting for upgrade / downgrade deployment to complete') spin.time_wait_noisy(lambda: ( plan.get_deployment_plan(package_name).json()['status'] == 'COMPLETE')) print('Checking that all tasks have restarted') tasks.check_tasks_updated(package_name, '', task_ids)
def replace_broker_pod(service_name=config.SERVICE_NAME): pod_name = '{}-0'.format(config.DEFAULT_POD_TYPE) task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME) broker_0_id = sdk_tasks.get_task_ids(service_name, task_name) sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod replace {}'.format(pod_name)) sdk_tasks.check_tasks_updated(service_name, task_name, broker_0_id) sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT) # wait till all brokers register broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=service_name)
def test_updated_placement_constraints_replaced_tasks_do_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Replace the task, and verify it moves hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) assert get_task_host("hello-0-server") == other_agent
def restart_broker_pods(service_name=config.SERVICE_NAME): for i in range(config.DEFAULT_BROKER_COUNT): pod_name = '{}-{}'.format(config.DEFAULT_POD_TYPE, i) task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME) broker_id = sdk_tasks.get_task_ids(service_name, task_name) restart_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod restart {}'.format(pod_name), json=True) assert len(restart_info) == 2 assert restart_info['tasks'][0] == task_name sdk_tasks.check_tasks_updated(service_name, task_name, broker_id) sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT)
def test_kill_journal_node(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal-0') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(FOLDERED_SERVICE_NAME, 'journal-0-node')) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_port_dynamic_to_dynamic_port(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) sdk_marathon.bump_cpu_count_config(config.SERVICE_NAME, 'BROKER_CPUS') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)
def _upgrade_or_downgrade( package_name, to_package_version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment): task_ids = sdk_tasks.get_task_ids(service_name, '') if shakedown.dcos_version_less_than("1.10") or shakedown.ee_version() is None: log.info('Using marathon upgrade flow to upgrade {} {}'.format(package_name, to_package_version)) sdk_marathon.destroy_app(service_name) sdk_install.install( package_name, service_name, running_task_count, additional_options=additional_options, package_version=to_package_version, timeout_seconds=timeout_seconds, wait_for_deployment=wait_for_deployment) else: log.info('Using CLI upgrade flow to upgrade {} {}'.format(package_name, to_package_version)) if additional_options: with tempfile.NamedTemporaryFile() as opts_f: opts_f.write(json.dumps(additional_options).encode('utf-8')) opts_f.flush() # ensure json content is available for the CLI to read below sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={} --options={}'.format(to_package_version, opts_f.name)) else: sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={}'.format(to_package_version)) if wait_for_deployment: log.info('Checking that all tasks have restarted') sdk_tasks.check_tasks_updated(service_name, '', task_ids) # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment log.info("Waiting for {}/{} to finish deployment plan...".format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds) # given the above wait for plan completion, here we just wait up to 5 minutes if shakedown.dcos_version_less_than("1.9"): log.info("Skipping `is_suppressed` check for %s/%s as this is only suppored starting in version 1.9", package_name, service_name) else: log.info("Waiting for %s/%s to be suppressed...", package_name, service_name) shakedown.wait_for( lambda: sdk_api.is_suppressed(service_name), noisy=True, timeout_seconds=5 * 60)
def test_kill_world_executor(): world_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="world-0")[0] sdk_cmd.kill_task_with_pattern( "mesos-default-executor", "nobody", agent_host=world_task.host, ) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0", [world_task.id]) check_healthy()
def test_pod_replace(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running()
def test_kill_data_node(): data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0] journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id]) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_kill_hello_task(): hello_task = sdk_tasks.get_service_tasks(config.SERVICE_NAME, task_prefix="hello-0")[0] sdk_cmd.kill_task_with_pattern( "hello-container-path/output", "nobody", agent_host=hello_task.host, ) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", [hello_task.id]) check_healthy()
def replace_name_node(index): config.check_healthy(service_name=config.FOLDERED_SERVICE_NAME) name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, name_node_name) journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data') sdk_cmd.run_cli('hdfs --name={} pod replace {}'.format(config.FOLDERED_SERVICE_NAME, name_node_name)) config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
def restart_broker_pods(package_name: str, service_name: str, pod_type: str, broker_count: int) -> None: for i in range(broker_count): pod_name = "{}-{}".format(pod_type, i) task_name = "{}-{}".format(pod_name, "broker") broker_id = sdk_tasks.get_task_ids(service_name, task_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod restart {}".format(pod_name)) assert rc == 0, "Pod restart {} failed".format(pod_name) restart_info = json.loads(stdout) assert len(restart_info) == 2 assert restart_info["tasks"][0] == task_name sdk_tasks.check_tasks_updated(service_name, task_name, broker_id) sdk_tasks.check_running(service_name, broker_count)
def test_modify_app_config(): check_healthy() app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('journal ids: ' + str(journal_ids)) print('name ids: ' + str(name_ids)) print('zkfc ids: ' + str(zkfc_ids)) print('data ids: ' + str(data_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) expiry_ms = int(config['env'][app_config_field]) config['env'][app_config_field] = str(expiry_ms + 1) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) # All tasks should be updated because hdfs-site.xml has changed tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'data', journal_ids) check_healthy()
def test_bump_journal_cpus(): check_healthy() journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') print('journal ids: ' + str(journal_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) cpus = float(config['env']['JOURNAL_CPUS']) config['env']['JOURNAL_CPUS'] = str(cpus + 0.1) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy()
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_adding_data_node_only_restarts_masters(): initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data") initial_coordinator_task_ids = sdk_tasks.get_task_ids(foldered_name, "coordinator") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "describe", parse_json=True ) data_nodes_count = get_in(["data_nodes", "count"], svc_config) global current_expected_task_count # Increase the data nodes count by 1. sdk_service.update_configuration( config.PACKAGE_NAME, foldered_name, {"data_nodes": {"count": data_nodes_count + 1}}, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) sdk_plan.wait_for_kicked_off_deployment(foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) _, new_data_pod_info, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "pod info data-{}".format(data_nodes_count), parse_json=True, ) # Get task ID for new data node task. new_data_task_id = get_in([0, "info", "taskId", "value"], new_data_pod_info) # Should be running 1 task more. current_expected_task_count += 1 sdk_tasks.check_running(foldered_name, current_expected_task_count) # Master nodes should restart. sdk_tasks.check_tasks_updated(foldered_name, "master", initial_master_task_ids) # Data node tasks should be the initial ones plus the new one. sdk_tasks.check_tasks_not_updated( foldered_name, "data", initial_data_task_ids + [new_data_task_id] ) # Coordinator tasks should not restart. sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator", initial_coordinator_task_ids)
def test_kill_all_datanodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_bump_world_cpus(): original_world_ids = sdk_tasks.get_task_ids(foldered_name, "world") log.info("world ids: " + str(original_world_ids)) updated_cpus = config.bump_world_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "world", original_world_ids) sdk_plan.wait_for_completed_deployment(foldered_name) all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="world") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def test_bump_journal_cpus(): check_healthy() journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') print('journal ids: ' + str(journal_ids)) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) cpus = float(config['env']['JOURNAL_CPUS']) config['env']['JOURNAL_CPUS'] = str(cpus + 0.1) marathon.update_app(PACKAGE_NAME, config) tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy()
def test_kill_all_journalnodes(hdfs_server): service_name = hdfs_server["service"]["name"] journal_ids = sdk_tasks.get_task_ids(service_name, 'journal') name_ids = sdk_tasks.get_task_ids(service_name, 'name') data_ids = sdk_tasks.get_task_ids(service_name, 'data') for journal_pod in config.get_pod_type_instances("journal", service_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=service_name) sdk_tasks.check_tasks_updated(service_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(service_name, 'name', name_ids) sdk_tasks.check_tasks_not_updated(service_name, 'data', data_ids)
def test_bump_journal_cpus(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') log.info('journal ids: ' + str(journal_ids)) sdk_marathon.bump_cpu_count_config(foldered_name, 'JOURNAL_CPUS') sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) # journal node update should not cause any of the name nodes to crash # if the name nodes crashed, then it implies the journal nodes were updated in parallel, when they should've been updated serially # for journal nodes, the deploy plan is parallel, while the update plan is serial. maybe the deploy plan was mistakenly used? sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids) config.check_healthy(service_name=foldered_name)
def test_pod_replace(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "world-0") rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace world-0") assert rc == 0, "Pod replace failed" jsonobj = json.loads(stdout) assert len(jsonobj) == 2 assert jsonobj["pod"] == "world-0" assert len(jsonobj["tasks"]) == 1 assert jsonobj["tasks"][0] == "world-0-server" sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "world-0", world_ids) check_healthy()
def replace_name_node(index): check_healthy() name_node_name = 'name-' + str(index) name_id = tasks.get_task_ids(FOLDERED_SERVICE_NAME, name_node_name) journal_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') cmd.run_cli('hdfs --name={} pods replace {}'.format( FOLDERED_SERVICE_NAME, name_node_name)) check_healthy() tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, name_node_name, name_id) tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_bump_world_cpus(): check_running() world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') sdk_utils.out('world ids: ' + str(world_ids)) updated_cpus = bump_world_cpus() tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running() all_tasks = shakedown.get_service_tasks(PACKAGE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == world_task_count() for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def test_kill_all_journalnodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids( sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data') for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_port_dynamic_to_dynamic_port(): tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) config = marathon.get_config(SERVICE_NAME) broker_cpus = int(config['env']['BROKER_CPUS']) config['env']['BROKER_CPUS'] = str(broker_cpus + 0.1) marathon.update_app(SERVICE_NAME, config) tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) # all tasks are running tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
def test_pod_replace(): world_ids = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'world-0') # get current agent id (TODO: uncomment if/when agent is guaranteed to change in a replace operation): #stdout = sdk_cmd.run_cli('hello-world pod info world-0') #old_agent = json.loads(stdout)[0]['info']['slaveId']['value'] jsonobj = json.loads(sdk_cmd.run_cli('hello-world pod replace world-0')) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.PACKAGE_NAME, 'world-0', world_ids) config.check_running()
def test_soak_secrets_restart_hello0(): hello_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0") world_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0") # restart pods to retrieve new secret's content sdk_cmd.svc_cli(config.PACKAGE_NAME, FRAMEWORK_NAME, 'pod restart hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, FRAMEWORK_NAME, 'pod restart world-0') # wait pod restart to complete sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, "hello-0", hello_tasks_old) sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, 'world-0', world_tasks_old) # wait till it all running sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
def replace_name_node(index): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(foldered_name, name_node_name) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace {}'.format(name_node_name)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_1_0(): check_healthy() name_0_ids = tasks.get_task_ids(PACKAGE_NAME, 'name-0') name_1_ids = tasks.get_task_ids(PACKAGE_NAME, 'name-1') journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') cmd.run_cli('hdfs pods replace name-1') cmd.run_cli('hdfs pods restart name-0') check_healthy() tasks.check_tasks_updated(PACKAGE_NAME, 'name-0', name_0_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'name-1', name_1_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_kill_scheduler(): scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix( config.SERVICE_NAME) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find one scheduler task" sdk_cmd.kill_task_with_pattern( "./hello-world-scheduler/bin/helloworld", "nobody", agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME), ) sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) check_healthy()
def replace_name_node(index): check_healthy() name_node_name = 'name-' + str(index) name_id = tasks.get_task_ids(PACKAGE_NAME, name_node_name) journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') cmd.run_cli('hdfs pods replace ' + name_node_name) tasks.check_tasks_updated(PACKAGE_NAME, name_node_name, name_id) tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) check_healthy()
def test_kill_all_datanodes(): check_healthy() journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') for host in shakedown.get_service_ips(PACKAGE_NAME): tasks.kill_task_with_pattern('datanode', host) tasks.check_tasks_updated(PACKAGE_NAME, 'data', data_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) check_healthy()
def upgrade_or_downgrade(package_name, running_task_count, additional_options, package_version=None): task_ids = tasks.get_task_ids(package_name, '') marathon.destroy_app(package_name) install.install(package_name, running_task_count, additional_options=additional_options, package_version=package_version, check_suppression=False) sdk_utils.out('Waiting for upgrade / downgrade deployment to complete') plan.wait_for_completed_deployment(package_name) sdk_utils.out('Checking that all tasks have restarted') tasks.check_tasks_updated(package_name, '', task_ids)
def test_bump_world_cpus(): config.check_running(FOLDERED_SERVICE_NAME) world_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'world') log.info('world ids: ' + str(world_ids)) updated_cpus = config.bump_world_cpus(FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'world', world_ids) config.check_running(FOLDERED_SERVICE_NAME) all_tasks = shakedown.get_service_tasks(FOLDERED_SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('world') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == config.world_task_count(FOLDERED_SERVICE_NAME) for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def test_bump_hello_cpus(): check_running(FOLDERED_SERVICE_NAME) hello_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'hello') sdk_utils.out('hello ids: ' + str(hello_ids)) updated_cpus = bump_hello_cpus(FOLDERED_SERVICE_NAME) tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'hello', hello_ids) check_running(FOLDERED_SERVICE_NAME) all_tasks = shakedown.get_service_tasks(FOLDERED_SERVICE_NAME) running_tasks = [t for t in all_tasks if t['name'].startswith('hello') and t['state'] == "TASK_RUNNING"] assert len(running_tasks) == hello_task_count(FOLDERED_SERVICE_NAME) for t in running_tasks: assert close_enough(t['resources']['cpus'], updated_cpus)
def _upgrade_or_downgrade(package_name, to_package_version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment): initial_config = get_config(package_name, service_name) task_ids = sdk_tasks.get_task_ids(service_name, '') if sdk_utils.dcos_version_less_than( "1.10") or shakedown.ee_version() is None: log.info('Using marathon upgrade flow to upgrade {} {}'.format( package_name, to_package_version)) sdk_marathon.destroy_app(service_name) sdk_install.install(package_name, service_name, running_task_count, additional_options=additional_options, package_version=to_package_version, timeout_seconds=timeout_seconds, wait_for_deployment=wait_for_deployment) else: log.info('Using CLI upgrade flow to upgrade {} {}'.format( package_name, to_package_version)) update_service(package_name, service_name, additional_options, to_package_version) # we must manually upgrade the package CLI because it's not done automatically in this flow # (and why should it? that'd imply the package CLI replacing itself via a call to the main CLI...) sdk_cmd.run_cli( 'package install --yes --cli --package-version={} {}'.format( to_package_version, package_name)) if wait_for_deployment: updated_config = get_config(package_name, service_name) if updated_config == initial_config: log.info( 'No config change detected. Tasks should not be restarted') sdk_tasks.check_tasks_not_updated(service_name, '', task_ids) else: log.info('Checking that all tasks have restarted') sdk_tasks.check_tasks_updated(service_name, '', task_ids) # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via ONCE tasks, without actually completing deployment log.info( "Waiting for package={} service={} to finish deployment plan...". format(package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
def test_bump_hello_cpus(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) hello_ids = sdk_tasks.get_task_ids(foldered_name, "hello") log.info("hello ids: " + str(hello_ids)) updated_cpus = config.bump_hello_cpus(foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "hello", hello_ids) config.check_running(foldered_name) all_tasks = sdk_tasks.get_service_tasks(foldered_name, task_prefix="hello") running_tasks = [t for t in all_tasks if t.state == "TASK_RUNNING"] assert len(running_tasks) == config.hello_task_count(foldered_name) for t in running_tasks: assert config.close_enough(t.resources["cpus"], updated_cpus)
def test_kill_essential(): '''kill the essential task, verify that both tasks are relaunched against a matching executor''' verify_shared_executor('hello-0') old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0') assert len(old_ids) == 2 sdk_cmd.kill_task_with_pattern( 'shared-volume/essential', # hardcoded in cmd, see yml sdk_hosts.system_host(config.SERVICE_NAME, 'hello-0-essential')) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', old_ids) # wait for ids to change... sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) # ...and for tasks to be up and running # the first verify_shared_executor call deleted the files. both should have come back via the relaunch. verify_shared_executor('hello-0', delete_files=False) # leave files as-is for the next test