def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_secrets_basic(): # 1) create Secrets # 2) install examples/secrets.yml # 3) if secret file is not created, tasks will fail # 4) wait till deployment finishes # 5) do replace operation # 6) ensure all tasks are running # 7) delete Secrets sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server") world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server") # ensure that secrets work after replace sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0) # tasks will fail if secret files are not created by mesos module sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # clean up and delete secrets delete_secrets("{}/".format(config.SERVICE_NAME))
def test_increase_decrease_world_nodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) original_hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello') original_world_ids = sdk_tasks.get_task_ids(foldered_name, 'world') log.info('world ids: ' + str(original_world_ids)) # add 2 world nodes sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', 2) config.check_running(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids) # check 2 world tasks added: assert 2 + len(original_world_ids) == len(sdk_tasks.get_task_ids(foldered_name, 'world')) # subtract 2 world nodes sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', -2) config.check_running(foldered_name) # wait for the decommission plan for this subtraction to be complete sdk_plan.wait_for_completed_plan(foldered_name, 'decommission') # check that the total task count is back to original sdk_tasks.check_running( foldered_name, len(original_hello_ids) + len(original_world_ids), allow_more=False) # check that original tasks weren't affected/relaunched in the process sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', original_hello_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids) # check that the world tasks are back to their prior state (also without changing task ids) assert original_world_ids == sdk_tasks.get_task_ids(foldered_name, 'world')
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None: initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True) ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config) coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config) global current_expected_task_count sdk_service.update_configuration( package_name, service_name, { "ingest_nodes": {"count": ingest_nodes_count + 1}, "coordinator_nodes": {"count": coordinator_nodes_count + 1}, }, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) # Should be running 2 tasks more. current_expected_task_count += 2 sdk_tasks.check_running(service_name, current_expected_task_count) # Master nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids) # Data nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
def test_soak_secrets_update(): secret_content_alternative = "hello-world-secret-data-alternative" test_soak_secrets_framework_alive() sdk_cmd.run_cli("package install --cli dcos-enterprise-cli --yes") sdk_cmd.run_cli("package install --cli hello-world --yes") sdk_cmd.run_cli("security secrets update --value={} secrets/secret1".format(secret_content_alternative)) sdk_cmd.run_cli("security secrets update --value={} secrets/secret2".format(secret_content_alternative)) sdk_cmd.run_cli("security secrets update --value={} secrets/secret3".format(secret_content_alternative)) test_soak_secrets_restart_hello0() # get new task ids - only first pod hello_tasks = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0") world_tasks = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0") # make sure content is changed assert secret_content_alternative == task_exec(world_tasks[0], "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_alternative == task_exec(world_tasks[0], "cat WORLD_SECRET2_FILE") assert secret_content_alternative == task_exec(world_tasks[0], "cat secrets/secret3") # make sure content is changed assert secret_content_alternative == task_exec(hello_tasks[0], "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_alternative == task_exec(hello_tasks[0], "cat HELLO_SECRET1_FILE") assert secret_content_alternative == task_exec(hello_tasks[0], "cat HELLO_SECRET2_FILE") # revert back to some other value sdk_cmd.run_cli("security secrets update --value=SECRET1 secrets/secret1") sdk_cmd.run_cli("security secrets update --value=SECRET2 secrets/secret2") sdk_cmd.run_cli("security secrets update --value=SECRET3 secrets/secret3") test_soak_secrets_restart_hello0()
def test_modify_app_config_rollback(): app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) log.info("expiry ms: " + str(expiry_ms)) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") log.info("old config: ") log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config["env"][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) log.info('expiry ms: ' + str(expiry_ms)) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') log.info('old config: ') log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(foldered_name, old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_service_startup_rapid(): max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS task_short_name = 'kafka-0' broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # the following 'dcos kafka topic ....' command has expected output as follows: # 'Output: 100 records sent ....' # but may fail, i.e. have output such as follows: # '...leader not available...' stdout = '' retries = 15 while retries > 0: retries -= 1 stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100') if 'records sent' in stdout: break jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == task_short_name assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ] starting_fallback_time = datetime.datetime.now() sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ]) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # extract starting and started lines from log starting_time = started_time = None retry_seconds_remaining = max_restart_seconds + startup_padding_seconds while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None): stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1)) task_lines = stdout.split('\n') for log_line in reversed(task_lines): if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line: starting_time = log_line_ts(log_line) elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line: started_time = log_line_ts(log_line) if starting_time is None or started_time is None: time.sleep(retry_delay_seconds) if started_time is None or starting_time is None: f = open('/tmp/kafka_startup_stdout', 'w') f.write(stdout) f.close() if starting_time is None: starting_time = starting_fallback_time assert starting_time is not None assert started_time is not None assert started_time >= starting_time assert (started_time - starting_time).total_seconds() <= max_restart_seconds
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: Optional[List[str]] = None, ) -> None: """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list") assert rc == 0, "Pod list failed" pod_list = set(json.loads(stdout)) pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else [] pods_to_update = set(pods_with_updated_tasks + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_kill_journal_node(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal-0') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(FOLDERED_SERVICE_NAME, 'journal-0-node')) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_bump_journal_cpus(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') log.info('journal ids: ' + str(journal_ids)) sdk_marathon.bump_cpu_count_config(FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS') sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) # journal node update should not cause any of the name nodes to crash sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
def test_kill_data_node(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') sdk_cmd.kill_task_with_pattern('datanode', sdk_hosts.system_host(foldered_name, 'data-0-node')) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_kill_data_node(): data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0] journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id]) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_changing_discovery_replaces_certificate_sans(hello_world_service): """ Update service configuration to change discovery prefix of a task. Scheduler should update task and new SANs should be generated. """ original_tasks = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'discovery') assert len(original_tasks) == 1, 'Expecting exactly one task ID' task_id = original_tasks[0] assert task_id # Load end-entity certificate from PEM encoded file end_entity_cert = x509.load_pem_x509_certificate( task_exec(task_id, 'cat server.crt').encode('ascii'), DEFAULT_BACKEND) san_extension = end_entity_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [ san.value for san in san_extension.value._general_names._general_names] expected_san = ( '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format( name=DISCOVERY_TASK_PREFIX, service_name=config.SERVICE_NAME) ) assert expected_san in sans # Run task update with new discovery prefix marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['DISCOVERY_TASK_PREFIX'] = DISCOVERY_TASK_PREFIX + '-new' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'discovery', original_tasks) sdk_tasks.check_running(config.SERVICE_NAME, 4) new_task_id = sdk_tasks.get_task_ids(config.SERVICE_NAME, "discovery")[0] assert task_id != new_task_id new_cert = x509.load_pem_x509_certificate( task_exec(new_task_id, 'cat server.crt').encode('ascii'), DEFAULT_BACKEND) san_extension = new_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [ san.value for san in san_extension.value._general_names._general_names] expected_san = ( '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format( name=DISCOVERY_TASK_PREFIX + '-new', service_name=config.SERVICE_NAME) ) assert expected_san in sans
def test_kill_all_journalnodes(hdfs_server): service_name = hdfs_server["service"]["name"] journal_ids = sdk_tasks.get_task_ids(service_name, "journal") name_ids = sdk_tasks.get_task_ids(service_name, "name") data_ids = sdk_tasks.get_task_ids(service_name, "data") for journal_pod in config.get_pod_type_instances("journal", service_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, "pod restart {}".format(journal_pod)) config.expect_recovery(service_name=service_name) sdk_tasks.check_tasks_updated(service_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(service_name, "name", name_ids) sdk_tasks.check_tasks_not_updated(service_name, "data", data_ids)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_bump_journal_cpus(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") log.info("journal ids: " + str(journal_ids)) sdk_marathon.bump_cpu_count_config(foldered_name, "JOURNAL_CPUS") sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) # journal node update should not cause any of the name nodes to crash # if the name nodes crashed, then it implies the journal nodes were updated in parallel, when they should've been updated serially # for journal nodes, the deploy plan is parallel, while the update plan is serial. maybe the deploy plan was mistakenly used? sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids) config.check_healthy(service_name=foldered_name)
def test_kill_all_journalnodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data') for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_kill_all_datanodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for journal_pod in config.get_pod_type_instances("journal", FOLDERED_SERVICE_NAME): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_secrets_update(): # 1) create Secrets # 2) install examples/secrets.yml # 3) update Secrets # 4) restart task # 5) verify Secrets content (updated after restart) # 6) delete Secrets sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) # tasks will fail if secret file is not created sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) sdk_cmd.run_cli("security secrets update --value={} {}/secret1".format(secret_content_alternative, config.SERVICE_NAME)) sdk_cmd.run_cli("security secrets update --value={} {}/secret2".format(secret_content_alternative, config.SERVICE_NAME)) sdk_cmd.run_cli("security secrets update --value={} {}/secret3".format(secret_content_alternative, config.SERVICE_NAME)) # Verify with hello-0 and world-0, just check with one of the pods hello_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server") world_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME, "world-0-server") # restart pods to retrieve new secret's content sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0') # wait pod restart to complete sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_old) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_old) # wait till it is running sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # make sure content is changed assert secret_content_alternative == read_secret("world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_alternative == read_secret("world-0-server", "cat WORLD_SECRET2_FILE") assert secret_content_alternative == read_secret("world-0-server", "cat {}/secret3".format(config.SERVICE_NAME)) # make sure content is changed assert secret_content_alternative == read_secret("hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET1_FILE") assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET2_FILE") # clean up and delete secrets delete_secrets("{}/".format(config.SERVICE_NAME))
def test_soak_secrets_restart_hello0(): hello_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0") world_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0") # restart pods to retrieve new secret's content sdk_cmd.svc_cli(config.PACKAGE_NAME, FRAMEWORK_NAME, 'pod restart hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, FRAMEWORK_NAME, 'pod restart world-0') # wait pod restart to complete sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, "hello-0", hello_tasks_old) sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, 'world-0', world_tasks_old) # wait till it all running sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for data_pod in config.get_pod_type_instances("data", FOLDERED_SERVICE_NAME): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
def test_kill_hello_node(): config.check_running() hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0') sdk_cmd.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0', hello_ids) config.check_running()
def test_config_updates_then_all_executors_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('helloworld.executor.Main', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_scheduler_died(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') host = sdk_marathon.get_scheduler_host(config.SERVICE_NAME) config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('helloworld.scheduler.Main', host) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_pods_restart_graceful_shutdown(): options = { "world": { "kill_grace_period": 30 } } sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_TASK_COUNT, additional_options=options) world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world-0') jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0', json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == 'world-0' assert len(jsonobj['tasks']) == 1 assert jsonobj['tasks'][0] == 'world-0-server' sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0', world_ids) config.check_running() # ensure the SIGTERM was sent via the "all clean" message in the world # service's signal trap/handler, BUT not the shell command, indicated # by "echo". stdout = sdk_cmd.run_cli( "task log --completed --lines=1000 {}".format(world_ids[0])) clean_msg = None for s in stdout.split('\n'): if s.find('echo') < 0 and s.find('all clean') >= 0: clean_msg = s assert clean_msg is not None
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_cmd.kill_task_with_pattern('world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_adding_data_node_only_restarts_masters(): initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data") initial_coordinator_task_ids = sdk_tasks.get_task_ids(foldered_name, "coordinator") marathon_config = sdk_marathon.get_config(foldered_name) data_nodes = int(marathon_config['env']['DATA_NODE_COUNT']) marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1) sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) global current_expected_task_count current_expected_task_count += 1 sdk_tasks.check_running(foldered_name, current_expected_task_count) sdk_tasks.check_tasks_updated(foldered_name, "master", initial_master_task_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", initial_data_task_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator", initial_coordinator_task_ids) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_config_update_then_executor_killed(): world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() tasks.kill_task_with_pattern( 'helloworld.executor.Main', 'world-0-server.{}.mesos'.format(PACKAGE_NAME)) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_soak_secrets_restart_hello0(): hello_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "hello-0") world_tasks_old = sdk_tasks.get_task_ids(FRAMEWORK_NAME, "world-0") # restart pods to retrieve new secret's content sdk_cmd.run_cli( 'hello-world --name={} pod restart hello-0'.format(FRAMEWORK_NAME)) sdk_cmd.run_cli( 'hello-world --name={} pod restart world-0'.format(FRAMEWORK_NAME)) # wait pod restart to complete sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, "hello-0", hello_tasks_old) sdk_tasks.check_tasks_updated(FRAMEWORK_NAME, 'world-0', world_tasks_old) # wait till it all running sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
def restart_broker_pods(service_name=SERVICE_NAME): for i in range(DEFAULT_BROKER_COUNT): broker_id = sdk_tasks.get_task_ids(service_name,'{}-{}-{}'.format(DEFAULT_POD_TYPE, i, DEFAULT_TASK_NAME)) restart_info = service_cli('pod restart {}-{}'.format(DEFAULT_POD_TYPE, i), service_name=service_name) sdk_tasks.check_tasks_updated(service_name, '{}-{}-{}'.format(DEFAULT_POD_TYPE, i, DEFAULT_TASK_NAME), broker_id) sdk_tasks.check_running(service_name, DEFAULT_BROKER_COUNT) assert len(restart_info) == 2 assert restart_info['tasks'][0] == '{}-{}-{}'.format(DEFAULT_POD_TYPE, i, DEFAULT_TASK_NAME)
def test_bump_data_nodes(): data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') sdk_utils.out('data ids: ' + str(data_ids)) marathon.bump_task_count_config(PACKAGE_NAME, 'DATA_COUNT') check_healthy(DEFAULT_TASK_COUNT + 1) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_journal_cpus(): journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') sdk_utils.out('journal ids: ' + str(journal_ids)) marathon.bump_cpu_count_config(PACKAGE_NAME, 'JOURNAL_CPUS') tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy()
def test_bump_data_nodes(): data_ids = sdk_tasks.get_task_ids(foldered_name, "data") log.info("data ids: " + str(data_ids)) sdk_marathon.bump_task_count_config(foldered_name, "DATA_COUNT") config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_config_update_then_executor_killed(): world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_cmd.kill_task_with_pattern( 'helloworld.executor.Main', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') config.bump_world_cpus() sdk_cmd.kill_task_with_pattern( 'world', 'world-0-server.{}.mesos'.format(config.SERVICE_NAME)) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_config_update_then_kill_task_in_node(): # kill 1 of 2 world tasks world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() tasks.kill_task_with_pattern( 'world', 'world-0-server.{}.mesos'.format(PACKAGE_NAME)) tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def test_unchanged_scheduler_restarts_without_restarting_tasks(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_task_ids = sdk_tasks.get_task_ids(foldered_name, "master") shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(foldered_name), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(foldered_name, "master", initial_task_ids)
def test_pods_replace(): broker_0_id = tasks.get_task_ids( SERVICE_NAME, '{}-0-{}'.format(DEFAULT_POD_TYPE, DEFAULT_TASK_NAME)) service_cli('pods replace {}-0'.format(DEFAULT_POD_TYPE)) tasks.check_tasks_updated( SERVICE_NAME, '{}-0-{}'.format(DEFAULT_POD_TYPE, DEFAULT_TASK_NAME), broker_0_id) tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
def test_data_node_replace(): data_ids = sdk_tasks.get_task_ids( sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, sdk_utils.get_foldered_name(config.SERVICE_NAME), 'pod replace data-0') sdk_tasks.check_tasks_updated( sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data-0', data_ids)
def test_kill_hello_node(): config.check_running() hello_ids = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'hello-0') sdk_tasks.kill_task_with_pattern('hello', 'hello-0-server.hello-world.mesos') sdk_tasks.check_tasks_updated(config.PACKAGE_NAME, 'hello', hello_ids) config.check_running()
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(foldered_name, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find one scheduler task" sdk_cmd.kill_task_with_pattern( "./hdfs-scheduler/bin/hdfs", "nobody", agent_host=sdk_marathon.get_scheduler_host(foldered_name), ) # scheduler should be restarted, but service tasks should be left as-is: sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) config.check_healthy(service_name=foldered_name)
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'world') hosts = shakedown.get_service_ips(config.SERVICE_NAME) config.bump_world_cpus() [sdk_cmd.kill_task_with_pattern('world', h) for h in hosts] sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world', world_ids) config.check_running()
def test_master_node_replace(): # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups. master_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'master-0') cmd.run_cli( 'elastic --name={} pod replace master-0'.format(FOLDERED_SERVICE_NAME)) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'master-0', master_ids)
def test_config_update_then_kill_all_task_in_node(): # kill both world tasks world_ids = tasks.get_task_ids(PACKAGE_NAME, 'world') bump_world_cpus() hosts = shakedown.get_service_ips(PACKAGE_NAME) [tasks.kill_task_with_pattern('world', h) for h in hosts] tasks.check_tasks_updated(PACKAGE_NAME, 'world', world_ids) check_running()
def replace_name_node(index): check_healthy() name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, name_node_name) journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') cmd.run_cli('hdfs --name={} pod replace {}'.format(FOLDERED_SERVICE_NAME, name_node_name)) expect_recovery() sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_0_1(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) name_0_ids = sdk_tasks.get_task_ids(foldered_name, 'name-0') name_1_ids = sdk_tasks.get_task_ids(foldered_name, 'name-1') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace name-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart name-1') config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_bump_data_nodes(): data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') log.info('data ids: ' + str(data_ids)) sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'DATA_COUNT') check_healthy(count=DEFAULT_TASK_COUNT + 1) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def _upgrade_or_downgrade(package_name, to_package_version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment): initial_config = get_config(package_name, service_name) task_ids = sdk_tasks.get_task_ids(service_name, '') if sdk_utils.dcos_version_less_than( "1.10") or shakedown.ee_version() is None: log.info('Using marathon upgrade flow to upgrade {} {}'.format( package_name, to_package_version)) sdk_marathon.destroy_app(service_name) sdk_install.install(package_name, service_name, running_task_count, additional_options=additional_options, package_version=to_package_version, timeout_seconds=timeout_seconds, wait_for_deployment=wait_for_deployment) else: log.info('Using CLI upgrade flow to upgrade {} {}'.format( package_name, to_package_version)) if additional_options: with tempfile.NamedTemporaryFile() as opts_f: opts_f.write(json.dumps(additional_options).encode('utf-8')) opts_f.flush( ) # ensure json content is available for the CLI to read below sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={} --options={}'.format( to_package_version, opts_f.name)) else: sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={}'.format(to_package_version)) # we must manually upgrade the package CLI because it's not done automatically in this flow # (and why should it? that'd imply the package CLI replacing itself via a call to the main CLI...) sdk_cmd.run_cli( 'package install --yes --cli --package-version={} {}'.format( to_package_version, package_name)) if wait_for_deployment: updated_config = get_config(package_name, service_name) if updated_config == initial_config: log.info( 'No config change detected. Tasks should not be restarted') sdk_tasks.check_tasks_not_updated(service_name, '', task_ids) else: log.info('Checking that all tasks have restarted') sdk_tasks.check_tasks_updated(service_name, '', task_ids) # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via FINISHED tasks, without actually completing deployment log.info("Waiting for {}/{} to finish deployment plan...".format( package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
def test_upgrade(): test_version = upgrade.get_pkg_version(PACKAGE_NAME) print('Found test version: {}'.format(test_version)) repositories = json.loads( cmd.run_cli('package repo list --json'))['repositories'] print("Repositories: " + str(repositories)) if len(repositories) < 2: print( "There is only one version in the repository. Skipping upgrade test!" ) assert repo[0]['name'] == 'Universe' return test_repo_name, test_repo_url = upgrade.get_test_repo_info() for repo in repositories: if repo['name'] != 'Universe': shakedown.remove_package_repo(repo['name']) universe_version = upgrade.get_pkg_version(PACKAGE_NAME) print('Found Universe version: {}'.format(universe_version)) print('Installing Universe version: {}'.format(universe_version)) install.install(PACKAGE_NAME, DEFAULT_BROKER_COUNT) print('Installation complete for Universe version: {}'.format( universe_version)) tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) broker_ids = tasks.get_task_ids(SERVICE_NAME, 'broker-') print('Adding test version to repository with name: {} and url: {}'.format( test_repo_name, test_repo_url)) upgrade.add_repo(test_repo_name, test_repo_url, universe_version, 0, PACKAGE_NAME) print('Upgrading to test version: {}'.format(test_version)) marathon.destroy_app(SERVICE_NAME) print('Installing test version: {}'.format(test_version)) # installation will return with old tasks because they are still running install.install(PACKAGE_NAME, DEFAULT_BROKER_COUNT) print('Installation complete for test version: {}'.format(test_version)) # wait till tasks are restarted tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) print('All task are restarted') # all tasks are running tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) address = service_cli('endpoints {}'.format(DEFAULT_TASK_NAME)) assert len(address) == 3 assert len(address['dns']) == DEFAULT_BROKER_COUNT assert len(address['address']) == DEFAULT_BROKER_COUNT
def test_modify_app_config_rollback(): check_healthy() app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') zkfc_ids = tasks.get_task_ids(PACKAGE_NAME, 'zkfc') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('journal ids: ' + str(journal_ids)) print('name ids: ' + str(name_ids)) print('zkfc ids: ' + str(zkfc_ids)) print('data ids: ' + str(data_ids)) old_config = marathon.get_config(PACKAGE_NAME) config = marathon.get_config(PACKAGE_NAME) print('marathon config: ') print(config) expiry_ms = int(config['env'][app_config_field]) print('expiry ms: ' + str(expiry_ms)) config['env'][app_config_field] = str(expiry_ms + 1) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) # Wait for journal nodes to be affected by the change tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') print('old config: ') print(old_config) # Put the old config back (rollback) r = cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=old_config) # Wait for the journal nodes to return to their old configuration tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy() config = marathon.get_config(PACKAGE_NAME) assert int(config['env'][app_config_field]) == expiry_ms # ZKFC and Data tasks should not have been affected tasks.check_tasks_not_updated(PACKAGE_NAME, 'zkfc', zkfc_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_data_nodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') log.info('data ids: ' + str(data_ids)) sdk_marathon.bump_task_count_config(foldered_name, 'DATA_COUNT') config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_permanent_and_transient_namenode_failures_1_0(): check_healthy() name_0_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-0') name_1_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-1') journal_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') cmd.run_cli( 'hdfs --name={} pods replace name-1'.format(FOLDERED_SERVICE_NAME)) cmd.run_cli( 'hdfs --name={} pods restart name-0'.format(FOLDERED_SERVICE_NAME)) check_healthy() tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-0', name_0_ids) tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-1', name_1_ids) tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def run_openssl_command() -> str: command = ' '.join([ 'timeout', openssl_timeout, 'openssl', 's_client', '-cipher', cipher, '-connect', endpoint ]) task_id = sdk_tasks.get_task_ids(service_name, task_name)[0] _, output = sdk_cmd.task_exec(task_id, command, True) return output
def test_ingest_node_replace(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) config.wait_for_expected_nodes_to_exist() ingest_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'ingest-0') cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace ingest-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'ingest-0', ingest_ids) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) config.wait_for_expected_nodes_to_exist()
def test_modify_app_config(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') config = marathon.get_config(PACKAGE_NAME) sdk_utils.out('marathon config: ') sdk_utils.out(config) expiry_ms = int(config['env'][app_config_field]) config['env'][app_config_field] = str(expiry_ms + 1) marathon.update_app(PACKAGE_NAME, config) # All tasks should be updated because hdfs-site.xml has changed check_healthy() tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_updated(PACKAGE_NAME, 'data', journal_ids)
def test_bump_journal_cpus(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') log.info('journal ids: ' + str(journal_ids)) sdk_marathon.bump_cpu_count_config(FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS') sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) check_healthy()