def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes(): initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "describe", parse_json=True ) ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config) coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config) global current_expected_task_count sdk_service.update_configuration( config.PACKAGE_NAME, foldered_name, { "ingest_nodes": {"count": ingest_nodes_count + 1}, "coordinator_nodes": {"count": coordinator_nodes_count + 1}, }, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) # Should be running 2 tasks more. current_expected_task_count += 2 sdk_tasks.check_running(foldered_name, current_expected_task_count) # Master nodes should not restart. sdk_tasks.check_tasks_not_updated(foldered_name, "master", initial_master_task_ids) # Data nodes should not restart. sdk_tasks.check_tasks_not_updated(foldered_name, "data", initial_data_task_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') old_config = marathon.get_config(PACKAGE_NAME) config = marathon.get_config(PACKAGE_NAME) sdk_utils.out('marathon config: ') sdk_utils.out(config) expiry_ms = int(config['env'][app_config_field]) sdk_utils.out('expiry ms: ' + str(expiry_ms)) config['env'][app_config_field] = str(expiry_ms + 1) marathon.update_app(PACKAGE_NAME, config) # Wait for journal nodes to be affected by the change tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') sdk_utils.out('old config: ') sdk_utils.out(old_config) # Put the old config back (rollback) marathon.update_app(PACKAGE_NAME, old_config) # Wait for the journal nodes to return to their old configuration tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) check_healthy() config = marathon.get_config(PACKAGE_NAME) assert int(config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master") shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(FOLDERED_SERVICE_NAME), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, "master", initial_task_ids)
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None: initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True) ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config) coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config) global current_expected_task_count sdk_service.update_configuration( package_name, service_name, { "ingest_nodes": {"count": ingest_nodes_count + 1}, "coordinator_nodes": {"count": coordinator_nodes_count + 1}, }, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) # Should be running 2 tasks more. current_expected_task_count += 2 sdk_tasks.check_running(service_name, current_expected_task_count) # Master nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids) # Data nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) log.info('expiry ms: ' + str(expiry_ms)) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') log.info('old config: ') log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(foldered_name, old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_unchanged_scheduler_restarts_without_restarting_tasks(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_task_ids = sdk_tasks.get_task_ids(foldered_name, '') shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(foldered_name), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(foldered_name, '', initial_task_ids)
def test_modify_app_config_rollback(): app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) log.info("expiry ms: " + str(expiry_ms)) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") log.info("old config: ") log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config["env"][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_modify_app_config_rollback(): app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') old_config = sdk_marathon.get_config(foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) log.info('expiry ms: ' + str(expiry_ms)) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # Wait for journal nodes to be affected by the change sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') log.info('old config: ') log.info(old_config) # Put the old config back (rollback) sdk_marathon.update_app(foldered_name, old_config) # Wait for the journal nodes to return to their old configuration sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) config.check_healthy(service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) assert int(marathon_config['env'][app_config_field]) == expiry_ms # Data tasks should not have been affected sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def _upgrade_or_downgrade(package_name, to_package_version, service_name, running_task_count, additional_options, timeout_seconds, wait_for_deployment): initial_config = get_config(package_name, service_name) task_ids = sdk_tasks.get_task_ids(service_name, '') if sdk_utils.dcos_version_less_than( "1.10") or shakedown.ee_version() is None: log.info('Using marathon upgrade flow to upgrade {} {}'.format( package_name, to_package_version)) sdk_marathon.destroy_app(service_name) sdk_install.install(package_name, service_name, running_task_count, additional_options=additional_options, package_version=to_package_version, timeout_seconds=timeout_seconds, wait_for_deployment=wait_for_deployment) else: log.info('Using CLI upgrade flow to upgrade {} {}'.format( package_name, to_package_version)) if additional_options: with tempfile.NamedTemporaryFile() as opts_f: opts_f.write(json.dumps(additional_options).encode('utf-8')) opts_f.flush( ) # ensure json content is available for the CLI to read below sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={} --options={}'.format( to_package_version, opts_f.name)) else: sdk_cmd.svc_cli( package_name, service_name, 'update start --package-version={}'.format(to_package_version)) # we must manually upgrade the package CLI because it's not done automatically in this flow # (and why should it? that'd imply the package CLI replacing itself via a call to the main CLI...) sdk_cmd.run_cli( 'package install --yes --cli --package-version={} {}'.format( to_package_version, package_name)) if wait_for_deployment: updated_config = get_config(package_name, service_name) if updated_config == initial_config: log.info( 'No config change detected. Tasks should not be restarted') sdk_tasks.check_tasks_not_updated(service_name, '', task_ids) else: log.info('Checking that all tasks have restarted') sdk_tasks.check_tasks_updated(service_name, '', task_ids) # this can take a while, default is 15 minutes. for example with HDFS, we can hit the expected # total task count via ONCE tasks, without actually completing deployment log.info( "Waiting for package={} service={} to finish deployment plan...". format(package_name, service_name)) sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds)
def test_bump_data_nodes(): data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') log.info('data ids: ' + str(data_ids)) sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'DATA_COUNT') check_healthy(count=DEFAULT_TASK_COUNT + 1) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_bump_data_nodes(): data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') sdk_utils.out('data ids: ' + str(data_ids)) marathon.bump_task_count_config(PACKAGE_NAME, 'DATA_COUNT') check_healthy(DEFAULT_TASK_COUNT + 1) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_data_nodes(): data_ids = sdk_tasks.get_task_ids(foldered_name, "data") log.info("data ids: " + str(data_ids)) sdk_marathon.bump_task_count_config(foldered_name, "DATA_COUNT") config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: Optional[List[str]] = None, ) -> None: """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list") assert rc == 0, "Pod list failed" pod_list = set(json.loads(stdout)) pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else [] pods_to_update = set(pods_with_updated_tasks + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_bump_data_nodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') log.info('data ids: ' + str(data_ids)) sdk_marathon.bump_task_count_config(foldered_name, 'DATA_COUNT') config.check_healthy(service_name=foldered_name, count=config.DEFAULT_TASK_COUNT + 1) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_adding_data_node_only_restarts_masters() -> None: initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data") initial_coordinator_task_ids = sdk_tasks.get_task_ids( foldered_name, "coordinator") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "describe", parse_json=True) data_nodes_count = get_in(["data_nodes", "count"], svc_config) global current_expected_task_count # Increase the data nodes count by 1. sdk_service.update_configuration( config.PACKAGE_NAME, foldered_name, {"data_nodes": { "count": data_nodes_count + 1 }}, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) sdk_plan.wait_for_kicked_off_deployment(foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) _, new_data_pod_info, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "pod info data-{}".format(data_nodes_count), parse_json=True, ) # Get task ID for new data node task. new_data_task_id = get_in([0, "info", "taskId", "value"], new_data_pod_info) # Should be running 1 task more. current_expected_task_count += 1 sdk_tasks.check_running(foldered_name, current_expected_task_count) # Master nodes should restart. sdk_tasks.check_tasks_updated(foldered_name, "master", initial_master_task_ids) # Data node tasks should be the initial ones plus the new one. sdk_tasks.check_tasks_not_updated( foldered_name, "data", initial_data_task_ids + [new_data_task_id]) # Coordinator tasks should not restart. sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator", initial_coordinator_task_ids)
def test_kill_journal_node(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal-0') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(FOLDERED_SERVICE_NAME, 'journal-0-node')) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_name_node(): name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name-0') journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') tasks.kill_task_with_pattern('namenode', 'name-0-node.hdfs.mesos') check_healthy() tasks.check_tasks_updated(PACKAGE_NAME, 'name', name_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_hello_nodes(): config.check_running(FOLDERED_SERVICE_NAME) hello_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'hello') log.info('hello ids: ' + str(hello_ids)) sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'HELLO_COUNT') config.check_running(FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'hello', hello_ids)
def test_kill_name_node(): name_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name-0') journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data') sdk_tasks.kill_task_with_pattern('namenode', sdk_hosts.system_host(config.FOLDERED_SERVICE_NAME, 'name-0-node')) config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'name', name_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') sdk_utils.out('hello ids: ' + str(hello_ids)) marathon.bump_task_count_config(PACKAGE_NAME, 'HELLO_COUNT') check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_kill_data_node(): data_task = sdk_tasks.get_service_tasks(foldered_name, "data-0")[0] journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") sdk_cmd.kill_task_with_pattern("datanode", "nobody", agent_host=data_task.host) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", [data_task.id]) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_kill_data_node(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) data_ids = sdk_tasks.get_task_ids(foldered_name, 'data-0') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') sdk_cmd.kill_task_with_pattern('datanode', sdk_hosts.system_host(foldered_name, 'data-0-node')) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_bump_data_nodes(): data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') log.info('data ids: ' + str(data_ids)) sdk_marathon.bump_task_count_config(FOLDERED_SERVICE_NAME, 'DATA_COUNT') config.check_healthy( service_name=FOLDERED_SERVICE_NAME, count=config.DEFAULT_TASK_COUNT + 1 ) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_all_journalnodes(): journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') for host in shakedown.get_service_ips(PACKAGE_NAME): tasks.kill_task_with_pattern('journalnode', host) check_healthy() # name nodes fail and restart, so don't check those tasks.check_tasks_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_hello_nodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello') log.info('hello ids: ' + str(hello_ids)) sdk_marathon.bump_task_count_config(foldered_name, 'HELLO_COUNT') config.check_running(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', hello_ids)
def test_bump_journal_cpus(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') log.info('journal ids: ' + str(journal_ids)) sdk_marathon.bump_cpu_count_config(FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS') sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) # journal node update should not cause any of the name nodes to crash sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) config.check_healthy(service_name=FOLDERED_SERVICE_NAME)
def test_kill_journal_node(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal-0') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_tasks.kill_task_with_pattern('journalnode', sdk_hosts.system_host(foldered_name, 'journal-0-node')) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_bump_journal_cpus(): journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'name') log.info('journal ids: ' + str(journal_ids)) sdk_marathon.bump_cpu_count_config(config.FOLDERED_SERVICE_NAME, 'JOURNAL_CPUS') sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) # journal node update should not cause any of the name nodes to crash sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'name', name_ids) config.check_healthy(service_name=config.FOLDERED_SERVICE_NAME)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for host in shakedown.get_service_ips(FOLDERED_SERVICE_NAME): sdk_tasks.kill_task_with_pattern('journalnode', host) expect_recovery() # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(config.FOLDERED_SERVICE_NAME, 'data') for journal_pod in config.get_pod_type_instances("journal", config.FOLDERED_SERVICE_NAME): sdk_cmd.run_cli('hdfs --name={} pod restart {}'.format(config.FOLDERED_SERVICE_NAME, journal_pod)) config.expect_recovery(service_name=config.FOLDERED_SERVICE_NAME) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(config.FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(config.FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_kill_data_node(): data_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data-0') journal_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') tasks.kill_task_with_pattern( 'datanode', hosts.system_host(FOLDERED_SERVICE_NAME, 'data-0-node')) check_healthy() tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids) tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
def test_kill_all_datanodes(): journal_ids = tasks.get_task_ids(PACKAGE_NAME, 'journal') name_ids = tasks.get_task_ids(PACKAGE_NAME, 'name') data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') for host in shakedown.get_service_ips(PACKAGE_NAME): tasks.kill_task_with_pattern('datanode', host) check_healthy() tasks.check_tasks_updated(PACKAGE_NAME, 'data', data_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'journal', journal_ids) tasks.check_tasks_not_updated(PACKAGE_NAME, 'name', name_ids)
def test_bump_journal_cpus(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") log.info("journal ids: " + str(journal_ids)) sdk_marathon.bump_cpu_count_config(foldered_name, "JOURNAL_CPUS") sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) # journal node update should not cause any of the name nodes to crash # if the name nodes crashed, then it implies the journal nodes were updated in parallel, when they should've been updated serially # for journal nodes, the deploy plan is parallel, while the update plan is serial. maybe the deploy plan was mistakenly used? sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids) config.check_healthy(service_name=foldered_name)
def test_kill_all_journalnodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(sdk_utils.get_foldered_name(config.SERVICE_NAME), 'data') for journal_pod in config.get_pod_type_instances("journal", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=foldered_name) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['HELLO_COUNT']) + 1 config['env']['HELLO_COUNT'] = str(node_count) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_kill_all_journalnodes(hdfs_server): service_name = hdfs_server["service"]["name"] journal_ids = sdk_tasks.get_task_ids(service_name, "journal") name_ids = sdk_tasks.get_task_ids(service_name, "name") data_ids = sdk_tasks.get_task_ids(service_name, "data") for journal_pod in config.get_pod_type_instances("journal", service_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, "pod restart {}".format(journal_pod)) config.expect_recovery(service_name=service_name) sdk_tasks.check_tasks_updated(service_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(service_name, "name", name_ids) sdk_tasks.check_tasks_not_updated(service_name, "data", data_ids)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart {}".format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "name", name_ids)
def test_bump_data_nodes(): check_healthy() data_ids = tasks.get_task_ids(PACKAGE_NAME, 'data') print('data ids: ' + str(data_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['DATA_COUNT']) + 1 config['env']['DATA_COUNT'] = str(node_count) cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) check_healthy(DEFAULT_TASK_COUNT + 1) tasks.check_tasks_not_updated(PACKAGE_NAME, 'data', data_ids)
def test_bump_hello_nodes(): check_running() hello_ids = tasks.get_task_ids(PACKAGE_NAME, 'hello') print('hello ids: ' + str(hello_ids)) config = marathon.get_config(PACKAGE_NAME) node_count = int(config['env']['HELLO_COUNT']) + 1 config['env']['HELLO_COUNT'] = str(node_count) marathon.update_app(PACKAGE_NAME, config) check_running() tasks.check_tasks_not_updated(PACKAGE_NAME, 'hello', hello_ids)
def test_kill_all_namenodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for host in shakedown.get_service_ips(FOLDERED_SERVICE_NAME): sdk_tasks.kill_task_with_pattern('namenode', host) expect_recovery() sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name', name_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_all_datanodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') for data_pod in config.get_pod_type_instances("data", foldered_name): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'name', name_ids)
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) task_ids = sdk_tasks.get_task_ids(foldered_name, '') # caching enabled by default: stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache') assert "Received cmd: refresh" in stdout marathon_config = sdk_marathon.get_config(foldered_name) marathon_config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): @retrying.retry( wait_fixed=1000, stop_max_delay=120*1000, retry_on_result=lambda res: not res) def check_cache_refresh_fails_409conflict(): output = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache', return_stderr_in_stdout=True) return "failed: 409 Conflict" in output check_cache_refresh_fails_409conflict() marathon_config = sdk_marathon.get_config(foldered_name) del marathon_config['env']['DISABLE_STATE_CACHE'] sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_not_updated(foldered_name, '', task_ids) config.check_running(foldered_name) shakedown.deployment_wait() # ensure marathon thinks the deployment is complete too # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): @retrying.retry( wait_fixed=1000, stop_max_delay=120*1000, retry_on_result=lambda res: not res) def check_cache_refresh(): return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug state refresh_cache') stdout = check_cache_refresh() assert "Received cmd: refresh" in stdout
def test_kill_all_journalnodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for journal_pod in config.get_pod_type_instances("journal", FOLDERED_SERVICE_NAME): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart {}'.format(journal_pod)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) # name nodes fail and restart, so don't check those sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_all_datanodes(): journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') name_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') for data_pod in config.get_pod_type_instances("data", FOLDERED_SERVICE_NAME): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart {}'.format(data_pod)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'data', data_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'name', name_ids)
def test_permanent_and_transient_namenode_failures_1_0(): config.check_healthy(service_name=FOLDERED_SERVICE_NAME) name_0_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-0') name_1_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'name-1') journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod replace name-1') sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod restart name-0') config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def replace_name_node(index): config.check_healthy(service_name=FOLDERED_SERVICE_NAME) name_node_name = 'name-' + str(index) name_id = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, name_node_name) journal_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'journal') data_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, 'data') sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'pod replace {}'.format(name_node_name)) config.expect_recovery(service_name=FOLDERED_SERVICE_NAME) sdk_tasks.check_tasks_updated(FOLDERED_SERVICE_NAME, name_node_name, name_id) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, 'data', data_ids)
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(config.SERVICE_NAME) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find ONLY one scheduler task but found {}".format(scheduler_ids) sdk_cmd.kill_task_with_pattern( "./hello-world-scheduler/bin/helloworld", "nobody", agent_host=sdk_marathon.get_scheduler_host(config.SERVICE_NAME), ) sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(config.SERVICE_NAME) config.check_running() sdk_tasks.check_tasks_not_updated(config.SERVICE_NAME, "", task_ids)
def test_permanent_and_transient_namenode_failures_1_0(): config.check_healthy(service_name=foldered_name) name_0_ids = sdk_tasks.get_task_ids(foldered_name, "name-0") name_1_ids = sdk_tasks.get_task_ids(foldered_name, "name-1") journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod replace name-1") sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "pod restart name-0") config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "name-0", name_0_ids) sdk_tasks.check_tasks_updated(foldered_name, "name-1", name_1_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", data_ids)
def test_permanent_and_transient_namenode_failures_0_1(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_healthy(service_name=foldered_name) name_0_ids = sdk_tasks.get_task_ids(foldered_name, 'name-0') name_1_ids = sdk_tasks.get_task_ids(foldered_name, 'name-1') journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace name-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod restart name-1') config.expect_recovery(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'name-0', name_0_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name-1', name_1_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'data', data_ids)
def test_adding_data_node_only_restarts_masters(): initial_master_task_ids = sdk_tasks.get_task_ids(foldered_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(foldered_name, "data") initial_coordinator_task_ids = sdk_tasks.get_task_ids(foldered_name, "coordinator") marathon_config = sdk_marathon.get_config(foldered_name) data_nodes = int(marathon_config['env']['DATA_NODE_COUNT']) marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1) sdk_marathon.update_app(foldered_name, marathon_config) sdk_plan.wait_for_completed_deployment(foldered_name) global current_expected_task_count current_expected_task_count += 1 sdk_tasks.check_running(foldered_name, current_expected_task_count) sdk_tasks.check_tasks_updated(foldered_name, "master", initial_master_task_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "data", initial_data_task_ids) sdk_tasks.check_tasks_not_updated(foldered_name, "coordinator", initial_coordinator_task_ids) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_kill_scheduler(): task_ids = sdk_tasks.get_task_ids(foldered_name, "") scheduler_task_prefix = sdk_marathon.get_scheduler_task_prefix(foldered_name) scheduler_ids = sdk_tasks.get_task_ids("marathon", scheduler_task_prefix) assert len(scheduler_ids) == 1, "Expected to find one scheduler task" sdk_cmd.kill_task_with_pattern( "./hdfs-scheduler/bin/hdfs", "nobody", agent_host=sdk_marathon.get_scheduler_host(foldered_name), ) # scheduler should be restarted, but service tasks should be left as-is: sdk_tasks.check_tasks_updated("marathon", scheduler_task_prefix, scheduler_ids) sdk_tasks.wait_for_active_framework(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, "", task_ids) config.check_healthy(service_name=foldered_name)