def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False): sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60) if recovery_expected: # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60) sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60) sdk_tasks.check_running(service_name, count)
def restart_zookeeper_node(id: int): sdk_cmd.svc_cli(config.ZOOKEEPER_PACKAGE_NAME, config.ZOOKEEPER_SERVICE_NAME, "pod restart zookeeper-{}".format(id)) sdk_plan.wait_for_kicked_off_recovery(config.ZOOKEEPER_SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.ZOOKEEPER_SERVICE_NAME)
def test_node_replace_replaces_seed_node(): pod_to_replace = 'node-0' # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_shutdown_host(): replace_task = sdk_tasks.get_task_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert replace_task is not None, 'Could not find a node to shut down' replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_auto_replace_on_decommission(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to decommission" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after decommission: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.decommission_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server'][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT'] try: marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_auto_replace_on_drain(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( service_name, re.compile("^(master|data|coordinator)-[0-9]+-node$") ) log.info("Candidate tasks: {}".format(candidate_tasks)) assert len(candidate_tasks) != 0, "Could not find a node to drain" # Pick the host of the first task from the above list replace_agent_id = candidate_tasks[0].agent_id replace_tasks = [task for task in candidate_tasks if task.agent_id == replace_agent_id] log.info( "Tasks on agent {} to be replaced after drain: {}".format(replace_agent_id, replace_tasks) ) sdk_agents.drain_agent(replace_agent_id) sdk_plan.wait_for_kicked_off_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name) new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id # Reactivate the drained agent, otherwise uninstall plans will be halted for portworx sdk_agents.reactivate_agent(replace_agent_id)
def test_node_replace_replaces_seed_node(): pod_to_replace = 'node-0' # start replace and wait for it to finish cmd.run_cli('cassandra pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(PACKAGE_NAME) sdk_plan.wait_for_completed_recovery(PACKAGE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_node_replace_replaces_node() -> None: replace_task = [ task for task in sdk_tasks.get_summary() if task.name == "node-2-server" ][0] log.info("avoid host for task {}".format(replace_task)) replace_pod_name = replace_task.name[:-len("-server")] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config["env"]["PLACEMENT_CONSTRAINT"] try: marathon_config["env"][ "PLACEMENT_CONSTRAINT"] = '[["hostname", "UNLIKE", "{}"]]'.format( replace_task.host) sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config["env"]["PLACEMENT_CONSTRAINT"] = original_constraint sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9({ "service": { "spec_file": "examples/marathon_constraint.yml" }, "hello": { "count": config.get_num_private_agents(), "placement": "[[\"hostname\", \"UNIQUE\"]]" }, "world": { "count": config.get_num_private_agents(), "placement": "[[\"hostname\", \"UNIQUE\"]]" } }) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.get_num_private_agents() * 2, additional_options=options) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2 - 1, timeout_seconds=10) sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9({ "service": { "yaml": "marathon_constraint" }, "hello": { "count": config.get_num_private_agents(), "placement": "[[\"hostname\", \"UNIQUE\"]]" }, "world": { "count": config.get_num_private_agents(), "placement": "[[\"hostname\", \"UNIQUE\"]]" } }) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.get_num_private_agents() * 2, additional_options=options) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2 - 1, timeout_seconds=10) sdk_tasks.check_running(config.SERVICE_NAME, config.get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False): sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60) if recovery_expected: # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60) sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60) sdk_tasks.check_running(service_name, count)
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: Optional[List[str]] = None, ) -> None: """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list") assert rc == 0, "Pod list failed" pod_list = set(json.loads(stdout)) pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else [] pods_to_update = set(pods_with_updated_tasks + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_node_replace_replaces_seed_node(): pod_to_replace = 'node-0' # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_envvar_accross_restarts(): class ConfigException(Exception): pass def assert_envvar_has_value(envvar: str, expected_value: str): _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME, "hello-0-server", "env") env = dict(l.strip().split("=", 1) for l in stdout.strip().split('\n')) val = env.get(envvar, "absent") if val == "absent": raise ConfigException("Required envvar not found") if val != expected_value: log.error("Looking for %s=%s but found: %s", envvar, expected_value, val) raise ConfigException("Envvar not set to required value") log.info("%s has expected value %s", envvar, expected_value) envvar = "CONFIG_SLEEP_DURATION" sleep_duration = 9999 try: assert_envvar_has_value(envvar, str(sleep_duration)) except ConfigException: log.debug("%s is set to something other than %d as expected", envvar, sleep_duration) sdk_upgrade.update_or_upgrade_or_downgrade( config.PACKAGE_NAME, config.SERVICE_NAME, to_version=None, to_options={ "service": { "name": config.SERVICE_NAME, "sleep": sleep_duration, "yaml": "sidecar" } }, expected_running_tasks=2, wait_for_deployment=True, ) log.info("Checking after update") assert_envvar_has_value(envvar, str(sleep_duration)) cmd_list = ["pod", "restart", "hello-0"] sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, " ".join(cmd_list)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) log.info("Checking after restart") assert_envvar_has_value(envvar, str(sleep_duration))
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$") ) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [task for task in candidate_tasks if task.host == replace_hostname] log.info( "Tasks on host {} to be replaced after shutdown: {}".format(replace_hostname, replace_tasks) ) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[: -len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format(replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info( "Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task) ) assert replaced_task.agent_id != new_task.agent_id
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^(hello|world)-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Pick the host of the first task from the above list, then get ALL tasks which may be located # on that host. We'll need to 'pod replace' all of them. replace_hostname = candidate_tasks[0].host replace_tasks = [ task for task in candidate_tasks if task.host == replace_hostname ] log.info("Tasks on host {} to be replaced after shutdown: {}".format( replace_hostname, replace_tasks)) # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_hostname) # Reserved resources on this agent are expected to appear as orphaned in Mesos state. # Tell our uninstall validation to ignore orphaned resources coming from this agent. sdk_install.ignore_dead_agent(replace_hostname) # Get pod name from task name: "hello-0-server" => "hello-0" replace_pods = set([task.name[:-len("-server")] for task in replace_tasks]) assert len(replace_pods) == len( replace_tasks ), "Expected one task per pod in tasks to replace: {}".format( replace_tasks) for pod_name in replace_pods: sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # For each task affected by the shutdown, find the new version of it, and check that it moved. # Note that the old version on the dead agent may still be present/'running' as # Mesos might not have fully acknowledged the agent's death. new_tasks = sdk_tasks.get_summary() for replaced_task in replace_tasks: new_task = [ task for task in new_tasks if task.name == replaced_task.name and task.id != replaced_task.id ][0] log.info("Checking affected task has moved to a new agent:\n" "old={}\nnew={}".format(replaced_task, new_task)) assert replaced_task.agent_id != new_task.agent_id
def test_envvar_accross_restarts(): class ConfigException(Exception): pass def assert_envvar_has_value(envvar: str, expected_value: str): _, stdout, _ = sdk_cmd.service_task_exec(config.SERVICE_NAME, "hello-0-server", "env") env = dict(l.strip().split("=", 1) for l in stdout.strip().split('\n')) val = env.get(envvar, "absent") if val == "absent": raise ConfigException("Required envvar not found") if val != expected_value: log.error("Looking for %s=%s but found: %s", envvar, expected_value, val) raise ConfigException("Envvar not set to required value") log.info("%s has expected value %s", envvar, expected_value) envvar = "CONFIG_SLEEP_DURATION" sleep_duration = 9999 try: assert_envvar_has_value(envvar, str(sleep_duration)) except ConfigException: log.debug("%s is set to something other than %d as expected", envvar, sleep_duration) sdk_upgrade.update_or_upgrade_or_downgrade( config.PACKAGE_NAME, config.SERVICE_NAME, to_version=None, to_options={ "service": {"name": config.SERVICE_NAME, "sleep": sleep_duration, "yaml": "sidecar"} }, expected_running_tasks=2, wait_for_deployment=True, ) log.info("Checking after update") assert_envvar_has_value(envvar, str(sleep_duration)) cmd_list = ["pod", "restart", "hello-0"] sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, " ".join(cmd_list)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) log.info("Checking after restart") assert_envvar_has_value(envvar, str(sleep_duration))
def test_node_replace_replaces_node(): pod_to_replace = 'node-2' pod_host = get_pod_host(pod_to_replace) log.info('avoid host for pod {}: {}'.format(pod_to_replace, pod_host)) # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['PLACEMENT_CONSTRAINT'] = 'hostname:UNLIKE:{}'.format(pod_host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_config_update_across_restart(): foldered_service_name = config.get_foldered_service_name() batch_size_warn_threshold_in_kb = 15 sdk_upgrade.update_or_upgrade_or_downgrade( config.PACKAGE_NAME, foldered_service_name, to_package_version=None, additional_options={ "service": {"name": foldered_service_name}, "cassandra": {"batch_size_warn_threshold_in_kb": batch_size_warn_threshold_in_kb}, }, expected_running_tasks=config.DEFAULT_TASK_COUNT, wait_for_deployment=True, timeout_seconds=config.DEFAULT_CASSANDRA_TIMEOUT, ) for _ in range(3): cmd_list = ["pod", "restart", "node-0"] sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_service_name, " ".join(cmd_list)) sdk_plan.wait_for_kicked_off_recovery(foldered_service_name) sdk_plan.wait_for_completed_recovery( foldered_service_name, timeout_seconds=config.DEFAULT_CASSANDRA_TIMEOUT ) _, stdout, _ = sdk_cmd.service_task_exec(foldered_service_name, "node-0-server", "env") envvar = "CASSANDRA_BATCH_SIZE_WARN_THRESHOLD_IN_KB=" envvar_pos = stdout.find(envvar) if envvar_pos < 0: raise Exception("Required envvar not found") if not stdout[envvar_pos + len(envvar) :].startswith( "{}".format(batch_size_warn_threshold_in_kb) ): found_string = stdout[envvar_pos + len(envvar) : envvar_pos + len(envvar) + 15] log.error( "Looking for %s%d but found: %s", envvar, batch_size_warn_threshold_in_kb, found_string, ) raise Exception("Envvar not set to required value")
def test_replace_pods_to_legacy_role(): # Issue pod replace operations till we move the pods to the legacy role. replace_pods = ["hello-0", "world-0", "world-1"] for pod in replace_pods: # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, SERVICE_NAME, "pod replace {}".format(pod)) sdk_plan.wait_for_kicked_off_recovery( SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) sdk_plan.wait_for_completed_recovery( SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) # Get the current service state to verify roles have applied. service_roles = sdk_utils.get_service_roles(SERVICE_NAME) current_task_roles = service_roles["task-roles"] task_name = "{}-server".format(pod) # Ensure we have transitioned over to the legacy role. assert current_task_roles[task_name] == LEGACY_ROLE # Get refreshed roles after pod replace's service_roles = sdk_utils.get_service_roles(SERVICE_NAME) current_task_roles = service_roles["task-roles"] # We must have some role! assert len(current_task_roles) > 0 assert LEGACY_ROLE in current_task_roles.values() assert ENFORCED_ROLE not in current_task_roles.values() # Ensure we're MULTI_ROLE assert service_roles["framework-roles"] is not None assert service_roles["framework-role"] is None assert len(service_roles["framework-roles"]) == 2 assert LEGACY_ROLE in service_roles["framework-roles"] assert ENFORCED_ROLE in service_roles["framework-roles"]
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, ): """ Perform a replace operation on a specified pod and check that it is replaced All other pods are checked to see if they remain consistent. """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) pod_list = sdk_cmd.svc_cli(package_name, service_name, "pod list", json=True) tasks_to_replace = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list: if pod == pod_name: continue tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) replace_cmd = ["pod", "replace", pod_name] sdk_cmd.svc_cli(package_name, service_name, " ".join(replace_cmd), json=True) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) sdk_tasks.check_tasks_updated(service_name, pod_name, tasks_to_replace) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server' ][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env'][ 'PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format( replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery( config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile("^node-[0-9]+-server$")) assert len(candidate_tasks) != 0, "Could not find a node to shut down" # Cassandra nodes should never share a machine assert len(candidate_tasks) == len( set([task.host for task in candidate_tasks]) ), "Expected candidate tasks to all be on different hosts: {}".format( candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len("-server")] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_agents.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace {}".format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id ][0] log.info("Checking that the original pod has moved to a new agent:\n" "old={}\nnew={}".format(replace_task, new_task)) assert replace_task.agent_id != new_task.agent_id
def restart_zookeeper_node(id: int): sdk_cmd.svc_cli(config.ZOOKEEPER_PACKAGE_NAME, config.ZOOKEEPER_SERVICE_NAME, "pod restart zookeeper-{}".format(id)) sdk_plan.wait_for_kicked_off_recovery(config.ZOOKEEPER_SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.ZOOKEEPER_SERVICE_NAME)