def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() logger.info("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] logger.info("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) common.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds(), path="ping") with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def assert_mom_ee(version, security_mode='permissive'): ensure_prerequisites_installed() ensure_service_account() ensure_permissions() ensure_sa_secret(strict=True if security_mode == 'strict' else False) ensure_docker_config_secret() # In strict mode all tasks are started as user `nobody` by default. However we start # MoM-EE as 'root' and for that we need to give root marathon ACLs to start # tasks as 'root'. if security_mode == 'strict': common.add_dcos_marathon_user_acls() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile( app_def_file ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) print('Deploying {} definition with {} image'.format(app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker'][ 'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) common.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode), path="ping")
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def assert_mom_ee(version, security_mode='permissive'): ensure_prerequisites_installed() ensure_service_account() ensure_permissions() ensure_secret(strict=True if security_mode == 'strict' else False) ensure_docker_credentials() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile( app_def_file ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) print('Deploying {} definition with {} image'.format(app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker'][ 'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() shakedown.wait_for_service_endpoint(mom_ee_endpoint( version, security_mode))
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) common.wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds()) with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def delete_all_apps(): client = marathon.create_client() apps = client.get_apps() for app in apps: if app['id'] == '/marathon-user': print('WARNING: not removing marathon-user, because it is special') else: client.remove_app(app['id'], True)
def __marathon_leadership_changed_in_marathon_api(original_leader): """ This method uses Marathon API to figure out that leadership changed. We have to retry here because leader election takes time and what might happen is that some nodes might not be aware of the new leader being elected resulting in HTTP 502. """ current_leader = marathon.create_client().get_leader() print('leader according to marathon API: {}'.format(current_leader)) assert original_leader != current_leader
def stop_all_deployments(noisy=False): client = marathon.create_client() deployments = client.get_deployments() for deployment in deployments: try: client.stop_deployment(deployment['id']) except Exception as e: if noisy: print(e)
def cluster_info(mom_name='marathon-user'): print("DC/OS: {}, in {} mode".format(shakedown.dcos_version(), shakedown.ee_version())) agents = shakedown.get_private_agents() print("Agents: {}".format(len(agents))) client = marathon.create_client() about = client.get_about() print("Marathon version: {}".format(about.get("version"))) if shakedown.service_available_predicate(mom_name): with shakedown.marathon_on_marathon(mom_name): try: client = marathon.create_client() about = client.get_about() print("Marathon MoM version: {}".format(about.get("version"))) except Exception: print("Marathon MoM not present") else: print("Marathon MoM not present")
def clear_pods(): try: client = marathon.create_client() pods = client.list_pod() for pod in pods: client.remove_pod(pod["id"], True) shakedown.deployment_wait() except Exception: pass
def deployment_predicate(service_id=None): deployments = marathon.create_client().get_deployments() if (service_id is None): return len(deployments) == 0 else: filtered = [ deployment for deployment in deployments if (service_id in deployment['affectedApps'] or service_id in deployment['affectedPods']) ] return len(filtered) == 0
def __marathon_leadership_changed_in_marathon_api(original_leader): """ This method uses Marathon API to figure out that leadership changed. We have to retry here because leader election takes time and what might happen is that some nodes might not be aware of the new leader being elected resulting in HTTP 502. """ # Leader is returned like this 10.0.6.88:8080 - we want just the IP current_leader = marathon.create_client().get_leader().split(':', 1)[0] print('leader according to marathon API: {}'.format(current_leader)) assert original_leader != current_leader return current_leader
def deployments_for(service_id=None): deployments = marathon.create_client().get_deployments() if (service_id is None): return deployments else: filtered = [ deployment for deployment in deployments if (service_id in deployment['affectedApps'] or service_id in deployment['affectedPods']) ] return filtered
def simple_sleep_app(name): # Deploy a simple sleep app in the MoM-EE with shakedown.marathon_on_marathon(name=name): client = marathon.create_client() app_def = apps.sleep_app() client.add_app(app_def) shakedown.deployment_wait() tasks = shakedown.get_service_task(name, app_def["id"].lstrip("/")) print('MoM-EE tasks: {}'.format(tasks)) return tasks is not None
def remove_mom_ee(): mom_ee_versions = [('1.4', 'strict'), ('1.4', 'permissive'), ('1.4', 'disabled'), ('1.3', 'strict'), ('1.3', 'permissive'), ('1.3', 'disabled')] for mom_ee in mom_ee_versions: endpoint = mom_ee_endpoint(mom_ee[0], mom_ee[1]) if shakedown.service_available_predicate(endpoint): print('Removing {}...'.format(endpoint)) with shakedown.marathon_on_marathon(name=endpoint): shakedown.delete_all_apps() client = marathon.create_client() client.remove_app(MOM_EE_NAME) shakedown.deployment_wait() print('Successfully removed {}'.format(MOM_EE_NAME))
def deployments_for(service_id=None, deployment_id=None): deployments = marathon.create_client().get_deployments() if deployment_id: filtered = [ deployment for deployment in deployments if deployment_id == deployment["id"] ] return filtered elif service_id: filtered = [ deployment for deployment in deployments if service_id in deployment['affectedApps'] or service_id in deployment['affectedPods'] ] return filtered else: return deployments
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) try: common.wait_for_service_endpoint('pyfw', 15) except Exception: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() with shakedown.marathon_on_marathon(): common.delete_all_apps_wait() client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() try: shakedown.wait_for_service_endpoint('pyfw', 15) except: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def is_mom_ee_deployed(): mom_ee_id = '/{}'.format(MOM_EE_NAME) client = marathon.create_client() apps = client.get_apps() return any(app['id'] == mom_ee_id for app in apps)
def clean_up_marathon(parent_group="/"): client = marathon.create_client() response = client.remove_group(parent_group, force=True) deployment_wait(deployment_id=response["deploymentId"])
def clean_up_marathon(): client = marathon.create_client() client.remove_group("/", force=True) deployment_wait()
def marathon_version(): client = marathon.create_client() about = client.get_about() # 1.3.9 or 1.4.0-RC8 return LooseVersion(about.get("version"))
def delete_all_groups(): client = marathon.create_client() groups = client.get_groups() for group in groups: client.remove_group(group["id"])