def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.systemctl_master('restart') common.wait_for_service_endpoint(marathon_service_name, path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_def["id"]) assert len( tasks ) == 1, "The number of tasks is {} after master restart, but 1 was expected".format( len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def assert_mom_ee(version, security_mode='permissive'): ensure_prerequisites_installed() ensure_service_account() ensure_permissions() ensure_sa_secret(strict=True if security_mode == 'strict' else False) ensure_docker_config_secret() # In strict mode all tasks are started as user `nobody` by default. However we start # MoM-EE as 'root' and for that we need to give root marathon ACLs to start # tasks as 'root'. if security_mode == 'strict': common.add_dcos_marathon_user_acls() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile( app_def_file ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) print('Deploying {} definition with {} image'.format(app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker'][ 'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) common.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode), path="ping")
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) common.wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def wait_for_marathon_and_cleanup(): common.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds(), path="ping") yield common.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds(), path="ping") common.clean_up_marathon()
def test_marathon_delete_leader(marathon_service_name): original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) common.abdicate_marathon_leader() common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") common.assert_marathon_leadership_changed(original_leader)
def wait_for_marathon_user_and_cleanup(): common.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") with marathon_on_marathon() as client: yield common.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") common.clean_up_marathon(client)
def setup_module(module): # When the cluster is starting, it might happen that there is some delay in: # - marathon leader registration with mesos # - admin router refreshing cache (every 30s) # We should not start our tests before marathon is accessible through service endpoint. common.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds(), path="ping") common.cluster_info() common.clean_up_marathon()
def test_deploy_custom_framework(): """Launches an app that has necessary elements to create a service endpoint in DCOS. This test confirms that the endpoint is created by the root Marathon. """ client = marathon.create_client() app_def = apps.fake_framework() app_id = app_def["id"] client.add_app(app_def) common.deployment_wait(service_id=app_id, max_attempts=300) common.wait_for_service_endpoint('pyfw', timedelta(minutes=5).total_seconds())
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() logger.info("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] logger.info("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) common.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds(), path="ping") with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def wait_for_marathon_user_and_cleanup(): print("entering wait_for_marathon_user_and_cleanup fixture") common.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") with shakedown.marathon_on_marathon(): yield common.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds(), path="ping") common.clean_up_marathon() print("exiting wait_for_marathon_user_and_cleanup fixture")
def test_custom_service_name(): """ Install MoM with a custom service name. """ cosmos_pm = packagemanager.PackageManager(cosmos.get_cosmos_url()) cosmos_pm.get_package_version('marathon', None) options = {'service': {'name': "test-marathon"}} install_package('marathon', options_json=options) common.deployment_wait(service_id=options["service"]["name"], max_attempts=300) common.wait_for_service_endpoint('test-marathon', timeout_sec=300, path="ping")
def test_marathon_backup_and_restore_leader(marathon_service_name): """Backup and restore meeting is done with only one master since new master has to be able to read the backup file that was created by the previous master and the easiest way to test it is when there is 1 master """ backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) params = '?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE /v2/leader{}'.format(params)) common.abdicate_marathon_leader(params) # Wait for new leader (but same master server) to be up and ready common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) assert task_id == app['tasks'][0][ 'id'], "Task has a different ID after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() app_id = app_def["id"] with marathon_on_marathon() as client: client.add_app(app_def) common.deployment_wait(service_id=app_id, client=client) try: common.wait_for_service_endpoint('pyfw', 15) except Exception: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def test_custom_service_name(): """ Install MoM with a custom service name. """ cosmos_pm = packagemanager.PackageManager(cosmos.get_cosmos_url()) cosmos_pm.get_package_version('marathon', None) options = {'service': {'name': "test-marathon"}} shakedown.install_package('marathon', options_json=options) shakedown.deployment_wait() assert common.wait_for_service_endpoint('test-marathon', path="ping")
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() with shakedown.marathon_on_marathon(): common.delete_all_apps_wait() client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() try: common.wait_for_service_endpoint('pyfw', 15) except Exception: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def test_deploy_custom_framework(): """Launches an app that has necessary elements to create a service endpoint in DCOS. This test confirms that the endpoint is created by the root Marathon. """ client = marathon.create_client() client.add_app(apps.fake_framework()) shakedown.deployment_wait(timeout=timedelta(minutes=5).total_seconds()) assert common.wait_for_service_endpoint('pyfw', timedelta(minutes=5).total_seconds()), \ "The framework has not showed up"
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in get_all_master_ips(): run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) # Abdicate the leader with backup original_leader = marathon_leader_ip() params = '?backup={}'.format(backup_url1) common.abdicate_marathon_leader(params) common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) common.deployment_wait(service_id=app_id) check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) params = '?backup={}'.format(backup_url2) print('DELETE /v2/leader{}'.format(params)) common.abdicate_marathon_leader(params) common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). common.assert_marathon_leadership_changed(original_leader) # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) # abdicate leader after app was started successfully common.abdicate_marathon_leader() common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) original_leader = marathon_leader_ip() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) common.deployment_wait(service_id=app_id) try: client.get_app(app_id) except Exception: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.abdicate_marathon_leader() common.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping") # wait until leader changed common.assert_marathon_leadership_changed(original_leader) # check if app definition is still not there try: client.get_app(app_id) except Exception: pass else: assert False, "The application resurrected"