def marathon_service_name(): shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds()) with shakedown.marathon_on_marathon(): yield 'marathon-user' shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds()) clear_marathon()
def wait_for_marathon_and_cleanup(): print("entering wait_for_marathon_and_cleanup fixture") shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) yield shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) common.clean_up_marathon() print("exiting wait_for_marathon_and_cleanup fixture")
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.systemctl_master('restart') shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_def["id"]) assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def marathon_service_name(): common.ensure_mom() with shakedown.marathon_on_marathon(): yield 'marathon-user' shakedown.wait_for_service_endpoint('marathon-user') clear_marathon()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds()) with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon leader: {}'.format(marathon_leader)) print('mesos leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) marathon_leader = assert_marathon_leadership_changed(marathon_leader) print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master with common.iptable_rules(original_leader): common.block_port(original_leader, 5050, direction='OUTPUT') # Wait for a leader change before restoring iptables rules common.marathon_leadership_changed(original_leader) # Make sure marathon is available shakedown.wait_for_service_endpoint( marathon_service_name, timedelta(minutes=5).total_seconds())
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon leader: {}'.format(marathon_leader)) print('mesos leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) marathon_leader = assert_marathon_leadership_changed(marathon_leader) print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master with shakedown.iptable_rules(original_leader): block_port(original_leader, 5050, direction='OUTPUT') # time of the master block time.sleep(timedelta(minutes=1.5).total_seconds()) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader marathon_leadership_changed()
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master with shakedown.iptable_rules(original_leader): common.block_port(original_leader, 5050, direction='OUTPUT') # time of the master block time.sleep(timedelta(minutes=1.5).total_seconds()) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader, "A new Marathon leader has not been elected"
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader marathon_leadership_changed()
def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking zk on marathon leader (not master leader) with shakedown.iptable_rules(original_leader): common.block_port(original_leader, 2181, direction='INPUT') common.block_port(original_leader, 2181, direction='OUTPUT') # time of the zk block, the duration must be greater than all the default ZK timeout values in Marathon time.sleep(20) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader, "A new Marathon leader has not been elected"
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master common.block_iptable_rules_for_seconds(original_leader, 5050, sleep_seconds=60, block_input=False, block_output=True) common.marathon_leadership_changed(original_leader) # Make sure marathon is available shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())
def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking zk on marathon leader (not master leader) with shakedown.iptable_rules(original_leader): common.block_port(original_leader, 2181, direction='INPUT') common.block_port(original_leader, 2181, direction='OUTPUT') # time of the zk block, the duration must be greater than all the default ZK timeout values in Marathon time.sleep(20) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader, "A new Marathon leader has not been elected"
def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking zk on marathon leader (not master leader) with shakedown.iptable_rules(original_leader): block_port(original_leader, 2181, direction='INPUT') block_port(original_leader, 2181, direction='OUTPUT') # time of the zk block time.sleep(5) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon: {}'.format(marathon_leader)) print('leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) new_leader = shakedown.marathon_leader_ip() assert new_leader != marathon_leader, "A new Marathon leader has not been elected" marathon_leader = new_leader print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon: {}'.format(marathon_leader)) print('leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) new_leader = shakedown.marathon_leader_ip() assert new_leader != marathon_leader, "A new Marathon leader has not been elected" marathon_leader = new_leader print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports.""" # get MoM ip mom_ip = common.ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = apps.sleep_app() app_id = app_def["id"] with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", app_id.lstrip('/')) tasks = client.get_tasks(app_id) original_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds()) with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports """ # get MoM ip mom_ip = ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = get_resource("{}/large-sleep.json".format(fixture_dir())) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') original_sleep_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds()) with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", "sleep", timedelta(minutes=10).total_seconds()) tasks = client.get_tasks('sleep') current_sleep_task_id = tasks[0]["id"] assert current_sleep_task_id == original_sleep_task_id, "Task ID shouldn't change"
def test_mom_with_network_failure_bounce_master(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports """ # get MoM ip mom_ip = ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = get_resource("{}/large-sleep.json".format(fixture_dir())) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') original_sleep_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] print("\nTask IP: " + task_ip) # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min time.sleep(timedelta(minutes=1).total_seconds()) # bounce master shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master") # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) time.sleep(timedelta(minutes=1).total_seconds()) shakedown.wait_for_service_endpoint('marathon-user') shakedown.wait_for_task("marathon-user", "sleep") with shakedown.marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') current_sleep_task_id = tasks[0]["id"] assert current_sleep_task_id == original_sleep_task_id, "Task ID shouldn't change"
def test_marathon_backup_and_restore_leader(marathon_service_name): """Backup and restore meeting is done with only one master since new master has to be able to read the backup file that was created by the previous master and the easiest way to test it is when there is 1 master """ backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) assert task_id == app['tasks'][0][ 'id'], "Task has a different ID after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_marathon_backup_and_restore_leader(marathon_service_name): backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection client = marathon.create_client() app_def = { "id": "/sleep", "instances": 1, "cpus": 0.01, "mem": 32, "cmd": "sleep 100000" } app_id = app_def['id'] client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1 assert task_id == app['tasks'][0][ 'id'], "Task has a different Id after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) run, data = shakedown.run_command_on_master(cmd) assert run, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() with shakedown.marathon_on_marathon(): common.delete_all_apps_wait() client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() try: shakedown.wait_for_service_endpoint('pyfw', 15) except Exception: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def test_deploy_custom_framework(): """ Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is created from the root marathon. """ client = marathon.create_client() client.add_app(fake_framework_app()) shakedown.deployment_wait() assert shakedown.wait_for_service_endpoint('pyfw')
def test_framework_unavailable_on_mom(): """Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ app_def = apps.fake_framework() with shakedown.marathon_on_marathon(): common.delete_all_apps_wait() client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() try: shakedown.wait_for_service_endpoint('pyfw', 15) except: pass else: assert False, 'MoM shoud NOT create a service endpoint'
def test_custom_service_name(): """ Install MoM with a custom service name. """ cosmos_pm = packagemanager.PackageManager(cosmos.get_cosmos_url()) pkg = cosmos_pm.get_package_version('marathon', None) options = {'service': {'name': "test-marathon"}} shakedown.install_package('marathon', options_json=options) shakedown.deployment_wait() assert shakedown.wait_for_service_endpoint('test-marathon')
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_deploy_custom_framework(): """ Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is created from the root marathon. """ client = marathon.create_client() client.add_app(fake_framework_app()) shakedown.deployment_wait() assert shakedown.wait_for_service_endpoint('pyfw')
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_marathon_backup_and_restore_leader(marathon_service_name): backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection client = marathon.create_client() app_def = { "id": "/sleep", "instances": 1, "cpus": 0.01, "mem": 32, "cmd": "sleep 100000" } app_id = app_def['id'] client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1 assert task_id == app['tasks'][0]['id'], "Task has a different Id after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_mom_with_network_failure(): """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports """ # get MoM ip mom_ip = ip_of_mom() print("MoM IP: {}".format(mom_ip)) app_def = get_resource("{}/large-sleep.json".format(fixture_dir())) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') original_sleep_task_id = tasks[0]["id"] task_ip = tasks[0]['host'] # PR for network partitioning in shakedown makes this better # take out the net partition_agent(mom_ip) partition_agent(task_ip) # wait for a min service_delay() # bring the net up reconnect_agent(mom_ip) reconnect_agent(task_ip) service_delay() shakedown.wait_for_service_endpoint(PACKAGE_APP_ID) shakedown.wait_for_task("marathon-user", "sleep") with marathon_on_marathon(): client = marathon.create_client() shakedown.wait_for_task("marathon-user", "sleep") tasks = client.get_tasks('sleep') current_sleep_task_id = tasks[0]["id"] assert current_sleep_task_id == original_sleep_task_id, "Task ID shouldn't change"
def test_deploy_custom_framework(): """Launches an app that has necessary elements to create a service endpoint in DCOS. This test confirms that the endpoint is created by the root Marathon. """ client = marathon.create_client() client.add_app(apps.fake_framework()) shakedown.deployment_wait(timeout=timedelta(minutes=5).total_seconds()) assert shakedown.wait_for_service_endpoint('pyfw', timedelta(minutes=5).total_seconds()), \ "The framework has not showed up"
def test_deploy_custom_framework(): """Launches an app that has necessary elements to create a service endpoint in DCOS. This test confirms that the endpoint is created by the root Marathon. """ client = marathon.create_client() client.add_app(apps.fake_framework()) shakedown.deployment_wait(timeout=timedelta(minutes=5).total_seconds()) assert shakedown.wait_for_service_endpoint('pyfw', timedelta(minutes=5).total_seconds()), \ "The framework has not showed up"
def test_mom_with_master_process_failure(): """ Launches a MoM, launches an app from MoM and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] systemctl_master() shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id
def test_custom_service_name(): """ Install MoM with a custom service name. """ cosmos = packagemanager.PackageManager(get_cosmos_url()) pkg = cosmos.get_package_version('marathon', None) options = { 'service': {'name': "test-marathon"} } shakedown.install_package('marathon', options_json=options) shakedown.deployment_wait() assert shakedown.wait_for_service_endpoint('test-marathon')
def assert_mom_ee(version, security_mode='permissive'): ensure_prerequisites_installed() ensure_service_account() ensure_permissions() ensure_secret(strict=True if security_mode == 'strict' else False) ensure_docker_credentials() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile(app_def_file), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) print('Deploying {} definition with {} image'.format(app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker']['image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() shakedown.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode))
def test_marathon_backup_and_restore_leader(marathon_service_name): """Backup and restore meeting is done with only one master since new master has to be able to read the backup file that was created by the previous master and the easiest way to test it is when there is 1 master """ backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) assert task_id == app['tasks'][0]['id'], "Task has a different ID after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_framework_unavailable_on_mom(): """ Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ if shakedown.service_available_predicate('pyfw'): client = marathon.create_client() client.remove_app('python-http', True) shakedown.deployment_wait() shakedown.wait_for_service_endpoint_removal('pyfw') with shakedown.marathon_on_marathon(): delete_all_apps_wait() client = marathon.create_client() client.add_app(common.fake_framework_app()) shakedown.deployment_wait() try: shakedown.wait_for_service_endpoint('pyfw', 15) assert False, 'MoM shoud NOT create a service endpoint' except: assert True pass
def test_marathon_with_master_process_failure(marathon_service_name): """ Launches an app from Marathon and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] common.systemctl_master() shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id
def test_framework_unavailable_on_mom(): """ Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ if shakedown.service_available_predicate('pyfw'): client = marathon.create_client() client.remove_app('python-http', True) shakedown.deployment_wait() shakedown.wait_for_service_endpoint_removal('pyfw') with shakedown.marathon_on_marathon(): delete_all_apps_wait() client = marathon.create_client() client.add_app(common.fake_framework_app()) shakedown.deployment_wait() try: shakedown.wait_for_service_endpoint('pyfw', 15) assert False, 'MoM shoud NOT create a service endpoint' except: assert True pass
def ensure_mom(): if not is_mom_installed(): # if there is an active deployment... wait for it. # it is possible that mom is currently in the process of being uninstalled # in which case it will not report as installed however install will fail # until the deployment is finished. shakedown.deployment_wait() try: shakedown.install_package_and_wait('marathon') shakedown.deployment_wait() except Exception: pass if not shakedown.wait_for_service_endpoint('marathon-user'): print('ERROR: Timeout waiting for endpoint')
def ensure_mom(): if not is_mom_installed(): # if there is an active deployment... wait for it. # it is possible that mom is currently in the process of being uninstalled # in which case it will not report as installed however install will fail # until the deployment is finished. shakedown.deployment_wait() try: shakedown.install_package_and_wait('marathon') shakedown.deployment_wait() except Exception: pass if not shakedown.wait_for_service_endpoint('marathon-user'): print('ERROR: Timeout waiting for endpoint')
def setup_function(function): shakedown.wait_for_service_endpoint('marathon-user') with marathon_on_marathon(): delete_all_apps_wait()
def setup_module(module): common.ensure_mom() shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds()) common.cluster_info() with shakedown.marathon_on_marathon(): clear_marathon()
def marathon_service_name(): shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) yield 'marathon' shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) clear_marathon()
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) if original_leader == current_leader: common.delete_marathon_path('v2/leader') assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) shakedown.deployment_wait() try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected"
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) if original_leader == current_leader: common.delete_marathon_path('v2/leader') assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) shakedown.deployment_wait() try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected"
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in shakedown.get_all_master_ips(): _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in shakedown.get_all_master_ips(): _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def setup_module(module): common.ensure_mom() shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=5).total_seconds()) common.cluster_info() with shakedown.marathon_on_marathon(): clear_marathon()
def setup_module(module): # verify test system requirements are met (number of nodes needed) ensure_mom() shakedown.wait_for_service_endpoint(PACKAGE_APP_ID) cluster_info()
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)