def test_marathon_with_master_process_failure(marathon_service_name): """ Launches an app from Marathon and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] common.systemctl_master() shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_delay=10000, retry_on_exception=retry_on_exception) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id check_task_recovery()
def test_marathon_when_disconnected_from_zk(): """ Launch an app from Marathon. Then knock out access to zk from the MoM. Verify the task is still good. """ app_def = app('zk-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/zk-failure') original_task_id = tasks[0]['id'] with shakedown.iptable_rules(host): block_port(host, 2181) # time of the zk block time.sleep(10) # after access to zk is restored. @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/zk-failure') tasks[0]['id'] == original_task_id check_task_is_back()
def test_command_health_check_healthy(): # Test COMMAND protocol with marathon_on_marathon(): client = marathon.create_client() app_def = app() assert_app_healthy(client, app_def, command_health_check())
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_good_user(): """ Test changes an app from the non-specified (default user) to another good user. This works on coreOS. """ app_id = uuid.uuid4().hex app_def = app(app_id) app_def['user'] = '******' client = marathon.create_client() client.add_app(app_def) # if bad this wait will fail. # Good user `core` didn't launch. This only works on a coreOS or a system with a core user. shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert tasks[0]['id'] != app_def['id'], "Good user `core` didn't launch. This only works on a coreOS or a system with a core user."
def test_bad_user(): """ Test changes the default user to a bad user and confirms that task will not launch. """ app_id = uuid.uuid4().hex app_def = app(app_id) app_def['user'] = '******' client = marathon.create_client() client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_failure_message(): appl = client.get_app(app_id) message = appl['lastTaskFailure']['message'] error = "Failed to get user information for 'bad'" assert error in message
def test_task_failure_recovers(): """ Tests that if a task is KILLED, it will be relaunched and the taskID is different. """ app_id = uuid.uuid4().hex app_def = app(app_id) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) host = tasks[0]['host'] shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(stop_max_delay=10000) def check_new_task_id(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0]['id']
def test_marathon_when_task_agent_bounced(): """ Launch an app and restart the node the task is on. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_pinned_task_does_not_find_unknown_host(): """ Tests that a task pinned to an unknown host will not launch. within 10 secs it is still in deployment and 0 tasks are running. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, '10.255.255.254') # only 1 can fit on the node app_def['cpus'] = 3.5 with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) # deploys are within secs # assuming after 10 no tasks meets criteria time.sleep(10) tasks = client.get_tasks('/pinned') assert len(tasks) == 0
def test_pinned_task_does_not_find_unknown_host(): """ Tests that a task pinned to an unknown host will not launch. within 10 secs it is still in deployment and 0 tasks are running. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, '10.255.255.254') # only 1 can fit on the node app_def['cpus'] = 3.5 client = marathon.create_client() client.add_app(app_def) # deploys are within secs # assuming after 10 no tasks meets criteria time.sleep(10) tasks = client.get_tasks('/pinned') assert len(tasks) == 0
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_bad_uri(): """ Tests marathon's response to launching a task with a bad url (a url that isn't fetchable) """ app_id = uuid.uuid4().hex app_def = app(app_id) fetch = [{"uri": "http://mesosphere.io/missing-artifact"}] app_def['fetch'] = fetch client = marathon.create_client() client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_failure_message(): appl = client.get_app(app_id) message = appl['lastTaskFailure']['message'] error = "Failed to fetch all URIs for container" assert error in message check_failure_message()
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_bad_uri(): """ Tests marathon's response to launching a task with a bad url (a url that isn't fetchable) """ app_id = uuid.uuid4().hex app_def = app(app_id) fetch = [{ "uri": "http://mesosphere.io/missing-artifact" }] app_def['fetch'] = fetch client = marathon.create_client() client.add_app(app_def) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_failure_message(): appl = client.get_app(app_id) message = appl['lastTaskFailure']['message'] error = "Failed to fetch all URIs for container" assert error in message check_failure_message()
def test_pinned_task_scales_on_host_only(): """ Tests that scaling a pinned app scales only on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') assert len(tasks) == 1 assert tasks[0]['host'] == host client.scale_app('pinned', 10) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') assert len(tasks) == 10 for task in tasks: assert task['host'] == host
def test_marathon_with_master_process_failure(marathon_service_name): """ Launches an app from Marathon and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] common.systemctl_master() shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id
def test_pinned_task_does_not_scale_to_unpinned_host(): """ Tests when a task lands on a pinned node (and barely fits) when asked to scale past the resources of that node will not scale. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) # only 1 can fit on the node app_def['cpus'] = 3.5 with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') client.scale_app('pinned', 2) # typical deployments are sub 3 secs time.sleep(5) deployments = client.get_deployments() tasks = client.get_tasks('/pinned') # still deploying assert len(deployments) == 1 assert len(tasks) == 1
def test_pinned_task_does_not_scale_to_unpinned_host(): """ Tests when a task lands on a pinned node (and barely fits) when asked to scale past the resources of that node will not scale. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) # only 1 can fit on the node app_def['cpus'] = 3.5 client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') client.scale_app('pinned', 2) # typical deployments are sub 3 secs time.sleep(5) deployments = client.get_deployments() tasks = client.get_tasks('/pinned') # still deploying assert len(deployments) == 1 assert len(tasks) == 1
def test_command_health_check_healthy(): # Test COMMAND protocol client = marathon.create_client() app_def = app() assert_app_healthy(client, app_def, command_health_check())
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)