def test_create_pod_with_private_image(): """Deploys a pod with a private Docker image, using Mesos containerizer.""" if not common.is_enterprise_cli_package_installed(): common.install_enterprise_cli_package() username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json( username, password) secret_value = json.dumps(secret_value_json) pod_def = pods.private_docker_pod() pod_id = pod_def['id'] common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_pod(pod_def) common.deployment_wait(timeout=timedelta(minutes=5).total_seconds(), service_id=pod_id) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created" finally: common.delete_secret(secret_name)
def test_pod_health_failed_check(): """Deploys a pod with correct health checks, then partitions the network and verifies that the tasks get restarted with new task IDs. """ pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] common.save_iptables(host) common.block_port(host, port) time.sleep(7) common.restore_iptables(host) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) for task in tasks: assert task[ 'id'] != initial_id1, "One of the tasks has not been restarted" assert task[ 'id'] != initial_id2, "One of the tasks has not been restarted"
def test_pod_with_persistent_volume(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) host = tasks[0]['statuses'][0]['container_status']['network_infos'][0][ 'ip_addresses'][0]['ip_address'] port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] dir1 = tasks[0]['container']['volumes'][0]['container_path'] dir2 = tasks[1]['container']['volumes'][0]['container_path'] print(host, port1, port2, dir1, dir2) time.sleep(1) cmd = "curl {}:{}/{}/foo".format(host, port1, dir1) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) cmd = "curl {}:{}/{}/foo".format(host, port2, dir2) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data)
def test_pod_with_persistent_volume(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) host = common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'] port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] path1 = tasks[0]['container']['volumes'][0]['container_path'] path2 = tasks[1]['container']['volumes'][0]['container_path'] logger.info('Deployd two containers on {}:{}/{} and {}:{}/{}'.format(host, port1, path1, host, port2, path2)) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception) def check_http_endpoint(port, path): cmd = "curl {}:{}/{}/foo".format(host, port, path) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) check_http_endpoint(port1, path1) check_http_endpoint(port2, path2)
def test_pod_with_container_bridge_network(): """Tests creation of a pod with a "container/bridge" network, and its HTTP endpoint accessibility.""" pod_def = pods.container_bridge_pod() pod_id = pod_def['id'] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write to /var/log within the container. if shakedown.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx") network_info = common.running_status_network_info(task['statuses']) assert network_info['name'] == "mesos-bridge", \ "The network is {}, but mesos-bridge was expected".format(network_info['name']) # get the port on the host port = task['discovery']['ports']['ports'][0]['number'] # the agent IP:port will be routed to the bridge IP:port # test against the agent_ip, however it is hard to get.. translating from # slave_id agent_ip = common.agent_hostname_by_id(task['slave_id']) assert agent_ip is not None, "Failed to get the agent IP address" container_ip = network_info['ip_addresses'][0]['ip_address'] assert agent_ip != container_ip, "The container IP address is the same as the agent one" url = "http://{}:{}/".format(agent_ip, port) common.assert_http_code(url)
def test_pod_health_failed_check(): """Deploys a pod with correct health checks, then partitions the network and verifies that the tasks get restarted with new task IDs. """ pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] common.block_iptable_rules_for_seconds(host, port, 7, block_input=True, block_output=False) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) for new_task in tasks: new_task_id = new_task['id'] assert new_task_id != initial_id1, f"Task {new_task_id} has not been restarted" # NOQA E999 assert new_task_id != initial_id2, f"Task {new_task_id} has not been restarted"
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) common.wait_for_service_endpoint('marathon-user', path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_pod_secret_env_var(secret_fixture): secret_name, secret_value = secret_fixture pod_id = '/{}'.format(uuid.uuid4().hex) pod_def = { "id": pod_id, "containers": [{ "name": "container-1", "resources": { "cpus": 0.5, "mem": 64 }, "endpoints": [{ "name": "http", "hostPort": 0, "protocol": ["tcp"] }], "exec": { "command": { "shell": "echo $SECRET_ENV && " "echo $SECRET_ENV >> $MESOS_SANDBOX/secret-env && " "/opt/mesosphere/bin/python -m http.server $ENDPOINT_HTTP" } } }], "environment": { "SECRET_ENV": { "secret": "secret1" } }, "networks": [{ "mode": "host" }], "secrets": { "secret1": { "source": secret_name } } } client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) instances = client.show_pod(pod_id)['instances'] assert len( instances) == 1, 'Failed to start the secret environment variable pod' port = instances[0]['containers'][0]['endpoints'][0]['allocatedHostPort'] host = instances[0]['networks'][0]['addresses'][0] cmd = "curl {}:{}/secret-env".format(host, port) status, data = shakedown.run_command_on_master(cmd) assert status, "{} did not succeed. status = {}, data = {}".format( cmd, status, data) assert data.rstrip() == secret_value, "Got an unexpected secret data"
def test_two_pods_with_shared_volume(): """Confirms that 1 container can read data in a volume that was written from the other container. The reading container fails if it can't read the file. So if there are 2 tasks after 4 seconds we are good. """ pod_def = pods.ephemeral_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len( tasks ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format( len(tasks)) time.sleep(4) tasks = common.get_pod_tasks(pod_id) assert len( tasks ) == 2, "The number of tasks is {} after sleeping, but 2 was expected".format( len(tasks))
def test_create_pod_with_private_image(): """Deploys a pod with a private Docker image, using Mesos containerizer. This method relies on the global `install_enterprise_cli` fixture to install the enterprise-cli-package. """ username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json( username, password) secret_value = json.dumps(secret_value_json) pod_def = pods.private_docker_pod() pod_id = pod_def['id'] common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_pod(pod_def) common.deployment_wait(service_id=pod_id, max_attempts=300) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created" finally: common.delete_secret(secret_name)
def test_create_and_update_pod(): """Versions and reverting with pods""" pod_def = pods.simple_pod() pod_def["scaling"]["instances"] = 1 pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) pod_def["scaling"]["instances"] = 3 client.update_pod(pod_id, pod_def) common.deployment_wait(service_id=pod_id) versions = get_pod_versions(pod_id) assert len( versions ) == 2, "The number of versions is {}, but 2 was expected".format( len(versions)) version1 = get_pod_version(pod_id, versions[0]) version2 = get_pod_version(pod_id, versions[1]) assert version1["scaling"]["instances"] != version2["scaling"]["instances"], \ "Two pod versions have the same number of instances: {}, but they should not".format( version1["scaling"]["instances"])
def test_launch_docker_grace_period(marathon_service_name): """Tests 'taskKillGracePeriodSeconds' option using a Docker container in a Marathon environment. Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period` """ app_id = '/launch-docker-grace-period-app' app_def = apps.docker_http_server(app_id) app_def['container']['docker']['image'] = 'kensipe/python-test' default_grace_period = 3 grace_period = 20 app_def['taskKillGracePeriodSeconds'] = grace_period app_def['cmd'] = 'python test.py' task_name = app_id.lstrip('/') client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = get_service_task(marathon_service_name, task_name) assert tasks is not None client.scale_app(app_id, 0) tasks = get_service_task(marathon_service_name, task_name) assert tasks is not None # tasks should still be here after the default_graceperiod time.sleep(default_grace_period + 1) tasks = get_service_task(marathon_service_name, task_name) assert tasks is not None # but not after the set grace_period time.sleep(grace_period) assert_that(lambda: get_service_task(marathon_service_name, task_name), eventually(equal_to(None), max_attempts=30))
def test_pod_with_container_network(): """Tests creation of a pod with a "container" network, and its HTTP endpoint accessibility.""" pod_def = pods.container_net_pod() pod_id = pod_def['id'] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write to /var/log within the container. if shakedown.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx") network_info = common.running_status_network_info(task['statuses']) assert network_info['name'] == "dcos", \ "The network name is {}, but 'dcos' was expected".format(network_info['name']) container_ip = network_info['ip_addresses'][0]['ip_address'] assert container_ip is not None, "No IP address has been assigned to the pod's container" url = "http://{}:80/".format(container_ip) common.assert_http_code(url)
async def test_event_channel_for_pods(sse_events): """Tests the Marathon event channel specific to pod events.""" await common.assert_event('event_stream_attached', sse_events) pod_def = pods.simple_pod() pod_id = pod_def['id'] # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write files. if shakedown.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_user_acls() client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) await common.assert_event('pod_created_event', sse_events) await common.assert_event('deployment_step_success', sse_events) pod_def["scaling"]["instances"] = 3 client.update_pod(pod_id, pod_def) common.deployment_wait(service_id=pod_id) await common.assert_event('pod_updated_event', sse_events)
def assert_mom_ee(version, security_mode='permissive'): ensure_service_account() ensure_permissions() ensure_sa_secret(strict=True if security_mode == 'strict' else False) ensure_docker_config_secret() # In strict mode all tasks are started as user `nobody` by default. However we start # MoM-EE as 'root' and for that we need to give root marathon ACLs to start # tasks as 'root'. if security_mode == 'strict': common.add_dcos_marathon_user_acls() # Deploy MoM-EE in permissive mode app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version) assert os.path.isfile( app_def_file ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file) image = mom_ee_image(version) logger.info('Deploying {} definition with {} image'.format( app_def_file, image)) app_def = get_resource(app_def_file) app_def['container']['docker'][ 'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image) app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) common.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode), path="ping")
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server() app_id = app_def["id"] app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_id) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition common.block_iptable_rules_for_seconds(host, port, sleep_seconds=10, block_input=True, block_output=False) common.deployment_wait(service_id=app_id) app = client.get_app(app_id) tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task didn't get killed because of a failed health check" assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy']) # network partition should cause a task restart @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_id) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format( task_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_vip_mesos_cmd(marathon_service_name): """Validates the creation of an app with a VIP label and the accessibility of the service via the VIP.""" app_def = apps.http_server() app_id = app_def["id"] vip_name = app_id.lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['portDefinitions'] = [{ "port": 0, "protocol": "tcp", "name": "{}".format(vip_name), "labels": { "VIP_0": "/{}:10000".format(vip_name) } }] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_docker_dns_mapping(marathon_service_name): """Tests that a running Docker task is accessible via DNS.""" app_def = apps.docker_http_server(app_id='/docker-dns-mapping-app') app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) bad_cmd = 'ping -c 1 docker-test.marathon-user.mesos-bad' status, output = shakedown.run_command_on_master(bad_cmd) assert not status @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_dns(): dnsname = '{}.{}.mesos'.format(app_id.lstrip('/'), marathon_service_name) cmd = 'ping -c 1 {}'.format(dnsname) shakedown.wait_for_dns(dnsname) status, output = shakedown.run_command_on_master(cmd) assert status, "ping failed for app using DNS lookup: {}".format( dnsname) check_dns()
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] common.systemctl_master('restart') common.wait_for_service_endpoint(marathon_service_name, path="ping") @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after master restart, but 1 was expected".format( len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_pod_restarts_on_nonzero_exit_code(): """Verifies that a pod get restarted in case one of its containers exits with a non-zero code. As a result, after restart, there should be two new tasks for different IDs. """ pod_def = pods.simple_pod() pod_id = pod_def['id'] pod_def["scaling"]["instances"] = 1 pod_def['containers'][0]['exec']['command'][ 'shell'] = 'sleep 5; echo -n leaving; exit 2' client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] time.sleep( 6) # 1 sec past the 5 sec sleep in one of the container's command tasks = common.get_pod_tasks(pod_id) for task in tasks: assert task['id'] != initial_id1, "Got the same task ID" assert task['id'] != initial_id2, "Got the same task ID"
def test_pinned_task_scales_on_host_only(): """Tests that a pinned app scales only on the pinned node.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) assert tasks[0]['host'] == host, \ "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host) client.scale_app(app_id, 10) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 10, "The number of tasks is {} after scale, but 10 was expected".format( len(tasks)) for task in tasks: assert task[ 'host'] == host, "The task is on {}, but it is supposed to be on {}".format( task['host'], host)
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) common.kill_process_on_host(host, '[s]leep') common.deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0][ 'id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_create_pod_with_private_image(): """Deploys a pod with a private Docker image, using Mesos containerizer.""" if not common.is_enterprise_cli_package_installed(): common.install_enterprise_cli_package() username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "pullconfig" secret_value_json = common.create_docker_pull_config_json(username, password) secret_value = json.dumps(secret_value_json) pod_def = pods.private_docker_pod() pod_id = pod_def['id'] common.create_secret(secret_name, secret_value) client = marathon.create_client() try: client.add_pod(pod_def) common.deployment_wait(timeout=timedelta(minutes=5).total_seconds(), service_id=pod_id) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created" finally: common.delete_secret(secret_name)
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() print('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len( deployments ) == 1, "The number of deployments is {}, but 1 was expected".format( len(deployments)) assert len( tasks) == 1, "The number of tasks is {}, but 1 was expected".format( len(tasks))
def test_pod_with_persistent_volume(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) host = common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'] port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] path1 = tasks[0]['container']['volumes'][0]['container_path'] path2 = tasks[1]['container']['volumes'][0]['container_path'] print(host, port1, port2, path1, path2) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception) def check_http_endpoint(port, path): cmd = "curl {}:{}/{}/foo".format(host, port, path) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) check_http_endpoint(port1, path1) check_http_endpoint(port2, path2)
def test_app_update(): """Tests that an app gets successfully updated.""" app_def = apps.mesos_app(app_id='/update-app') app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) app_def['cpus'] = 1 app_def['instances'] = 2 client.update_app(app_id, app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format( len(tasks))
def test_app_with_persistent_volume_recovers(): """Tests that when an app task with a persistent volume gets killed, it recovers on the node it was launched on, and it gets attached to the same persistent-volume.""" app_def = apps.persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) task_id = tasks[0]['id'] port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task(cmd, target_data): run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert target_data in data, "'{}' not found in {}".format( target_data, data) check_task(cmd, target_data='hello\n') shakedown.kill_process_on_host(host, '[h]ttp.server') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after recovery, but 1 was expected".format( len(tasks)) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task ID has not changed, and is still {}".format( task_id) check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) check_task(cmd, target_data='hello\nhello\n')
def test_install_universe_package(package): """ Marathon is responsible for installing packages from the universe. This test confirms that several packages are installed into a healty state. """ install_package_and_wait(package) assert package_installed(package), 'Package failed to install' common.deployment_wait(max_attempts=300) assert service_healthy(package)
def test_app_secret_env_var(secret_fixture): secret_name, secret_value = secret_fixture app_id = '/app-secret-env-var-{}'.format(uuid.uuid4().hex) app_def = { "id": app_id, "instances": 1, "cpus": 0.5, "mem": 64, "cmd": "echo $SECRET_ENV >> $MESOS_SANDBOX/secret-env && /opt/mesosphere/bin/python -m http.server $PORT_API", "env": { "SECRET_ENV": { "secret": "secret1" } }, "portDefinitions": [{ "port": 0, "protocol": "tcp", "name": "api", "labels": {} }], "secrets": { "secret1": { "source": secret_name } } } client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks) == 1, 'Failed to start the secret environment variable app' port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/secret-env".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def value_check(): status, data = run_command_on_master(cmd) assert status, "{} did not succeed".format(cmd) assert data.rstrip() == secret_value value_check()
def test_create_pod(): """Launch simple pod in DC/OS root marathon.""" pod_def = pods.simple_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) pod = client.show_pod(pod_id) assert pod is not None, "The pod has not been created"
def test_app_with_persistent_volume_recovers(): """Tests that when an app task with a persistent volume gets killed, it recovers on the node it was launched on, and it gets attached to the same persistent-volume.""" app_def = apps.persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) task_id = tasks[0]['id'] port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task(cmd, target_data): run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert target_data in data, "'{}' not found in {}".format(target_data, data) check_task(cmd, target_data='hello\n') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def kill_task(host, pattern): pids = common.kill_process_on_host(host, pattern) assert len(pids) != 0, "no task got killed on {} for pattern {}".format(host, pattern) kill_task(host, '[h]ttp\\.server') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after recovery, but 1 was expected".format(len(tasks)) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task ID has not changed, and is still {}".format(task_id) check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) check_task(cmd, target_data='hello\nhello\n')
def test_multi_instance_pod(): """Launches a pod with multiple instances.""" pod_def = pods.simple_pod() pod_id = pod_def['id'] pod_def["scaling"]["instances"] = 3 client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) status = get_pod_status(pod_id) assert len(status["instances"]) == 3, \ "The number of instances is {}, but 3 was expected".format(len(status["instances"]))
def test_pod_health_check(): """Tests that health checks work for pods.""" pod_def = pods.ports_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) for task in tasks: health = common.running_task_status(task['statuses'])['healthy'] assert health, "One of the pod's tasks (%s) is unhealthy" % (task['name'])
def test_pin_pod(): """Tests that a pod can be pinned to a specific host.""" pod_def = pods.ports_pod() pod_id = pod_def['id'] host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(len(tasks)) pod = client.list_pod()[0] assert pod['instances'][0]['agentHostname'] == host, "The pod didn't get pinned to {}".format(host)
def test_pod_multi_port(): """A pod with two containers is properly provisioned so that each container has a unique port.""" pod_def = pods.ports_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) pod = client.show_pod(pod_id) container1 = pod['instances'][0]['containers'][0] port1 = container1['endpoints'][0]['allocatedHostPort'] container2 = pod['instances'][0]['containers'][1] port2 = container2['endpoints'][0]['allocatedHostPort'] assert port1 != port2, "Containers' ports are equal, but they should be different"
def test_restart_container_with_persistent_volume(): """A task with a persistent volume, which writes to a file in the persistent volume, is launched. The app is killed and restarted and we can still read from the persistent volume what was written to it. """ app_def = apps.persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) host = tasks[0]['host'] port = tasks[0]['ports'][0] cmd = "curl {}:{}/data/foo".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task(cmd, target_data): run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == target_data, "'{}' was not equal to {}".format(data, target_data) check_task(cmd, target_data='hello\n') client.restart_app(app_id) common.deployment_wait(service_id=app_id) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after recovery, but 1 was expected".format(len(tasks)) check_task_recovery() host = tasks[0]['host'] port = tasks[0]['ports'][0] cmd = "curl {}:{}/data/foo".format(host, port) check_task(cmd, target_data='hello\nhello\n')
def test_remove_pod(): """Launches a pod and then removes it.""" pod_def = pods.simple_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) client.remove_pod(pod_id) common.deployment_wait(service_id=pod_id) try: client.show_pod(pod_id) except Exception: pass else: assert False, "The pod has not been removed"
def test_pod_port_communication(): """ Test that 1 container can establish a socket connection to the other container in the same pod. """ pod_def = pods.ports_pod() pod_id = pod_def['id'] cmd = 'sleep 2; ' \ 'curl -m 2 localhost:$ENDPOINT_HTTPENDPOINT; ' \ 'if [ $? -eq 7 ]; then exit; fi; ' \ '/opt/mesosphere/bin/python -m http.server $ENDPOINT_HTTPENDPOINT2' pod_def['containers'][1]['exec']['command']['shell'] = cmd client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(len(tasks))
def test_two_pods_with_shared_volume(): """Confirms that 1 container can read data in a volume that was written from the other container. The reading container fails if it can't read the file. So if there are 2 tasks after 4 seconds we are good. """ pod_def = pods.ephemeral_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(len(tasks)) time.sleep(4) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after sleeping, but 2 was expected".format(len(tasks))
def test_scale_down_pod(): """Scales down a pod from 3 to 1 instance.""" pod_def = pods.simple_pod() pod_def["scaling"]["instances"] = 3 pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) status = get_pod_status(pod_id) assert len(status["instances"]) == 3, \ "The number of instances is {}, but 3 was expected".format(len(status["instances"])) pod_def["scaling"]["instances"] = 1 client.update_pod(pod_id, pod_def) common.deployment_wait(service_id=pod_id) status = get_pod_status(pod_id) assert len(status["instances"]) == 1, \ "The number of instances is {}, but 1 was expected".format(len(status["instances"]))
def test_create_and_update_pod(): """Versions and reverting with pods""" pod_def = pods.simple_pod() pod_def["scaling"]["instances"] = 1 pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) pod_def["scaling"]["instances"] = 3 client.update_pod(pod_id, pod_def) common.deployment_wait(service_id=pod_id) versions = get_pod_versions(pod_id) assert len(versions) == 2, "The number of versions is {}, but 2 was expected".format(len(versions)) version1 = get_pod_version(pod_id, versions[0]) version2 = get_pod_version(pod_id, versions[1]) assert version1["scaling"]["instances"] != version2["scaling"]["instances"], \ "Two pod versions have the same number of instances: {}, but they should not".format( version1["scaling"]["instances"])
def test_pod_restarts_on_nonzero_exit_code(): """Verifies that a pod get restarted in case one of its containers exits with a non-zero code. As a result, after restart, there should be two new tasks for different IDs. """ pod_def = pods.simple_pod() pod_id = pod_def['id'] pod_def["scaling"]["instances"] = 1 pod_def['containers'][0]['exec']['command']['shell'] = 'sleep 5; echo -n leaving; exit 2' client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] time.sleep(6) # 1 sec past the 5 sec sleep in one of the container's command tasks = common.get_pod_tasks(pod_id) for task in tasks: assert task['id'] != initial_id1, "Got the same task ID" assert task['id'] != initial_id2, "Got the same task ID"
def test_pod_with_persistent_volume_recovers(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of pod tasks is {}, but is expected to be 2".format(len(tasks)) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def wait_for_status_network_info(): tasks = common.get_pod_tasks(pod_id) # the following command throws exceptions if there are no tasks in TASK_RUNNING state common.running_status_network_info(tasks[0]['statuses']) wait_for_status_network_info() host = common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'] task_id1 = tasks[0]['id'] task_id2 = tasks[1]['id'] @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def kill_task(host, pattern): pids = common.kill_process_on_host(host, pattern) assert len(pids) != 0, "no task got killed on {} for pattern {}".format(host, pattern) kill_task(host, '[h]ttp\\.server') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def wait_for_pod_recovery(): tasks = common.get_pod_tasks(pod_id) assert len(tasks) == 2, "The number of tasks is {} after recovery, but 2 was expected".format(len(tasks)) old_task_ids = [task_id1, task_id2] new_task_id1 = tasks[0]['id'] new_task_id2 = tasks[1]['id'] assert new_task_id1 not in old_task_ids, \ "The task ID has not changed, and is still {}".format(new_task_id1) assert new_task_id2 not in old_task_ids, \ "The task ID has not changed, and is still {}".format(new_task_id2) wait_for_pod_recovery() wait_for_status_network_info() tasks = common.get_pod_tasks(pod_id) assert host == common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'], \ "the pod has been restarted on another host" port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] path1 = tasks[0]['container']['volumes'][0]['container_path'] path2 = tasks[1]['container']['volumes'][0]['container_path'] print(host, port1, port2, path1, path2) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_data(port, path): cmd = "curl {}:{}/{}/foo".format(host, port, path) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert 'hello\nhello\n' in data, "'hello\nhello\n' not found in '{}'n".format(data) check_data(port1, path1) check_data(port2, path2)