def test_event_channel(): """ Tests the event channel. The way events are verified is by streaming the events to a events.txt file. The fixture ensures the file is removed before and after the test. events checked are connecting, deploying a good task and killing a task. """ app_def = apps.mesos_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() master_ip = shakedown.master_ip() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_deployment_message(): status, stdout = shakedown.run_command(master_ip, 'cat events.txt') assert 'event_stream_attached' in stdout, "event_stream_attached event has not been found" assert 'deployment_info' in stdout, "deployment_info event has not been found" assert 'deployment_step_success' in stdout, "deployment_step_success has not been found" check_deployment_message() client.remove_app(app_id, True) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_kill_message(): status, stdout = shakedown.run_command(master_ip, 'cat events.txt') assert 'KILLED' in stdout, "KILLED event has not been found" check_kill_message()
def test_incremental_groups_scale(): """ Scale number of groups. """ client = marathon.create_client() batch_size_for = exponential_decay(start=40, decay=0.01) total = 0 for step in itertools.count(start=0): batch_size = batch_size_for(step) total += batch_size shakedown.echo("Add {} groups totaling {}".format(batch_size, total)) group_ids = ("/group-{0:0>4}".format(step * batch_size + i) for i in range(batch_size)) app_ids = ("{}/app-1".format(g) for g in group_ids) app_definitions = [app_def(app_id) for app_id in app_ids] # There is no app id. We simply PUT /v2/apps to create groups in # batches. client.update_app('', app_definitions) shakedown.deployment_wait( timeout=timedelta(minutes=15).total_seconds()) shakedown.echo("done.")
def test_incremental_group_nesting(): """ Scale depth of nested groups. Again we grow fast at the beginning and then slow the growth. """ client = marathon.create_client() batch_size_for = exponential_decay(start=5, decay=0.1) depth = 0 for step in itertools.count(start=0): batch_size = batch_size_for(step) depth += batch_size shakedown.echo("Create a group with a nesting of {}".format(depth)) group_ids = ("group-{0:0>3}".format(g) for g in range(depth)) nested_groups = '/'.join(group_ids) # Note: We always deploy into the same nested groups. app_id = '/{0}/app-1'.format(nested_groups) client.add_app(app_def(app_id)) shakedown.deployment_wait( timeout=timedelta(minutes=15).total_seconds()) shakedown.echo("done.")
def test_incremental_apps_per_group_scale(): """ Try to reach the maximum number of apps. We start with batches of apps in a group and decay the batch size. """ client = marathon.create_client() batch_size_for = exponential_decay(start=500, decay=0.3) for step in itertools.count(start=0): batch_size = batch_size_for(step) shakedown.echo("Add {} apps".format(batch_size)) group_id = "/batch-{0:0>3}".format(step) app_ids = ("app-{0:0>4}".format(i) for i in range(batch_size)) app_definitions = [app_def(app_id) for app_id in app_ids] next_batch = { "apps": app_definitions, "dependencies": [], "id": group_id } client.create_group(next_batch) shakedown.deployment_wait( timeout=timedelta(minutes=15).total_seconds()) shakedown.echo("done.")
def test_lock(): '''This test verifies that a second scheduler fails to startup when an existing scheduler is running. Without locking, the scheduler would fail during registration, but after writing its config to ZK. So in order to verify that the scheduler fails immediately, we ensure that the ZK config state is unmodified.''' marathon_client = dcos.marathon.create_client() # Get ZK state from running framework zk_path = "dcos-service-{}/ConfigTarget".format(PACKAGE_NAME) zk_config_old = shakedown.get_zk_node_data(zk_path) # Get marathon app app_id = "/{}".format(PACKAGE_NAME) app = marathon_client.get_app(app_id) old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None) # Scale to 2 instances labels = app["labels"] labels.pop("MARATHON_SINGLE_INSTANCE_APP") marathon_client.update_app(app_id, {"labels": labels}) shakedown.deployment_wait() marathon_client.update_app(app_id, {"instances": 2}) # Wait for second scheduler to fail def fn(): timestamp = marathon_client.get_app(app_id).get("lastTaskFailure", {}).get("timestamp", None) return timestamp != old_timestamp spin.time_wait_noisy(lambda: fn()) # Verify ZK is unchanged zk_config_new = shakedown.get_zk_node_data(zk_path) assert zk_config_old == zk_config_new
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_pod_file_based_secret(secret_fixture): secret_name, secret_value = secret_fixture secret_normalized_name = secret_name.replace('/', '') pod_id = '/{}'.format(uuid.uuid4().hex) pod_def = { "id": pod_id, "containers": [{ "name": "container-1", "resources": { "cpus": 0.1, "mem": 64 }, "endpoints": [{ "name": "http", "hostPort": 0, "protocol": [ "tcp" ]} ], "exec": { "command": { "shell": "cat {} >> {}_file && /opt/mesosphere/bin/python -m http.server $ENDPOINT_HTTP".format( secret_normalized_name, secret_normalized_name), } }, "volumeMounts": [{ "name": "vol", "mountPath": secret_name }], }], "networks": [{ "mode": "host" }], "volumes": [{ "name": "vol", "secret": "secret1" }], "secrets": { "secret1": { "source": secret_name } } } client = marathon.create_client() client.add_pod(pod_def) shakedown.deployment_wait() instances = client.show_pod(pod_id)['instances'] assert len(instances) == 1, 'Failed to start the file based secret pod' port = instances[0]['containers'][0]['endpoints'][0]['allocatedHostPort'] host = instances[0]['networks'][0]['addresses'][0] cmd = "curl {}:{}/{}_file".format(host, port, secret_normalized_name) status, data = shakedown.run_command_on_master(cmd) assert status, "{} did not succeed".format(cmd) assert data.rstrip() == secret_value, "Got an unexpected secret data"
def test_marathon_when_disconnected_from_zk(): """ Launch an app from Marathon. Then knock out access to zk from the MoM. Verify the task is still good. """ app_def = app('zk-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/zk-failure') original_task_id = tasks[0]['id'] with shakedown.iptable_rules(host): block_port(host, 2181) # time of the zk block time.sleep(10) # after access to zk is restored. @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/zk-failure') tasks[0]['id'] == original_task_id check_task_is_back()
def test_health_failed_check(): """ Deploys a pod with good health checks, then partitions the network and verifies the tasks return with new task ids. """ client = marathon.create_client() pod_id = "/pod-ken".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id host = ip_other_than_mom() pin_pod_to_host(pod_json, host) client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) initial_id1 = tasks[0]['id'] initial_id2 = tasks[1]['id'] pod = client.list_pod()[0] container1 = pod['instances'][0]['containers'][0] port = container1['endpoints'][0]['allocatedHostPort'] save_iptables(host) block_port(host, port) time.sleep(7) restore_iptables(host) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) for task in tasks: assert task['id'] != initial_id1 assert task['id'] != initial_id2
def test_docker_dns_mapping(marathon_service_name): """ Tests that a running docker task is accessible from DNS. """ app_id = uuid.uuid4().hex client = marathon.create_client() app_json = app_docker(app_id) client.add_app(app_json) shakedown.deployment_wait() tasks = client.get_tasks(app_id) host = tasks[0]['host'] bad_cmd = 'ping -c 1 docker-test.marathon-user.mesos-bad' status, output = shakedown.run_command_on_master(bad_cmd) assert not status @retrying.retry(stop_max_attempt_number=30) def check_dns(): cmd = 'ping -c 1 {}.{}.mesos'.format(app_id, marathon_service_name) shakedown.wait_for_dns('{}.{}.mesos'.format(app_id, marathon_service_name)) status, output = shakedown.run_command_on_master(cmd) assert status check_dns()
def test_launch_docker_graceperiod(marathon_service_name): """ Test the 'taskKillGracePeriodSeconds' in a Marathon environment. This is the same test as above however tests against docker. """ app_id = uuid.uuid4().hex app_def = app_docker(app_id) app_def['container']['docker']['image'] = 'kensipe/python-test' default_graceperiod = 3 graceperiod = 20 app_def['taskKillGracePeriodSeconds'] = graceperiod app_def['cmd'] = 'python test.py' client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None client.scale_app(app_id, 0) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # task should still be here after the default_graceperiod time.sleep(default_graceperiod + 1) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # but not after the set graceperiod time.sleep(graceperiod) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is None
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_install_marathon(): """Install the Marathon package for DC/OS. """ # Install shakedown.install_package_and_wait(PACKAGE_NAME) assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to install' end_time = time.time() + WAIT_TIME_IN_SECS found = False while time.time() < end_time: found = shakedown.get_service(PACKAGE_NAME) is not None if found and shakedown.service_healthy(SERVICE_NAME): break time.sleep(1) assert found, 'Service did not register with DCOS' shakedown.deployment_wait() # Uninstall uninstall('marathon-user') shakedown.deployment_wait() # Reinstall shakedown.install_package_and_wait(PACKAGE_NAME) assert shakedown.package_installed(PACKAGE_NAME), 'Package failed to reinstall' # try: shakedown.install_package(PACKAGE_NAME) except Exception as e: pass else: # Exception is not raised -> exit code was 0 assert False, "Error: CLI returns 0 when asked to install Marathon"
def test_event_channel(): """ Tests the Marathon event channnel specific to pod events. """ client = marathon.create_client() pod_id = "/pod-create" pod_json = _pods_json() pod_json["id"] = pod_id client.add_pod(pod_json) shakedown.deployment_wait() # look for created @retrying.retry(stop_max_delay=10000) def check_deployment_message(): status, stdout = shakedown.run_command_on_master('cat test.txt') assert 'event_stream_attached' in stdout assert 'pod_created_event' in stdout assert 'deployment_step_success' in stdout pod_json["scaling"]["instances"] = 3 client.update_pod(pod_id, pod_json) shakedown.deployment_wait() # look for updated @retrying.retry(stop_max_delay=10000) def check_update_message(): status, stdout = shakedown.run_command_on_master('cat test.txt') assert 'pod_updated_event' in stdout
def test_launch_container_with_persistent_volume(): """ Tests launching a task with PV. It will write to a file in the PV. The app is killed and restarted and we can still read from the PV. """ app_def = persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) client.restart_app(app_id) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\nhello\n', "'{}' was not equal to hello\\nhello\\n".format(data)
def test_vip_mesos_cmd(marathon_service_name): """ Tests the creation of a VIP from a python command NOT in a docker. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ vip_name = 'vip-service' fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def = python_http_app() app_def['portDefinitions'] = [ { "port": 0, "protocol": "tcp", "name": "{}".format(vip_name), "labels": { "VIP_0": "/{}:10000".format(vip_name) } } ] app_def['id'] = vip_name client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry def http_output_check(stop_max_attempt_number=30): common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_launch_mesos_grace_period(marathon_service_name): """Tests 'taskKillGracePeriodSeconds' option using a Mesos container in a Marathon environment. Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period` """ app_def = apps.mesos_app() default_grace_period = 3 grace_period = 20 app_def['fetch'] = [{"uri": "https://downloads.mesosphere.com/testing/test.py"}] app_def['cmd'] = '/opt/mesosphere/bin/python test.py' app_def['taskKillGracePeriodSeconds'] = grace_period app_id = app_def['id'].lstrip('/') client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None client.scale_app(app_id, 0) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # tasks should still be here after the default_grace_period time.sleep(default_grace_period + 1) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # but not after the set grace_period time.sleep(grace_period) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is None
def test_scale_app_in_group(): """Scales an individual app in a group.""" group_def = groups.sleep_group() groups_id = group_def["groups"][0]["id"] client = marathon.create_client() client.create_group(group_def) shakedown.deployment_wait() group_apps = client.get_group(groups_id) apps = group_apps['apps'] assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps)) app1_id = group_def["groups"][0]["apps"][0]["id"] app2_id = group_def["groups"][0]["apps"][1]["id"] tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2)) # scaling just one app in the group client.scale_app(app1_id, 2) shakedown.deployment_wait() tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after scale, but 1 was expected".format(len(tasks2))
def test_pinned_task_does_not_scale_to_unpinned_host(): """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past the resources of that node, no tasks will be launched on any other node. """ app_def = apps.sleep_app() app_id = app_def['id'] host = common.ip_other_than_mom() print('Constraint set to host: {}'.format(host)) # the size of cpus is designed to be greater than 1/2 of a node # such that only 1 task can land on the node. cores = common.cpus_on_agent(host) app_def['cpus'] = max(0.6, cores - 0.5) common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) client.scale_app(app_id, 2) time.sleep(5) deployments = client.get_deployments(app_id=app_id) tasks = client.get_tasks(app_id) # still deploying assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments)) assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
def test_vip_docker_bridge_mode(marathon_service_name): """Tests the creation of a VIP from a python command in a docker image using bridge mode. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ app_def = apps.docker_http_server() vip_name = app_def["id"].lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['id'] = vip_name app_def['container']['docker']['portMappings'] = [{ "containerPort": 8080, "hostPort": 0, "labels": { "VIP_0": "/{}:10000".format(vip_name) }, "protocol": "tcp", "name": "{}".format(vip_name) }] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_launch_and_scale_group(): """Launches and scales a group.""" group_def = groups.sleep_group() groups_id = group_def["groups"][0]["id"] client = marathon.create_client() client.create_group(group_def) shakedown.deployment_wait() group_apps = client.get_group(groups_id) apps = group_apps['apps'] assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps)) app1_id = group_def["groups"][0]["apps"][0]["id"] app2_id = group_def["groups"][0]["apps"][1]["id"] tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1)) assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2)) # scale by 2 for the entire group client.scale_group(groups_id, 2) shakedown.deployment_wait() tasks1 = client.get_tasks(app1_id) tasks2 = client.get_tasks(app2_id) assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1)) assert len(tasks2) == 2, "The number of tasks #2 is {} after scale, but 2 was expected".format(len(tasks2))
def test_vip_mesos_cmd(marathon_service_name): """Validates the creation of an app with a VIP label and the accessibility of the service via the VIP.""" app_def = apps.http_server() vip_name = app_def["id"].lstrip("/") fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def['portDefinitions'] = [{ "port": 0, "protocol": "tcp", "name": "{}".format(vip_name), "labels": { "VIP_0": "/{}:10000".format(vip_name) } }] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def http_output_check(): time.sleep(1) common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_launch_docker_grace_period(marathon_service_name): """Tests 'taskKillGracePeriodSeconds' option using a Docker container in a Marathon environment. Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period` """ app_def = apps.docker_http_server() app_def['container']['docker']['image'] = 'kensipe/python-test' default_grace_period = 3 grace_period = 20 app_def['taskKillGracePeriodSeconds'] = grace_period app_def['cmd'] = 'python test.py' app_id = app_def['id'].lstrip('/') client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None client.scale_app(app_id, 0) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # tasks should still be here after the default_graceperiod time.sleep(default_grace_period + 1) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is not None # but not after the set grace_period time.sleep(grace_period) tasks = shakedown.get_service_task(marathon_service_name, app_id) assert tasks is None
def test_event_channel(): """ Tests the event channel. The way events are verified is by streaming the events to a test.txt file. The fixture ensures the file is removed before and after the test. events checked are connecting, deploying a good task and killing a task. """ app_def = common.app_mesos() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_deployment_message(): status, stdout = shakedown.run_command_on_master('cat test.txt') assert 'event_stream_attached' in stdout assert 'deployment_info' in stdout assert 'deployment_step_success' in stdout client.remove_app(app_id, True) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_kill_message(): status, stdout = shakedown.run_command_on_master('cat test.txt') assert 'Killed' in stdout
def test_vip_docker_bridge_mode(marathon_service_name): """ Tests the creation of a VIP from a python command in a docker image using bridge mode. the test validates the creation of an app with the VIP label and the accessability of the service via the VIP. """ vip_name = 'vip-docker-service' fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name) app_def = app_docker() app_def['container']['docker']['portMappings'] = [ { "containerPort": 8080, "hostPort": 0, "labels": { "VIP_0": "/{}:10000".format(vip_name) }, "protocol": "tcp", "name": "{}".format(vip_name) } ] app_def['id'] = vip_name client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() @retrying.retry def http_output_check(stop_max_attempt_number=30): common.assert_http_code('{}:{}'.format(fqn, 10000)) http_output_check()
def test_marathon_with_master_process_failure(marathon_service_name): """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and the task ID stays the same. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] common.systemctl_master('restart') shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_def["id"]) assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks)) assert tasks[0]['id'] == original_task_id, \ "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id']) check_task_recovery()
def test_pod_secret_env_var(secret_fixture): # Install enterprise-cli since it's needed to create secrets if not common.is_enterprise_cli_package_installed(): common.install_enterprise_cli_package() secret_name, secret_value = secret_fixture pod_id = '/{}'.format(uuid.uuid4().hex) pod_def = { "id": pod_id, "containers": [{ "name": "container-1", "resources": { "cpus": 0.1, "mem": 64 }, "endpoints": [{ "name": "http", "hostPort": 0, "protocol": [ "tcp" ]} ], "exec": { "command": { "shell": "echo $SECRET_ENV && echo $SECRET_ENV >> $MESOS_SANDBOX/secret-env && /opt/mesosphere/bin/python -m http.server $ENDPOINT_HTTP" } } }], "environment": { "SECRET_ENV": { "secret": "secret1" } }, "networks": [{ "mode": "host" }], "secrets": { "secret1": { "source": secret_name } } } client = marathon.create_client() client.add_pod(pod_def) shakedown.deployment_wait() instances = client.show_pod(pod_id)['instances'] assert len(instances) == 1, 'Failed to start the secret environment variable pod' port = instances[0]['containers'][0]['endpoints'][0]['allocatedHostPort'] host = instances[0]['networks'][0]['addresses'][0] cmd = "curl {}:{}/secret-env".format(host, port) status, data = shakedown.run_command_on_master(cmd) assert status, "{} did not succeed".format(cmd) assert data.rstrip() == secret_value
def clear_pods(): try: client = marathon.create_client() pods = client.list_pod() for pod in pods: client.remove_pod(pod["id"], True) shakedown.deployment_wait() except Exception: pass
def clear_pods(): # clearing doesn't cause try: client = marathon.create_client() pods = client.list_pod() for pod in pods: client.remove_pod(pod["id"], True) shakedown.deployment_wait() except: pass
def test_install_universe_package(package): """ Marathon is responsible for installing packages from the universe. This test confirms that several packages are installed into a healty state. """ shakedown.install_package_and_wait(package) assert shakedown.package_installed(package), 'Package failed to install' shakedown.deployment_wait(timeout=timedelta(minutes=5).total_seconds()) assert shakedown.service_healthy(package)
def test_app_update_rollback(): """Tests that an updated app can be rolled back to its initial version.""" app_def = apps.readiness_and_health_app() app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks)) app_def['instances'] = 2 client.update_app(app_id, app_def) shakedown.deployment_wait(app_id=app_id) tasks = client.get_tasks(app_id) assert len(tasks) == 2, "The number of tasks is {} after update, but 2 was expected".format(len(tasks)) # provides a testing delay to rollback in the meantime app_def['readinessChecks'][0]['intervalSeconds'] = 30 app_def['instances'] = 1 deployment_id = client.update_app(app_id, app_def) client.rollback_deployment(deployment_id) shakedown.deployment_wait(app_id=app_id) # update to 1 instance is rollback to 2 tasks = client.get_tasks(app_id) assert len(tasks) == 2, "The number of tasks is {} after rollback, but 2 was expected".format(len(tasks))
def test_scale_app_in_group(): """ Tests the scaling of an individual app in a group """ with marathon_on_marathon(): client = marathon.create_client() try: client.remove_group('/test-group', True) shakedown.deployment_wait() except Exception as e: pass client.create_group(group()) shakedown.deployment_wait() group_apps = client.get_group('/test-group/sleep') apps = group_apps['apps'] assert len(apps) == 2 tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 1 assert len(tasks2) == 1 # scaling just an app in the group client.scale_app('/test-group/sleep/goodnight', 2) shakedown.deployment_wait() tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 2 assert len(tasks2) == 1
def test_network_pinger(test_type, get_pinger_app, dns_format, marathon_service_name): """ This test runs a pinger app and a relay app. It retrieves the python app from the master via the new http service (which will be moving into shakedown). Then a curl call to the relay will invoke a call to the 2nd pinger app and return back pong to the relay then back to curl. It tests that 1 task can network communicate to another task on the given network It tests inbound and outbound connectivity test_type param is not used. It is passed so that it is clear which parametrized test is running or may be failing. """ client = marathon.create_client() pinger_app = get_pinger_app('pinger') relay_app = get_pinger_app('relay') pinger_dns = dns_format.format('pinger', marathon_service_name) relay_dns = dns_format.format('relay', marathon_service_name) # test pinger app to master shakedown.copy_file_to_master(fixture_dir() + "/pinger.py") with shakedown.master_http_service(): # need to add app with http service in place or it will fail to fetch client.add_app(pinger_app) client.add_app(relay_app) shakedown.deployment_wait() shakedown.wait_for_dns(relay_dns) relay_url = 'http://{}:7777/relay-ping?url={}:7777'.format( relay_dns, pinger_dns ) @retrying.retry def http_output_check(stop_max_attempt_number=30): status, output = shakedown.run_command_on_master('curl {}'.format(relay_url)) assert status assert 'Pong /pinger' in output assert 'Relay from /relay' in output http_output_check()
def test_event_channel(): """ Tests the event channel. The way events are verified is by streaming the events to a events.txt file. The fixture ensures the file is removed before and after the test. events checked are connecting, deploying a good task and killing a task. """ app_def = apps.mesos_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) leader_ip = shakedown.marathon_leader_ip() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_deployment_message(): status, stdout = shakedown.run_command(leader_ip, 'cat events.exitcode') assert str(stdout).strip( ) == '', "SSE stream disconnected (CURL exit code is {})".format( stdout.strip()) status, stdout = shakedown.run_command(leader_ip, 'cat events.txt') assert 'event_stream_attached' in stdout, "event_stream_attached event has not been found" assert 'deployment_info' in stdout, "deployment_info event has not been found" assert 'deployment_step_success' in stdout, "deployment_step_success has not been found" check_deployment_message() client.remove_app(app_id, True) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_kill_message(): status, stdout = shakedown.run_command(leader_ip, 'cat events.txt') assert 'KILLED' in stdout, "KILLED event has not been found" check_kill_message()
def test_update_app_rollback(): """ Tests updating an app then rolling back the update. """ app_id = uuid.uuid4().hex app_def = readiness_and_health_app() app_def['id'] = app_id client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() # start with 1 tasks = client.get_tasks(app_id) assert len(tasks) == 1 app_def['instances'] = 2 client.update_app(app_id, app_def) shakedown.deployment_wait() # update works to 2 tasks = client.get_tasks(app_id) assert len(tasks) == 2 # provides a testing delay to rollback from app_def['readinessChecks'][0]['intervalSeconds'] = 30 app_def['instances'] = 1 deployment_id = client.update_app(app_id, app_def) client.rollback_deployment(deployment_id) shakedown.deployment_wait() # update to 1 instance is rollback to 2 tasks = client.get_tasks(app_id) assert len(tasks) == 2
def test_lock(): '''This test verifies that a second scheduler fails to startup when an existing scheduler is running. Without locking, the scheduler would fail during registration, but after writing its config to ZK. So in order to verify that the scheduler fails immediately, we ensure that the ZK config state is unmodified.''' foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) marathon_client = dcos.marathon.create_client() # Get ZK state from running framework zk_path = "dcos-service-{}/ConfigTarget".format(foldered_name) zk_config_old = shakedown.get_zk_node_data(zk_path) # Get marathon app app = marathon_client.get_app(foldered_name) old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None) # Scale to 2 instances labels = app["labels"] original_labels = labels.copy() labels.pop("MARATHON_SINGLE_INSTANCE_APP") marathon_client.update_app(foldered_name, {"labels": labels}) shakedown.deployment_wait() marathon_client.update_app(foldered_name, {"instances": 2}) # Wait for second scheduler to fail def fn(): timestamp = marathon_client.get_app(foldered_name).get("lastTaskFailure", {}).get("timestamp", None) return timestamp != old_timestamp shakedown.wait_for(lambda: fn()) # Verify ZK is unchanged zk_config_new = shakedown.get_zk_node_data(zk_path) assert zk_config_old == zk_config_new # In order to prevent the second scheduler instance from obtaining a lock, we undo the "scale-up" operation marathon_client.update_app(foldered_name, {"labels": original_labels, "instances": 1}, force=True) shakedown.deployment_wait()
def test_marathon_backup_and_restore_leader(marathon_service_name): """Backup and restore meeting is done with only one master since new master has to be able to read the backup file that was created by the previous master and the easiest way to test it is when there is 1 master """ backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) assert task_id == app['tasks'][0]['id'], "Task has a different ID after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_scale_group(): """ Tests the scaling of a group """ client = marathon.create_client() try: client.remove_group('/test-group', True) shakedown.deployment_wait() except Exception as e: pass client.create_group(group()) shakedown.deployment_wait() group_apps = client.get_group('/test-group/sleep') apps = group_apps['apps'] assert len(apps) == 2 tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 1 assert len(tasks2) == 1 # scale by 2 for the entire group client.scale_group('/test-group/sleep', 2) shakedown.deployment_wait() tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 2 assert len(tasks2) == 2
def test_update_app_poor_health(): """ Tests updating an app with an automatic rollback due to poor health. """ app_id = uuid.uuid4().hex app_def = readiness_and_health_app() app_def['id'] = app_id client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() # start with 1 tasks = client.get_tasks(app_id) assert len(tasks) == 1 # provides a testing delay to rollback from app_def['healthChecks'][0]['path'] = '/non-existant' app_def['instances'] = 2 deployment_id = client.update_app(app_id, app_def) # 2 min wait try: shakedown.deployment_wait() except: client.rollback_deployment(deployment_id) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1
def test_event_channel_for_pods(): """Tests the Marathon event channel specific to pod events.""" pod_def = pods.simple_pod() # In strict mode all tasks are started as user `nobody` by default and `nobody` # doesn't have permissions to write files. if shakedown.ee_version() == 'strict': pod_def['user'] = '******' common.add_dcos_marathon_root_user_acls() client = marathon.create_client() client.add_pod(pod_def) shakedown.deployment_wait() # look for created @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_deployment_message(): status, stdout = shakedown.run_command_on_master('cat events.txt') assert 'event_stream_attached' in stdout, "event_stream_attached event has not been produced" assert 'pod_created_event' in stdout, "pod_created_event event has not been produced" assert 'deployment_step_success' in stdout, "deployment_step_success event has not beed produced" check_deployment_message() pod_def["scaling"]["instances"] = 3 client.update_pod(pod_def["id"], pod_def) shakedown.deployment_wait() # look for updated @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_update_message(): status, stdout = shakedown.run_command_on_master('cat events.txt') assert 'pod_updated_event' in stdout, 'pod_update_event event has not been produced' check_update_message()
def test_unhealthy_app_can_be_rolled_back(): """Verifies that an updated app gets rolled back due to being unhealthy.""" app_def = apps.readiness_and_health_app() app_id = app_def["id"] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) app_def['healthChecks'][0]['path'] = '/non-existent' app_def['instances'] = 2 deployment_id = client.update_app(app_id, app_def) try: shakedown.deployment_wait() except Exception: client.rollback_deployment(deployment_id) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after rollback, but 1 was expected".format( len(tasks))
def test_network_pinger(test_type, get_pinger_app, dns_format, marathon_service_name): """This test runs a pinger app and a relay app. It retrieves the python app from the master via the new http service (which will be moving into shakedown). Then a curl call to the relay will invoke a call to the 2nd pinger app and return back pong to the relay then back to curl. It tests that 1 task can network communicate to another task on the given network It tests inbound and outbound connectivity test_type param is not used. It is passed so that it is clear which parametrized test is running or may be failing. """ pinger_app = get_pinger_app() relay_app = get_pinger_app() relay_app["id"] = relay_app["id"].replace("pinger", "relay") pinger_dns = dns_format.format(pinger_app["id"].lstrip("/"), marathon_service_name) relay_dns = dns_format.format(relay_app["id"].lstrip("/"), marathon_service_name) # test pinger app to master shakedown.copy_file_to_master(os.path.join(scripts.scripts_dir(), "pinger.py")) client = marathon.create_client() with shakedown.master_http_service(): # need to add app with http service in place or it will fail to fetch client.add_app(pinger_app) client.add_app(relay_app) shakedown.deployment_wait() shakedown.wait_for_dns(relay_dns) relay_url = 'http://{}:7777/relay-ping?url={}:7777'.format(relay_dns, pinger_dns) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=300, retry_on_exception=common.ignore_exception) def http_output_check(): status, output = shakedown.run_command_on_master('curl {}'.format(relay_url)) assert status, "curl {} failed on master with {}".format(relay_url, output) assert 'Pong {}'.format(pinger_app["id"]) in output assert 'Relay from {}'.format(relay_app["id"]) in output http_output_check()
def test_scale_app_in_group(): """ Tests the scaling of an individual app in a group """ client = marathon.create_client() try: client.remove_group('/test-group', True) shakedown.deployment_wait() except Exception as e: pass client.create_group(group()) shakedown.deployment_wait() group_apps = client.get_group('/test-group/sleep') apps = group_apps['apps'] assert len(apps) == 2, "Num of Apps: {} is not 2".format(len(apps)) tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 1, "Num of tasks 1: {} is not 1 after deployment".format(len(tasks1)) assert len(tasks2) == 1, "Num of tasks 2: {} is not 1 after deployment".format(len(tasks2)) # scaling just an app in the group client.scale_app('/test-group/sleep/goodnight', 2) shakedown.deployment_wait() tasks1 = client.get_tasks('/test-group/sleep/goodnight') tasks2 = client.get_tasks('/test-group/sleep/goodnight2') assert len(tasks1) == 2, "Num of tasks 1: {} is not 2 after scale".format(len(tasks1)) assert len(tasks2) == 1, "Num of tasks 2: {} is not 1 after scale".format(len(tasks2))
def test_incremental_scale(): """ Scale instances of app in steps until the first error, e.g. a timeout, is reached. """ ensure_mom_version('1.4.0-RC7') cluster_info() print(available_resources()) app_def = { "id": "cap-app", "instances": 1, "cmd": "for (( ; ; )); do sleep 100000000; done", "cpus": 0.001, "mem": 8, "disk": 0, "backoffFactor": 1.0, "backoffSeconds": 0, } with marathon_on_marathon(): # shakedown.delete_app_wait('/cap-app') client = marathon.create_client() client.add_app(app_def) for new_size in incremental_steps( linear_step_function(step_size=1000)): shakedown.echo("Scaling to {}".format(new_size)) shakedown.deployment_wait( app_id='cap-app', timeout=timedelta(minutes=10).total_seconds()) # Scale to 200 client.scale_app('/cap-app', new_size) shakedown.deployment_wait( app_id='cap-app', timeout=timedelta(minutes=10).total_seconds()) shakedown.echo("done.")
def test_pinned_task_scales_on_host_only(): """ Tests that scaling a pinned app scales only on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') assert len(tasks) == 1 assert tasks[0]['host'] == host client.scale_app('pinned', 10) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') assert len(tasks) == 10 for task in tasks: assert task['host'] == host
def test_create_pod_with_private_image(): username = os.environ['DOCKER_HUB_USERNAME'] password = os.environ['DOCKER_HUB_PASSWORD'] secret_name = "dockerPullConfig" secret_value_json = common.create_docker_pull_config_json( username, password) import json secret_value = json.dumps(secret_value_json) client = marathon.create_client() common.create_secret(secret_name, secret_value) try: pod_def = common.private_docker_pod(secret_name) client.add_pod(pod_def) shakedown.deployment_wait(timeout=timedelta(minutes=5).total_seconds()) pod = client.show_pod(pod_def["id"]) assert pod is not None finally: common.delete_secret(secret_name)
def test_pin_pod(): """Tests that a pod can be pinned to a specific host.""" pod_def = pods.ports_pod() host = common.ip_other_than_mom() common.pin_pod_to_host(pod_def, host) client = marathon.create_client() client.add_pod(pod_def) shakedown.deployment_wait() tasks = common.get_pod_tasks(pod_def["id"]) assert len( tasks ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format( len(tasks)) pod = client.list_pod()[0] assert pod['instances'][0][ 'agentHostname'] == host, "The pod didn't get pinned to {}".format( host)
async def test_event_channel(sse_events): """ Tests the event channel. The way events are verified is by converting the parsed events to an iterator and asserting the right oder of certain events. Unknown events are skipped. """ await common.assert_event('event_stream_attached', sse_events) app_def = apps.mesos_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_id) await common.assert_event('deployment_info', sse_events) await common.assert_event('deployment_step_success', sse_events) client.remove_app(app_id, True) shakedown.deployment_wait(app_id=app_id) await common.assert_event('app_terminated_event', sse_events)
def test_marathon_with_master_process_failure(marathon_service_name): """ Launches an app from Marathon and restarts the master. It is expected that the service endpoint will come back and that the task_id is the original task_id """ app_def = app('master-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/master-failure') original_task_id = tasks[0]['id'] common.systemctl_master() shakedown.wait_for_service_endpoint(marathon_service_name) @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_task_recovery(): tasks = client.get_tasks('/master-failure') tasks[0]['id'] == original_task_id
def test_neo4j_universe_package_install(neo_package): """ Neo4j used to be 1 of the universe packages tested above, largely because there was a bug in marathon for a short period of time which was realized through neo4j. However neo4j is so strongly different that we can't test it like the other services. It is NOT a framework so framework health checks do not work with neo4j. """ package = neo_package shakedown.install_package(package) shakedown.deployment_wait(timeout=timedelta(minutes=5).total_seconds(), app_id='neo4j/core') assert shakedown.package_installed(package), 'Package failed to install' marathon_client = marathon.create_client() tasks = marathon_client.get_tasks('neo4j/core') for task in tasks: assert task['healthCheckResults'][0][ 'lastSuccess'] is not None, 'Healthcheck was not successful' assert task['healthCheckResults'][0][ 'consecutiveFailures'] == 0, 'Healthcheck had consecutive failures'
def test_docker_dns_mapping(marathon_service_name): """Tests that a running Docker task is accessible via DNS.""" app_def = apps.docker_http_server(app_id='docker-dns-mapping-app') client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_def["id"]) bad_cmd = 'ping -c 1 docker-test.marathon-user.mesos-bad' status, output = shakedown.run_command_on_master(bad_cmd) assert not status @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_dns(): dnsname = '{}.{}.mesos'.format(app_def["id"].lstrip('/'), marathon_service_name) cmd = 'ping -c 1 {}'.format(dnsname) shakedown.wait_for_dns(dnsname) status, output = shakedown.run_command_on_master(cmd) assert status, "ping failed for app using DNS lookup: {}".format(dnsname) check_dns()
def _retried_install_impl( package_name, service_name, expected_running_tasks, options={}, package_version=None, timeout_seconds=TIMEOUT_SECONDS, install_cli=True): '''Cleaned up version of shakedown's package_install().''' package_manager = dcos.packagemanager.PackageManager(dcos.cosmos.get_cosmos_url()) pkg = package_manager.get_package_version(package_name, package_version) if package_version is None: # Get the resolved version for logging below package_version = 'auto:{}'.format(pkg.version()) log.info('Installing package={} service={} with options={} version={}'.format( package_name, service_name, options, package_version)) # Trigger package install, but only if it's not already installed. # We expect upstream to have confirmed that it wasn't already installed beforehand. if sdk_marathon.app_exists(service_name): log.info('Marathon app={} exists, skipping package install call'.format(service_name)) else: package_manager.install_app(pkg, options) # Install CLI while package starts to install if install_cli and pkg.cli_definition(): log.info('Installing CLI for package={}'.format(package_name)) dcos.subcommand.install(pkg) # Wait for expected tasks to come up if expected_running_tasks > 0: shakedown.wait_for_service_tasks_running( service_name, expected_running_tasks, timeout_seconds) # Wait for completed marathon deployment app_id = pkg.marathon_json(options).get('id') shakedown.deployment_wait(timeout_seconds, app_id)
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_pin_pod(): """ Tests that we can pin a pod to a host. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id host = ip_other_than_mom() pin_pod_to_host(pod_json, host) client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) assert len( tasks) == 2, "Num of tasks: {} is not 2 after deployment".format( len(tasks)) pod = client.list_pod()[0] assert pod['instances'][0]['agentHostname'] == host
def test_pod_port_communication(): """ Test that 1 container can establish a socket connection to the other container in the same pod. """ client = marathon.create_client() pod_id = "/pod-{}".format(uuid.uuid4().hex) pod_json = _pods_json('pod-ports.json') pod_json["id"] = pod_id # sleeps 2, then container 2 checks communication with container 1. # if that timesout, the task completes resulting in 1 container running # otherwise it is expected that 2 containers are running. pod_json['containers'][1]['exec']['command'][ 'shell'] = 'sleep 2; curl -m 2 localhost:$ENDPOINT_HTTPENDPOINT; if [ $? -eq 7 ]; then exit; fi; /opt/mesosphere/bin/python -m http.server $ENDPOINT_HTTPENDPOINT2' # NOQA client.add_pod(pod_json) shakedown.deployment_wait() tasks = get_pod_tasks(pod_id) assert len( tasks) == 2, "Num of tasks: {} is not 2 after deployment".format( len(tasks))
def test_default_user(): """ Ensures the default user of a task is started as root. This is the default user. """ # launch unique-sleep application_json = get_resource("{}/unique-sleep.json".format( fixture_dir())) client = marathon.create_client() client.add_app(application_json) shakedown.deployment_wait() app = client.get_app(application_json['id']) assert app['user'] is None # wait for deployment to finish tasks = client.get_tasks("unique-sleep") host = tasks[0]['host'] assert shakedown.run_command_on_agent( host, "ps aux | grep '[s]leep ' | awk '{if ($1 !=\"root\") exit 1;}'") client = marathon.create_client() client.remove_app("/unique-sleep")
def test_launch_container_with_persistent_volume(): """ Tests launching a task with PV. It will write to a file in the PV. The app is killed and restarted and we can still read from the PV. """ app_def = persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) assert len(tasks) == 1 port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data) client.restart_app(app_id) shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=10000) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len(tasks) == 1 check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert data == 'hello\nhello\n', "'{}' was not equal to hello\\nhello\\n".format( data)
def test_framework_unavailable_on_mom(): """ Launches an app that has elements necessary to create a service endpoint in DCOS. This test confirms that the endpoint is not created when launched with MoM. """ if shakedown.service_available_predicate('pyfw'): client = marathon.create_client() client.remove_app('python-http', True) shakedown.deployment_wait() shakedown.wait_for_service_endpoint_removal('pyfw') with shakedown.marathon_on_marathon(): delete_all_apps_wait() client = marathon.create_client() client.add_app(common.fake_framework_app()) shakedown.deployment_wait() try: shakedown.wait_for_service_endpoint('pyfw', 15) assert False, 'MoM shoud NOT create a service endpoint' except: assert True pass
def test_incremental_app_scale(): """ Scale number of app in steps until the first error, e.g. a timeout, is reached. The apps are created in root group. """ cluster_info() print(available_resources()) client = marathon.create_client() client.remove_group('/') for step in itertools.count(start=1): shakedown.echo("Add new apps") app_id = "app-{0:0>4}".format(step) client.add_app(app_def(app_id)) shakedown.deployment_wait(timeout=timedelta( minutes=15).total_seconds()) shakedown.echo("done.")