Esempio n. 1
0
def test_mom_with_network_failure_bounce_master():
    """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports."""

    # get MoM ip
    mom_ip = common.ip_of_mom()
    logger.info("MoM IP: {}".format(mom_ip))

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'))
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]["id"]
        task_ip = tasks[0]['host']
        logger.info("\nTask IP: " + task_ip)

    # PR for network partitioning in shakedown makes this better
    # take out the net
    partition_agent(mom_ip)
    partition_agent(task_ip)

    # wait for a min
    time.sleep(timedelta(minutes=1).total_seconds())

    # bounce master
    run_command_on_master("sudo systemctl restart dcos-mesos-master")

    # bring the net up
    reconnect_agent(mom_ip)
    reconnect_agent(task_ip)

    time.sleep(timedelta(minutes=1).total_seconds())
    common.wait_for_service_endpoint('marathon-user',
                                     timedelta(minutes=10).total_seconds(),
                                     path="ping")

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'),
                                timedelta(minutes=10).total_seconds())

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
Esempio n. 2
0
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        common.deployment_wait(service_id=app_id)
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        common.wait_for_service_endpoint('marathon-user', path="ping")

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
Esempio n. 3
0
def test_create_pod_with_private_image():
    """Deploys a pod with a private Docker image, using Mesos containerizer.
        This method relies on the global `install_enterprise_cli` fixture to install the
        enterprise-cli-package.
    """

    username = os.environ['DOCKER_HUB_USERNAME']
    password = os.environ['DOCKER_HUB_PASSWORD']

    secret_name = "pullconfig"
    secret_value_json = common.create_docker_pull_config_json(
        username, password)
    secret_value = json.dumps(secret_value_json)

    pod_def = pods.private_docker_pod()
    pod_id = pod_def['id']
    common.create_secret(secret_name, secret_value)
    client = marathon.create_client()

    try:
        client.add_pod(pod_def)
        deployment_wait(service_id=pod_id, max_attempts=300)
        pod = client.show_pod(pod_id)
        assert pod is not None, "The pod has not been created"
    finally:
        common.delete_secret(secret_name)
Esempio n. 4
0
def test_pod_with_persistent_volume():
    pod_def = pods.persistent_volume_pod()
    pod_id = pod_def['id']

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)

    host = common.running_status_network_info(
        tasks[0]['statuses'])['ip_addresses'][0]['ip_address']
    port1 = tasks[0]['discovery']['ports']['ports'][0]["number"]
    port2 = tasks[1]['discovery']['ports']['ports'][0]["number"]
    path1 = tasks[0]['container']['volumes'][0]['container_path']
    path2 = tasks[1]['container']['volumes'][0]['container_path']
    logger.info('Deployd two containers on {}:{}/{} and {}:{}/{}'.format(
        host, port1, path1, host, port2, path2))

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=60,
                    retry_on_exception=common.ignore_exception)
    def check_http_endpoint(port, path):
        cmd = "curl {}:{}/{}/foo".format(host, port, path)
        run, data = run_command_on_master(cmd)
        assert run, "{} did not succeed".format(cmd)
        assert data == 'hello\n', "'{}' was not equal to hello\\n".format(data)

    check_http_endpoint(port1, path1)
    check_http_endpoint(port2, path2)
Esempio n. 5
0
def test_pod_health_failed_check():
    """Deploys a pod with correct health checks, then partitions the network and verifies that
       the tasks get restarted with new task IDs.
    """

    pod_def = pods.ports_pod()
    pod_id = pod_def['id']

    host = common.ip_other_than_mom()
    common.pin_pod_to_host(pod_def, host)

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    initial_id1 = tasks[0]['id']
    initial_id2 = tasks[1]['id']

    pod = client.list_pod()[0]
    container1 = pod['instances'][0]['containers'][0]
    port = container1['endpoints'][0]['allocatedHostPort']

    common.block_iptable_rules_for_seconds(host,
                                           port,
                                           7,
                                           block_input=True,
                                           block_output=False)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    for new_task in tasks:
        new_task_id = new_task['id']
        assert new_task_id != initial_id1, f"Task {new_task_id} has not been restarted"  # NOQA E999
        assert new_task_id != initial_id2, f"Task {new_task_id} has not been restarted"
Esempio n. 6
0
def test_pod_with_container_bridge_network():
    """Tests creation of a pod with a "container/bridge" network, and its HTTP endpoint accessibility."""

    pod_def = pods.container_bridge_pod()
    pod_id = pod_def['id']

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write to /var/log within the container.
    if shakedown.dcos.cluster.ee_version() == 'strict':
        pod_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx")
    network_info = common.running_status_network_info(task['statuses'])
    assert network_info['name'] == "mesos-bridge", \
        "The network is {}, but mesos-bridge was expected".format(network_info['name'])

    # get the port on the host
    port = task['discovery']['ports']['ports'][0]['number']

    # the agent IP:port will be routed to the bridge IP:port
    # test against the agent_ip, however it is hard to get.. translating from
    # slave_id
    agent_ip = common.agent_hostname_by_id(task['slave_id'])
    assert agent_ip is not None, "Failed to get the agent IP address"
    container_ip = network_info['ip_addresses'][0]['ip_address']
    assert agent_ip != container_ip, "The container IP address is the same as the agent one"

    url = "http://{}:{}/".format(agent_ip, port)
    common.assert_http_code(url)
Esempio n. 7
0
def test_pod_with_container_network():
    """Tests creation of a pod with a "container" network, and its HTTP endpoint accessibility."""

    pod_def = pods.container_net_pod()
    pod_id = pod_def['id']

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write to /var/log within the container.
    if shakedown.dcos.cluster.ee_version() == 'strict':
        pod_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx")

    network_info = common.running_status_network_info(task['statuses'])
    assert network_info['name'] == "dcos", \
        "The network name is {}, but 'dcos' was expected".format(network_info['name'])

    container_ip = network_info['ip_addresses'][0]['ip_address']
    assert container_ip is not None, "No IP address has been assigned to the pod's container"

    url = "http://{}:80/".format(container_ip)
    common.assert_http_code(url)
Esempio n. 8
0
def test_app_update_rollback():
    """Tests that an updated app can be rolled back to its initial version."""

    app_def = apps.readiness_and_health_app("app-update-rollback")
    app_id = app_def["id"]

    # First deployment
    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert_that(tasks, has_len(equal_to(1)))

    # Second deployment
    app_def['instances'] = 2
    client.update_app(app_id, app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert_that(tasks, has_len(equal_to(2)))

    # Third deployment with rollback
    # provides a testing delay to rollback in the meantime
    app_def['readinessChecks'][0]['intervalSeconds'] = 30
    app_def['instances'] = 1
    deployment_id = client.update_app(app_id, app_def)
    client.rollback_deployment(deployment_id)
    deployment_wait(service_id=app_id)

    # update to 1 instance is rollback to 2
    tasks = client.get_tasks(app_id)
    assert_that(tasks, has_len(equal_to(2)))
Esempio n. 9
0
def test_two_pods_with_shared_volume():
    """Confirms that 1 container can read data in a volume that was written from the other container.
       The reading container fails if it can't read the file. So if there are 2 tasks after
       4 seconds we are good.
    """

    pod_def = pods.ephemeral_volume_pod()
    pod_id = pod_def['id']

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    assert len(
        tasks
    ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(
        len(tasks))

    time.sleep(4)

    tasks = common.get_pod_tasks(pod_id)
    assert len(
        tasks
    ) == 2, "The number of tasks is {} after sleeping, but 2 was expected".format(
        len(tasks))
def assert_mom_ee(version, security_mode='permissive'):
    ensure_service_account()
    ensure_permissions()
    ensure_sa_secret(strict=True if security_mode == 'strict' else False)
    ensure_docker_config_secret()

    # In strict mode all tasks are started as user `nobody` by default. However we start
    # MoM-EE as 'root' and for that we need to give root marathon ACLs to start
    # tasks as 'root'.
    if security_mode == 'strict':
        common.add_dcos_marathon_user_acls()

    # Deploy MoM-EE in permissive mode
    app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(), security_mode, version)
    assert os.path.isfile(app_def_file), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file)

    image = mom_ee_image(version)
    logger.info('Deploying {} definition with {} image'.format(app_def_file, image))

    app_def = get_resource(app_def_file)
    app_def['container']['docker']['image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image)
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)
    shakedown.dcos.service.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode), path="ping")
Esempio n. 11
0
def test_create_pod_with_private_image():
    """Deploys a pod with a private Docker image, using Mesos containerizer.
        This method relies on the global `install_enterprise_cli` fixture to install the
        enterprise-cli-package.
    """

    username = os.environ['DOCKER_HUB_USERNAME']
    password = os.environ['DOCKER_HUB_PASSWORD']

    secret_name = "pullconfig"
    secret_value_json = common.create_docker_pull_config_json(username, password)
    secret_value = json.dumps(secret_value_json)

    pod_def = pods.private_docker_pod()
    pod_id = pod_def['id']
    common.create_secret(secret_name, secret_value)
    client = marathon.create_client()

    try:
        client.add_pod(pod_def)
        deployment_wait(service_id=pod_id, max_attempts=300)
        pod = client.show_pod(pod_id)
        assert pod is not None, "The pod has not been created"
    finally:
        common.delete_secret(secret_name)
Esempio n. 12
0
def test_pod_with_persistent_volume():
    pod_def = pods.persistent_volume_pod()
    pod_id = pod_def['id']

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)

    host = common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address']

    # Container with the name 'container1' appends its taskId to the file. So we search for the
    # taskId of that container which is not always the tasks[0]
    expected_data = next((t['id'] for t in tasks if t['name'] == 'container1'), None)
    assert expected_data, f"Hasn't found a container with the name 'container1' in the pod {tasks}"

    port1 = tasks[0]['discovery']['ports']['ports'][0]["number"]
    port2 = tasks[1]['discovery']['ports']['ports'][0]["number"]
    path1 = tasks[0]['container']['volumes'][0]['container_path']
    path2 = tasks[1]['container']['volumes'][0]['container_path']
    logger.info('Deployd two containers on {}:{}/{} and {}:{}/{}'.format(host, port1, path1, host, port2, path2))

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=60, retry_on_exception=common.ignore_exception)
    def check_http_endpoint(port, path, expected):
        cmd = "curl {}:{}/{}/foo".format(host, port, path)
        run, data = run_command_on_master(cmd)
        assert run, "{} did not succeed".format(cmd)
        assert expected in data, "'{}' was not found in '{}'".format(data, expected)

    check_http_endpoint(port1, path1, expected_data)
    check_http_endpoint(port2, path2, expected_data)
Esempio n. 13
0
def test_pod_health_failed_check():
    """Deploys a pod with correct health checks, then partitions the network and verifies that
       the tasks get restarted with new task IDs.
    """

    pod_def = pods.ports_pod()
    pod_id = pod_def['id']

    host = common.ip_other_than_mom()
    common.pin_pod_to_host(pod_def, host)

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    initial_id1 = tasks[0]['id']
    initial_id2 = tasks[1]['id']

    pod = client.list_pod()[0]
    container1 = pod['instances'][0]['containers'][0]
    port = container1['endpoints'][0]['allocatedHostPort']

    common.block_iptable_rules_for_seconds(host, port, 7, block_input=True, block_output=False)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    for new_task in tasks:
        new_task_id = new_task['id']
        assert new_task_id != initial_id1, f"Task {new_task_id} has not been restarted" # NOQA E999
        assert new_task_id != initial_id2, f"Task {new_task_id} has not been restarted"
Esempio n. 14
0
def test_pod_with_container_bridge_network():
    """Tests creation of a pod with a "container/bridge" network, and its HTTP endpoint accessibility."""

    pod_def = pods.container_bridge_pod()
    pod_id = pod_def['id']

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write to /var/log within the container.
    if shakedown.dcos.cluster.ee_version() == 'strict':
        pod_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx")
    network_info = common.running_status_network_info(task['statuses'])
    assert network_info['name'] == "mesos-bridge", \
        "The network is {}, but mesos-bridge was expected".format(network_info['name'])

    # get the port on the host
    port = task['discovery']['ports']['ports'][0]['number']

    # the agent IP:port will be routed to the bridge IP:port
    # test against the agent_ip, however it is hard to get.. translating from
    # slave_id
    agent_ip = common.agent_hostname_by_id(task['slave_id'])
    assert agent_ip is not None, "Failed to get the agent IP address"
    container_ip = network_info['ip_addresses'][0]['ip_address']
    assert agent_ip != container_ip, "The container IP address is the same as the agent one"

    url = "http://{}:{}/".format(agent_ip, port)
    common.assert_http_code(url)
Esempio n. 15
0
def test_pod_restarts_on_nonzero_exit_code():
    """Verifies that a pod get restarted in case one of its containers exits with a non-zero code.
       As a result, after restart, there should be two new tasks for different IDs.
    """

    pod_def = pods.simple_pod()
    pod_id = pod_def['id']
    pod_def["scaling"]["instances"] = 1
    pod_def['containers'][0]['exec']['command'][
        'shell'] = 'sleep 5; echo -n leaving; exit 2'

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    initial_id1 = tasks[0]['id']
    initial_id2 = tasks[1]['id']

    time.sleep(
        6)  # 1 sec past the 5 sec sleep in one of the container's command
    tasks = common.get_pod_tasks(pod_id)
    for task in tasks:
        assert task['id'] != initial_id1, "Got the same task ID"
        assert task['id'] != initial_id2, "Got the same task ID"
Esempio n. 16
0
def test_mom_when_mom_agent_bounced():
    """Launch an app from MoM and restart the node MoM is on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    mom_ip = common.ip_of_mom()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        common.deployment_wait(service_id=app_id)
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.restart_agent(mom_ip)

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
Esempio n. 17
0
def test_marathon_when_task_agent_bounced():
    """Launch an app and restart the node the task is running on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']
    restart_agent(host)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
Esempio n. 18
0
def test_marathon_when_disconnected_from_zk():
    """Launches an app from Marathon, then knocks out access to ZK from Marathon.
       Verifies the task is preserved.
    """

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']

    common.block_iptable_rules_for_seconds(host, 2181, sleep_seconds=10, block_input=True, block_output=False)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
Esempio n. 19
0
def test_private_repository_mesos_app():
    """Deploys an app with a private Docker image, using Mesos containerizer.
        It relies on the global `install_enterprise_cli` fixture to install the
        enterprise-cli-package.
    """

    username = os.environ['DOCKER_HUB_USERNAME']
    password = os.environ['DOCKER_HUB_PASSWORD']

    secret_name = "pullconfig"
    secret_value_json = common.create_docker_pull_config_json(username, password)
    secret_value = json.dumps(secret_value_json)

    app_def = apps.private_ucr_docker_app()
    app_id = app_def["id"]

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write to /var/log within the container.
    if is_strict():
        app_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    common.create_secret(secret_name, secret_value)
    client = marathon.create_client()

    try:
        client.add_app(app_def)
        deployment_wait(service_id=app_id)

        common.assert_app_tasks_running(client, app_def)
    finally:
        common.delete_secret(secret_name)
Esempio n. 20
0
def service_healthy(service_name, app_id=None):
    """ Check whether a named service is healthy

        :param service_name: the service name
        :type service_name: str
        :param app_id: app_id to filter
        :type app_id: str

        :return: True if healthy, False otherwise
        :rtype: bool
    """

    marathon_client = marathon.create_client()
    apps = marathon_client.get_apps_for_framework(service_name)

    if apps:
        for app in apps:
            if (app_id
                    is not None) and (app['id'] != "/{}".format(str(app_id))):
                continue

            if (app['tasksHealthy']) and \
               (app['tasksRunning']) and \
               (not app['tasksStaged']) and \
               (not app['tasksUnhealthy']):
                return True

    return False
Esempio n. 21
0
async def test_event_channel_for_pods(sse_events):
    """Tests the Marathon event channel specific to pod events."""

    await common.assert_event('event_stream_attached', sse_events)

    pod_def = pods.simple_pod()
    pod_id = pod_def['id']

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write files.
    if shakedown.dcos.cluster.ee_version() == 'strict':
        pod_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    await common.assert_event('pod_created_event', sse_events)
    await common.assert_event('deployment_step_success', sse_events)

    pod_def["scaling"]["instances"] = 3
    client.update_pod(pod_id, pod_def)
    deployment_wait(service_id=pod_id)

    await common.assert_event('pod_updated_event', sse_events)
Esempio n. 22
0
def test_scale_app_in_group_then_group():
    """First scales an app in a group, then scales the group itself."""

    group_def = groups.sleep_group()
    groups_id = group_def["groups"][0]["id"]
    app1_id = group_def["groups"][0]["apps"][0]["id"]
    app2_id = group_def["groups"][0]["apps"][1]["id"]

    client = marathon.create_client()
    client.create_group(group_def)

    deployment_wait(service_id=app1_id)

    group_apps = client.get_group(groups_id)
    apps = group_apps['apps']
    assert len(
        apps) == 2, "The number of apps is {}, but 2 was expected".format(
            len(apps))

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(
        tasks1
    ) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(
        len(tasks1))
    assert len(
        tasks2
    ) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(
        len(tasks2))

    # scaling just one app in the group
    client.scale_app(app1_id, 2)
    deployment_wait(service_id=app1_id)

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(
        tasks1
    ) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(
        len(tasks1))
    assert len(
        tasks2
    ) == 1, "The number of tasks #2 is {} after scale, but 1 was expected".format(
        len(tasks2))
    deployment_wait(service_id=app1_id)

    # scaling the group after one app in the group was scaled
    client.scale_group(groups_id, 2)
    deployment_wait(service_id=app1_id)

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(
        tasks1
    ) == 4, "The number of tasks #1 is {} after scale, but 4 was expected".format(
        len(tasks1))
    assert len(
        tasks2
    ) == 2, "The number of tasks #2 is {} after scale, but 2 was expected".format(
        len(tasks2))
Esempio n. 23
0
def test_pinned_task_scales_on_host_only():
    """Tests that a pinned app scales only on the pinned node."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(
        tasks
    ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(
        len(tasks))
    assert tasks[0]['host'] == host, \
        "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host)

    client.scale_app(app_id, 10)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(
        tasks
    ) == 10, "The number of tasks is {} after scale, but 10 was expected".format(
        len(tasks))
    for task in tasks:
        assert task[
            'host'] == host, "The task is on {}, but it is supposed to be on {}".format(
                task['host'], host)
Esempio n. 24
0
def test_launch_docker_grace_period(marathon_service_name):
    """Tests 'taskKillGracePeriodSeconds' option using a Docker container in a Marathon environment.
       Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period`
    """

    app_id = '/launch-docker-grace-period-app'
    app_def = apps.docker_http_server(app_id)
    app_def['container']['docker']['image'] = 'kensipe/python-test'

    default_grace_period = 3
    grace_period = 20
    app_def['taskKillGracePeriodSeconds'] = grace_period
    app_def['cmd'] = 'python test.py'
    task_name = app_id.lstrip('/')

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = get_service_task(marathon_service_name, task_name)
    assert tasks is not None

    client.scale_app(app_id, 0)
    tasks = get_service_task(marathon_service_name, task_name)
    assert tasks is not None

    # tasks should still be here after the default_graceperiod
    time.sleep(default_grace_period + 1)
    tasks = get_service_task(marathon_service_name, task_name)
    assert tasks is not None

    # but not after the set grace_period
    time.sleep(grace_period)
    assert_that(lambda: get_service_task(marathon_service_name, task_name),
                eventually(equal_to(None), max_attempts=30))
Esempio n. 25
0
def test_docker_dns_mapping(marathon_service_name):
    """Tests that a running Docker task is accessible via DNS."""

    app_def = apps.docker_http_server(app_id='/docker-dns-mapping-app')
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    bad_cmd = 'ping -c 1 docker-test.marathon-user.mesos-bad'
    status, output = run_command_on_master(bad_cmd)
    assert not status

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_dns():
        dnsname = '{}.{}.mesos'.format(app_id.lstrip('/'),
                                       marathon_service_name)
        cmd = 'ping -c 1 {}'.format(dnsname)
        wait_for_dns(dnsname)
        status, output = run_command_on_master(cmd)
        assert status, "ping failed for app using DNS lookup: {}".format(
            dnsname)

    check_dns()
def assert_mom_ee(version, security_mode='permissive'):
    ensure_service_account()
    ensure_permissions()
    ensure_sa_secret(strict=True if security_mode == 'strict' else False)
    ensure_docker_config_secret()

    # In strict mode all tasks are started as user `nobody` by default. However we start
    # MoM-EE as 'root' and for that we need to give root marathon ACLs to start
    # tasks as 'root'.
    if security_mode == 'strict':
        common.add_dcos_marathon_user_acls()

    # Deploy MoM-EE in permissive mode
    app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(),
                                                 security_mode, version)
    assert os.path.isfile(
        app_def_file
    ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file)

    image = mom_ee_image(version)
    logger.info('Deploying {} definition with {} image'.format(
        app_def_file, image))

    app_def = get_resource(app_def_file)
    app_def['container']['docker'][
        'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image)
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)
    common.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode),
                                     path="ping")
Esempio n. 27
0
def test_vip_mesos_cmd(marathon_service_name):
    """Validates the creation of an app with a VIP label and the accessibility of the service via the VIP."""

    app_def = apps.http_server()
    app_id = app_def["id"]

    vip_name = app_id.lstrip("/")
    fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name,
                                                 marathon_service_name)

    app_def['portDefinitions'] = [{
        "port": 0,
        "protocol": "tcp",
        "name": "{}".format(vip_name),
        "labels": {
            "VIP_0": "/{}:10000".format(vip_name)
        }
    }]

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def http_output_check():
        time.sleep(1)
        common.assert_http_code('{}:{}'.format(fqn, 10000))

    http_output_check()
Esempio n. 28
0
def test_marathon_with_master_process_failure(marathon_service_name):
    """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and
       the task ID stays the same.
    """

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']

    common.systemctl_master('restart')
    shakedown.dcos.service.wait_for_service_endpoint(marathon_service_name,
                                                     path="ping")

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_recovery():
        tasks = client.get_tasks(app_id)
        assert len(
            tasks
        ) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(
            len(tasks))
        assert tasks[0]['id'] == original_task_id, \
            "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id'])

    check_task_recovery()
Esempio n. 29
0
def test_app_update():
    """Tests that an app gets successfully updated."""

    app_def = apps.mesos_app(app_id='/update-app')
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(
        tasks
    ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(
        len(tasks))

    app_def['cpus'] = 1
    app_def['instances'] = 2

    client.update_app(app_id, app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(
        tasks
    ) == 2, "The number of tasks is {} after deployment, but 2 was expected".format(
        len(tasks))
Esempio n. 30
0
def test_app_update_rollback():
    """Tests that an updated app can be rolled back to its initial version."""

    app_def = apps.readiness_and_health_app()
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks))

    app_def['instances'] = 2
    client.update_app(app_id, app_def)
    common.deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 2, "The number of tasks is {} after update, but 2 was expected".format(len(tasks))

    # provides a testing delay to rollback in the meantime
    app_def['readinessChecks'][0]['intervalSeconds'] = 30
    app_def['instances'] = 1
    deployment_id = client.update_app(app_id, app_def)
    client.rollback_deployment(deployment_id)
    common.deployment_wait(service_id=app_id)

    # update to 1 instance is rollback to 2
    tasks = client.get_tasks(app_id)
    assert len(tasks) == 2, "The number of tasks is {} after rollback, but 2 was expected".format(len(tasks))
Esempio n. 31
0
def test_unhealthy_app_can_be_rolled_back():
    """Verifies that an updated app gets rolled back due to being unhealthy."""

    app_def = apps.readiness_and_health_app()
    app_id = app_def["id"]

    @retrying.retry(
        wait_fixed=1000,
        stop_max_attempt_number=30,
        retry_on_exception=common.ignore_provided_exception(DCOSException)
    )
    def wait_for_deployment():
        common.deployment_wait(service_id=app_id)

    client = marathon.create_client()
    client.add_app(app_def)
    wait_for_deployment()

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks))

    app_def['healthChecks'][0]['path'] = '/non-existent'
    app_def['instances'] = 2
    deployment_id = client.update_app(app_id, app_def)

    try:
        wait_for_deployment()
    except Exception:
        client.rollback_deployment(deployment_id)
        wait_for_deployment()

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after rollback, but 1 was expected".format(len(tasks))
Esempio n. 32
0
def test_launch_and_scale_group():
    """Launches and scales a group."""

    group_def = groups.sleep_group()
    groups_id = group_def["groups"][0]["id"]
    app1_id = group_def["groups"][0]["apps"][0]["id"]
    app2_id = group_def["groups"][0]["apps"][1]["id"]

    client = marathon.create_client()
    client.create_group(group_def)

    common.deployment_wait(service_id=app1_id)

    group_apps = client.get_group(groups_id)
    apps = group_apps['apps']
    assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps))

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1))
    assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2))

    # scale by 2 for the entire group
    client.scale_group(groups_id, 2)
    common.deployment_wait(service_id=app1_id)

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1))
    assert len(tasks2) == 2, "The number of tasks #2 is {} after scale, but 2 was expected".format(len(tasks2))
Esempio n. 33
0
def test_vip_docker_bridge_mode(marathon_service_name):
    """Tests the creation of a VIP from a python command in a docker image using bridge mode.
       the test validates the creation of an app with the VIP label and the accessability
       of the service via the VIP.
    """

    app_def = apps.docker_http_server(app_id='vip-docker-bridge-mode-app')
    app_id = app_def["id"]

    vip_name = app_id.lstrip("/")
    fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name)

    app_def['id'] = vip_name
    app_def['container']['docker']['portMappings'] = [{
        "containerPort": 8080,
        "hostPort": 0,
        "labels": {
            "VIP_0": "/{}:10000".format(vip_name)
        },
        "protocol": "tcp",
        "name": "{}".format(vip_name)
    }]

    client = marathon.create_client()
    client.add_app(app_def)

    common.deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def http_output_check():
        time.sleep(1)
        common.assert_http_code('{}:{}'.format(fqn, 10000))

    http_output_check()
Esempio n. 34
0
def _test_declined_offer(app_def, reason):
    """Used to confirm that offers were declined.   The `processedOffersSummary` and these tests
       in general require 1.4+ marathon with the queue end point.
       The retry is the best possible way to "time" the success of the test.
    """

    app_id = app_def["id"]
    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def verify_declined_offer():
        deployments = client.get_deployments(app_id)
        assert len(deployments) == 1

        offer_summary = client.get_queued_app(app_id)['processedOffersSummary']
        role_summary = declined_offer_by_reason(offer_summary['rejectSummaryLastOffers'], reason)
        last_attempt = declined_offer_by_reason(offer_summary['rejectSummaryLaunchAttempt'], reason)

        assert role_summary['declined'] > 0, "There are no declined offers because of {}".format(reason)
        assert role_summary['processed'] > 0, "There are no processed offers for {}".format(reason)
        assert last_attempt['declined'] > 0, "There are no declined offers because of {}".format(reason)
        assert last_attempt['processed'] > 0, "There are no processed offers for {}".format(reason)

    verify_declined_offer()
Esempio n. 35
0
def test_pinned_task_recovers_on_host():
    """Tests that when a pinned task gets killed, it recovers on the node it was pinned to."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)

    common.kill_process_on_host(host, '[s]leep')
    deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_for_new_task():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0][
            'id'], "The task did not get killed: {}".format(tasks[0]['id'])
        assert new_tasks[0]['host'] == host, \
            "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host)

    check_for_new_task()
Esempio n. 36
0
def test_marathon_when_disconnected_from_zk():
    """Launches an app from Marathon, then knocks out access to ZK from Marathon.
       Verifies the task is preserved.
    """

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']

    common.block_iptable_rules_for_seconds(host,
                                           2181,
                                           sleep_seconds=10,
                                           block_input=True,
                                           block_output=False)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_is_back():
        tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] == original_task_id, \
            "The task {} got replaced with {}".format(original_task_id, tasks[0]['id'])

    check_task_is_back()
Esempio n. 37
0
def test_pinned_task_does_not_scale_to_unpinned_host():
    """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past
       the resources of that node, no tasks will be launched on any other node.
    """

    app_def = apps.sleep_app()
    app_id = app_def['id']

    host = common.ip_other_than_mom()
    logger.info('Constraint set to host: {}'.format(host))
    # the size of cpus is designed to be greater than 1/2 of a node
    # such that only 1 task can land on the node.
    cores = common.cpus_on_agent(host)
    app_def['cpus'] = max(0.6, cores - 0.5)
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    client.scale_app(app_id, 2)

    time.sleep(5)
    deployments = client.get_deployments(app_id=app_id)
    tasks = client.get_tasks(app_id)

    # still deploying
    assert len(
        deployments
    ) == 1, "The number of deployments is {}, but 1 was expected".format(
        len(deployments))
    assert len(
        tasks) == 1, "The number of tasks is {}, but 1 was expected".format(
            len(tasks))
Esempio n. 38
0
def test_pod_with_container_network():
    """Tests creation of a pod with a "container" network, and its HTTP endpoint accessibility."""

    pod_def = pods.container_net_pod()
    pod_id = pod_def['id']

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write to /var/log within the container.
    if shakedown.dcos.cluster.ee_version() == 'strict':
        pod_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    client = marathon.create_client()
    client.add_pod(pod_def)
    deployment_wait(service_id=pod_id)

    task = common.task_by_name(common.get_pod_tasks(pod_id), "nginx")

    network_info = common.running_status_network_info(task['statuses'])
    assert network_info['name'] == "dcos", \
        "The network name is {}, but 'dcos' was expected".format(network_info['name'])

    container_ip = network_info['ip_addresses'][0]['ip_address']
    assert container_ip is not None, "No IP address has been assigned to the pod's container"

    url = "http://{}:80/".format(container_ip)
    common.assert_http_code(url)
Esempio n. 39
0
def test_marathon_with_master_process_failure(marathon_service_name):
    """Launches an app and restarts the master. It is expected that the service endpoint eventually comes back and
       the task ID stays the same.
    """

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    original_task_id = tasks[0]['id']

    common.systemctl_master('restart')
    shakedown.dcos.service.wait_for_service_endpoint(marathon_service_name, path="ping")

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_task_recovery():
        tasks = client.get_tasks(app_id)
        assert len(tasks) == 1, "The number of tasks is {} after master restart, but 1 was expected".format(len(tasks))
        assert tasks[0]['id'] == original_task_id, \
            "Task {} has not recovered, it got replaced with another one: {}".format(original_task_id, tasks[0]['id'])

    check_task_recovery()
Esempio n. 40
0
def test_unhealthy_app_can_be_rolled_back():
    """Verifies that an updated app gets rolled back due to being unhealthy."""

    app_def = apps.readiness_and_health_app()
    app_id = app_def["id"]

    @retrying.retry(
        wait_fixed=1000,
        stop_max_attempt_number=30,
        retry_on_exception=common.ignore_provided_exception(DCOSException)
    )
    def wait_for_deployment():
        deployment_wait(service_id=app_id)

    client = marathon.create_client()
    client.add_app(app_def)
    wait_for_deployment()

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks))

    app_def['healthChecks'][0]['path'] = '/non-existent'
    app_def['instances'] = 2
    deployment_id = client.update_app(app_id, app_def)

    try:
        wait_for_deployment()
    except Exception:
        client.rollback_deployment(deployment_id)
        wait_for_deployment()

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after rollback, but 1 was expected".format(len(tasks))
Esempio n. 41
0
def test_private_repository_mesos_app():
    """Deploys an app with a private Docker image, using Mesos containerizer."""

    if not common.is_enterprise_cli_package_installed():
        common.install_enterprise_cli_package()

    username = os.environ['DOCKER_HUB_USERNAME']
    password = os.environ['DOCKER_HUB_PASSWORD']

    secret_name = "pullconfig"
    secret_value_json = common.create_docker_pull_config_json(
        username, password)
    secret_value = json.dumps(secret_value_json)

    app_def = apps.private_ucr_docker_app()
    app_id = app_def["id"]

    # In strict mode all tasks are started as user `nobody` by default and `nobody`
    # doesn't have permissions to write to /var/log within the container.
    if is_strict():
        app_def['user'] = '******'
        common.add_dcos_marathon_user_acls()

    common.create_secret(secret_name, secret_value)
    client = marathon.create_client()

    try:
        client.add_app(app_def)
        common.deployment_wait(service_id=app_id)

        common.assert_app_tasks_running(client, app_def)
    finally:
        common.delete_secret(secret_name)
Esempio n. 42
0
def test_scale_app_in_group():
    """Scales an individual app in a group."""

    group_def = groups.sleep_group()
    groups_id = group_def["groups"][0]["id"]
    app1_id = group_def["groups"][0]["apps"][0]["id"]
    app2_id = group_def["groups"][0]["apps"][1]["id"]

    client = marathon.create_client()
    client.create_group(group_def)

    deployment_wait(service_id=app1_id)

    group_apps = client.get_group(groups_id)
    apps = group_apps['apps']
    assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps))

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1))
    assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2))

    # scaling just one app in the group
    client.scale_app(app1_id, 2)
    deployment_wait(service_id=app1_id)

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1))
    assert len(tasks2) == 1, "The number of tasks #2 is {} after scale, but 1 was expected".format(len(tasks2))
Esempio n. 43
0
def test_pinned_task_does_not_scale_to_unpinned_host():
    """Tests when a task lands on a pinned node (and barely fits) and it is asked to scale past
       the resources of that node, no tasks will be launched on any other node.
    """

    app_def = apps.sleep_app()
    app_id = app_def['id']

    host = common.ip_other_than_mom()
    logger.info('Constraint set to host: {}'.format(host))
    # the size of cpus is designed to be greater than 1/2 of a node
    # such that only 1 task can land on the node.
    cores = common.cpus_on_agent(host)
    app_def['cpus'] = max(0.6, cores - 0.5)
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    client.scale_app(app_id, 2)

    time.sleep(5)
    deployments = client.get_deployments(app_id=app_id)
    tasks = client.get_tasks(app_id)

    # still deploying
    assert len(deployments) == 1, "The number of deployments is {}, but 1 was expected".format(len(deployments))
    assert len(tasks) == 1, "The number of tasks is {}, but 1 was expected".format(len(tasks))
Esempio n. 44
0
def test_pinned_task_recovers_on_host():
    """Tests that when a pinned task gets killed, it recovers on the node it was pinned to."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)
    tasks = client.get_tasks(app_id)

    common.kill_process_on_host(host, '[s]leep')
    deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_for_new_task():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0]['id'], "The task did not get killed: {}".format(tasks[0]['id'])
        assert new_tasks[0]['host'] == host, \
            "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host)

    check_for_new_task()
Esempio n. 45
0
def test_launch_docker_grace_period(marathon_service_name):
    """Tests 'taskKillGracePeriodSeconds' option using a Docker container in a Marathon environment.
       Read more details about this test in `test_root_marathon.py::test_launch_mesos_root_marathon_grace_period`
    """

    app_id = '/launch-docker-grace-period-app'
    app_def = apps.docker_http_server(app_id)
    app_def['container']['docker']['image'] = 'kensipe/python-test'

    default_grace_period = 3
    grace_period = 20
    app_def['taskKillGracePeriodSeconds'] = grace_period
    app_def['cmd'] = 'python test.py'
    task_name = app_id.lstrip('/')

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = get_service_task(marathon_service_name, task_name)
    assert tasks is not None

    client.scale_app(app_id, 0)
    tasks = get_service_task(marathon_service_name, task_name)
    assert tasks is not None

    # tasks should still be here after the default_graceperiod
    time.sleep(default_grace_period + 1)
    tasks = get_service_task(marathon_service_name, task_name)
    assert tasks is not None

    # but not after the set grace_period
    time.sleep(grace_period)
    assert_that(lambda: get_service_task(marathon_service_name, task_name),
                eventually(equal_to(None), max_attempts=30))
Esempio n. 46
0
def test_app_update_rollback():
    """Tests that an updated app can be rolled back to its initial version."""

    app_def = apps.readiness_and_health_app("app-update-rollback")
    app_id = app_def["id"]

    # First deployment
    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert_that(tasks, has_len(equal_to(1)))

    # Second deployment
    app_def['instances'] = 2
    client.update_app(app_id, app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert_that(tasks, has_len(equal_to(2)))

    # Third deployment with rollback
    # provides a testing delay to rollback in the meantime
    app_def['readinessChecks'][0]['intervalSeconds'] = 30
    app_def['instances'] = 1
    deployment_id = client.update_app(app_id, app_def)
    client.rollback_deployment(deployment_id)
    deployment_wait(service_id=app_id)

    # update to 1 instance is rollback to 2
    tasks = client.get_tasks(app_id)
    assert_that(tasks, has_len(equal_to(2)))
Esempio n. 47
0
def test_vip_docker_bridge_mode(marathon_service_name):
    """Tests the creation of a VIP from a python command in a docker image using bridge mode.
       the test validates the creation of an app with the VIP label and the accessability
       of the service via the VIP.
    """

    app_def = apps.docker_http_server(app_id='vip-docker-bridge-mode-app')
    app_id = app_def["id"]

    vip_name = app_id.lstrip("/")
    fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name)

    app_def['id'] = vip_name
    app_def['container']['docker']['portMappings'] = [{
        "containerPort": 8080,
        "hostPort": 0,
        "labels": {
            "VIP_0": "/{}:10000".format(vip_name)
        },
        "protocol": "tcp",
        "name": "{}".format(vip_name)
    }]

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def http_output_check():
        time.sleep(1)
        common.assert_http_code('{}:{}'.format(fqn, 10000))

    http_output_check()
Esempio n. 48
0
def test_vip_mesos_cmd(marathon_service_name):
    """Validates the creation of an app with a VIP label and the accessibility of the service via the VIP."""

    app_def = apps.http_server()
    app_id = app_def["id"]

    vip_name = app_id.lstrip("/")
    fqn = '{}.{}.l4lb.thisdcos.directory'.format(vip_name, marathon_service_name)

    app_def['portDefinitions'] = [{
        "port": 0,
        "protocol": "tcp",
        "name": "{}".format(vip_name),
        "labels": {
            "VIP_0": "/{}:10000".format(vip_name)
        }
    }]

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def http_output_check():
        time.sleep(1)
        common.assert_http_code('{}:{}'.format(fqn, 10000))

    http_output_check()
Esempio n. 49
0
def _test_declined_offer(app_def, reason):
    """Used to confirm that offers were declined.   The `processedOffersSummary` and these tests
       in general require 1.4+ marathon with the queue end point.
       The retry is the best possible way to "time" the success of the test.
    """

    app_id = app_def["id"]
    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def verify_declined_offer():
        deployments = client.get_deployments(app_id)
        assert len(deployments) == 1

        offer_summary = client.get_queued_app(app_id)['processedOffersSummary']
        role_summary = declined_offer_by_reason(offer_summary['rejectSummaryLastOffers'], reason)
        last_attempt = declined_offer_by_reason(offer_summary['rejectSummaryLaunchAttempt'], reason)

        assert role_summary['declined'] > 0, "There are no declined offers because of {}".format(reason)
        assert role_summary['processed'] > 0, "There are no processed offers for {}".format(reason)
        assert last_attempt['declined'] > 0, "There are no declined offers because of {}".format(reason)
        assert last_attempt['processed'] > 0, "There are no processed offers for {}".format(reason)

    verify_declined_offer()
Esempio n. 50
0
def test_pinned_task_scales_on_host_only():
    """Tests that a pinned app scales only on the pinned node."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(len(tasks))
    assert tasks[0]['host'] == host, \
        "The task is on {}, but it is supposed to be on {}".format(tasks[0]['host'], host)

    client.scale_app(app_id, 10)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 10, "The number of tasks is {} after scale, but 10 was expected".format(len(tasks))
    for task in tasks:
        assert task['host'] == host, "The task is on {}, but it is supposed to be on {}".format(task['host'], host)
Esempio n. 51
0
def test_launch_and_scale_group():
    """Launches and scales a group."""

    group_def = groups.sleep_group()
    groups_id = group_def["groups"][0]["id"]
    app1_id = group_def["groups"][0]["apps"][0]["id"]
    app2_id = group_def["groups"][0]["apps"][1]["id"]

    client = marathon.create_client()
    client.create_group(group_def)

    deployment_wait(service_id=app1_id)

    group_apps = client.get_group(groups_id)
    apps = group_apps['apps']
    assert len(apps) == 2, "The number of apps is {}, but 2 was expected".format(len(apps))

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(tasks1) == 1, "The number of tasks #1 is {} after deployment, but 1 was expected".format(len(tasks1))
    assert len(tasks2) == 1, "The number of tasks #2 is {} after deployment, but 1 was expected".format(len(tasks2))

    # scale by 2 for the entire group
    client.scale_group(groups_id, 2)
    deployment_wait(service_id=app1_id)

    tasks1 = client.get_tasks(app1_id)
    tasks2 = client.get_tasks(app2_id)
    assert len(tasks1) == 2, "The number of tasks #1 is {} after scale, but 2 was expected".format(len(tasks1))
    assert len(tasks2) == 2, "The number of tasks #2 is {} after scale, but 2 was expected".format(len(tasks2))
Esempio n. 52
0
def test_external_volume():
    volume_name = "marathon-si-test-vol-{}".format(uuid.uuid4().hex)
    app_def = apps.external_volume_mesos_app()
    app_def["container"]["volumes"][0]["external"]["name"] = volume_name
    app_id = app_def['id']

    # Tested with root marathon since MoM doesn't have
    # --enable_features external_volumes option activated.
    # First deployment should create the volume since it has a unique name
    try:
        print('INFO: Deploying {} with external volume {}'.format(app_id, volume_name))
        client = marathon.create_client()
        client.add_app(app_def)
        deployment_wait(service_id=app_id)

        # Create the app: the volume should be successfully created
        common.assert_app_tasks_running(client, app_def)
        common.assert_app_tasks_healthy(client, app_def)

        # Scale down to 0
        print('INFO: Scaling {} to 0 instances'.format(app_id))
        client.stop_app(app_id)
        deployment_wait(service_id=app_id)

        # Scale up again: the volume should be successfully reused
        print('INFO: Scaling {} back to 1 instance'.format(app_id))
        client.scale_app(app_id, 1)
        deployment_wait(service_id=app_id)

        common.assert_app_tasks_running(client, app_def)
        common.assert_app_tasks_healthy(client, app_def)

        # Remove the app to be able to remove the volume
        print('INFO: Finally removing {}'.format(app_id))
        client.remove_app(app_id)
        deployment_wait(service_id=app_id)
    except Exception as e:
        print('Fail to test external volumes: {}'.format(e))
        raise e
    finally:
        # Clean up after the test: external volumes are not destroyed by marathon or dcos
        # and have to be cleaned manually.
        cmd = 'sudo /opt/mesosphere/bin/dvdcli remove --volumedriver=rexray --volumename={}'.format(volume_name)
        removed = False
        for agent in get_private_agents():
            status, output = run_command_on_agent(agent, cmd)  # NOQA
            print('DEBUG: Failed to remove external volume with name={} on agent={}: {}'.format(
                volume_name, agent, output))
            if status:
                removed = True
        # Note: Removing the volume might fail sometimes because EC2 takes some time (~10min) to recognize that
        # the volume is not in use anymore hence preventing it's removal. This is a known pitfall: we log the error
        # and the volume should be cleaned up manually later.
        if not removed:
            print('WARNING: Failed to remove external volume with name={}'.format(volume_name))
        else:
            print('DEBUG: External volume with name={} successfully removed'.format(volume_name))
Esempio n. 53
0
def clear_pods():
    try:
        client = marathon.create_client()
        pods = client.list_pod()
        for pod in pods:
            client.remove_pod(pod["id"], True)
            deployment_wait(service_id=pod["id"])
    except Exception:
        pass
Esempio n. 54
0
def __marathon_leadership_changed_in_marathon_api(original_leader):
    """ This method uses Marathon API to figure out that leadership changed.
        We have to retry here because leader election takes time and what might happen is that some nodes might
        not be aware of the new leader being elected resulting in HTTP 502.
    """
    # Leader is returned like this 10.0.6.88:8080 - we want just the IP
    current_leader = marathon.create_client().get_leader().split(':', 1)[0]
    logger.info('Current leader according to marathon API: {}'.format(current_leader))
    assert original_leader != current_leader
    return current_leader
Esempio n. 55
0
def test_https_health_check_healthy(protocol):
    """Tests HTTPS and MESOS_HTTPS health checks using a prepared nginx image that enables
       SSL (using self-signed certificate) and listens on 443.
    """
    # marathon version captured here will work for root and mom
    requires_marathon_version('1.4.2')

    client = marathon.create_client()
    app_def = apps.docker_nginx_ssl()
    assert_app_healthy(client, app_def, common.health_check(protocol=protocol, port_index=1))
Esempio n. 56
0
def test_run_app_with_non_existing_user():
    """Runs an app with a non-existing user, which should be failing."""

    app_def = apps.sleep_app()
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        prop(['lastTaskFailure', 'message'], contains_string("No such user 'bad'")), max_attempts=30))
Esempio n. 57
0
def test_run_app_with_non_downloadable_artifact():
    """Runs an app with a non-downloadable artifact."""

    app_def = apps.sleep_app()
    app_def['fetch'] = [{"uri": "http://localhost/missing-artifact"}]

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        prop(['lastTaskFailure', 'message'], contains_string("Failed to fetch all URIs for container")), max_attempts=30)) # NOQA E501
Esempio n. 58
0
def test_failing_health_check_results_in_unhealthy_app():
    """Tests failed health checks of an app. The health check is meant to never pass."""

    app_def = apps.http_server()
    app_def['healthChecks'] = [common.health_check('/bad-url', 'HTTP', failures=0, timeout=3)]

    client = marathon.create_client()
    client.add_app(app_def)

    assert_that(lambda: client.get_app(app_def["id"]), eventually(
        has_values(tasksRunning=1, tasksHealthy=0, tasksUnhealthy=1), max_attempts=30))
Esempio n. 59
0
def test_app_file_based_secret(secret_fixture):

    secret_name, secret_value = secret_fixture
    secret_container_path = 'mysecretpath'

    app_id = '/app-fbs-{}'.format(uuid.uuid4().hex)
    # In case you're wondering about the `cmd`: secrets are mounted via tmpfs inside
    # the container and are not visible outside, hence the intermediate file
    app_def = {
        "id": app_id,
        "instances": 1,
        "cpus": 0.5,
        "mem": 64,
        "cmd": "cat {} >> {}_file && /opt/mesosphere/bin/python -m http.server $PORT_API".format(
            secret_container_path, secret_container_path),
        "container": {
            "type": "MESOS",
            "volumes": [{
                "containerPath": secret_container_path,
                "secret": "secret1"
            }]
        },
        "portDefinitions": [{
            "port": 0,
            "protocol": "tcp",
            "name": "api",
            "labels": {}
        }],
        "secrets": {
            "secret1": {
                "source": secret_name
            }
        }
    }

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(tasks) == 1, 'Failed to start the file based secret app'

    port = tasks[0]['ports'][0]
    host = tasks[0]['host']
    # The secret by default is saved in $MESOS_SANDBOX/.secrets/path/to/secret
    cmd = "curl {}:{}/{}_file".format(host, port, secret_container_path)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def value_check():
        status, data = run_command_on_master(cmd)
        assert status, "{} did not succeed. status = {}, data = {}".format(cmd, status, data)
        assert data.rstrip() == secret_value, "Got an unexpected secret data"

    value_check()
def test_deploy_custom_framework():
    """Launches an app that has necessary elements to create a service endpoint in DCOS.
       This test confirms that the endpoint is created by the root Marathon.
    """

    client = marathon.create_client()
    app_def = apps.fake_framework()
    app_id = app_def["id"]
    client.add_app(app_def)
    deployment_wait(service_id=app_id, max_attempts=300)

    shakedown.dcos.service.wait_for_service_endpoint('pyfw', timedelta(minutes=5).total_seconds())