Example #1
0
def test_fault_domain(dcos_api_session):
    expanded_config = test_helpers.get_expanded_config()
    if expanded_config['fault_domain_enabled'] == 'false':
        pytest.skip('fault domain is not set')
    master_ip = dcos_api_session.masters[0]
    r = dcos_api_session.get('/state', host=master_ip, port=5050)
    assert r.status_code == 200
    state = r.json()

    # check flags and get the domain parameters mesos master was started with.
    assert 'flags' in state, 'missing flags in state json'
    assert 'domain' in state['flags'], 'missing domain in state json flags'
    cli_flag = json.loads(state['flags']['domain'])
    expected_region, expected_zone = get_region_zone(cli_flag)

    # check master top level keys
    assert 'leader_info' in state, 'leader_info is missing in state json'
    assert 'domain' in state['leader_info'], 'domain is missing in state json'
    leader_region, leader_zone = get_region_zone(state['leader_info']['domain'])

    assert leader_region == expected_region, 'expect region {}. Got {}'.format(expected_region, leader_region)
    assert leader_zone == expected_zone, 'expect zone {}. Got {}'.format(expected_zone, leader_zone)

    for agent in state['slaves']:
        assert 'domain' in agent, 'missing domain field for agent. {}'.format(agent)
        agent_region, agent_zone = get_region_zone(agent['domain'])

        assert agent_region == expected_region, 'expect region {}. Got {}'.format(expected_region, agent_region)

        # agent_zone might be different on agents, so we just make sure it's a sane value
        assert agent_zone, 'agent_zone cannot be empty'
Example #2
0
def test_min_allocatable_resources(reserved_disk):
    """Test that the Mesos master creates offers for just `disk` resources."""
    # We use `mesos-execute` since e.g., Marathon cannot make use of disk-only
    # offers.
    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('Missing framework authentication for mesos-execute')

    name = \
        'test-min-test_min-allocatable-resources-{}'.format(uuid.uuid4().hex)

    argv = [
        '/opt/mesosphere/bin/mesos-execute',
        '--resources=disk:32',
        '--role=' + reserved_disk.role,
        '--command=:',
        '--master=leader.mesos:5050',
        '--name={}'.format(name),
        '--env={"LC_ALL":"C"}']

    output = subprocess.check_output(
        argv,
        stderr=subprocess.STDOUT,
        universal_newlines=True)

    # If the framework received any status update it launched a task which
    # means it was offered resources.
    assert 'Received status update' in output, output
Example #3
0
def skip_in_downstream():
    expanded_config = get_expanded_config()
    if 'security' in expanded_config:
        pytest.skip(
            'Skip upstream-specific user management tests',
            allow_module_level=True
        )
Example #4
0
def test_expanded_config():
    expanded_config = get_expanded_config()
    # Caluclated parameters should be present
    assert 'master_quorum' in expanded_config
    # Defined and used parameters should be present
    assert 'marathon_port' in expanded_config
    assert 'mesos_master_port' in expanded_config
    assert 'mesos_agent_port' in expanded_config
    assert 'exhibitor_port' in expanded_config
    assert 'mesos_dns_port' in expanded_config
    assert 'metronome_port' in expanded_config
Example #5
0
def test_dcos_net_cluster_identity(dcos_api_session):
    cluster_id = 'minuteman'  # default

    expanded_config = test_helpers.get_expanded_config()
    if expanded_config['dcos_net_cluster_identity'] == 'true':
        with open('/var/lib/dcos/cluster-id') as f:
            cluster_id = "'{}'".format(f.readline().rstrip())

    argv = ['sudo', '/opt/mesosphere/bin/dcos-net-env', 'eval', 'erlang:get_cookie().']
    cookie = subprocess.check_output(argv, stderr=subprocess.STDOUT).decode('utf-8').rstrip()

    assert cluster_id == cookie, "cluster_id: {}, cookie: {}".format(cluster_id, cookie)
Example #6
0
def dcos_api_session(dcos_api_session_factory):
    """ Overrides the dcos_api_session fixture to use
    exhibitor settings currently used in the cluster
    """
    args = dcos_api_session_factory.get_args_from_env()

    exhibitor_admin_password = None
    expanded_config = get_expanded_config()
    if expanded_config['exhibitor_admin_password_enabled'] == 'true':
        exhibitor_admin_password = expanded_config['exhibitor_admin_password']

    api = dcos_api_session_factory(
        exhibitor_admin_password=exhibitor_admin_password,
        **args)
    api.wait_for_dcos()
    return api
Example #7
0
def test_pkgpanda_api(dcos_api_session):

    expanded_config = get_expanded_config()
    if 'advanced' in expanded_config['template_filenames']:
        reason = (
            'Will not work on advanced CF templates, see: '
            'https://jira.mesosphere.com/browse/DCOS_OSS-1375'
        )
        pytest.skip(reason)

    def get_and_validate_package_ids(path, node):
        r = dcos_api_session.get(path, node=node)
        assert r.status_code == 200
        package_ids = r.json()
        assert isinstance(package_ids, list)
        for package_id in package_ids:
            r = dcos_api_session.get(path + package_id, node=node)
            assert r.status_code == 200
            name, version = package_id.split('--')
            assert r.json() == {'id': package_id, 'name': name, 'version': version}
        return package_ids

    active_buildinfo = dcos_api_session.get('/pkgpanda/active.buildinfo.full.json').json()
    active_buildinfo_packages = sorted(
        # Setup packages don't have a buildinfo.
        (package_name, info['package_version'] if info else None)
        for package_name, info in active_buildinfo.items()
    )

    def assert_packages_match_active_buildinfo(package_ids):
        packages = sorted(map(lambda id_: tuple(id_.split('--')), package_ids))
        assert len(packages) == len(active_buildinfo_packages)
        for package, buildinfo_package in zip(packages, active_buildinfo_packages):
            if buildinfo_package[1] is None:
                # No buildinfo for this package, so we can only compare names.
                assert package[0] == buildinfo_package[0]
            else:
                assert package == buildinfo_package

    for node in dcos_api_session.masters + dcos_api_session.all_slaves:
        package_ids = get_and_validate_package_ids('/pkgpanda/repository/', node)
        active_package_ids = get_and_validate_package_ids('/pkgpanda/active/', node)

        assert set(active_package_ids) <= set(package_ids)
        assert_packages_match_active_buildinfo(active_package_ids)
Example #8
0
def test_octarine(dcos_api_session, timeout=30):
    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('See: https://jira.mesosphere.com/browse/DCOS-14760')
    # This app binds to port 80. This is only required by the http (not srv)
    # transparent mode test. In transparent mode, we use ".mydcos.directory"
    # to go to localhost, the port attached there is only used to
    # determine which port to send traffic to on localhost. When it
    # reaches the proxy, the port is not used, and a request is made
    # to port 80.
    app, uuid = test_helpers.marathon_test_app(host_port=80)
    app['acceptedResourceRoles'] = ["slave_public"]
    app['requirePorts'] = True

    with dcos_api_session.marathon.deploy_and_cleanup(app):
        service_points = dcos_api_session.marathon.get_app_service_endpoints(app['id'])
        port_number = service_points[0].port
        # It didn't actually grab port 80 when requirePorts was unset
        assert port_number == app['portDefinitions'][0]["port"]

        app_name = app["id"].strip("/")
        port_name = app['portDefinitions'][0]["name"]
        port_protocol = app['portDefinitions'][0]["protocol"]

        srv = "_{}._{}._{}.marathon.mesos".format(port_name, app_name, port_protocol)
        addr = "{}.marathon.mesos".format(app_name)
        transparent_suffix = ".mydcos.directory"

        standard_mode = "standard"
        transparent_mode = "transparent"

        t_addr_bind = 2508
        t_srv_bind = 2509

        standard_addr = "{}:{}/ping".format(addr, port_number)
        standard_srv = "{}/ping".format(srv)
        transparent_addr = "{}{}:{}/ping".format(addr, transparent_suffix, t_addr_bind)
        transparent_srv = "{}{}:{}/ping".format(srv, transparent_suffix, t_srv_bind)

        # The uuids are different between runs so that they don't have a
        # chance of colliding. They shouldn't anyways, but just to be safe.
        octarine_runner(dcos_api_session, standard_mode, uuid + "1", standard_addr)
        octarine_runner(dcos_api_session, standard_mode, uuid + "2", standard_srv)
        octarine_runner(dcos_api_session, transparent_mode, uuid + "3", transparent_addr, bind_port=t_addr_bind)
        octarine_runner(dcos_api_session, transparent_mode, uuid + "4", transparent_srv, bind_port=t_srv_bind)
Example #9
0
def deploy_test_app_and_check(dcos_api_session, app: dict, test_uuid: str):
    """This method deploys the test server app and then
    pings its /operating_environment endpoint to retrieve the container
    user running the task.

    In a mesos container, this will be the marathon user
    In a docker container this user comes from the USER setting
    from the app's Dockerfile, which, for the test application
    is the default, root
    """
    expanded_config = test_helpers.get_expanded_config()
    default_os_user = '******' if expanded_config.get('security') == 'strict' else 'root'

    if 'container' in app and app['container']['type'] == 'DOCKER':
        marathon_user = '******'
    else:
        marathon_user = app.get('user', default_os_user)
    with dcos_api_session.marathon.deploy_and_cleanup(app):
        service_points = dcos_api_session.marathon.get_app_service_endpoints(app['id'])
        r = requests.get('http://{}:{}/test_uuid'.format(service_points[0].host, service_points[0].port))
        if r.status_code != 200:
            msg = "Test server replied with non-200 reply: '{0} {1}. "
            msg += "Detailed explanation of the problem: {2}"
            raise Exception(msg.format(r.status_code, r.reason, r.text))

        r_data = r.json()

        assert r_data['test_uuid'] == test_uuid

        r = requests.get('http://{}:{}/operating_environment'.format(
            service_points[0].host,
            service_points[0].port))

        if r.status_code != 200:
            msg = "Test server replied with non-200 reply: '{0} {1}. "
            msg += "Detailed explanation of the problem: {2}"
            raise Exception(msg.format(r.status_code, r.reason, r.text))

        json_uid = r.json()['uid']
        if marathon_user == 'root':
            assert json_uid == 0, "App running as root should have uid 0."
        else:
            assert json_uid != 0, ("App running as {} should not have uid 0.".format(marathon_user))
def test_mom_installation(dcos_api_session):
    """Test the Cosmos installation of marathon on marathon (MoM)
    """
    expanded_config = get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('MoM disabled for strict mode')

    install_response = dcos_api_session.cosmos.install_package('marathon')
    data = install_response.json()

    dcos_api_session.marathon.wait_for_deployments_complete()

    list_response = dcos_api_session.cosmos.list_packages()
    packages = list_response.json()['packages']
    assert len(packages) == 1 and packages[0]['appId'] == data['appId']

    dcos_api_session.cosmos.uninstall_package('marathon', app_id=data['appId'])

    list_response = dcos_api_session.cosmos.list_packages()
    packages = list_response.json()['packages']
    assert len(packages) == 0
Example #11
0
def test_if_search_is_working(dcos_api_session):
    """Test if custom set search is working.

    Verifies that a marathon app running on the dcos_api_session can resolve names using
    searching the "search" the dcos_api_session was launched with (if any). It also tests
    that absolute searches still work, and search + things that aren't
    sub-domains fails properly.

    The application being deployed is a simple http server written in python.
    Please check test_server.py for more details.
    """
    # Launch the app
    app_definition, test_uuid = test_helpers.marathon_test_app()
    with dcos_api_session.marathon.deploy_and_cleanup(app_definition):
        service_points = dcos_api_session.marathon.get_app_service_endpoints(app_definition['id'])
        # Get the status
        r = requests.get('http://{}:{}/dns_search'.format(service_points[0].host,
                                                          service_points[0].port))
        if r.status_code != 200:
            msg = "Test server replied with non-200 reply: '{0} {1}. "
            msg += "Detailed explanation of the problem: {2}"
            pytest.fail(msg.format(r.status_code, r.reason, r.text))

        r_data = r.json()

        # Make sure we hit the app we expected
        assert r_data['test_uuid'] == test_uuid

        expected_error = {'error': '[Errno -2] Name or service not known'}

        # Check that result matches expectations for this dcos_api_session
        expanded_config = test_helpers.get_expanded_config()
        if expanded_config['dns_search']:
            assert r_data['search_hit_leader'] in dcos_api_session.masters
            assert r_data['always_hit_leader'] in dcos_api_session.masters
            assert r_data['always_miss'] == expected_error
        else:  # No dns search, search hit should miss.
            assert r_data['search_hit_leader'] == expected_error
            assert r_data['always_hit_leader'] in dcos_api_session.masters
            assert r_data['always_miss'] == expected_error
Example #12
0
def test_if_search_is_working(dcos_api_session):
    """Test if custom set search is working.

    Verifies that a marathon app running on the dcos_api_session can resolve names using
    searching the "search" the dcos_api_session was launched with (if any). It also tests
    that absolute searches still work, and search + things that aren't
    sub-domains fails properly.

    The application being deployed is a simple http server written in python.
    Please check test_server.py for more details.
    """
    # Launch the app
    app_definition, test_uuid = test_helpers.marathon_test_app()
    with dcos_api_session.marathon.deploy_and_cleanup(app_definition):
        service_points = dcos_api_session.marathon.get_app_service_endpoints(app_definition['id'])
        # Get the status
        r = requests.get('http://{}:{}/dns_search'.format(service_points[0].host,
                                                          service_points[0].port))
        if r.status_code != 200:
            msg = "Test server replied with non-200 reply: '{0} {1}. "
            msg += "Detailed explanation of the problem: {2}"
            pytest.fail(msg.format(r.status_code, r.reason, r.text))

        r_data = r.json()

        # Make sure we hit the app we expected
        assert r_data['test_uuid'] == test_uuid

        expected_error = {'error': '[Errno -2] Name or service not known'}

        # Check that result matches expectations for this dcos_api_session
        expanded_config = test_helpers.get_expanded_config()
        if expanded_config['dns_search']:
            assert r_data['search_hit_leader'] in dcos_api_session.masters
            assert r_data['always_hit_leader'] in dcos_api_session.masters
            assert r_data['always_miss'] == expected_error
        else:  # No dns search, search hit should miss.
            assert r_data['search_hit_leader'] == expected_error
            assert r_data['always_hit_leader'] in dcos_api_session.masters
            assert r_data['always_miss'] == expected_error
Example #13
0
def test_task_metrics_metadata(dcos_api_session):
    """Test that task metrics have expected metadata/labels"""
    expanded_config = get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('MoM disabled for strict mode')
    with deploy_and_cleanup_dcos_package(dcos_api_session, 'marathon',
                                         '1.6.535', 'marathon-user'):
        node = get_task_hostname(dcos_api_session, 'marathon', 'marathon-user')

        @retrying.retry(wait_fixed=STD_INTERVAL,
                        stop_max_delay=METRICS_WAITTIME)
        def check_metrics_metadata():
            response = get_metrics_prom(dcos_api_session, node)
            for line in response.text.splitlines():
                if '#' in line:
                    continue
                if 'task_name="marathon-user"' in line:
                    assert 'service_name="marathon"' in line
                    # check for whitelisted label
                    assert 'DCOS_SERVICE_NAME="marathon-user"' in line

        check_metrics_metadata()
Example #14
0
def test_executor_metrics_metadata(dcos_api_session):
    """Test that executor metrics have expected metadata/labels"""
    expanded_config = get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('Framework disabled for strict mode')

    with deploy_and_cleanup_dcos_package(dcos_api_session, 'hello-world', '2.2.0-0.42.2', 'hello-world'):
        node = get_task_hostname(dcos_api_session, 'marathon', 'hello-world')

        @retrying.retry(wait_fixed=STD_INTERVAL, stop_max_delay=METRICS_WAITTIME)
        def check_executor_metrics_metadata():
            response = get_metrics_prom(dcos_api_session, node)
            for line in response.text.splitlines():
                if '#' in line:
                    continue
                # ignore metrics from hello-world task started by marathon by checking
                # for absence of 'marathon' string.
                if 'cpus_nr_periods' in line and 'marathon' not in line:
                    assert 'service_name="hello-world"' in line
                    assert 'task_name=""' in line  # this is an executor, not a task
                    # hello-world executors can be named "hello" or "world"
                    assert ('executor_name="hello"' in line or 'executor_name="world"' in line)
        check_executor_metrics_metadata()
Example #15
0
def test_fault_domain(dcos_api_session: DcosApiSession) -> None:
    expanded_config = test_helpers.get_expanded_config()
    if expanded_config['fault_domain_enabled'] == 'false':
        pytest.skip('fault domain is not set')
    master_ip = dcos_api_session.masters[0]
    r = dcos_api_session.get('/state', host=master_ip, port=5050)
    assert r.status_code == 200
    state = r.json()

    # check flags and get the domain parameters mesos master was started with.
    assert 'flags' in state, 'missing flags in state json'
    assert 'domain' in state['flags'], 'missing domain in state json flags'
    cli_flag = json.loads(state['flags']['domain'])
    expected_region, expected_zone = get_region_zone(cli_flag)

    # check master top level keys
    assert 'leader_info' in state, 'leader_info is missing in state json'
    assert 'domain' in state['leader_info'], 'domain is missing in state json'
    leader_region, leader_zone = get_region_zone(
        state['leader_info']['domain'])

    assert leader_region == expected_region, 'expect region {}. Got {}'.format(
        expected_region, leader_region)
    assert leader_zone == expected_zone, 'expect zone {}. Got {}'.format(
        expected_zone, leader_zone)

    for agent in state['slaves']:
        assert 'domain' in agent, 'missing domain field for agent. {}'.format(
            agent)
        agent_region, agent_zone = get_region_zone(agent['domain'])

        assert agent_region == expected_region, 'expect region {}. Got {}'.format(
            expected_region, agent_region)

        # agent_zone might be different on agents, so we just make sure it's a sane value
        assert agent_zone, 'agent_zone cannot be empty'
Example #16
0
def test_profile_symlink():
    """Assert the DC/OS profile script is symlinked from the correct source."""
    expanded_config = get_expanded_config()
    symlink_target = expanded_config['profile_symlink_target']
    expected_symlink_source = expanded_config['profile_symlink_source']
    assert expected_symlink_source == os.readlink(symlink_target)
Example #17
0
def test_move_external_volume_to_new_agent(dcos_api_session):
    """Test that an external volume is successfully attached to a new agent.

    If the dcos_api_session has only one agent, the volume will be detached and
    reattached to the same agent.

    """
    expanded_config = get_expanded_config()
    if not (expanded_config['provider'] == 'aws' or expanded_config['platform'] == 'aws'):
        pytest.skip('Must be run in an AWS environment!')

    if expanded_config.get('security') == 'strict':
        pytest.skip('See: https://jira.mesosphere.com/browse/DCOS_OSS-4922')

    hosts = dcos_api_session.slaves[0], dcos_api_session.slaves[-1]
    test_uuid = uuid.uuid4().hex
    test_label = 'integration-test-move-external-volume-{}'.format(test_uuid)
    mesos_volume_path = 'volume'
    docker_volume_path = '/volume'
    base_app = {
        'mem': 32,
        'cpus': 0.1,
        'instances': 1,
        'container': {
            'volumes': [{
                'mode': 'RW',
                'external': {
                    'name': test_label,
                    'provider': 'dvdi',
                    'options': {'dvdi/driver': 'rexray'}
                }
            }]
        }
    }

    write_app = copy.deepcopy(base_app)
    write_app.update({
        'id': '/{}/write'.format(test_label),
        'cmd': (
            # Check that the volume is empty.
            '[ $(ls -A {volume_path}/ | grep -v --line-regexp "lost+found" | wc -l) -eq 0 ] && '
            # Write the test UUID to a file.
            'echo "{test_uuid}" >> {volume_path}/test && '
            'while true; do sleep 1000; done'
        ).format(test_uuid=test_uuid, volume_path=mesos_volume_path),
        'constraints': [['hostname', 'LIKE', hosts[0]]],
    })
    write_app['container']['type'] = 'MESOS'
    write_app['container']['volumes'][0]['containerPath'] = mesos_volume_path
    write_app['container']['volumes'][0]['external']['size'] = 1

    read_app = copy.deepcopy(base_app)
    read_app.update({
        'id': '/{}/read'.format(test_label),
        'cmd': (
            # Diff the file and the UUID.
            'echo "{test_uuid}" | diff - {volume_path}/test && '
            'while true; do sleep 1000; done'
        ).format(test_uuid=test_uuid, volume_path=docker_volume_path),
        'constraints': [['hostname', 'LIKE', hosts[1]]],
    })
    read_app['container'].update({
        'type': 'DOCKER',
        'docker': {
            'image': 'busybox',
            'network': 'HOST',
        }
    })
    read_app['container']['volumes'][0]['containerPath'] = docker_volume_path

    # Volume operations can take several minutes.
    timeout = 600

    deploy_kwargs = {
        'check_health': False,
        # A volume might fail to attach because EC2. We can tolerate that and retry.
        'ignore_failed_tasks': True,
        'timeout': timeout
    }

    try:
        with dcos_api_session.marathon.deploy_and_cleanup(write_app, **deploy_kwargs):
            logging.info('Successfully wrote to volume')
        with dcos_api_session.marathon.deploy_and_cleanup(read_app, **deploy_kwargs):
            logging.info('Successfully read from volume')
    finally:
        logging.info('Deleting volume: ' + test_label)
        delete_cmd = \
            "/opt/mesosphere/bin/dcos-shell python " \
            "/opt/mesosphere/active/dcos-integration-test/util/delete_ec2_volume.py {}".format(test_label)
        delete_job = {
            'id': 'delete-volume-' + test_uuid,
            'run': {
                'cpus': .1,
                'mem': 128,
                'disk': 0,
                'cmd': delete_cmd}}
        try:
            # We use a metronome job to work around the `aws-deploy` integration tests where the master doesn't have
            # volume permissions so all volume actions need to be performed from the agents.
            dcos_api_session.metronome_one_off(delete_job, timeout=timeout)
        except Exception as ex:
            raise Exception('Failed to clean up volume {}: {}'.format(test_label, ex)) from ex
Example #18
0
def test_move_external_volume_to_new_agent(dcos_api_session):
    """Test that an external volume is successfully attached to a new agent.

    If the dcos_api_session has only one agent, the volume will be detached and
    reattached to the same agent.

    """
    expanded_config = get_expanded_config()
    if not (expanded_config['provider'] == 'aws'
            or expanded_config['platform'] == 'aws'):
        pytest.skip('Must be run in an AWS environment!')

    if expanded_config.get('security') == 'strict':
        pytest.skip('See: https://jira.mesosphere.com/browse/DCOS_OSS-4922')

    hosts = dcos_api_session.slaves[0], dcos_api_session.slaves[-1]
    test_uuid = uuid.uuid4().hex
    test_label = 'integration-test-move-external-volume-{}'.format(test_uuid)
    mesos_volume_path = 'volume'
    docker_volume_path = '/volume'
    base_app = {
        'mem': 32,
        'cpus': 0.1,
        'instances': 1,
        'container': {
            'volumes': [{
                'mode': 'RW',
                'external': {
                    'name': test_label,
                    'provider': 'dvdi',
                    'options': {
                        'dvdi/driver': 'rexray'
                    }
                }
            }]
        }
    }

    write_app = copy.deepcopy(base_app)
    write_app.update({
        'id':
        '/{}/write'.format(test_label),
        'cmd': (
            # Check that the volume is empty.
            '[ $(ls -A {volume_path}/ | grep -v --line-regexp "lost+found" | wc -l) -eq 0 ] && '
            # Write the test UUID to a file.
            'echo "{test_uuid}" >> {volume_path}/test && '
            'while true; do sleep 1000; done').format(
                test_uuid=test_uuid, volume_path=mesos_volume_path),
        'constraints': [['hostname', 'LIKE', hosts[0]]],
    })
    write_app['container']['type'] = 'MESOS'
    write_app['container']['volumes'][0]['containerPath'] = mesos_volume_path
    write_app['container']['volumes'][0]['external']['size'] = 1

    read_app = copy.deepcopy(base_app)
    read_app.update({
        'id':
        '/{}/read'.format(test_label),
        'cmd': (
            # Diff the file and the UUID.
            'echo "{test_uuid}" | diff - {volume_path}/test && '
            'while true; do sleep 1000; done').format(
                test_uuid=test_uuid, volume_path=docker_volume_path),
        'constraints': [['hostname', 'LIKE', hosts[1]]],
    })
    read_app['container'].update({
        'type': 'DOCKER',
        'docker': {
            'image': 'busybox',
            'network': 'HOST',
        }
    })
    read_app['container']['volumes'][0]['containerPath'] = docker_volume_path

    # Volume operations can take several minutes.
    timeout = 600

    deploy_kwargs = {
        'check_health': False,
        # A volume might fail to attach because EC2. We can tolerate that and retry.
        'ignore_failed_tasks': True,
        'timeout': timeout
    }

    try:
        with dcos_api_session.marathon.deploy_and_cleanup(
                write_app, **deploy_kwargs):
            logging.info('Successfully wrote to volume')
        with dcos_api_session.marathon.deploy_and_cleanup(
                read_app, **deploy_kwargs):
            logging.info('Successfully read from volume')
    finally:
        logging.info('Deleting volume: ' + test_label)
        delete_cmd = \
            "/opt/mesosphere/bin/dcos-shell python " \
            "/opt/mesosphere/active/dcos-integration-test/util/delete_ec2_volume.py {}".format(test_label)
        delete_job = {
            'id': 'delete-volume-' + test_uuid,
            'run': {
                'cpus': .1,
                'mem': 128,
                'disk': 0,
                'cmd': delete_cmd
            }
        }
        try:
            # We use a metronome job to work around the `aws-deploy` integration tests where the master doesn't have
            # volume permissions so all volume actions need to be performed from the agents.
            dcos_api_session.metronome_one_off(delete_job, timeout=timeout)
        except Exception as ex:
            raise Exception('Failed to clean up volume {}: {}'.format(
                test_label, ex)) from ex
Example #19
0
def test_executor_uses_domain_socket(dcos_api_session: DcosApiSession) -> None:
    """
    This test validates that by default executors connect with the agent over domain sockets.

    The test launches a Marathon app with a health check which validates that
    the `mesos-executor` process of the task is connected to the agent socket.
    We do not validate that any actual data is passed over the socket
    connection. Once the app is healthy the test has succeeded.
    """
    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('Cannot detect domain sockets with EE strict mode enabled')

    task_id = 'domain-socket-{}'.format(uuid.uuid4())

    check = textwrap.dedent('''\
        #!/bin/bash

        set -o nounset -o pipefail
        set -ex

        export PATH=/usr/sbin/:$PATH
        MESOS_EXECUTORS_SOCK="/var/run/mesos/mesos-executors.sock"

        # In the container's PID namespace the containerizer will have pid=1. Since the
        # first process it launches is the executor, the executor has pid=2.
        EXECUTOR_PID=2
        grep -q '^mesos-executor' /proc/$EXECUTOR_PID/cmdline || exit 2

        declare -i socket_connections
        socket_connections=0

        for peer in $(ss -xp | grep "pid=$EXECUTOR_PID" | awk '{print $8}'); do
            # We cannot see the mesos-agent process, but can make sure
            # the executor's socket is related to the agent socket.
            if ss -xp | grep "$peer" | grep -q "$MESOS_EXECUTORS_SOCK"; then
                ((socket_connections+=1))
            fi
        done

        if [ $socket_connections -ne 2 ]; then
            echo "expected 2 socket connections, got $socket_connections"
            exit 1
        fi''')

    app = {
        "id":
        task_id,
        "cpus":
        0.1,
        "mem":
        32,
        "disk":
        32,
        "cmd":
        "sleep 10000",
        "container": {
            "type": "MESOS",
            "volumes": []
        },
        "instances":
        1,
        "healthChecks": [{
            "gracePeriodSeconds": 5,
            "intervalSeconds": 60,
            "maxConsecutiveFailures": 1,
            "timeoutSeconds": 20,
            "delaySeconds": 1,
            "protocol": "COMMAND",
            "command": {
                "value": check
            }
        }],
    }

    with dcos_api_session.marathon.deploy_and_cleanup(app,
                                                      check_health=False,
                                                      timeout=60):
        # Retry collecting the health check result since its availability might be delayed.
        @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
        def assert_app_is_healthy() -> None:
            assert dcos_api_session.marathon.check_app_instances(
                app_id=task_id,
                app_instances=1,
                check_health=True,
                ignore_failed_tasks=False)

        assert_app_is_healthy()
Example #20
0
def test_blkio_stats(dcos_api_session: DcosApiSession) -> None:
    expanded_config = test_helpers.get_expanded_config()
    if expanded_config['provider'] == 'azure' or expanded_config.get(
            'platform') == 'azure':
        pytest.skip('See: https://jira.mesosphere.com/browse/DCOS-49023')

    # Launch a Marathon application to do some disk writes, and then verify that
    # the cgroups blkio statistics of the application can be correctly retrieved.
    app, test_uuid = test_helpers.marathon_test_app(
        container_type=marathon.Container.MESOS)
    app_id = 'integration-test-{}'.format(test_uuid)

    # The application will generate a 10k file with 10 disk writes.
    #
    # TODO(qianzhang): In some old platforms (CentOS 6 and Ubuntu 14),
    # the first disk write of a blkio cgroup will always be missed in
    # the blkio throttling statistics, so here we run two `dd` commands,
    # the first one which does only one disk write will be missed on
    # those platforms, and the second one will be recorded in the blkio
    # throttling statistics. When we drop the CentOS 6 and Ubuntu 14
    # support in future, we should remove the first `dd` command.
    marker_file = 'marker'
    app['cmd'] = ('dd if=/dev/zero of=file bs=1024 count=1 oflag=dsync && '
                  'dd if=/dev/zero of=file bs=1024 count=10 oflag=dsync && '
                  'echo -n done > {} && sleep 1000').format(marker_file)

    with dcos_api_session.marathon.deploy_and_cleanup(app, check_health=False):
        marathon_framework_id = dcos_api_session.marathon.get(
            '/v2/info').json()['frameworkId']
        app_task = dcos_api_session.marathon.get('/v2/apps/{}/tasks'.format(
            app['id'])).json()['tasks'][0]

        # Wait up to 10 seconds for the marker file to appear which
        # indicates the disk writes via `dd` command are done.
        @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
        def get_marker_file_content() -> Any:
            return dcos_api_session.mesos_sandbox_file(app_task['slaveId'],
                                                       marathon_framework_id,
                                                       app_task['id'],
                                                       marker_file)

        assert get_marker_file_content() == 'done'

        # Fetch the Mesos master state
        master_ip = dcos_api_session.masters[0]
        r = dcos_api_session.get('/state', host=master_ip, port=5050)
        assert r.status_code == 200
        state = r.json()

        # Find the agent_id from master state
        agent_id = None
        for framework in state['frameworks']:
            for task in framework['tasks']:
                if app_id in task['id']:
                    agent_id = task['slave_id']
        assert agent_id is not None, 'Agent ID not found for instance of app_id {}'.format(
            app_id)

        # Find hostname from agent_id
        agent_hostname = None
        for agent in state['slaves']:
            if agent['id'] == agent_id:
                agent_hostname = agent['hostname']
        assert agent_hostname is not None, 'Agent hostname not found for agent_id {}'.format(
            agent_id)
        logging.debug('Located %s on agent %s', app_id, agent_hostname)

        # Fetch the Mesos agent statistics
        r = dcos_api_session.get('/monitor/statistics',
                                 host=agent_hostname,
                                 port=5051)
        assert r.status_code == 200
        stats = r.json()

        total_io_serviced = None
        total_io_service_bytes = None
        for stat in stats:
            # Find the statistic for the Marathon application that we deployed. Since what that
            # Marathon application launched is a Mesos command task (i.e., using Mesos built-in
            # command executor), the executor ID will be same as the task ID, so if we find the
            # `app_id` in an executor ID of a statistic, that must be the statistic entry
            # corresponding to the application that we deployed.
            if app_id in stat['executor_id']:
                # We only care about the blkio throttle statistics but not the blkio cfq statistics,
                # because in the environment where the disk IO scheduler is not `cfq`, all the cfq
                # statistics may be 0.
                throttle_stats = stat['statistics']['blkio_statistics'][
                    'throttling']
                for throttle_stat in throttle_stats:
                    if 'device' not in throttle_stat:
                        total_io_serviced = throttle_stat['io_serviced'][0][
                            'value']
                        total_io_service_bytes = throttle_stat[
                            'io_service_bytes'][0]['value']

        assert total_io_serviced is not None, (
            'Total blkio throttling IO serviced not found '
            'for app_id {}'.format(app_id))
        assert total_io_service_bytes is not None, (
            'Total blkio throttling IO service bytes '
            'not found for app_id {}'.format(app_id))
        # We expect the statistics retrieved from Mesos agent are equal or greater than what we
        # did with the `dd` command (i.e., 10 and 10240), because:
        #   1. Besides the disk writes done by the `dd` command, the statistics may also include
        #      some disk reads, e.g., to load the necessary executable binary and libraries.
        #   2. In the environment where RAID is enabled, there may be multiple disk writes to
        #      different disks for a single `dd` write.
        assert int(total_io_serviced) >= 10, (
            'Total blkio throttling IO serviced for app_id {} '
            'are less than 10'.format(app_id))
        assert int(total_io_service_bytes) >= 10240, (
            'Total blkio throttling IO service bytes for '
            'app_id {} are less than 10240'.format(app_id))
def skip_in_downstream():
    expanded_config = get_expanded_config()
    if 'security' in expanded_config:
        pytest.skip('Skip upstream-specific user management tests',
                    allow_module_level=True)
Example #22
0
def test_expanded_config():
    expanded_config = get_expanded_config()
    # Caluclated parameters should be present
    assert 'master_quorum' in expanded_config
    # Defined and used parameters should be present
    assert 'marathon_port' in expanded_config
Example #23
0
def test_standalone_container_metrics(dcos_api_session):
    """
    An operator should be able to launch a standalone container using the
    LAUNCH_CONTAINER call of the agent operator API. Additionally, if the
    process running within the standalone container emits statsd metrics, they
    should be accessible via the DC/OS metrics API.
    """
    expanded_config = get_expanded_config()
    if expanded_config.get('security') == 'strict':
        reason = (
            'Only resource providers are authorized to launch standalone '
            'containers in strict mode. See DCOS-42325.')
        pytest.skip(reason)
    # Fetch the mesos master state to get an agent ID
    master_ip = dcos_api_session.masters[0]
    r = dcos_api_session.get('/state', host=master_ip, port=5050)
    assert r.status_code == 200
    state = r.json()

    # Find hostname and ID of an agent
    assert len(state['slaves']) > 0, 'No agents found in master state'
    agent_hostname = state['slaves'][0]['hostname']
    agent_id = state['slaves'][0]['id']
    logging.debug('Selected agent %s at %s', agent_id, agent_hostname)

    def _post_agent(json):
        headers = {
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        }

        r = dcos_api_session.post('/api/v1',
                                  host=agent_hostname,
                                  port=5051,
                                  headers=headers,
                                  json=json,
                                  data=None,
                                  stream=False)
        return r

    # Prepare container ID data
    container_id = {'value': 'test-standalone-%s' % str(uuid.uuid4())}

    # Launch standalone container. The command for this container executes a
    # binary installed with DC/OS which will emit statsd metrics.
    launch_data = {
        'type': 'LAUNCH_CONTAINER',
        'launch_container': {
            'command': {
                'value':
                './statsd-emitter',
                'uris': [{
                    'value':
                    'https://downloads.mesosphere.com/dcos-metrics/1.11.0/statsd-emitter',
                    'executable': True
                }]
            },
            'container_id':
            container_id,
            'resources': [{
                'name': 'cpus',
                'scalar': {
                    'value': 0.2
                },
                'type': 'SCALAR'
            }, {
                'name': 'mem',
                'scalar': {
                    'value': 64.0
                },
                'type': 'SCALAR'
            }, {
                'name': 'disk',
                'scalar': {
                    'value': 1024.0
                },
                'type': 'SCALAR'
            }],
            'container': {
                'type': 'MESOS'
            }
        }
    }

    # There is a short delay between the container starting and metrics becoming
    # available via the metrics service. Because of this, we wait up to 5
    # minutes for these metrics to appear before throwing an exception.
    def _should_retry_metrics_fetch(response):
        return response.status_code == 204

    @retrying.retry(wait_fixed=METRICS_INTERVAL,
                    stop_max_delay=METRICS_WAITTIME,
                    retry_on_result=_should_retry_metrics_fetch,
                    retry_on_exception=lambda x: False)
    def _get_metrics():
        master_response = dcos_api_session.get(
            '/system/v1/agent/%s/metrics/v0/containers/%s/app' %
            (agent_id, container_id['value']),
            host=master_ip)
        return master_response

    r = _post_agent(launch_data)
    assert r.status_code == 200, 'Received unexpected status code when launching standalone container'

    try:
        logging.debug(
            'Successfully created standalone container with container ID %s',
            container_id['value'])

        # Verify that the standalone container's metrics are being collected
        r = _get_metrics()
        assert r.status_code == 200, 'Received unexpected status code when fetching standalone container metrics'

        metrics_response = r.json()

        assert 'datapoints' in metrics_response, 'got {}'.format(
            metrics_response)

        uptime_dp = None
        for dp in metrics_response['datapoints']:
            if dp['name'] == 'statsd_tester.time.uptime':
                uptime_dp = dp
                break

        # If this metric is missing, statsd-emitter's metrics were not received
        assert uptime_dp is not None, 'got {}'.format(metrics_response)

        datapoint_keys = ['name', 'value', 'unit', 'timestamp', 'tags']
        for k in datapoint_keys:
            assert k in uptime_dp, 'got {}'.format(uptime_dp)

        expected_tag_names = {
            'dcos_cluster_id', 'test_tag_key', 'dcos_cluster_name', 'host'
        }
        check_tags(uptime_dp['tags'], expected_tag_names)
        assert uptime_dp['tags'][
            'test_tag_key'] == 'test_tag_value', 'got {}'.format(uptime_dp)
        assert uptime_dp['value'] > 0

        assert 'dimensions' in metrics_response, 'got {}'.format(
            metrics_response)
        assert metrics_response['dimensions']['container_id'] == container_id[
            'value']
    finally:
        # Clean up the standalone container
        kill_data = {
            'type': 'KILL_CONTAINER',
            'kill_container': {
                'container_id': container_id
            }
        }

        _post_agent(kill_data)
Example #24
0
def test_if_marathon_app_can_be_deployed_with_nfs_csi_volume(
        dcos_api_session: DcosApiSession) -> None:
    """Marathon app deployment integration test using an NFS CSI volume.

    This test verifies that a Marathon app can be deployed which attaches to
    an NFS volume provided by the NFS CSI plugin. In order to accomplish this,
    we must first set up an NFS share on one agent.
    """

    # We will run an NFS server on one agent and an app on another agent to
    # verify CSI volume functionality.
    if len(dcos_api_session.slaves) < 2:
        pytest.skip("CSI Volume Tests require a minimum of two agents.")

    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip(
            'Cannot setup NFS server as root user with EE strict mode enabled')

    test_uuid = uuid.uuid4().hex

    hosts = dcos_api_session.slaves[0], dcos_api_session.slaves[1]

    # A helper to run a Metronome job as root to clean up the NFS share on an agent.
    # We define this here so that it can be used during error handling.
    def cleanup_nfs() -> None:
        cleanup_command = """
            sudo systemctl stop nfs-server && \
            echo '' | sudo tee /etc/exports && \
            sudo systemctl restart nfs-utils && \
            sudo exportfs -arv && \
            sudo rm -rf /var/lib/dcos-nfs-shares/test-volume-001
        """

        cleanup_job = {
            'description': 'Clean up NFS share',
            'id': 'nfs-share-cleanup-{}'.format(test_uuid),
            'run': {
                'cmd': cleanup_command,
                'cpus': 0.5,
                'mem': 256,
                'disk': 32,
                'user': '******',
                'restart': {
                    'policy': 'ON_FAILURE'
                },
                'placement': {
                    'constraints': [{
                        'attribute': '@hostname',
                        'operator': 'LIKE',
                        'value': hosts[0]
                    }]
                }
            }
        }

        dcos_api_session.metronome_one_off(cleanup_job)

    # Run a Metronome job as root to set up the NFS share on an agent.
    command = """sudo mkdir -p /var/lib/dcos-nfs-shares/test-volume-001 && \
        sudo chown -R nobody: /var/lib/dcos-nfs-shares/test-volume-001 && \
        sudo chmod 777 /var/lib/dcos-nfs-shares/test-volume-001 && \
        echo '/var/lib/dcos-nfs-shares/test-volume-001 *(rw,sync)' | sudo tee /etc/exports && \
        sudo systemctl restart nfs-utils && \
        sudo exportfs -arv && \
        sudo systemctl start nfs-server && \
        sudo systemctl enable nfs-server
    """

    setup_job = {
        'description': 'Set up NFS share',
        'id': 'nfs-share-setup-{}'.format(test_uuid),
        'run': {
            'cmd': command,
            'cpus': 0.5,
            'mem': 256,
            'disk': 32,
            'user': '******',
            'restart': {
                'policy': 'ON_FAILURE'
            },
            'placement': {
                'constraints': [{
                    'attribute': '@hostname',
                    'operator': 'LIKE',
                    'value': hosts[0]
                }]
            }
        }
    }

    dcos_api_session.metronome_one_off(setup_job)

    # Create an app which writes to the NFS volume.
    app = {
        'id':
        'csi-nfs-write-app-{}'.format(test_uuid),
        'instances':
        1,
        'cpus':
        0.5,
        'mem':
        256,
        'cmd':
        'echo some-stuff > test-volume-dir/output && sleep 999999',
        'user':
        '******',
        'container': {
            'type':
            'MESOS',
            'volumes': [{
                'mode': 'rw',
                'containerPath': 'test-volume-dir',
                'external': {
                    'provider': 'csi',
                    'name': 'test-volume-001',
                    'options': {
                        'pluginName': 'nfs.csi.k8s.io',
                        'capability': {
                            'accessType': 'mount',
                            'accessMode': 'MULTI_NODE_MULTI_WRITER',
                            'fsType': 'nfs'
                        },
                        'volumeContext': {
                            'server': hosts[0],
                            'share': '/var/lib/dcos-nfs-shares/test-volume-001'
                        }
                    }
                }
            }]
        },
        'constraints': [['hostname', 'LIKE', hosts[1]]],
        'healthChecks': [{
            'protocol': 'COMMAND',
            'command': {
                'value': 'test `cat test-volume-dir/output` = some-stuff'
            },
            'gracePeriodSeconds': 5,
            'intervalSeconds': 10,
            'timeoutSeconds': 10,
            'maxConsecutiveFailures': 3
        }]
    }

    try:
        with dcos_api_session.marathon.deploy_and_cleanup(app):
            # Trivial app if it deploys, there is nothing else to check
            pass
    except Exception as error:
        raise (error)
    finally:
        cleanup_nfs()
Example #25
0
def test_signal_service(dcos_api_session: DcosApiSession) -> None:
    """
    signal-service runs on an hourly timer, this test runs it as a one-off
    and pushes the results to the test_server app for easy retrieval

    When this test fails due to `dcos-checks-poststart-service-unhealthy`,
    consider that the issue may be due to timeouts which are too low.  See
    https://jira.mesosphere.com/browse/DCOS-22458 for more information.
    """
    dcos_version = os.getenv("DCOS_VERSION", "")
    variant = 'open'

    signal_config_path = Path('/opt/mesosphere/etc/dcos-signal-config.json')
    signal_config = json.loads(signal_config_path.read_text())
    signal_extra_path = Path('/opt/mesosphere/etc/dcos-signal-extra.json')
    try:
        signal_config.update(json.loads(signal_extra_path.read_text()))
        variant = 'enterprise'
    except FileNotFoundError:
        # the file only exists on EE clusters so just skip if it's not there
        pass

    customer_key = signal_config.get('customer_key', '')
    cluster_id = Path('/var/lib/dcos/cluster-id').read_text().strip()

    # sudo is required to read /run/dcos/etc/signal-service/service_account.json
    env = os.environ.copy()
    signal_cmd = ["sudo", "-E", "/opt/mesosphere/bin/dcos-signal", "-test"]
    # universal_newlines means utf-8
    with subprocess.Popen(signal_cmd, stdout=subprocess.PIPE, universal_newlines=True, env=env) as p:
        signal_results = p.stdout.read()  # type: ignore

    r_data = json.loads(signal_results)

    # Collect the dcos-diagnostics output that `dcos-signal` uses to determine
    # whether or not there are failed units.
    resp = dcos_api_session.get('/system/health/v1/report?cache=0')
    # We expect reading the health report to succeed.
    resp.raise_for_status()
    # Parse the response into JSON.
    health_report = resp.json()
    # Reformat the /health json into the expected output format for dcos-signal.
    units_health = {}
    for unit, unit_health in health_report["Units"].items():
        unhealthy = 0
        for node_health in unit_health["Nodes"]:
            for output_unit, output in node_health["Output"].items():
                if unit != output_unit:
                    # This is the output of some unrelated unit, ignore.
                    continue
                if output == "":
                    # This unit is healthy on this node.
                    pass
                else:
                    # This unit is unhealthy on this node.
                    unhealthy += 1
        prefix = "health-unit-{}".format(unit.replace('.', '-'))
        units_health.update({
            "{}-total".format(prefix): len(unit_health["Nodes"]),
            "{}-unhealthy".format(prefix): unhealthy,
        })

    exp_data = {
        'diagnostics': {
            'event': 'health',
            'anonymousId': cluster_id,
            'properties': units_health,
        },
        'cosmos': {
            'event': 'package_list',
            'anonymousId': cluster_id,
            'properties': {}
        },
        'mesos': {
            'event': 'mesos_track',
            'anonymousId': cluster_id,
            'properties': {}
        }
    }

    if customer_key != '':
        exp_data['diagnostics']['userId'] = customer_key

    dcos_config = get_expanded_config()
    # Generic properties which are the same between all tracks
    generic_properties = {
        'licenseId': '',
        'platform': dcos_config['platform'],
        'provider': dcos_config['provider'],
        'source': 'cluster',
        'clusterId': cluster_id,
        'customerKey': customer_key,
        'environmentVersion': dcos_version,
        'variant': variant
    }

    # Insert the generic property data which is the same between all signal tracks
    exp_data['diagnostics']['properties'].update(generic_properties)   # type: ignore
    exp_data['cosmos']['properties'].update(generic_properties)  # type: ignore
    exp_data['mesos']['properties'].update(generic_properties)  # type: ignore

    # Check the entire hash of diagnostics data
    if r_data['diagnostics'] != exp_data['diagnostics']:
        # The optional second argument to `assert` is an error message that
        # appears to get truncated in the output. As such, we log the output
        # instead.
        log.error("Cluster is unhealthy: {}".format(
            json.dumps(health_report, indent=4, sort_keys=True)))
        assert r_data['diagnostics'] == exp_data['diagnostics']

    # Check a subset of things regarding Mesos that we can logically check for
    framework_names = [x['name'] for x in r_data['mesos']['properties']['frameworks']]
    assert 'marathon' in framework_names
    assert 'metronome' in framework_names

    # There are no packages installed by default on the integration test, ensure the key exists
    assert len(r_data['cosmos']['properties']['package_list']) == 0
Example #26
0
def lb_enabled():
    expanded_config = test_helpers.get_expanded_config()
    return expanded_config['enable_lb'] == 'true'
Example #27
0
def lb_enabled():
    expanded_config = test_helpers.get_expanded_config()
    return expanded_config['enable_lb'] == 'true'
Example #28
0
def test_signal_service(dcos_api_session):
    """
    signal-service runs on an hourly timer, this test runs it as a one-off
    and pushes the results to the test_server app for easy retrieval

    When this test fails due to `dcos-checks-poststart-service-unhealthy`,
    consider that the issue may be due to check timeouts which are too low.
    """
    # This is due to caching done by dcos-diagnostics / Signal service
    # We're going to remove this soon: https://mesosphere.atlassian.net/browse/DCOS-9050
    dcos_version = os.environ["DCOS_VERSION"]
    with open('/opt/mesosphere/etc/dcos-signal-config.json', 'r') as f:
        signal_config_data = json.load(f)
    customer_key = signal_config_data.get('customer_key', '')
    enabled = signal_config_data.get('enabled', 'false')
    with open('/var/lib/dcos/cluster-id', 'r') as f:
        cluster_id = f.read().strip()

    if enabled == 'false':
        pytest.skip('Telemetry disabled in /opt/mesosphere/etc/dcos-signal-config.json... skipping test')

    logging.info("Version: " + dcos_version)
    logging.info("Customer Key: " + customer_key)
    logging.info("Cluster ID: " + cluster_id)

    signal_results = subprocess.check_output(["/opt/mesosphere/bin/dcos-signal", "-test"], universal_newlines=True)
    r_data = json.loads(signal_results)

    resp = dcos_api_session.get('/system/health/v1/report?cache=0')
    # We expect reading the health report to succeed.
    resp.raise_for_status()
    # Parse the response into JSON.
    health_report = resp.json()
    # Reformat the /health json into the expected output format for dcos-signal.
    units_health = {}
    for unit, unit_health in health_report["Units"].items():
        unhealthy = 0
        for node_health in unit_health["Nodes"]:
            for output_unit, output in node_health["Output"].items():
                if unit != output_unit:
                    # This is the output of some unrelated unit, ignore.
                    continue
                if output == "":
                    # This unit is healthy on this node.
                    pass
                else:
                    # This unit is unhealthy on this node.
                    unhealthy += 1
        prefix = "health-unit-{}".format(unit.replace('.', '-'))
        units_health.update({
            "{}-total".format(prefix): len(unit_health["Nodes"]),
            "{}-unhealthy".format(prefix): unhealthy,
        })

    exp_data = {
        'diagnostics': {
            'event': 'health',
            'anonymousId': cluster_id,
            'properties': units_health,
        },
        'cosmos': {
            'event': 'package_list',
            'anonymousId': cluster_id,
            'properties': {}
        },
        'mesos': {
            'event': 'mesos_track',
            'anonymousId': cluster_id,
            'properties': {}
        }
    }

    expanded_config = get_expanded_config()
    # Generic properties which are the same between all tracks
    generic_properties = {
        'platform': expanded_config['platform'],
        'provider': expanded_config['provider'],
        'source': 'cluster',
        'clusterId': cluster_id,
        'customerKey': customer_key,
        'environmentVersion': dcos_version,
        'variant': 'open'
    }

    # Insert the generic property data which is the same between all signal tracks
    exp_data['diagnostics']['properties'].update(generic_properties)
    exp_data['cosmos']['properties'].update(generic_properties)
    exp_data['mesos']['properties'].update(generic_properties)

    # Check the entire hash of diagnostics data
    if r_data['diagnostics'] != exp_data['diagnostics']:
        # The optional second argument to `assert` is an error message that
        # appears to get truncated in the output. As such, we log the output
        # instead.
        logging.error("Cluster is unhealthy: {}".format(
            json.dumps(health_report, indent=4, sort_keys=True)))
        assert r_data['diagnostics'] == exp_data['diagnostics']

    # Check a subset of things regarding Mesos that we can logically check for
    framework_names = [x['name'] for x in r_data['mesos']['properties']['frameworks']]
    assert 'marathon' in framework_names
    assert 'metronome' in framework_names

    # There are no packages installed by default on the integration test, ensure the key exists
    assert len(r_data['cosmos']['properties']['package_list']) == 0
Example #29
0
def test_signal_service(dcos_api_session):
    """
    signal-service runs on an hourly timer, this test runs it as a one-off
    and pushes the results to the test_server app for easy retrieval

    When this test fails due to `dcos-checks-poststart-service-unhealthy`,
    consider that the issue may be due to check timeouts which are too low.
    """
    # This is due to caching done by dcos-diagnostics / Signal service
    # We're going to remove this soon: https://mesosphere.atlassian.net/browse/DCOS-9050
    dcos_version = os.environ["DCOS_VERSION"]
    with open('/opt/mesosphere/etc/dcos-signal-config.json', 'r') as f:
        signal_config_data = json.load(f)
    customer_key = signal_config_data.get('customer_key', '')
    enabled = signal_config_data.get('enabled', 'false')
    with open('/var/lib/dcos/cluster-id', 'r') as f:
        cluster_id = f.read().strip()

    if enabled == 'false':
        pytest.skip(
            'Telemetry disabled in /opt/mesosphere/etc/dcos-signal-config.json... skipping test'
        )

    logging.info("Version: " + dcos_version)
    logging.info("Customer Key: " + customer_key)
    logging.info("Cluster ID: " + cluster_id)

    signal_results = subprocess.check_output(
        ["/opt/mesosphere/bin/dcos-signal", "-test"], universal_newlines=True)
    r_data = json.loads(signal_results)

    resp = dcos_api_session.get('/system/health/v1/report?cache=0')
    # We expect reading the health report to succeed.
    resp.raise_for_status()
    # Parse the response into JSON.
    health_report = resp.json()
    # Reformat the /health json into the expected output format for dcos-signal.
    units_health = {}
    for unit, unit_health in health_report["Units"].items():
        unhealthy = 0
        for node_health in unit_health["Nodes"]:
            for output_unit, output in node_health["Output"].items():
                if unit != output_unit:
                    # This is the output of some unrelated unit, ignore.
                    continue
                if output == "":
                    # This unit is healthy on this node.
                    pass
                else:
                    # This unit is unhealthy on this node.
                    unhealthy += 1
        prefix = "health-unit-{}".format(unit.replace('.', '-'))
        units_health.update({
            "{}-total".format(prefix):
            len(unit_health["Nodes"]),
            "{}-unhealthy".format(prefix):
            unhealthy,
        })

    exp_data = {
        'diagnostics': {
            'event': 'health',
            'anonymousId': cluster_id,
            'properties': units_health,
        },
        'cosmos': {
            'event': 'package_list',
            'anonymousId': cluster_id,
            'properties': {}
        },
        'mesos': {
            'event': 'mesos_track',
            'anonymousId': cluster_id,
            'properties': {}
        }
    }

    expanded_config = get_expanded_config()
    # Generic properties which are the same between all tracks
    generic_properties = {
        'platform': expanded_config['platform'],
        'provider': expanded_config['provider'],
        'source': 'cluster',
        'clusterId': cluster_id,
        'customerKey': customer_key,
        'environmentVersion': dcos_version,
        'variant': 'open'
    }

    # Insert the generic property data which is the same between all signal tracks
    exp_data['diagnostics']['properties'].update(generic_properties)
    exp_data['cosmos']['properties'].update(generic_properties)
    exp_data['mesos']['properties'].update(generic_properties)

    # Check the entire hash of diagnostics data
    if r_data['diagnostics'] != exp_data['diagnostics']:
        # The optional second argument to `assert` is an error message that
        # appears to get truncated in the output. As such, we log the output
        # instead.
        logging.error("Cluster is unhealthy: {}".format(
            json.dumps(health_report, indent=4, sort_keys=True)))
        assert r_data['diagnostics'] == exp_data['diagnostics']

    # Check a subset of things regarding Mesos that we can logically check for
    framework_names = [
        x['name'] for x in r_data['mesos']['properties']['frameworks']
    ]
    assert 'marathon' in framework_names
    assert 'metronome' in framework_names

    # There are no packages installed by default on the integration test, ensure the key exists
    assert len(r_data['cosmos']['properties']['package_list']) == 0
Example #30
0
def test_dcos_cni_l4lb(dcos_api_session):
    '''
    This tests the `dcos - l4lb` CNI plugins:
        https: // github.com / dcos / dcos - cni / tree / master / cmd / l4lb

    The `dcos-l4lb` CNI plugins allows containers running on networks that don't
    necessarily have routes to spartan interfaces and minuteman VIPs to consume DNS
    service from spartan and layer-4 load-balancing services from minuteman by
    injecting spartan and minuteman services into the container's network
    namespace. You can read more about the motivation for this CNI plugin and type
    of problems it solves in this design doc:

    https://docs.google.com/document/d/1xxvkFknC56hF-EcDmZ9tzKsGiZdGKBUPfrPKYs85j1k/edit?usp=sharing

    In order to test `dcos-l4lb` CNI plugin we emulate a virtual network that
    lacks routes for spartan interface and minuteman VIPs. In this test, we
    first install a virtual network called `spartan-net` on one of the agents.
    The `spartan-net` is a CNI network that is a simple BRIDGE network with the
    caveat that it doesn't have any default routes. `spartan-net` has routes
    only for the agent network. In other words it doesn't have any routes
    towards the spartan-interfaces or minuteman VIPs.

    We then run a server (our python ping-pong server) on the DC/OS overlay.
    Finally to test that the `dcos-l4lb` plugin, which is also part of
    `spartan-net` is able to inject the Minuteman and Spartan services into the
    contianer's netns, we start a client on the `spartan-net` and try to `curl` the
    `ping-pong` server using its VIP. Without the Minuteman and Spartan services
    injected in the container's netns the expectation would be that this `curl`
    would fail, with a successful `curl` execution on the VIP allowing the
    test-case to PASS.
    '''
    if not lb_enabled():
        pytest.skip('Load Balancer disabled')

    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('Cannot setup CNI config with EE strict mode enabled')

    # CNI configuration of `spartan-net`.
    spartan_net = {
        'cniVersion': '0.2.0',
        'name': 'spartan-net',
        'type': 'dcos-l4lb',
        'delegate': {
            'type': 'mesos-cni-port-mapper',
            'excludeDevices': ['sprt-cni0'],
            'chain': 'spartan-net',
            'delegate': {
                'type': 'bridge',
                'bridge': 'sprt-cni0',
                'ipMasq': True,
                'isGateway': True,
                'ipam': {
                    'type': 'host-local',
                    'subnet': '192.168.250.0/24',
                    'routes': [
                     # Reachability to DC/OS overlay.
                     {'dst': '9.0.0.0/8'},
                     # Reachability to all private address subnet. We need
                     # this reachability since different cloud providers use
                     # different private address spaces to launch tenant
                     # networks.
                     {'dst': '10.0.0.0/8'},
                     {'dst': '172.16.0.0/12'},
                     {'dst': '192.168.0.0/16'}
                    ]
                }
            }
        }
    }

    log.info("spartan-net config:{}".format(json.dumps(spartan_net)))

    # Application to deploy CNI configuration.
    cni_config_app, config_uuid = test_helpers.marathon_test_app()

    # Override the default test app command with a command to write the CNI
    # configuration.
    #
    # NOTE: We add the sleep at the end of this command so that the task stays
    # alive for the test harness to make sure that the task got deployed.
    # Ideally we should be able to deploy one of tasks using the test harness
    # but that doesn't seem to be the case here.
    cni_config_app['cmd'] = 'echo \'{}\' > /opt/mesosphere/etc/dcos/network/cni/spartan.cni && sleep 10000'.format(
        json.dumps(spartan_net))
    del cni_config_app['healthChecks']

    log.info("App for setting CNI config: {}".format(json.dumps(cni_config_app)))

    try:
        dcos_api_session.marathon.deploy_app(cni_config_app, check_health=False)
    except Exception as ex:
        raise AssertionError("Couldn't install CNI config for `spartan-net`".format(json.dumps(cni_config_app))) from ex

    # Get the host on which the `spartan-net` was installed.
    cni_config_app_service = None
    try:
        cni_config_app_service = dcos_api_session.marathon.get_app_service_endpoints(cni_config_app['id'])
    except Exception as ex:
        raise AssertionError("Couldn't retrieve the host on which `spartan-net` was installed.") from ex

    # We only have one instance of `cni_config_app_service`.
    spartan_net_host = cni_config_app_service[0].host

    # Launch the test-app on DC/OS overlay, with a VIP.
    server_vip_port = unused_port()
    server_vip = '/spartanvip:{}'.format(server_vip_port)
    server_vip_addr = 'spartanvip.marathon.l4lb.thisdcos.directory:{}'.format(server_vip_port)

    # Launch the test_server in ip-per-container mode (user network)
    server, test_uuid = test_helpers.marathon_test_app(
        container_type=marathon.Container.MESOS,
        healthcheck_protocol=marathon.Healthcheck.MESOS_HTTP,
        network=marathon.Network.USER,
        host_port=9080,
        vip=server_vip)

    # Launch the server on the DC/OS overlay
    log.info("Launching server with VIP:{} on network {}".format(server_vip_addr, server['networks'][0]['name']))

    try:
        dcos_api_session.marathon.deploy_app(server, check_health=False)
    except Exception as ex:
        raise AssertionError(
            "Couldn't launch server on 'dcos':{}".format(server['networks'][0]['name'])) from ex

    # Get the client app on the 'spartan-net' network.
    client_port = 9081
    client, test_uuid = test_helpers.marathon_test_app(
        container_type=marathon.Container.MESOS,
        healthcheck_protocol=marathon.Healthcheck.MESOS_HTTP,
        network=marathon.Network.USER,
        host_port=client_port,
        container_port=client_port,
        vip=server_vip,
        host_constraint=spartan_net_host,
        network_name='spartan-net')

    try:
        dcos_api_session.marathon.deploy_app(client, check_health=False)
    except Exception as ex:
        raise AssertionError("Couldn't launch client on 'spartan-net':{}".format(client)) from ex

    # Change the client command task to do a curl on the server we just deployed.
    cmd = '/opt/mesosphere/bin/curl -s -f -m 5 http://{}/ping'.format(server_vip_addr)

    try:
        response = ensure_routable(cmd, spartan_net_host, client_port)
        log.info("Received a response from {}: {}".format(server_vip_addr, response))
    except Exception as ex:
        raise AssertionError("Unable to query VIP: {}".format(server_vip_addr)) from ex
Example #31
0
def test_dcos_cni_l4lb(dcos_api_session):
    '''
    This tests the `dcos - l4lb` CNI plugins:
        https: // github.com / dcos / dcos - cni / tree / master / cmd / l4lb

    The `dcos-l4lb` CNI plugins allows containers running on networks that don't
    necessarily have routes to spartan interfaces and minuteman VIPs to consume DNS
    service from spartan and layer-4 load-balancing services from minuteman by
    injecting spartan and minuteman services into the container's network
    namespace. You can read more about the motivation for this CNI plugin and type
    of problems it solves in this design doc:

    https://docs.google.com/document/d/1xxvkFknC56hF-EcDmZ9tzKsGiZdGKBUPfrPKYs85j1k/edit?usp=sharing

    In order to test `dcos-l4lb` CNI plugin we emulate a virtual network that
    lacks routes for spartan interface and minuteman VIPs. In this test, we
    first install a virtual network called `spartan-net` on one of the agents.
    The `spartan-net` is a CNI network that is a simple BRIDGE network with the
    caveat that it doesn't have any default routes. `spartan-net` has routes
    only for the agent network. In other words it doesn't have any routes
    towards the spartan-interfaces or minuteman VIPs.

    We then run a server (our python ping-pong server) on the DC/OS overlay.
    Finally to test that the `dcos-l4lb` plugin, which is also part of
    `spartan-net` is able to inject the Minuteman and Spartan services into the
    contianer's netns, we start a client on the `spartan-net` and try to `curl` the
    `ping-pong` server using its VIP. Without the Minuteman and Spartan services
    injected in the container's netns the expectation would be that this `curl`
    would fail, with a successful `curl` execution on the VIP allowing the
    test-case to PASS.
    '''
    if not lb_enabled():
        pytest.skip('Load Balancer disabled')

    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('Cannot setup CNI config with EE strict mode enabled')

    # Run all the test application on the first agent node
    host = dcos_api_session.slaves[0]

    # CNI configuration of `spartan-net`.
    spartan_net = {
        'cniVersion': '0.2.0',
        'name': 'spartan-net',
        'type': 'dcos-l4lb',
        'delegate': {
            'type': 'mesos-cni-port-mapper',
            'excludeDevices': ['sprt-cni0'],
            'chain': 'spartan-net',
            'delegate': {
                'type': 'bridge',
                'bridge': 'sprt-cni0',
                'ipMasq': True,
                'isGateway': True,
                'ipam': {
                    'type':
                    'host-local',
                    'subnet':
                    '192.168.250.0/24',
                    'routes': [
                        # Reachability to DC/OS overlay.
                        {
                            'dst': '9.0.0.0/8'
                        },
                        # Reachability to all private address subnet. We need
                        # this reachability since different cloud providers use
                        # different private address spaces to launch tenant
                        # networks.
                        {
                            'dst': '10.0.0.0/8'
                        },
                        {
                            'dst': '172.16.0.0/12'
                        },
                        {
                            'dst': '192.168.0.0/16'
                        }
                    ]
                }
            }
        }
    }

    log.info("spartan-net config:{}".format(json.dumps(spartan_net)))

    # Application to deploy CNI configuration.
    cni_config_app = MarathonApp(
        marathon.Container.NONE,
        marathon.Network.HOST,
        host,
        app_name_fmt='/integration-test/cni-l4lb/config-{}')

    # Override the default test app command with a command to write the CNI
    # configuration.
    #
    # NOTE: We add the original command at the end of this command so that the task
    # stays alive for the test harness to make sure that the task got deployed.
    # Ideally we should be able to deploy one of tasks using the test harness
    # but that doesn't seem to be the case here.
    cni_config_app.app['cmd'] = \
        "echo '{}' > {} && {}".format(
            json.dumps(spartan_net),
            '/opt/mesosphere/etc/dcos/network/cni/spartan.cni',
            cni_config_app.app['cmd'])

    log.info("CNI Config application: {}".format(cni_config_app.app))
    try:
        cni_config_app.deploy(dcos_api_session)
        cni_config_app.wait(dcos_api_session)
    finally:
        cni_config_app.purge(dcos_api_session)
    log.info("CNI Config has been deployed on {}".format(host))

    # Get the host on which the `spartan-net` was installed.
    # Launch the test-app on DC/OS overlay, with a VIP.
    server_vip_label = '/spartanvip:10000'
    server_vip_addr = 'spartanvip.marathon.l4lb.thisdcos.directory:10000'

    # Launch the test_server in ip-per-container mode (user network)
    server_app = MarathonApp(
        marathon.Container.MESOS,
        marathon.Network.USER,
        host,
        vip=server_vip_label,
        app_name_fmt='/integration-test/cni-l4lb/server-{}')
    log.info("Server application: {}".format(server_app.app))

    # Get the client app on the 'spartan-net' network.
    client_app = MarathonApp(
        marathon.Container.MESOS,
        marathon.Network.USER,
        host,
        network_name='spartan-net',
        app_name_fmt='/integration-test/cni-l4lb/client-{}')
    log.info("Client application: {}".format(client_app.app))

    try:
        # Launch the test application
        client_app.deploy(dcos_api_session)
        server_app.deploy(dcos_api_session)

        # Wait for the test application
        server_app.wait(dcos_api_session)
        client_app.wait(dcos_api_session)

        # NOTE(mainred): route from the pytest worker node to the client
        # application is not ensured, so it's better to use the IP address of
        # the agent deploying the client app and the mapping port instead
        client_host, client_port = client_app.hostport(dcos_api_session)
        # Check linux kernel version
        uname = ensure_routable('uname -r',
                                client_host,
                                client_port,
                                json_output=False)
        if '3.10.0-862' <= uname < '3.10.0-898':
            return pytest.skip(
                'See https://bugzilla.redhat.com/show_bug.cgi?id=1572983')

        # Change the client command task to do a curl on the server we just deployed.
        cmd = '/opt/mesosphere/bin/curl -s -f -m 5 http://{}/test_uuid'.format(
            server_vip_addr)

        assert ensure_routable(cmd, client_host,
                               client_port)['test_uuid'] == server_app.uuid
    finally:
        server_app.purge(dcos_api_session)
        client_app.purge(dcos_api_session)
Example #32
0
def test_dcos_cni_l4lb(dcos_api_session):
    '''
    This tests the `dcos - l4lb` CNI plugins:
        https: // github.com / dcos / dcos - cni / tree / master / cmd / l4lb

    The `dcos-l4lb` CNI plugins allows containers running on networks that don't
    necessarily have routes to spartan interfaces and minuteman VIPs to consume DNS
    service from spartan and layer-4 load-balancing services from minuteman by
    injecting spartan and minuteman services into the container's network
    namespace. You can read more about the motivation for this CNI plugin and type
    of problems it solves in this design doc:

    https://docs.google.com/document/d/1xxvkFknC56hF-EcDmZ9tzKsGiZdGKBUPfrPKYs85j1k/edit?usp=sharing

    In order to test `dcos-l4lb` CNI plugin we emulate a virtual network that
    lacks routes for spartan interface and minuteman VIPs. In this test, we
    first install a virtual network called `spartan-net` on one of the agents.
    The `spartan-net` is a CNI network that is a simple BRIDGE network with the
    caveat that it doesn't have any default routes. `spartan-net` has routes
    only for the agent network. In other words it doesn't have any routes
    towards the spartan-interfaces or minuteman VIPs.

    We then run a server (our python ping-pong server) on the DC/OS overlay.
    Finally to test that the `dcos-l4lb` plugin, which is also part of
    `spartan-net` is able to inject the Minuteman and Spartan services into the
    contianer's netns, we start a client on the `spartan-net` and try to `curl` the
    `ping-pong` server using its VIP. Without the Minuteman and Spartan services
    injected in the container's netns the expectation would be that this `curl`
    would fail, with a successful `curl` execution on the VIP allowing the
    test-case to PASS.
    '''
    if not lb_enabled():
        pytest.skip('Load Balancer disabled')

    expanded_config = test_helpers.get_expanded_config()
    if expanded_config.get('security') == 'strict':
        pytest.skip('Cannot setup CNI config with EE strict mode enabled')

    # CNI configuration of `spartan-net`.
    spartan_net = {
        'cniVersion': '0.2.0',
        'name': 'spartan-net',
        'type': 'dcos-l4lb',
        'delegate': {
            'type': 'mesos-cni-port-mapper',
            'excludeDevices': ['sprt-cni0'],
            'chain': 'spartan-net',
            'delegate': {
                'type': 'bridge',
                'bridge': 'sprt-cni0',
                'ipMasq': True,
                'isGateway': True,
                'ipam': {
                    'type':
                    'host-local',
                    'subnet':
                    '192.168.250.0/24',
                    'routes': [
                        # Reachability to DC/OS overlay.
                        {
                            'dst': '9.0.0.0/8'
                        },
                        # Reachability to all private address subnet. We need
                        # this reachability since different cloud providers use
                        # different private address spaces to launch tenant
                        # networks.
                        {
                            'dst': '10.0.0.0/8'
                        },
                        {
                            'dst': '172.16.0.0/12'
                        },
                        {
                            'dst': '192.168.0.0/16'
                        }
                    ]
                }
            }
        }
    }

    log.info("spartan-net config:{}".format(json.dumps(spartan_net)))

    # Application to deploy CNI configuration.
    cni_config_app, config_uuid = test_helpers.marathon_test_app()

    # Override the default test app command with a command to write the CNI
    # configuration.
    #
    # NOTE: We add the sleep at the end of this command so that the task stays
    # alive for the test harness to make sure that the task got deployed.
    # Ideally we should be able to deploy one of tasks using the test harness
    # but that doesn't seem to be the case here.
    cni_config_app[
        'cmd'] = 'echo \'{}\' > /opt/mesosphere/etc/dcos/network/cni/spartan.cni && sleep 10000'.format(
            json.dumps(spartan_net))
    del cni_config_app['healthChecks']

    log.info("App for setting CNI config: {}".format(
        json.dumps(cni_config_app)))

    try:
        dcos_api_session.marathon.deploy_app(cni_config_app,
                                             check_health=False)
    except Exception as ex:
        raise AssertionError(
            "Couldn't install CNI config for `spartan-net`".format(
                json.dumps(cni_config_app))) from ex

    # Get the host on which the `spartan-net` was installed.
    cni_config_app_service = None
    try:
        cni_config_app_service = dcos_api_session.marathon.get_app_service_endpoints(
            cni_config_app['id'])
    except Exception as ex:
        raise AssertionError(
            "Couldn't retrieve the host on which `spartan-net` was installed."
        ) from ex

    # We only have one instance of `cni_config_app_service`.
    spartan_net_host = cni_config_app_service[0].host

    # Launch the test-app on DC/OS overlay, with a VIP.
    server_vip_port = unused_port()
    server_vip = '/spartanvip:{}'.format(server_vip_port)
    server_vip_addr = 'spartanvip.marathon.l4lb.thisdcos.directory:{}'.format(
        server_vip_port)

    # Launch the test_server in ip-per-container mode (user network)
    server, test_uuid = test_helpers.marathon_test_app(
        container_type=marathon.Container.MESOS,
        healthcheck_protocol=marathon.Healthcheck.MESOS_HTTP,
        network=marathon.Network.USER,
        host_port=9080,
        vip=server_vip)

    # Launch the server on the DC/OS overlay
    log.info("Launching server with VIP:{} on network {}".format(
        server_vip_addr, server['networks'][0]['name']))

    try:
        dcos_api_session.marathon.deploy_app(server, check_health=False)
    except Exception as ex:
        raise AssertionError("Couldn't launch server on 'dcos':{}".format(
            server['networks'][0]['name'])) from ex

    # Get the client app on the 'spartan-net' network.
    client_port = 9081
    client, test_uuid = test_helpers.marathon_test_app(
        container_type=marathon.Container.MESOS,
        healthcheck_protocol=marathon.Healthcheck.MESOS_HTTP,
        network=marathon.Network.USER,
        host_port=client_port,
        container_port=client_port,
        vip=server_vip,
        host_constraint=spartan_net_host,
        network_name='spartan-net')

    try:
        dcos_api_session.marathon.deploy_app(client, check_health=False)
    except Exception as ex:
        raise AssertionError(
            "Couldn't launch client on 'spartan-net':{}".format(
                client)) from ex

    # Change the client command task to do a curl on the server we just deployed.
    cmd = '/opt/mesosphere/bin/curl -s -f -m 5 http://{}/ping'.format(
        server_vip_addr)

    try:
        response = ensure_routable(cmd, spartan_net_host, client_port)
        log.info("Received a response from {}: {}".format(
            server_vip_addr, response))
    except Exception as ex:
        raise AssertionError(
            "Unable to query VIP: {}".format(server_vip_addr)) from ex