def test_mom_with_network_failure_bounce_master():
    """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports."""

    # get MoM ip
    mom_ip = common.ip_of_mom()
    logger.info("MoM IP: {}".format(mom_ip))

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'))
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]["id"]
        task_ip = tasks[0]['host']
        logger.info("\nTask IP: " + task_ip)

    # PR for network partitioning in shakedown makes this better
    # take out the net
    partition_agent(mom_ip)
    partition_agent(task_ip)

    # wait for a min
    time.sleep(timedelta(minutes=1).total_seconds())

    # bounce master
    shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master")

    # bring the net up
    reconnect_agent(mom_ip)
    reconnect_agent(task_ip)

    time.sleep(timedelta(minutes=1).total_seconds())
    common.wait_for_service_endpoint('marathon-user',
                                     timedelta(minutes=10).total_seconds(),
                                     path="ping")

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'),
                                timedelta(minutes=10).total_seconds())

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def assert_mom_ee(version, security_mode='permissive'):
    ensure_prerequisites_installed()
    ensure_service_account()
    ensure_permissions()
    ensure_sa_secret(strict=True if security_mode == 'strict' else False)
    ensure_docker_config_secret()

    # In strict mode all tasks are started as user `nobody` by default. However we start
    # MoM-EE as 'root' and for that we need to give root marathon ACLs to start
    # tasks as 'root'.
    if security_mode == 'strict':
        common.add_dcos_marathon_user_acls()

    # Deploy MoM-EE in permissive mode
    app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(),
                                                 security_mode, version)
    assert os.path.isfile(
        app_def_file
    ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file)

    image = mom_ee_image(version)
    print('Deploying {} definition with {} image'.format(app_def_file, image))

    app_def = get_resource(app_def_file)
    app_def['container']['docker'][
        'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image)
    app_id = app_def["id"]

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)
    common.wait_for_service_endpoint(mom_ee_endpoint(version, security_mode),
                                     path="ping")
def test_mom_when_mom_agent_bounced():
    """Launch an app from MoM and restart the node MoM is on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    mom_ip = common.ip_of_mom()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        common.deployment_wait(service_id=app_id)
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.restart_agent(mom_ip)

        @retrying.retry(wait_fixed=1000,
                        stop_max_attempt_number=30,
                        retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0][
                'id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def assert_mom_ee(version, security_mode='permissive'):
    ensure_prerequisites_installed()
    ensure_service_account()
    ensure_permissions()
    ensure_secret(strict=True if security_mode == 'strict' else False)
    ensure_docker_credentials()

    # Deploy MoM-EE in permissive mode
    app_def_file = '{}/mom-ee-{}-{}.json'.format(fixtures.fixtures_dir(),
                                                 security_mode, version)
    assert os.path.isfile(
        app_def_file
    ), "Couldn't find appropriate MoM-EE definition: {}".format(app_def_file)

    image = mom_ee_image(version)
    print('Deploying {} definition with {} image'.format(app_def_file, image))

    app_def = get_resource(app_def_file)
    app_def['container']['docker'][
        'image'] = 'mesosphere/marathon-dcos-ee:{}'.format(image)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    shakedown.wait_for_service_endpoint(mom_ee_endpoint(
        version, security_mode))
Esempio n. 5
0
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        common.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        common.wait_for_service_endpoint('marathon-user', path="ping")

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def test_mom_with_network_failure_bounce_master():
    """Marathon on Marathon (MoM) tests for DC/OS with network failures simulated by knocking out ports."""

    # get MoM ip
    mom_ip = common.ip_of_mom()
    print("MoM IP: {}".format(mom_ip))

    app_def = apps.sleep_app()
    app_id = app_def["id"]

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'))
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]["id"]
        task_ip = tasks[0]['host']
        print("\nTask IP: " + task_ip)

    # PR for network partitioning in shakedown makes this better
    # take out the net
    partition_agent(mom_ip)
    partition_agent(task_ip)

    # wait for a min
    time.sleep(timedelta(minutes=1).total_seconds())

    # bounce master
    shakedown.run_command_on_master("sudo systemctl restart dcos-mesos-master")

    # bring the net up
    reconnect_agent(mom_ip)
    reconnect_agent(task_ip)

    time.sleep(timedelta(minutes=1).total_seconds())
    shakedown.wait_for_service_endpoint('marathon-user', timedelta(minutes=10).total_seconds())

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        shakedown.wait_for_task("marathon-user", app_id.lstrip('/'), timedelta(minutes=10).total_seconds())

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
Esempio n. 8
0
def delete_all_apps():
    client = marathon.create_client()
    apps = client.get_apps()
    for app in apps:
        if app['id'] == '/marathon-user':
            print('WARNING: not removing marathon-user, because it is special')
        else:
            client.remove_app(app['id'], True)
Esempio n. 9
0
def delete_all_apps():
    client = marathon.create_client()
    apps = client.get_apps()
    for app in apps:
        if app['id'] == '/marathon-user':
            print('WARNING: not removing marathon-user, because it is special')
        else:
            client.remove_app(app['id'], True)
Esempio n. 10
0
def __marathon_leadership_changed_in_marathon_api(original_leader):
    """ This method uses Marathon API to figure out that leadership changed.
        We have to retry here because leader election takes time and what might happen is that some nodes might
        not be aware of the new leader being elected resulting in HTTP 502.
    """
    current_leader = marathon.create_client().get_leader()
    print('leader according to marathon API: {}'.format(current_leader))
    assert original_leader != current_leader
Esempio n. 11
0
def stop_all_deployments(noisy=False):
    client = marathon.create_client()
    deployments = client.get_deployments()
    for deployment in deployments:
        try:
            client.stop_deployment(deployment['id'])
        except Exception as e:
            if noisy:
                print(e)
Esempio n. 12
0
def cluster_info(mom_name='marathon-user'):
    print("DC/OS: {}, in {} mode".format(shakedown.dcos_version(), shakedown.ee_version()))
    agents = shakedown.get_private_agents()
    print("Agents: {}".format(len(agents)))
    client = marathon.create_client()
    about = client.get_about()
    print("Marathon version: {}".format(about.get("version")))

    if shakedown.service_available_predicate(mom_name):
        with shakedown.marathon_on_marathon(mom_name):
            try:
                client = marathon.create_client()
                about = client.get_about()
                print("Marathon MoM version: {}".format(about.get("version")))
            except Exception:
                print("Marathon MoM not present")
    else:
        print("Marathon MoM not present")
Esempio n. 13
0
def clear_pods():
    try:
        client = marathon.create_client()
        pods = client.list_pod()
        for pod in pods:
            client.remove_pod(pod["id"], True)
        shakedown.deployment_wait()
    except Exception:
        pass
Esempio n. 14
0
def stop_all_deployments(noisy=False):
    client = marathon.create_client()
    deployments = client.get_deployments()
    for deployment in deployments:
        try:
            client.stop_deployment(deployment['id'])
        except Exception as e:
            if noisy:
                print(e)
Esempio n. 15
0
def cluster_info(mom_name='marathon-user'):
    print("DC/OS: {}, in {} mode".format(shakedown.dcos_version(), shakedown.ee_version()))
    agents = shakedown.get_private_agents()
    print("Agents: {}".format(len(agents)))
    client = marathon.create_client()
    about = client.get_about()
    print("Marathon version: {}".format(about.get("version")))

    if shakedown.service_available_predicate(mom_name):
        with shakedown.marathon_on_marathon(mom_name):
            try:
                client = marathon.create_client()
                about = client.get_about()
                print("Marathon MoM version: {}".format(about.get("version")))
            except Exception:
                print("Marathon MoM not present")
    else:
        print("Marathon MoM not present")
Esempio n. 16
0
def clear_pods():
    try:
        client = marathon.create_client()
        pods = client.list_pod()
        for pod in pods:
            client.remove_pod(pod["id"], True)
        shakedown.deployment_wait()
    except Exception:
        pass
Esempio n. 17
0
def deployment_predicate(service_id=None):
    deployments = marathon.create_client().get_deployments()
    if (service_id is None):
        return len(deployments) == 0
    else:
        filtered = [
            deployment for deployment in deployments
            if (service_id in deployment['affectedApps'] or service_id in deployment['affectedPods'])
        ]
        return len(filtered) == 0
Esempio n. 18
0
def __marathon_leadership_changed_in_marathon_api(original_leader):
    """ This method uses Marathon API to figure out that leadership changed.
        We have to retry here because leader election takes time and what might happen is that some nodes might
        not be aware of the new leader being elected resulting in HTTP 502.
    """
    # Leader is returned like this 10.0.6.88:8080 - we want just the IP
    current_leader = marathon.create_client().get_leader().split(':', 1)[0]
    print('leader according to marathon API: {}'.format(current_leader))
    assert original_leader != current_leader
    return current_leader
Esempio n. 19
0
def deployment_predicate(service_id=None):
    deployments = marathon.create_client().get_deployments()
    if (service_id is None):
        return len(deployments) == 0
    else:
        filtered = [
            deployment for deployment in deployments
            if (service_id in deployment['affectedApps'] or service_id in deployment['affectedPods'])
        ]
        return len(filtered) == 0
Esempio n. 20
0
def __marathon_leadership_changed_in_marathon_api(original_leader):
    """ This method uses Marathon API to figure out that leadership changed.
        We have to retry here because leader election takes time and what might happen is that some nodes might
        not be aware of the new leader being elected resulting in HTTP 502.
    """
    # Leader is returned like this 10.0.6.88:8080 - we want just the IP
    current_leader = marathon.create_client().get_leader().split(':', 1)[0]
    print('leader according to marathon API: {}'.format(current_leader))
    assert original_leader != current_leader
    return current_leader
Esempio n. 21
0
def deployments_for(service_id=None):
    deployments = marathon.create_client().get_deployments()
    if (service_id is None):
        return deployments
    else:
        filtered = [
            deployment for deployment in deployments
            if (service_id in deployment['affectedApps']
                or service_id in deployment['affectedPods'])
        ]
        return filtered
def simple_sleep_app(name):
    # Deploy a simple sleep app in the MoM-EE
    with shakedown.marathon_on_marathon(name=name):
        client = marathon.create_client()

        app_def = apps.sleep_app()
        client.add_app(app_def)
        shakedown.deployment_wait()

        tasks = shakedown.get_service_task(name, app_def["id"].lstrip("/"))
        print('MoM-EE tasks: {}'.format(tasks))
        return tasks is not None
def remove_mom_ee():
    mom_ee_versions = [('1.4', 'strict'), ('1.4', 'permissive'),
                       ('1.4', 'disabled'), ('1.3', 'strict'),
                       ('1.3', 'permissive'), ('1.3', 'disabled')]
    for mom_ee in mom_ee_versions:
        endpoint = mom_ee_endpoint(mom_ee[0], mom_ee[1])
        if shakedown.service_available_predicate(endpoint):
            print('Removing {}...'.format(endpoint))
            with shakedown.marathon_on_marathon(name=endpoint):
                shakedown.delete_all_apps()

    client = marathon.create_client()
    client.remove_app(MOM_EE_NAME)
    shakedown.deployment_wait()
    print('Successfully removed {}'.format(MOM_EE_NAME))
Esempio n. 24
0
def deployments_for(service_id=None, deployment_id=None):
    deployments = marathon.create_client().get_deployments()
    if deployment_id:
        filtered = [
            deployment for deployment in deployments
            if deployment_id == deployment["id"]
        ]
        return filtered
    elif service_id:
        filtered = [
            deployment for deployment in deployments
            if service_id in deployment['affectedApps'] or service_id in deployment['affectedPods']
        ]
        return filtered
    else:
        return deployments
def test_framework_unavailable_on_mom():
    """Launches an app that has elements necessary to create a service endpoint in DCOS.
       This test confirms that the endpoint is not created when launched with MoM.
    """

    app_def = apps.fake_framework()
    app_id = app_def["id"]

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        common.deployment_wait(service_id=app_id)
    try:
        common.wait_for_service_endpoint('pyfw', 15)
    except Exception:
        pass
    else:
        assert False, 'MoM shoud NOT create a service endpoint'
def test_framework_unavailable_on_mom():
    """Launches an app that has elements necessary to create a service endpoint in DCOS.
       This test confirms that the endpoint is not created when launched with MoM.
    """

    app_def = apps.fake_framework()

    with shakedown.marathon_on_marathon():
        common.delete_all_apps_wait()
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()

    try:
        shakedown.wait_for_service_endpoint('pyfw', 15)
    except:
        pass
    else:
        assert False, 'MoM shoud NOT create a service endpoint'
def test_mom_when_mom_agent_bounced():
    """Launch an app from MoM and restart the node MoM is on."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    mom_ip = common.ip_of_mom()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.restart_agent(mom_ip)

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def is_mom_ee_deployed():
    mom_ee_id = '/{}'.format(MOM_EE_NAME)
    client = marathon.create_client()
    apps = client.get_apps()
    return any(app['id'] == mom_ee_id for app in apps)
Esempio n. 29
0
def clean_up_marathon(parent_group="/"):
    client = marathon.create_client()

    response = client.remove_group(parent_group, force=True)
    deployment_wait(deployment_id=response["deploymentId"])
Esempio n. 30
0
def clean_up_marathon():
    client = marathon.create_client()
    client.remove_group("/", force=True)
    deployment_wait()
Esempio n. 31
0
def marathon_version():
    client = marathon.create_client()
    about = client.get_about()
    # 1.3.9 or 1.4.0-RC8
    return LooseVersion(about.get("version"))
Esempio n. 32
0
def delete_all_groups():
    client = marathon.create_client()
    groups = client.get_groups()
    for group in groups:
        client.remove_group(group["id"])
Esempio n. 33
0
def delete_all_groups():
    client = marathon.create_client()
    groups = client.get_groups()
    for group in groups:
        client.remove_group(group["id"])
Esempio n. 34
0
def marathon_version():
    client = marathon.create_client()
    about = client.get_about()
    # 1.3.9 or 1.4.0-RC8
    return LooseVersion(about.get("version"))