def test_marathon_with_master_process_failure(marathon_service_name):
    """ Launches an app from Marathon and restarts the master.
        It is expected that the service endpoint will come back and that the
        task_id is the original task_id
    """

    app_def = app('master-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/master-failure')
    original_task_id = tasks[0]['id']
    common.systemctl_master()
    shakedown.wait_for_service_endpoint(marathon_service_name)

    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=10000,
                    retry_on_exception=retry_on_exception)
    def check_task_recovery():
        tasks = client.get_tasks('/master-failure')
        tasks[0]['id'] == original_task_id

    check_task_recovery()
Beispiel #2
0
def test_marathon_when_disconnected_from_zk():
    """ Launch an app from Marathon.  Then knock out access to zk from the MoM.
        Verify the task is still good.
    """
    app_def = app('zk-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/zk-failure')
    original_task_id = tasks[0]['id']

    with shakedown.iptable_rules(host):
        block_port(host, 2181)
        #  time of the zk block
        time.sleep(10)

    # after access to zk is restored.
    @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
    def check_task_is_back():
        tasks = client.get_tasks('/zk-failure')
        tasks[0]['id'] == original_task_id

    check_task_is_back()
def test_command_health_check_healthy():
    # Test COMMAND protocol
    with marathon_on_marathon():
        client = marathon.create_client()
        app_def = app()

        assert_app_healthy(client, app_def, command_health_check())
Beispiel #4
0
def test_marathon_delete_leader_and_check_apps(marathon_service_name):

    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    # start an app
    app_def = common.app(id=uuid.uuid4().hex)
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1

    # abdicate leader after app was started successfully
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name,
                                        timedelta(minutes=5).total_seconds())

    @retrying.retry(stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def marathon_leadership_changed():
        current_leader = shakedown.marathon_leader_ip()
        print('leader: {}'.format(current_leader))
        assert original_leader != current_leader

    # wait until leader changed
    marathon_leadership_changed()

    @retrying.retry(stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    client.remove_app(app_id)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 0

    # abdicate leader after app was started successfully
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name,
                                        timedelta(minutes=5).total_seconds())

    # wait until leader changed
    marathon_leadership_changed()

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)
def test_marathon_delete_leader_and_check_apps(marathon_service_name):

    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    # start an app
    app_def = common.app(id=uuid.uuid4().hex)
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1

    # abdicate leader after app was started successfully
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())

    @retrying.retry(stop_max_attempt_number=30)
    def marathon_leadership_changed():
        current_leader = shakedown.marathon_leader_ip()
        print('leader: {}'.format(current_leader))
        assert original_leader != current_leader

    # wait until leader changed
    marathon_leadership_changed()

    @retrying.retry(stop_max_attempt_number=30)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    client.remove_app(app_id)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 0

    # abdicate leader after app was started successfully
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds())

    # wait until leader changed
    marathon_leadership_changed()

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)
Beispiel #6
0
def test_good_user():
    """ Test changes an app from the non-specified (default user) to another
        good user.  This works on coreOS.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)
    # if bad this wait will fail.
    # Good user `core` didn't launch.  This only works on a coreOS or a system with a core user.
    shakedown.deployment_wait()
    tasks = client.get_tasks(app_id)
    assert tasks[0]['id'] != app_def['id'], "Good user `core` didn't launch.  This only works on a coreOS or a system with a core user."
def test_bad_user():
    """ Test changes the default user to a bad user and confirms that task will
        not launch.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
    def check_failure_message():
        appl = client.get_app(app_id)
        message = appl['lastTaskFailure']['message']
        error = "Failed to get user information for 'bad'"
        assert error in message
def test_bad_user():
    """ Test changes the default user to a bad user and confirms that task will
        not launch.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)
    app_def['user'] = '******'

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
    def check_failure_message():
        appl = client.get_app(app_id)
        message = appl['lastTaskFailure']['message']
        error = "Failed to get user information for 'bad'"
        assert error in message
def test_task_failure_recovers():
    """ Tests that if a task is KILLED, it will be relaunched and the taskID is different.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']
    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(stop_max_delay=10000)
    def check_new_task_id():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0]['id']
def test_marathon_when_task_agent_bounced():
    """ Launch an app and restart the node the task is on.
    """
    app_def = app('agent-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/agent-failure')
    original_task_id = tasks[0]['id']
    shakedown.restart_agent(host)

    @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
    def check_task_is_back():
        tasks = client.get_tasks('/agent-failure')
        tasks[0]['id'] == original_task_id
def test_task_failure_recovers():
    """ Tests that if a task is KILLED, it will be relaunched and the taskID is different.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']
    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(stop_max_delay=10000)
    def check_new_task_id():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0]['id']
def test_marathon_when_task_agent_bounced():
    """ Launch an app and restart the node the task is on.
    """
    app_def = app('agent-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/agent-failure')
    original_task_id = tasks[0]['id']
    shakedown.restart_agent(host)

    @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
    def check_task_is_back():
        tasks = client.get_tasks('/agent-failure')
        tasks[0]['id'] == original_task_id
def test_pinned_task_does_not_find_unknown_host():
    """ Tests that a task pinned to an unknown host will not launch.
        within 10 secs it is still in deployment and 0 tasks are running.
    """
    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, '10.255.255.254')
    # only 1 can fit on the node
    app_def['cpus'] = 3.5
    with marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        # deploys are within secs
        # assuming after 10 no tasks meets criteria
        time.sleep(10)

        tasks = client.get_tasks('/pinned')
        assert len(tasks) == 0
def test_pinned_task_does_not_find_unknown_host():
    """ Tests that a task pinned to an unknown host will not launch.
        within 10 secs it is still in deployment and 0 tasks are running.
    """

    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, '10.255.255.254')
    # only 1 can fit on the node
    app_def['cpus'] = 3.5
    client = marathon.create_client()
    client.add_app(app_def)
    # deploys are within secs
    # assuming after 10 no tasks meets criteria
    time.sleep(10)

    tasks = client.get_tasks('/pinned')
    assert len(tasks) == 0
Beispiel #15
0
def test_mom_when_mom_process_killed():
    """ Launched a task from MoM then killed MoM.
    """
    app_def = app('agent-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)
    with marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks('/agent-failure')
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        tasks = client.get_tasks('/agent-failure')
        tasks[0]['id'] == original_task_id
Beispiel #16
0
def test_bad_uri():
    """ Tests marathon's response to launching a task with a bad url (a url that isn't fetchable)
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)
    fetch = [{"uri": "http://mesosphere.io/missing-artifact"}]

    app_def['fetch'] = fetch

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
    def check_failure_message():
        appl = client.get_app(app_id)
        message = appl['lastTaskFailure']['message']
        error = "Failed to fetch all URIs for container"
        assert error in message

    check_failure_message()
def test_pinned_task_recovers_on_host():
    """ Tests that a killed pinned task will recover on the pinned node.
    """
    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    with marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks('/pinned')

        shakedown.kill_process_on_host(host, '[s]leep')
        shakedown.deployment_wait()

        @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
        def check_for_new_task():
            new_tasks = client.get_tasks('/pinned')
            assert tasks[0]['id'] != new_tasks[0]['id']
            assert new_tasks[0]['host'] == host
def test_pinned_task_recovers_on_host():
    """ Tests that a killed pinned task will recover on the pinned node.
    """

    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/pinned')

    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
    def check_for_new_task():
        new_tasks = client.get_tasks('/pinned')
        assert tasks[0]['id'] != new_tasks[0]['id']
        assert new_tasks[0]['host'] == host
def test_bad_uri():
    """ Tests marathon's response to launching a task with a bad url (a url that isn't fetchable)
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)
    fetch = [{
      "uri": "http://mesosphere.io/missing-artifact"
    }]

    app_def['fetch'] = fetch

    client = marathon.create_client()
    client.add_app(app_def)

    @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
    def check_failure_message():
        appl = client.get_app(app_id)
        message = appl['lastTaskFailure']['message']
        error = "Failed to fetch all URIs for container"
        assert error in message

    check_failure_message()
Beispiel #20
0
def test_pinned_task_scales_on_host_only():
    """ Tests that scaling a pinned app scales only on the pinned node.
    """
    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    tasks = client.get_tasks('/pinned')
    assert len(tasks) == 1
    assert tasks[0]['host'] == host

    client.scale_app('pinned', 10)
    shakedown.deployment_wait()

    tasks = client.get_tasks('/pinned')
    assert len(tasks) == 10
    for task in tasks:
        assert task['host'] == host
def test_marathon_with_master_process_failure(marathon_service_name):
    """ Launches an app from Marathon and restarts the master.
        It is expected that the service endpoint will come back and that the
        task_id is the original task_id
    """

    app_def = app('master-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/master-failure')
    original_task_id = tasks[0]['id']
    common.systemctl_master()
    shakedown.wait_for_service_endpoint(marathon_service_name)

    @retrying.retry(wait_fixed=1000, stop_max_delay=10000)
    def check_task_recovery():
        tasks = client.get_tasks('/master-failure')
        tasks[0]['id'] == original_task_id
def test_pinned_task_does_not_scale_to_unpinned_host():
    """ Tests when a task lands on a pinned node (and barely fits) when asked to
        scale past the resources of that node will not scale.
    """
    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)
    # only 1 can fit on the node
    app_def['cpus'] = 3.5
    with marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks('/pinned')
        client.scale_app('pinned', 2)
        # typical deployments are sub 3 secs
        time.sleep(5)
        deployments = client.get_deployments()
        tasks = client.get_tasks('/pinned')

        # still deploying
        assert len(deployments) == 1
        assert len(tasks) == 1
def test_pinned_task_does_not_scale_to_unpinned_host():
    """ Tests when a task lands on a pinned node (and barely fits) when asked to
        scale past the resources of that node will not scale.
    """

    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)
    # only 1 can fit on the node
    app_def['cpus'] = 3.5
    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/pinned')
    client.scale_app('pinned', 2)
    # typical deployments are sub 3 secs
    time.sleep(5)
    deployments = client.get_deployments()
    tasks = client.get_tasks('/pinned')

    # still deploying
    assert len(deployments) == 1
    assert len(tasks) == 1
def test_command_health_check_healthy():
    # Test COMMAND protocol
    client = marathon.create_client()
    app_def = app()

    assert_app_healthy(client, app_def, command_health_check())
Beispiel #25
0
def test_marathon_backup_and_check_apps(marathon_service_name):

    backup_file1 = 'backup1.tar'
    backup_file2 = 'backup2.tar'
    backup_dir = '/tmp'
    backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1)
    backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2)

    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    # start an app
    app_def = common.app(id=uuid.uuid4().hex)
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1

    # Abdicate the leader with backup
    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    url = 'v2/leader?backup={}'.format(backup_url1)
    print('DELETE {}'.format(url))
    common.delete_marathon_path(url)

    shakedown.wait_for_service_endpoint(marathon_service_name,
                                        timedelta(minutes=5).total_seconds())

    @retrying.retry(stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def marathon_leadership_changed():
        current_leader = shakedown.marathon_leader_ip()
        print('leader: {}'.format(current_leader))
        assert original_leader != current_leader

    # wait until leader changed
    marathon_leadership_changed()

    @retrying.retry(stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    # then remove
    client.remove_app(app_id)
    shakedown.deployment_wait()

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 0

    # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted
    # leads to the state that marathon was not able to re-start, because the second backup failed constantly.

    # Abdicate the leader with backup
    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    url = 'v2/leader?backup={}'.format(backup_url2)
    print('DELETE {}'.format(url))
    common.delete_marathon_path(url)

    shakedown.wait_for_service_endpoint(marathon_service_name,
                                        timedelta(minutes=5).total_seconds())

    # wait until leader changed
    # if leader changed, this means that marathon was able to start again, which is great :-).
    marathon_leadership_changed()

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)