def test_marathon_zk_partition_leader_change(marathon_service_name):

    original_leader = common.get_marathon_leader_not_on_master_leader_node()

    common.block_iptable_rules_for_seconds(original_leader, 2181, sleep_seconds=30)

    common.assert_marathon_leadership_changed(original_leader)
Example #2
0
def test_marathon_zk_partition_leader_change(marathon_service_name):

    original_leader = common.get_marathon_leader_not_on_master_leader_node()

    common.block_iptable_rules_for_seconds(original_leader, 2181, sleep_seconds=30)

    common.assert_marathon_leadership_changed(original_leader)
Example #3
0
def test_marathon_delete_leader(marathon_service_name):
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    common.abdicate_marathon_leader()

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    common.assert_marathon_leadership_changed(original_leader)
def test_marathon_delete_leader(marathon_service_name):
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    common.abdicate_marathon_leader()

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    common.assert_marathon_leadership_changed(original_leader)
Example #5
0
def test_marathon_delete_leader(marathon_service_name):
    original_leader = shakedown.marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    common.delete_marathon_path('v2/leader')

    shakedown.wait_for_service_endpoint(marathon_service_name,
                                        timedelta(minutes=5).total_seconds())

    common.assert_marathon_leadership_changed(original_leader)
Example #6
0
def test_marathon_master_partition_leader_change(marathon_service_name):

    original_leader = common.get_marathon_leader_not_on_master_leader_node()

    # blocking outbound connection to mesos master
    # Marathon has a Mesos heartbeat interval of 15 seconds. If 5 are missed it
    # disconnects. Thus we should wait more than 75 seconds.
    common.block_iptable_rules_for_seconds(original_leader, 5050, sleep_seconds=100,
                                           block_input=False, block_output=True)

    common.assert_marathon_leadership_changed(original_leader)
def test_marathon_master_partition_leader_change(marathon_service_name):

    original_leader = common.get_marathon_leader_not_on_master_leader_node()

    # blocking outbound connection to mesos master
    # Marathon has a Mesos heartbeat interval of 15 seconds. If 5 are missed it
    # disconnects. Thus we should wait more than 75 seconds.
    common.block_iptable_rules_for_seconds(original_leader, 5050, sleep_seconds=100,
                                           block_input=False, block_output=True)

    common.assert_marathon_leadership_changed(original_leader)
Example #8
0
def test_marathon_backup_and_check_apps(marathon_service_name):

    backup_file1 = 'backup1.tar'
    backup_file2 = 'backup2.tar'
    backup_dir = '/tmp'

    for master_ip in get_all_master_ips():
        run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1))
        run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2))

    backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1)
    backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2)

    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app[
        'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(
            app["tasksRunning"])

    # Abdicate the leader with backup
    original_leader = marathon_leader_ip()
    params = '?backup={}'.format(backup_url1)
    common.abdicate_marathon_leader(params)

    common.wait_for_service_endpoint(marathon_service_name,
                                     timedelta(minutes=5).total_seconds(),
                                     path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        try:
            app = client.get_app(app_id)
        except Exception as e:
            if expected_instances != 0:
                raise e
        else:
            if expected_instances == 0:
                assert False, "The application resurrected"
            else:
                app['tasksRunning'] == expected_instances, \
                    "The number of running tasks is {}, but {} was expected".format(
                        app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    # then remove
    client.remove_app(app_id)
    common.deployment_wait(service_id=app_id)

    check_app_existence(0)

    # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted
    # leads to the state that marathon was not able to re-start, because the second backup failed constantly.

    # Abdicate the leader with backup
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    params = '?backup={}'.format(backup_url2)
    print('DELETE /v2/leader{}'.format(params))
    common.abdicate_marathon_leader(params)

    common.wait_for_service_endpoint(marathon_service_name,
                                     timedelta(minutes=5).total_seconds(),
                                     path="ping")

    # wait until leader changed
    # if leader changed, this means that marathon was able to start again, which is great :-).
    common.assert_marathon_leadership_changed(original_leader)

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)
Example #9
0
def test_marathon_delete_leader_and_check_apps(marathon_service_name):
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    common.deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app[
        'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(
            app["tasksRunning"])

    # abdicate leader after app was started successfully
    common.abdicate_marathon_leader()

    common.wait_for_service_endpoint(marathon_service_name,
                                     timedelta(minutes=5).total_seconds(),
                                     path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)
    original_leader = marathon_leader_ip()

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances
        assert app['tasksRunning'] == expected_instances, \
            "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def remove_app(app_id):
        client.remove_app(app_id)

    remove_app(app_id)
    common.deployment_wait(service_id=app_id)

    try:
        client.get_app(app_id)
    except Exception:
        pass
    else:
        assert False, "The application resurrected"

    # abdicate leader after app was started successfully
    common.abdicate_marathon_leader()

    common.wait_for_service_endpoint(marathon_service_name,
                                     timedelta(minutes=5).total_seconds(),
                                     path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)

    # check if app definition is still not there
    try:
        client.get_app(app_id)
    except Exception:
        pass
    else:
        assert False, "The application resurrected"
Example #10
0
def test_marathon_delete_leader_and_check_apps(marathon_service_name):
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])

    # abdicate leader after app was started successfully
    common.abdicate_marathon_leader()

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)
    original_leader = marathon_leader_ip()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        app = client.get_app(app_id)
        assert app['tasksRunning'] == expected_instances
        assert app['tasksRunning'] == expected_instances, \
            "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def remove_app(app_id):
        client.remove_app(app_id)

    remove_app(app_id)
    deployment_wait(service_id=app_id)

    try:
        client.get_app(app_id)
    except Exception:
        pass
    else:
        assert False, "The application resurrected"

    # abdicate leader after app was started successfully
    common.abdicate_marathon_leader()

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)

    # check if app definition is still not there
    try:
        client.get_app(app_id)
    except Exception:
        pass
    else:
        assert False, "The application resurrected"
Example #11
0
def test_marathon_backup_and_check_apps(marathon_service_name):

    backup_file1 = 'backup1.tar'
    backup_file2 = 'backup2.tar'
    backup_dir = '/tmp'

    for master_ip in get_all_master_ips():
        run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1))
        run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2))

    backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1)
    backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2)

    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))

    app_def = apps.sleep_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)
    deployment_wait(service_id=app_id)

    app = client.get_app(app_id)
    assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"])

    # Abdicate the leader with backup
    original_leader = marathon_leader_ip()
    params = '?backup={}'.format(backup_url1)
    common.abdicate_marathon_leader(params)

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    common.assert_marathon_leadership_changed(original_leader)

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_app_existence(expected_instances):
        try:
            app = client.get_app(app_id)
        except Exception as e:
            if expected_instances != 0:
                raise e
        else:
            if expected_instances == 0:
                assert False, "The application resurrected"
            else:
                app['tasksRunning'] == expected_instances, \
                    "The number of running tasks is {}, but {} was expected".format(
                        app["tasksRunning"], expected_instances)

    # check if app definition is still there and one instance is still running after new leader was elected
    check_app_existence(1)

    # then remove
    client.remove_app(app_id)
    deployment_wait(service_id=app_id)

    check_app_existence(0)

    # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted
    # leads to the state that marathon was not able to re-start, because the second backup failed constantly.

    # Abdicate the leader with backup
    original_leader = marathon_leader_ip()
    print('leader: {}'.format(original_leader))
    params = '?backup={}'.format(backup_url2)
    print('DELETE /v2/leader{}'.format(params))
    common.abdicate_marathon_leader(params)

    wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds(), path="ping")

    # wait until leader changed
    # if leader changed, this means that marathon was able to start again, which is great :-).
    common.assert_marathon_leadership_changed(original_leader)

    # check if app definition is still not there and no instance is running after new leader was elected
    check_app_existence(0)