Example #1
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    initial_task_ids = sdk_tasks.get_task_ids(foldered_name, '')
    shakedown.kill_process_on_host(
        sdk_marathon.get_scheduler_host(foldered_name),
        "elastic.scheduler.Main")
    sdk_tasks.check_tasks_not_updated(foldered_name, '', initial_task_ids)
def test_task_failure_recovers():
    """Tests that if a task is KILLED, another one will be launched with a different ID."""

    app_def = apps.sleep_app()
    app_def['cmd'] = 'sleep 1000'

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait(app_id=app_def["id"])

    tasks = client.get_tasks(app_def["id"])
    old_task_id = tasks[0]['id']
    host = tasks[0]['host']

    shakedown.kill_process_on_host(host, '[s]leep 1000')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_new_task_id():
        tasks = client.get_tasks(app_def["id"])
        new_task_id = tasks[0]['id']
        assert old_task_id != new_task_id, "The task ID has not changed: {}".format(
            old_task_id)

    check_new_task_id()
Example #3
0
def events_to_file():
    leader_ip = shakedown.marathon_leader_ip()
    print("entering events_to_file fixture")
    shakedown.run_command(leader_ip, 'rm events.txt')

    # In strict mode marathon runs in SSL mode on port 8443 and requires authentication
    if shakedown.ee_version() == 'strict':
        shakedown.run_command(
            leader_ip,
            '(curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" '
            + '-H "Authorization: token={}" '.format(
                shakedown.dcos_acs_token()) +
            '-o events.txt -k https://marathon.mesos:8443/v2/events; echo $? > events.exitcode) &'
        )

    # Otherwise marathon runs on HTTP mode on port 8080
    else:
        shakedown.run_command(
            leader_ip,
            '(curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" '
            '-o events.txt http://marathon.mesos:8080/v2/events; echo $? > events.exitcode) &'
        )

    yield
    shakedown.kill_process_on_host(leader_ip, '[c]url')
    shakedown.run_command(leader_ip, 'rm events.txt')
    shakedown.run_command(leader_ip, 'rm events.exitcode')
    print("exiting events_to_file fixture")
Example #4
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master")
    shakedown.kill_process_on_host(
        sdk_marathon.get_scheduler_host(FOLDERED_SERVICE_NAME),
        "elastic.scheduler.Main")
    sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, "master",
                                      initial_task_ids)
def test_pinned_task_recovers_on_host():
    """Tests that when a pinned task gets killed, it recovers on the node it was pinned to."""

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    tasks = client.get_tasks(app_def["id"])

    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_for_new_task():
        new_tasks = client.get_tasks(app_def["id"])
        assert tasks[0]['id'] != new_tasks[0][
            'id'], "The task did not get killed: {}".format(tasks[0]['id'])
        assert new_tasks[0]['host'] == host, \
            "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host)

    check_for_new_task()
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
def test_data_survives_crash():
    # Generate SQL Commands
    cmd_drop_database = cockroach_cmd('DROP DATABASE IF EXISTS bank;')
    cmd_create_database = cockroach_cmd('CREATE DATABASE bank;')
    cmd_create_table = cockroach_cmd(
        'CREATE TABLE accounts (id INT PRIMARY KEY, balance INT);', 'bank')
    cmd_insert = cockroach_cmd(
        'INSERT INTO accounts (id, balance) VALUES (1, 1000), (2, 250);',
        'bank')
    cmd_select = cockroach_cmd('SELECT id, balance FROM accounts;', 'bank')

    # Run SQL Commands (except cmd_select)
    cmd.run_cli(cmd_drop_database)
    out_create_database = cmd.run_cli(cmd_create_database)
    out_create_table = cmd.run_cli(cmd_create_table)
    out_insert = cmd.run_cli(cmd_insert)

    # Kill All CockroachDB Nodes (one at a time)
    service_ips = shakedown.get_service_ips(SERVICE_NAME)
    for service_ip in service_ips:
        shakedown.kill_process_on_host(
            service_ip, "cockroach start")  # Kill CockroachDB node
        tasks.check_running(SERVICE_NAME, DEFAULT_TASK_COUNT,
                            5 * 60)  # Wait for new CockroachDB node to run
        shakedown.wait_for(lambda: cockroach_nodes_healthy(),
                           noisy=True,
                           timeout_seconds=5 *
                           60)  # Wait for healthy CockroachDB cluster

    # Run cmd_select
    out_select = cmd.run_cli(cmd_select)

    # Confirm Output
    assert '2 rows' in out_select
def test_mom_when_mom_process_killed():
    """Launched a task from MoM then killed MoM."""

    app_def = apps.sleep_app()
    app_id = app_def["id"]
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks(app_id)
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
        def check_task_is_back():
            tasks = client.get_tasks(app_id)
            assert tasks[0]['id'] == original_task_id, "The task ID has changed"

        check_task_is_back()
Example #9
0
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service(JOB_SERVICE_NAME) is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service(JOB_SERVICE_NAME)
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 hdfs:///netcheck hdfs:///outfile",
        app_name=utils.SPARK_APP_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has running executors")

    host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"]
    id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
def test_app_with_persistent_volume_recovers():
    """Tests that when an app task with a persistent volume gets killed,
       it recovers on the node it was launched on, and it gets attached
       to the same persistent-volume."""

    app_def = apps.persistent_volume_app()
    app_id = app_def['id']

    client = marathon.create_client()
    client.add_app(app_def)

    common.deployment_wait(service_id=app_id)

    tasks = client.get_tasks(app_id)
    assert len(
        tasks
    ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format(
        len(tasks))

    task_id = tasks[0]['id']
    port = tasks[0]['ports'][0]
    host = tasks[0]['host']
    cmd = "curl {}:{}/data/foo".format(host, port)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task(cmd, target_data):
        run, data = shakedown.run_command_on_master(cmd)

        assert run, "{} did not succeed".format(cmd)
        assert target_data in data, "'{}' not found in {}".format(
            target_data, data)

    check_task(cmd, target_data='hello\n')

    shakedown.kill_process_on_host(host, '[h]ttp.server')

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_task_recovery():
        tasks = client.get_tasks(app_id)
        assert len(
            tasks
        ) == 1, "The number of tasks is {} after recovery, but 1 was expected".format(
            len(tasks))

        new_task_id = tasks[0]['id']
        assert task_id != new_task_id, "The task ID has not changed, and is still {}".format(
            task_id)

    check_task_recovery()

    port = tasks[0]['ports'][0]
    host = tasks[0]['host']
    cmd = "curl {}:{}/data/foo".format(host, port)

    check_task(cmd, target_data='hello\nhello\n')
Example #11
0
def test_losing_and_regaining_index_health(default_populated_index):
    config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name)
    shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, "data-0-node"), "data__.*Elasticsearch")
    config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "yellow", service_name=foldered_name)
    config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name)

    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Example #12
0
def test_losing_and_regaining_index_health(default_populated_index):
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "green", service_name=FOLDERED_SERVICE_NAME)
    shakedown.kill_process_on_host(sdk_hosts.system_host(
        FOLDERED_SERVICE_NAME, "data-0-node"), "data__.*Elasticsearch")
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "yellow", service_name=FOLDERED_SERVICE_NAME)
    config.check_elasticsearch_index_health(
        config.DEFAULT_INDEX_NAME, "green", service_name=FOLDERED_SERVICE_NAME)
Example #13
0
def test_master_reelection():
    initial_master = config.get_elasticsearch_master(
        service_name=FOLDERED_SERVICE_NAME)
    shakedown.kill_process_on_host(sdk_hosts.system_host(
        FOLDERED_SERVICE_NAME, initial_master), "master__.*Elasticsearch")
    config.wait_for_expected_nodes_to_exist(service_name=FOLDERED_SERVICE_NAME)
    new_master = config.get_elasticsearch_master(
        service_name=FOLDERED_SERVICE_NAME)
    assert new_master.startswith("master") and new_master != initial_master
Example #14
0
def test_master_reelection():
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    initial_master = config.get_elasticsearch_master(service_name=foldered_name)
    shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch")
    sdk_plan.wait_for_in_progress_recovery(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    config.wait_for_expected_nodes_to_exist(service_name=foldered_name)
    new_master = config.get_elasticsearch_master(service_name=foldered_name)
    assert new_master.startswith("master") and new_master != initial_master
Example #15
0
def events_to_file():
    print("entering events_to_file fixture")
    shakedown.run_command_on_master('rm events.txt')
    shakedown.run_command_on_master(
        'curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" '
        '-o events.txt leader.mesos:8080/v2/events &')
    yield
    shakedown.kill_process_on_host(shakedown.master_ip(), '[c]url')
    shakedown.run_command_on_master('rm events.txt')
    print("exiting events_to_file fixture")
Example #16
0
def test_master_reelection():
    initial_master = config.get_elasticsearch_master(
        service_name=FOLDERED_SERVICE_NAME)
    shakedown.kill_process_on_host(
        sdk_hosts.system_host(FOLDERED_SERVICE_NAME, initial_master),
        "master__.*Elasticsearch")
    config.wait_for_expected_nodes_to_exist(service_name=FOLDERED_SERVICE_NAME)
    new_master = config.get_elasticsearch_master(
        service_name=FOLDERED_SERVICE_NAME)
    assert new_master.startswith("master") and new_master != initial_master
Example #17
0
def events_to_file():
    print("entering events_to_file fixture")
    shakedown.run_command_on_master('rm events.txt')
    shakedown.run_command_on_master(
        'curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" '
        '-o events.txt leader.mesos:8080/v2/events &')
    yield
    shakedown.kill_process_on_host(shakedown.master_ip(), '[c]url')
    shakedown.run_command_on_master('rm events.txt')
    print("exiting events_to_file fixture")
def test_supervise():
    def streaming_job_registered():
        return shakedown.get_service("HdfsWordCount") is not None

    def streaming_job_is_not_running():
        return not streaming_job_registered()

    def has_running_executors():
        f = shakedown.get_service("HdfsWordCount")
        if f is None:
            return False
        else:
            return len([
                x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING"
            ]) > 0

    driver_id = utils.submit_job(
        app_url=SPARK_EXAMPLES,
        app_args="file:///mnt/mesos/sandbox/",
        app_name="/spark",
        args=[
            "--supervise", "--class",
            "org.apache.spark.examples.streaming.HdfsWordCount", "--conf",
            "spark.cores.max=8", "--conf", "spark.executors.cores=4"
        ])
    LOGGER.info("Started supervised driver {}".format(driver_id))
    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has running executors")

    host = shakedown.get_service("HdfsWordCount").dict()["hostname"]
    id = shakedown.get_service("HdfsWordCount").dict()["id"]
    driver_regex = "spark.mesos.driver.frameworkId={}".format(id)
    shakedown.kill_process_on_host(hostname=host, pattern=driver_regex)

    shakedown.wait_for(lambda: streaming_job_registered(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has re-registered")
    shakedown.wait_for(lambda: has_running_executors(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
    LOGGER.info("Job has re-started")
    out = utils.kill_driver(driver_id, "/spark")
    LOGGER.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    shakedown.wait_for(lambda: streaming_job_is_not_running(),
                       ignore_exceptions=False,
                       timeout_seconds=600)
Example #19
0
def test_master_reelection():
    initial_master = config.get_elasticsearch_master(service_name=foldered_name)
    shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch")
    sdk_plan.wait_for_in_progress_recovery(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    config.wait_for_expected_nodes_to_exist(service_name=foldered_name)
    new_master = config.get_elasticsearch_master(service_name=foldered_name)
    assert new_master.startswith("master") and new_master != initial_master

    sdk_plan.wait_for_completed_deployment(foldered_name)
    sdk_plan.wait_for_completed_recovery(foldered_name)
Example #20
0
def test_losing_and_regaining_index_health(default_populated_index):
    config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME,
                                            "green",
                                            service_name=FOLDERED_SERVICE_NAME)
    shakedown.kill_process_on_host(
        sdk_hosts.system_host(FOLDERED_SERVICE_NAME, "data-0-node"),
        "data__.*Elasticsearch")
    config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME,
                                            "yellow",
                                            service_name=FOLDERED_SERVICE_NAME)
    config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME,
                                            "green",
                                            service_name=FOLDERED_SERVICE_NAME)
def test_tasks_updated():
    service_ips = shakedown.get_service_ips(SERVICE_NAME)
    old_task_ids = tasks.get_task_ids(SERVICE_NAME, 'cockroach')
    for service_ip in service_ips:
        shakedown.kill_process_on_host(
            service_ip, "cockroach start")  # Kill CockroachDB node
        tasks.check_running(SERVICE_NAME, DEFAULT_TASK_COUNT,
                            5 * 60)  # Wait for new CockroachDB node to run
        shakedown.wait_for(lambda: cockroach_nodes_healthy(),
                           noisy=True,
                           timeout_seconds=5 *
                           60)  # Wait for healthy CockroachDB cluster
    tasks.check_tasks_updated(SERVICE_NAME, 'cockroach', old_task_ids)
Example #22
0
def test_supervise():
    @retrying.retry(wait_fixed=1000,
                    stop_max_delay=600 * 1000,
                    retry_on_result=lambda res: not res)
    def wait_job_present(present):
        svc = shakedown.get_service(JOB_SERVICE_NAME)
        if present:
            return svc is not None
        else:
            return svc is None

    JOB_SERVICE_NAME = "RecoverableNetworkWordCount"

    job_args = [
        "--supervise", "--class",
        "org.apache.spark.examples.streaming.RecoverableNetworkWordCount",
        "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4"
    ]

    data_dir = "hdfs://{}".format(HDFS_DATA_DIR)
    driver_id = utils.submit_job(
        app_url=utils.SPARK_EXAMPLES,
        app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format(
            dir=data_dir),
        service_name=utils.SPARK_SERVICE_NAME,
        args=(KERBEROS_ARGS + job_args))
    log.info("Started supervised driver {}".format(driver_id))
    wait_job_present(True)
    log.info("Job has registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has running executors")

    service_info = shakedown.get_service(JOB_SERVICE_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    shakedown.kill_process_on_host(hostname=service_info['hostname'],
                                   pattern=driver_regex)

    wait_job_present(True)
    log.info("Job has re-registered")
    sdk_tasks.check_running(JOB_SERVICE_NAME, 1)
    log.info("Job has re-started")
    out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME)
    log.info("{}".format(out))
    out = json.loads(out)
    assert out["success"], "Failed to kill spark streaming job"
    wait_job_present(False)
def test_task_failure_recovers():
    """ Tests that if a task is KILLED, it will be relaunched and the taskID is different.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']
    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(stop_max_delay=10000)
    def check_new_task_id():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0]['id']
def test_task_failure_recovers():
    """ Tests that if a task is KILLED, it will be relaunched and the taskID is different.
    """
    app_id = uuid.uuid4().hex
    app_def = app(app_id)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks(app_id)
    host = tasks[0]['host']
    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(stop_max_delay=10000)
    def check_new_task_id():
        new_tasks = client.get_tasks(app_id)
        assert tasks[0]['id'] != new_tasks[0]['id']
def test_mom_when_mom_process_killed():
    """ Launched a task from MoM then killed MoM.
    """
    app_def = app('agent-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)
    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks('/agent-failure')
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        tasks = client.get_tasks('/agent-failure')
        tasks[0]['id'] == original_task_id
def test_mom_when_mom_process_killed():
    """ Launched a task from MoM then killed MoM.
    """
    app_def = app('agent-failure')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)
    with shakedown.marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks('/agent-failure')
        original_task_id = tasks[0]['id']

        shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly')
        shakedown.wait_for_task('marathon', 'marathon-user', 300)
        shakedown.wait_for_service_endpoint('marathon-user')

        tasks = client.get_tasks('/agent-failure')
        tasks[0]['id'] == original_task_id
def test_pinned_task_recovers_on_host():
    """ Tests that a killed pinned task will recover on the pinned node.
    """
    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    with marathon_on_marathon():
        client = marathon.create_client()
        client.add_app(app_def)
        shakedown.deployment_wait()
        tasks = client.get_tasks('/pinned')

        shakedown.kill_process_on_host(host, '[s]leep')
        shakedown.deployment_wait()

        @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
        def check_for_new_task():
            new_tasks = client.get_tasks('/pinned')
            assert tasks[0]['id'] != new_tasks[0]['id']
            assert new_tasks[0]['host'] == host
def test_pinned_task_recovers_on_host():
    """ Tests that a killed pinned task will recover on the pinned node.
    """

    app_def = app('pinned')
    host = ip_other_than_mom()
    pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()
    tasks = client.get_tasks('/pinned')

    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000, stop_max_delay=3000)
    def check_for_new_task():
        new_tasks = client.get_tasks('/pinned')
        assert tasks[0]['id'] != new_tasks[0]['id']
        assert new_tasks[0]['host'] == host
def test_task_failure_recovers():
    """Tests that if a task is KILLED, another one will be launched with a different ID."""

    app_def = apps.sleep_app()
    app_def['cmd'] = 'sleep 1000'

    client = marathon.create_client()
    client.add_app(app_def)
    shakedown.deployment_wait()

    tasks = client.get_tasks(app_def["id"])
    old_task_id = tasks[0]['id']
    host = tasks[0]['host']

    shakedown.kill_process_on_host(host, '[s]leep 1000')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_new_task_id():
        tasks = client.get_tasks(app_def["id"])
        new_task_id = tasks[0]['id']
        assert old_task_id != new_task_id, "The task ID has not changed: {}".format(old_task_id)

    check_new_task_id()
def test_pinned_task_recovers_on_host():
    """Tests that when a pinned task gets killed, it recovers on the node it was pinned to."""

    app_def = apps.sleep_app()
    host = common.ip_other_than_mom()
    common.pin_to_host(app_def, host)

    client = marathon.create_client()
    client.add_app(app_def)

    shakedown.deployment_wait()
    tasks = client.get_tasks(app_def["id"])

    shakedown.kill_process_on_host(host, '[s]leep')
    shakedown.deployment_wait()

    @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception)
    def check_for_new_task():
        new_tasks = client.get_tasks(app_def["id"])
        assert tasks[0]['id'] != new_tasks[0]['id'], "The task did not get killed: {}".format(tasks[0]['id'])
        assert new_tasks[0]['host'] == host, \
            "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host)

    check_for_new_task()
def test_pod_with_persistent_volume_recovers():
    pod_def = pods.persistent_volume_pod()
    pod_id = pod_def['id']

    client = marathon.create_client()
    client.add_pod(pod_def)
    common.deployment_wait(service_id=pod_id)

    tasks = common.get_pod_tasks(pod_id)
    assert len(
        tasks
    ) == 2, "The number of pod tasks is {}, but is expected to be 2".format(
        len(tasks))

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def wait_for_status_network_info():
        tasks = common.get_pod_tasks(pod_id)
        # the following command throws exceptions if there are no tasks in TASK_RUNNING state
        common.running_status_network_info(tasks[0]['statuses'])

    wait_for_status_network_info()
    host = common.running_status_network_info(
        tasks[0]['statuses'])['ip_addresses'][0]['ip_address']

    task_id1 = tasks[0]['id']
    task_id2 = tasks[1]['id']

    shakedown.kill_process_on_host(host, '[h]ttp.server')

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def wait_for_pod_recovery():
        tasks = common.get_pod_tasks(pod_id)
        assert len(
            tasks
        ) == 2, "The number of tasks is {} after recovery, but 2 was expected".format(
            len(tasks))

        old_task_ids = [task_id1, task_id2]
        new_task_id1 = tasks[0]['id']
        new_task_id2 = tasks[1]['id']

        assert new_task_id1 not in old_task_ids, \
            "The task ID has not changed, and is still {}".format(new_task_id1)
        assert new_task_id2 not in old_task_ids, \
            "The task ID has not changed, and is still {}".format(new_task_id2)

    wait_for_pod_recovery()
    wait_for_status_network_info()

    tasks = common.get_pod_tasks(pod_id)
    assert host == common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'], \
        "the pod has been restarted on another host"

    port1 = tasks[0]['discovery']['ports']['ports'][0]["number"]
    port2 = tasks[1]['discovery']['ports']['ports'][0]["number"]
    path1 = tasks[0]['container']['volumes'][0]['container_path']
    path2 = tasks[1]['container']['volumes'][0]['container_path']
    print(host, port1, port2, path1, path2)

    @retrying.retry(wait_fixed=1000,
                    stop_max_attempt_number=30,
                    retry_on_exception=common.ignore_exception)
    def check_data(port, path):
        cmd = "curl {}:{}/{}/foo".format(host, port, path)
        run, data = shakedown.run_command_on_master(cmd)
        assert run, "{} did not succeed".format(cmd)
        assert 'hello\nhello\n' in data, "'hello\nhello\n' not found in '{}'n".format(
            data)

    check_data(port1, path1)
    check_data(port2, path2)
Example #32
0
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka):
    kafka_brokers = ','.join(
        sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME,
                        KAFKA_SERVICE_NAME,
                        'endpoints broker',
                        json=True)['dns'])
    LOGGER.info("Kafka brokers: {}".format(kafka_brokers))

    _uri = upload_jaas()
    uris = "spark.mesos.uris={}".format(_uri)

    jar_uri = utils.upload_dcos_test_jar()

    kafka_kerberos_args = get_kerberized_kafka_spark_conf(
        utils.SPARK_SERVICE_NAME)
    LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format(
        '\n'.join(kafka_kerberos_args)))

    common_args = [
        "--conf", "spark.mesos.containerizer=mesos", "--conf",
        "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf",
        "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris
    ] + kafka_kerberos_args

    # configuring streaming job and HDFS folders
    setup_hdfs_paths()

    # running kafka producer
    message_set_a = ["abc"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args,
                     message_set_a)

    spark_submit_args = [
        "--supervise", "--class", "StructuredStreamingWithCheckpointing",
        "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1",
        "--conf", "spark.sql.shuffle.partitions=2", "--conf",
        "spark.executor.memory=2g"
    ] + common_args

    application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC,
                                            HDFS_CHECKPOINT_DIR,
                                            SPARK_SECURITY_PROTOCOL)

    driver_task_id = utils.submit_job(app_url=jar_uri,
                                      app_args=application_args,
                                      service_name=utils.SPARK_SERVICE_NAME,
                                      args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS +
                                            spark_submit_args))

    # Wait until executor is running
    LOGGER.info("Starting supervised driver {}".format(driver_task_id))
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)

    # validating Structured Streaming topic consumption
    expected_output_a = "{}|  {}".format(message_set_a[0], len(message_set_a))
    LOGGER.info(
        "Validating Structured Streaming topic consumption, waiting for output {}"
        .format(expected_output_a))
    utils.wait_for_running_job_output(driver_task_id, expected_output_a)

    # killing the driver
    service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict()
    driver_regex = "spark.mesos.driver.frameworkId={}".format(
        service_info['id'])
    shakedown.kill_process_on_host(hostname=service_info['hostname'],
                                   pattern=driver_regex)

    # sending more data to Kafka
    message_set_b = ["def"] * 100
    feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC,
                     common_args + kafka_kerberos_args, message_set_b)

    # checkpointing validation
    sdk_tasks.check_running(SPARK_APPLICATION_NAME,
                            expected_task_count=1,
                            timeout_seconds=600)
    LOGGER.info("Streaming job has re-started")

    # validating Structured Streaming resumed topic consumption
    expected_output_b = "{}|  {}".format(message_set_b[0], len(message_set_b))
    LOGGER.info(
        "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'"
        .format(expected_output_a, expected_output_b))

    utils.wait_for_running_job_output(driver_task_id, expected_output_a)
    utils.wait_for_running_job_output(driver_task_id, expected_output_b)
Example #33
0
def test_unchanged_scheduler_restarts_without_restarting_tasks():
    initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master")
    shakedown.kill_process_on_host(sdk_marathon.get_scheduler_host(
        FOLDERED_SERVICE_NAME), "elastic.scheduler.Main")
    sdk_tasks.check_tasks_not_updated(
        FOLDERED_SERVICE_NAME, "master", initial_task_ids)