def test_unchanged_scheduler_restarts_without_restarting_tasks(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_task_ids = sdk_tasks.get_task_ids(foldered_name, '') shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(foldered_name), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(foldered_name, '', initial_task_ids)
def test_task_failure_recovers(): """Tests that if a task is KILLED, another one will be launched with a different ID.""" app_def = apps.sleep_app() app_def['cmd'] = 'sleep 1000' client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait(app_id=app_def["id"]) tasks = client.get_tasks(app_def["id"]) old_task_id = tasks[0]['id'] host = tasks[0]['host'] shakedown.kill_process_on_host(host, '[s]leep 1000') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_new_task_id(): tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert old_task_id != new_task_id, "The task ID has not changed: {}".format( old_task_id) check_new_task_id()
def events_to_file(): leader_ip = shakedown.marathon_leader_ip() print("entering events_to_file fixture") shakedown.run_command(leader_ip, 'rm events.txt') # In strict mode marathon runs in SSL mode on port 8443 and requires authentication if shakedown.ee_version() == 'strict': shakedown.run_command( leader_ip, '(curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" ' + '-H "Authorization: token={}" '.format( shakedown.dcos_acs_token()) + '-o events.txt -k https://marathon.mesos:8443/v2/events; echo $? > events.exitcode) &' ) # Otherwise marathon runs on HTTP mode on port 8080 else: shakedown.run_command( leader_ip, '(curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" ' '-o events.txt http://marathon.mesos:8080/v2/events; echo $? > events.exitcode) &' ) yield shakedown.kill_process_on_host(leader_ip, '[c]url') shakedown.run_command(leader_ip, 'rm events.txt') shakedown.run_command(leader_ip, 'rm events.exitcode') print("exiting events_to_file fixture")
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master") shakedown.kill_process_on_host( sdk_marathon.get_scheduler_host(FOLDERED_SERVICE_NAME), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated(FOLDERED_SERVICE_NAME, "master", initial_task_ids)
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] != new_tasks[0][ 'id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_data_survives_crash(): # Generate SQL Commands cmd_drop_database = cockroach_cmd('DROP DATABASE IF EXISTS bank;') cmd_create_database = cockroach_cmd('CREATE DATABASE bank;') cmd_create_table = cockroach_cmd( 'CREATE TABLE accounts (id INT PRIMARY KEY, balance INT);', 'bank') cmd_insert = cockroach_cmd( 'INSERT INTO accounts (id, balance) VALUES (1, 1000), (2, 250);', 'bank') cmd_select = cockroach_cmd('SELECT id, balance FROM accounts;', 'bank') # Run SQL Commands (except cmd_select) cmd.run_cli(cmd_drop_database) out_create_database = cmd.run_cli(cmd_create_database) out_create_table = cmd.run_cli(cmd_create_table) out_insert = cmd.run_cli(cmd_insert) # Kill All CockroachDB Nodes (one at a time) service_ips = shakedown.get_service_ips(SERVICE_NAME) for service_ip in service_ips: shakedown.kill_process_on_host( service_ip, "cockroach start") # Kill CockroachDB node tasks.check_running(SERVICE_NAME, DEFAULT_TASK_COUNT, 5 * 60) # Wait for new CockroachDB node to run shakedown.wait_for(lambda: cockroach_nodes_healthy(), noisy=True, timeout_seconds=5 * 60) # Wait for healthy CockroachDB cluster # Run cmd_select out_select = cmd.run_cli(cmd_select) # Confirm Output assert '2 rows' in out_select
def test_mom_when_mom_process_killed(): """Launched a task from MoM then killed MoM.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(common.ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def test_supervise(): def streaming_job_registered(): return shakedown.get_service(JOB_SERVICE_NAME) is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service(JOB_SERVICE_NAME) if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 hdfs:///netcheck hdfs:///outfile", app_name=utils.SPARK_APP_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has running executors") host = shakedown.get_service(JOB_SERVICE_NAME).dict()["hostname"] id = shakedown.get_service(JOB_SERVICE_NAME).dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_APP_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def test_app_with_persistent_volume_recovers(): """Tests that when an app task with a persistent volume gets killed, it recovers on the node it was launched on, and it gets attached to the same persistent-volume.""" app_def = apps.persistent_volume_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after deployment, but 1 was expected".format( len(tasks)) task_id = tasks[0]['id'] port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task(cmd, target_data): run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert target_data in data, "'{}' not found in {}".format( target_data, data) check_task(cmd, target_data='hello\n') shakedown.kill_process_on_host(host, '[h]ttp.server') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_recovery(): tasks = client.get_tasks(app_id) assert len( tasks ) == 1, "The number of tasks is {} after recovery, but 1 was expected".format( len(tasks)) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task ID has not changed, and is still {}".format( task_id) check_task_recovery() port = tasks[0]['ports'][0] host = tasks[0]['host'] cmd = "curl {}:{}/data/foo".format(host, port) check_task(cmd, target_data='hello\nhello\n')
def test_losing_and_regaining_index_health(default_populated_index): config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, "data-0-node"), "data__.*Elasticsearch") config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "yellow", service_name=foldered_name) config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=foldered_name) sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_losing_and_regaining_index_health(default_populated_index): config.check_elasticsearch_index_health( config.DEFAULT_INDEX_NAME, "green", service_name=FOLDERED_SERVICE_NAME) shakedown.kill_process_on_host(sdk_hosts.system_host( FOLDERED_SERVICE_NAME, "data-0-node"), "data__.*Elasticsearch") config.check_elasticsearch_index_health( config.DEFAULT_INDEX_NAME, "yellow", service_name=FOLDERED_SERVICE_NAME) config.check_elasticsearch_index_health( config.DEFAULT_INDEX_NAME, "green", service_name=FOLDERED_SERVICE_NAME)
def test_master_reelection(): initial_master = config.get_elasticsearch_master( service_name=FOLDERED_SERVICE_NAME) shakedown.kill_process_on_host(sdk_hosts.system_host( FOLDERED_SERVICE_NAME, initial_master), "master__.*Elasticsearch") config.wait_for_expected_nodes_to_exist(service_name=FOLDERED_SERVICE_NAME) new_master = config.get_elasticsearch_master( service_name=FOLDERED_SERVICE_NAME) assert new_master.startswith("master") and new_master != initial_master
def test_master_reelection(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) initial_master = config.get_elasticsearch_master(service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master
def events_to_file(): print("entering events_to_file fixture") shakedown.run_command_on_master('rm events.txt') shakedown.run_command_on_master( 'curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" ' '-o events.txt leader.mesos:8080/v2/events &') yield shakedown.kill_process_on_host(shakedown.master_ip(), '[c]url') shakedown.run_command_on_master('rm events.txt') print("exiting events_to_file fixture")
def test_master_reelection(): initial_master = config.get_elasticsearch_master( service_name=FOLDERED_SERVICE_NAME) shakedown.kill_process_on_host( sdk_hosts.system_host(FOLDERED_SERVICE_NAME, initial_master), "master__.*Elasticsearch") config.wait_for_expected_nodes_to_exist(service_name=FOLDERED_SERVICE_NAME) new_master = config.get_elasticsearch_master( service_name=FOLDERED_SERVICE_NAME) assert new_master.startswith("master") and new_master != initial_master
def events_to_file(): print("entering events_to_file fixture") shakedown.run_command_on_master('rm events.txt') shakedown.run_command_on_master( 'curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" ' '-o events.txt leader.mesos:8080/v2/events &') yield shakedown.kill_process_on_host(shakedown.master_ip(), '[c]url') shakedown.run_command_on_master('rm events.txt') print("exiting events_to_file fixture")
def test_supervise(): def streaming_job_registered(): return shakedown.get_service("HdfsWordCount") is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service("HdfsWordCount") if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 driver_id = utils.submit_job( app_url=SPARK_EXAMPLES, app_args="file:///mnt/mesos/sandbox/", app_name="/spark", args=[ "--supervise", "--class", "org.apache.spark.examples.streaming.HdfsWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ]) LOGGER.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has running executors") host = shakedown.get_service("HdfsWordCount").dict()["hostname"] id = shakedown.get_service("HdfsWordCount").dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has re-started") out = utils.kill_driver(driver_id, "/spark") LOGGER.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def test_master_reelection(): initial_master = config.get_elasticsearch_master(service_name=foldered_name) shakedown.kill_process_on_host(sdk_hosts.system_host(foldered_name, initial_master), "master__.*Elasticsearch") sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name) config.wait_for_expected_nodes_to_exist(service_name=foldered_name) new_master = config.get_elasticsearch_master(service_name=foldered_name) assert new_master.startswith("master") and new_master != initial_master sdk_plan.wait_for_completed_deployment(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_losing_and_regaining_index_health(default_populated_index): config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=FOLDERED_SERVICE_NAME) shakedown.kill_process_on_host( sdk_hosts.system_host(FOLDERED_SERVICE_NAME, "data-0-node"), "data__.*Elasticsearch") config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "yellow", service_name=FOLDERED_SERVICE_NAME) config.check_elasticsearch_index_health(config.DEFAULT_INDEX_NAME, "green", service_name=FOLDERED_SERVICE_NAME)
def test_tasks_updated(): service_ips = shakedown.get_service_ips(SERVICE_NAME) old_task_ids = tasks.get_task_ids(SERVICE_NAME, 'cockroach') for service_ip in service_ips: shakedown.kill_process_on_host( service_ip, "cockroach start") # Kill CockroachDB node tasks.check_running(SERVICE_NAME, DEFAULT_TASK_COUNT, 5 * 60) # Wait for new CockroachDB node to run shakedown.wait_for(lambda: cockroach_nodes_healthy(), noisy=True, timeout_seconds=5 * 60) # Wait for healthy CockroachDB cluster tasks.check_tasks_updated(SERVICE_NAME, 'cockroach', old_task_ids)
def test_supervise(): @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(JOB_SERVICE_NAME) if present: return svc is not None else: return svc is None JOB_SERVICE_NAME = "RecoverableNetworkWordCount" job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has running executors") service_info = shakedown.get_service(JOB_SERVICE_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) shakedown.kill_process_on_host(hostname=service_info['hostname'], pattern=driver_regex) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(JOB_SERVICE_NAME, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)
def test_task_failure_recovers(): """ Tests that if a task is KILLED, it will be relaunched and the taskID is different. """ app_id = uuid.uuid4().hex app_def = app(app_id) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) host = tasks[0]['host'] shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(stop_max_delay=10000) def check_new_task_id(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0]['id']
def test_task_failure_recovers(): """ Tests that if a task is KILLED, it will be relaunched and the taskID is different. """ app_id = uuid.uuid4().hex app_def = app(app_id) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) host = tasks[0]['host'] shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(stop_max_delay=10000) def check_new_task_id(): new_tasks = client.get_tasks(app_id) assert tasks[0]['id'] != new_tasks[0]['id']
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_mom_when_mom_process_killed(): """ Launched a task from MoM then killed MoM. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.kill_process_on_host(ip_of_mom(), 'marathon-assembly') shakedown.wait_for_task('marathon', 'marathon-user', 300) shakedown.wait_for_service_endpoint('marathon-user') tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) with marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_pinned_task_recovers_on_host(): """ Tests that a killed pinned task will recover on the pinned node. """ app_def = app('pinned') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/pinned') shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_for_new_task(): new_tasks = client.get_tasks('/pinned') assert tasks[0]['id'] != new_tasks[0]['id'] assert new_tasks[0]['host'] == host
def test_task_failure_recovers(): """Tests that if a task is KILLED, another one will be launched with a different ID.""" app_def = apps.sleep_app() app_def['cmd'] = 'sleep 1000' client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) old_task_id = tasks[0]['id'] host = tasks[0]['host'] shakedown.kill_process_on_host(host, '[s]leep 1000') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_new_task_id(): tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert old_task_id != new_task_id, "The task ID has not changed: {}".format(old_task_id) check_new_task_id()
def test_pinned_task_recovers_on_host(): """Tests that when a pinned task gets killed, it recovers on the node it was pinned to.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) shakedown.kill_process_on_host(host, '[s]leep') shakedown.deployment_wait() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_for_new_task(): new_tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] != new_tasks[0]['id'], "The task did not get killed: {}".format(tasks[0]['id']) assert new_tasks[0]['host'] == host, \ "The task got restarted on {}, but it was supposed to stay on {}".format(new_tasks[0]['host'], host) check_for_new_task()
def test_pod_with_persistent_volume_recovers(): pod_def = pods.persistent_volume_pod() pod_id = pod_def['id'] client = marathon.create_client() client.add_pod(pod_def) common.deployment_wait(service_id=pod_id) tasks = common.get_pod_tasks(pod_id) assert len( tasks ) == 2, "The number of pod tasks is {}, but is expected to be 2".format( len(tasks)) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def wait_for_status_network_info(): tasks = common.get_pod_tasks(pod_id) # the following command throws exceptions if there are no tasks in TASK_RUNNING state common.running_status_network_info(tasks[0]['statuses']) wait_for_status_network_info() host = common.running_status_network_info( tasks[0]['statuses'])['ip_addresses'][0]['ip_address'] task_id1 = tasks[0]['id'] task_id2 = tasks[1]['id'] shakedown.kill_process_on_host(host, '[h]ttp.server') @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def wait_for_pod_recovery(): tasks = common.get_pod_tasks(pod_id) assert len( tasks ) == 2, "The number of tasks is {} after recovery, but 2 was expected".format( len(tasks)) old_task_ids = [task_id1, task_id2] new_task_id1 = tasks[0]['id'] new_task_id2 = tasks[1]['id'] assert new_task_id1 not in old_task_ids, \ "The task ID has not changed, and is still {}".format(new_task_id1) assert new_task_id2 not in old_task_ids, \ "The task ID has not changed, and is still {}".format(new_task_id2) wait_for_pod_recovery() wait_for_status_network_info() tasks = common.get_pod_tasks(pod_id) assert host == common.running_status_network_info(tasks[0]['statuses'])['ip_addresses'][0]['ip_address'], \ "the pod has been restarted on another host" port1 = tasks[0]['discovery']['ports']['ports'][0]["number"] port2 = tasks[1]['discovery']['ports']['ports'][0]["number"] path1 = tasks[0]['container']['volumes'][0]['container_path'] path2 = tasks[1]['container']['volumes'][0]['container_path'] print(host, port1, port2, path1, path2) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_data(port, path): cmd = "curl {}:{}/{}/foo".format(host, port, path) run, data = shakedown.run_command_on_master(cmd) assert run, "{} did not succeed".format(cmd) assert 'hello\nhello\n' in data, "'hello\nhello\n' not found in '{}'n".format( data) check_data(port1, path1) check_data(port2, path2)
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka): kafka_brokers = ','.join( sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns']) LOGGER.info("Kafka brokers: {}".format(kafka_brokers)) _uri = upload_jaas() uris = "spark.mesos.uris={}".format(_uri) jar_uri = utils.upload_dcos_test_jar() kafka_kerberos_args = get_kerberized_kafka_spark_conf( utils.SPARK_SERVICE_NAME) LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format( '\n'.join(kafka_kerberos_args))) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] + kafka_kerberos_args # configuring streaming job and HDFS folders setup_hdfs_paths() # running kafka producer message_set_a = ["abc"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args, message_set_a) spark_submit_args = [ "--supervise", "--class", "StructuredStreamingWithCheckpointing", "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--conf", "spark.sql.shuffle.partitions=2", "--conf", "spark.executor.memory=2g" ] + common_args application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC, HDFS_CHECKPOINT_DIR, SPARK_SECURITY_PROTOCOL) driver_task_id = utils.submit_job(app_url=jar_uri, app_args=application_args, service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + spark_submit_args)) # Wait until executor is running LOGGER.info("Starting supervised driver {}".format(driver_task_id)) sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) # validating Structured Streaming topic consumption expected_output_a = "{}| {}".format(message_set_a[0], len(message_set_a)) LOGGER.info( "Validating Structured Streaming topic consumption, waiting for output {}" .format(expected_output_a)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) # killing the driver service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) shakedown.kill_process_on_host(hostname=service_info['hostname'], pattern=driver_regex) # sending more data to Kafka message_set_b = ["def"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args + kafka_kerberos_args, message_set_b) # checkpointing validation sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) LOGGER.info("Streaming job has re-started") # validating Structured Streaming resumed topic consumption expected_output_b = "{}| {}".format(message_set_b[0], len(message_set_b)) LOGGER.info( "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'" .format(expected_output_a, expected_output_b)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) utils.wait_for_running_job_output(driver_task_id, expected_output_b)
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = sdk_tasks.get_task_ids(FOLDERED_SERVICE_NAME, "master") shakedown.kill_process_on_host(sdk_marathon.get_scheduler_host( FOLDERED_SERVICE_NAME), "elastic.scheduler.Main") sdk_tasks.check_tasks_not_updated( FOLDERED_SERVICE_NAME, "master", initial_task_ids)