def test_marathon_when_disconnected_from_zk(): """ Launch an app from Marathon. Then knock out access to zk from the MoM. Verify the task is still good. """ app_def = app('zk-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/zk-failure') original_task_id = tasks[0]['id'] with shakedown.iptable_rules(host): block_port(host, 2181) # time of the zk block time.sleep(10) # after access to zk is restored. @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/zk-failure') tasks[0]['id'] == original_task_id check_task_is_back()
def test_marathon_when_disconnected_from_zk(): """Launches an app from Marathon, then knocks out access to ZK from Marathon. Verifies the task is preserved. """ app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] with shakedown.iptable_rules(host): common.block_port(host, 2181) time.sleep(10) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server() app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_def["id"]) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_def["id"]) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition with shakedown.iptable_rules(host): common.block_port(host, port) time.sleep(10) shakedown.deployment_wait() app = client.get_app(app_def["id"]) tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task didn't get killed because of a failed health check" assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy']) # network partition should cause a task restart @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format( task_id) app = client.get_app(app_def["id"]) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_task_gets_restarted_due_to_network_split(): """Verifies that a health check fails in presence of a network partition.""" app_def = apps.http_server() app_def['healthChecks'] = [common.health_check()] common.pin_to_host(app_def, common.ip_other_than_mom()) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_def["id"]) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) tasks = client.get_tasks(app_def["id"]) task_id = tasks[0]['id'] host = tasks[0]['host'] port = tasks[0]['ports'][0] # introduce a network partition with shakedown.iptable_rules(host): common.block_port(host, port) time.sleep(10) shakedown.deployment_wait() app = client.get_app(app_def["id"]) tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task didn't get killed because of a failed health check" assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 0 was expected".format(app['tasksHealthy']) # network partition should cause a task restart @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_health_message(): tasks = client.get_tasks(app_def["id"]) new_task_id = tasks[0]['id'] assert task_id != new_task_id, "The task has not been restarted: {}".format(task_id) app = client.get_app(app_def["id"]) assert app['tasksRunning'] == 1, \ "The number of running tasks is {}, but 1 was expected".format(app['tasksRunning']) assert app['tasksHealthy'] == 1, \ "The number of healthy tasks is {}, but 1 was expected".format(app['tasksHealthy']) check_health_message()
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master with shakedown.iptable_rules(original_leader): common.block_port(original_leader, 5050, direction='OUTPUT') # time of the master block time.sleep(timedelta(minutes=1.5).total_seconds()) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader, "A new Marathon leader has not been elected"
def test_marathon_master_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking outbound connection to mesos master with shakedown.iptable_rules(original_leader): block_port(original_leader, 5050, direction='OUTPUT') # time of the master block time.sleep(timedelta(minutes=1.5).total_seconds()) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader
def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking zk on marathon leader (not master leader) with shakedown.iptable_rules(original_leader): common.block_port(original_leader, 2181, direction='INPUT') common.block_port(original_leader, 2181, direction='OUTPUT') # time of the zk block, the duration must be greater than all the default ZK timeout values in Marathon time.sleep(20) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader, "A new Marathon leader has not been elected"
def test_marathon_zk_partition_leader_change(marathon_service_name): original_leader = common.get_marathon_leader_not_on_master_leader_node() # blocking zk on marathon leader (not master leader) with shakedown.iptable_rules(original_leader): block_port(original_leader, 2181, direction='INPUT') block_port(original_leader, 2181, direction='OUTPUT') # time of the zk block time.sleep(5) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) current_leader = shakedown.marathon_leader_ip() assert original_leader != current_leader