def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0][ 'id'] == original_task_id, "The task ID has changed" check_task_is_back()
def recover_agents(hosts): get_and_verify_plan( lambda p: p['status'] == infinity_commons.PlanState.COMPLETE.value, assert_success=False) for h in hosts: print("Restarting mesos agent on {}".format(h)) shakedown.restart_agent(h)
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() app_id = app_def["id"] host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) common.deployment_wait(service_id=app_id) tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def recover_failed_agents(hosts): log.info("Recover failed agents- {}".format(str(hosts))) tasks = {} try: tasks = check_health(wait_time=ONE_MINUTE, assert_success=False) log.info("Failed_tasks- " + str(tasks)) failed_hosts = find_failed_hosts(hosts, tasks) log.info("Failed_hosts- " + failed_hosts) for h in failed_hosts: log.info("Restarting mesos agent on {}".format(h)) shakedown.restart_agent(h) except Exception as e: log.error("error in recover_failed_agents") log.error(str(e))
def test_marathon_when_task_agent_bounced(): """ Launch an app and restart the node the task is on. """ app_def = app('agent-failure') host = ip_other_than_mom() pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_delay=3000) def check_task_is_back(): tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_marathon_when_task_agent_bounced(): """Launch an app and restart the node the task is running on.""" app_def = apps.sleep_app() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_def["id"]) original_task_id = tasks[0]['id'] shakedown.restart_agent(host) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_def["id"]) assert tasks[0]['id'] == original_task_id, \ "The task {} got replaced with {}".format(original_task_id, tasks[0]['id']) check_task_is_back()
def test_mom_when_mom_agent_bounced(): """ Launch an app from MoM and restart the node MoM is on. """ app_def = app('agent-failure') mom_ip = ip_of_mom() host = ip_other_than_mom() pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks('/agent-failure') original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_delay=3000, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks('/agent-failure') tasks[0]['id'] == original_task_id
def test_mom_when_mom_agent_bounced(): """Launch an app from MoM and restart the node MoM is on.""" app_def = apps.sleep_app() app_id = app_def["id"] mom_ip = common.ip_of_mom() host = common.ip_other_than_mom() common.pin_to_host(app_def, host) with shakedown.marathon_on_marathon(): client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() tasks = client.get_tasks(app_id) original_task_id = tasks[0]['id'] shakedown.restart_agent(mom_ip) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_task_is_back(): tasks = client.get_tasks(app_id) assert tasks[0]['id'] == original_task_id, "The task ID has changed" check_task_is_back()
def recover_host_from_partitioning(host): # if is_dns_healthy_for_node(host): log.info("Restarting erlang and mesos on {}".format(host)) restart_erlang_on_host(host) shakedown.restart_agent(host)
def recover_failed_agents(hosts): tasks = check_health(wait_time=HEALTH_WAIT_TIME, assert_success=False) failed_hosts = find_failed_hosts(hosts, tasks) for h in failed_hosts: print("Restarting mesos agent on {}".format(h)) shakedown.restart_agent(h)