def test_config_update_then_master_killed(): master_leader_ip = shakedown.master_leader_ip() run_planned_operation( lambda: bump_cpu_count_config(-0.1), lambda: kill_task_with_pattern('mesos-master', master_leader_ip)) verify_leader_changed(master_leader_ip) check_health()
def test_partition_master_outgoing(): master_leader_ip = shakedown.master_leader_ip() shakedown.partition_master(master_leader_ip, incoming=False, outgoing=True) time.sleep(20) shakedown.reconnect_master(master_leader_ip) check_health()
def test_partition_master_both_ways(): master_leader_ip = shakedown.master_leader_ip() shakedown.partition_master(master_leader_ip) time.sleep(20) shakedown.reconnect_master(master_leader_ip) check_health()
def test_config_update_then_zk_killed(): master_leader_ip = shakedown.master_leader_ip() run_planned_operation( bump_cpu_count_config, lambda: kill_task_with_pattern('zookeeper', master_leader_ip), lambda: verify_leader_changed(master_leader_ip)) check_health()
def test_cleanup_then_master_killed(): master_leader_ip = shakedown.master_leader_ip() run_planned_operation( run_cleanup, lambda: kill_task_with_pattern('mesos-master', master_leader_ip)) verify_leader_changed(master_leader_ip) check_health()
def test_repair_then_zk_killed(): master_leader_ip = shakedown.master_leader_ip() run_planned_operation( run_repair, lambda: kill_task_with_pattern('zookeeper', master_leader_ip), lambda: verify_leader_changed(master_leader_ip)) check_health()
def run_command_on_leader( command, username=None, key_path=None, noisy=True ): """ Run a command on the Mesos leader. Important for Multi-Master. """ return run_command(shakedown.master_leader_ip(), command, username, key_path, noisy)
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon leader: {}'.format(marathon_leader)) print('mesos leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) marathon_leader = assert_marathon_leadership_changed(marathon_leader) print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon leader: {}'.format(marathon_leader)) print('mesos leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) marathon_leader = assert_marathon_leadership_changed(marathon_leader) print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def test_zk_killed(): time.sleep(60) log.info("Starting {}".format(sys._getframe().f_code.co_name)) master_leader_ip = shakedown.master_leader_ip() log.info("master leader ip- " + master_leader_ip) kill_task_with_pattern('zookeeper', master_leader_ip) time.sleep(60) #_block_on_adminrouter(master_leader_ip) verify_leader_changed(master_leader_ip) time.sleep(60) check_health()
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon: {}'.format(marathon_leader)) print('leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) new_leader = shakedown.marathon_leader_ip() assert new_leader != marathon_leader, "A new Marathon leader has not been elected" marathon_leader = new_leader print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def get_marathon_leader_not_on_master_leader_node(): marathon_leader = shakedown.marathon_leader_ip() master_leader = shakedown.master_leader_ip() print('marathon: {}'.format(marathon_leader)) print('leader: {}'.format(master_leader)) if marathon_leader == master_leader: delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint('marathon', timedelta(minutes=5).total_seconds()) new_leader = shakedown.marathon_leader_ip() assert new_leader != marathon_leader, "A new Marathon leader has not been elected" marathon_leader = new_leader print('switched leader to: {}'.format(marathon_leader)) return marathon_leader
def test_zk_killed_recovery(): time.sleep(60) log.info("Starting {}".format(sys._getframe().f_code.co_name)) master_leader_ip = shakedown.master_leader_ip() log.info("master leader ip- " + master_leader_ip) kill_task_with_pattern('zookeeper', master_leader_ip) _block_on_adminrouter(master_leader_ip) time.sleep(60) log.info("Taking a health check") check_health() print("Sleeping for 120 sec") time.sleep(120)
def test_metronome_shutdown_with_no_extra_tasks(): """ Test for METRONOME-100 regression When Metronome is restarted it incorrectly started another task for already running job run task. """ client = metronome.create_client() job_id = "metronome-shutdown-{}".format(uuid.uuid4().hex) with job(job_no_schedule(job_id)): # run a job before we shutdown Metronome run_id = client.run_job(job_id)["id"] common.wait_for_job_started(job_id, run_id) common.assert_job_run(client, job_id) # restart metronome process # this won't work in multi-master setup if the mesos leader is not the same as metronome leader # we can improve this one there is a good way how to get metronome leader from the system (e.g. info endpoint) metronome_leader = shakedown.master_leader_ip() shakedown.run_command_on_agent(metronome_leader, 'sudo systemctl restart dcos-metronome') common.wait_for_metronome() # verify that no extra job runs were started when Metronome was restarted common.assert_wait_for_no_additional_tasks(tasks_count=1, client=client, job_id=job_id)
def run_command_on_leader(command, username=None, key_path=None, noisy=True): """ Run a command on the Mesos leader. Important for Multi-Master. """ return run_command(shakedown.master_leader_ip(), command, username, key_path, noisy)
def test_zk_killed_recovery(): master_leader_ip = shakedown.master_leader_ip() kill_task_with_pattern('zookeeper', master_leader_ip) _block_on_adminrouter(master_leader_ip) check_health()
def test_zk_killed(): master_leader_ip = shakedown.master_leader_ip() kill_task_with_pattern('zookeeper', master_leader_ip) verify_leader_changed(master_leader_ip) check_health()
def fn(): try: return shakedown.master_leader_ip() except DCOSAuthenticationException: log.error("Got exception while fetching leader") return old_leader_ip
def test_master_killed_block_on_admin_router(): master_leader_ip = shakedown.master_leader_ip() kill_task_with_pattern('mesos-master', master_leader_ip) verify_leader_changed(master_leader_ip) check_health()