def events_to_file(): leader_ip = shakedown.marathon_leader_ip() print("entering events_to_file fixture") shakedown.run_command(leader_ip, 'rm events.txt') # In strict mode marathon runs in SSL mode on port 8443 and requires authentication if shakedown.ee_version() == 'strict': shakedown.run_command( leader_ip, '(curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" ' + '-H "Authorization: token={}" '.format( shakedown.dcos_acs_token()) + '-o events.txt -k https://marathon.mesos:8443/v2/events; echo $? > events.exitcode) &' ) # Otherwise marathon runs on HTTP mode on port 8080 else: shakedown.run_command( leader_ip, '(curl --compressed -H "Cache-Control: no-cache" -H "Accept: text/event-stream" ' '-o events.txt http://marathon.mesos:8080/v2/events; echo $? > events.exitcode) &' ) yield shakedown.kill_process_on_host(leader_ip, '[c]url') shakedown.run_command(leader_ip, 'rm events.txt') shakedown.run_command(leader_ip, 'rm events.exitcode') print("exiting events_to_file fixture")
def check_deployment_message(): status, stdout = shakedown.run_command(leader_ip, 'cat events.exitcode') assert str(stdout).strip( ) == '', "SSE stream disconnected (CURL exit code is {})".format( stdout.strip()) status, stdout = shakedown.run_command(leader_ip, 'cat events.txt') assert 'event_stream_attached' in stdout, "event_stream_attached event has not been found" assert 'deployment_info' in stdout, "deployment_info event has not been found" assert 'deployment_step_success' in stdout, "deployment_step_success has not been found"
def run_command_on_metronome_leader(command, username=None, key_path=None, noisy=True): """ Run a command on the Metronome leader """ return shakedown.run_command(metronome_leader_ip(), command, username, key_path, noisy)
def dcos_masters_public_ips(): """ retrieves public ips of all masters :return: public ips of all masters """ @retrying.retry( wait_fixed=1000, stop_max_attempt_number=240, # waiting 20 minutes for exhibitor start-up retry_on_exception=ignore_provided_exception(DCOSException)) def all_master_ips(): return get_all_master_ips() master_public_ips = [shakedown.run_command(private_ip, '/opt/mesosphere/bin/detect_ip_public')[1] for private_ip in all_master_ips()] return master_public_ips
def gc_frameworks(): for host in shakedown.get_private_agents(): shakedown.run_command( host, "sudo rm -rf /var/lib/mesos/slave/slaves/*/frameworks/*")
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in shakedown.get_all_master_ips(): _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def check_kill_message(): status, stdout = shakedown.run_command(master_ip, 'cat events.txt') assert 'KILLED' in stdout, "KILLED event has not been found"
def check_deployment_message(): status, stdout = shakedown.run_command(master_ip, 'cat events.txt') assert 'event_stream_attached' in stdout, "event_stream_attached event has not been found" assert 'deployment_info' in stdout, "deployment_info event has not been found" assert 'deployment_step_success' in stdout, "deployment_step_success has not been found"
def check_update_message(): status, stdout = shakedown.run_command(leader_ip, 'cat events.txt') assert 'pod_updated_event' in stdout, 'pod_update_event event has not been produced'
def gc_frameworks(): '''Reclaims private agent disk space consumed by Mesos but not yet garbage collected''' for host in shakedown.get_private_agents(): shakedown.run_command( host, "sudo rm -rf /var/lib/mesos/slave/slaves/*/frameworks/*")
def gc_frameworks(): for host in shakedown.get_private_agents(): shakedown.run_command(host, "sudo rm -rf /var/lib/mesos/slave/slaves/*/frameworks/*")