def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) marathon_leadership_changed(original_leader)
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader marathon_leadership_changed()
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader marathon_leadership_changed()
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') common.wait_for_marathon_up() @retrying.retry(stop_max_attempt_number=30) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader marathon_leadership_changed()
def test_marathon_delete_leader(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) common.delete_marathon_path('v2/leader') common.wait_for_marathon_up() @retrying.retry(stop_max_attempt_number=30) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader marathon_leadership_changed()
def test_marathon_backup_and_restore_leader(marathon_service_name): """Backup and restore meeting is done with only one master since new master has to be able to read the backup file that was created by the previous master and the easiest way to test it is when there is 1 master """ backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app[ 'tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format( app["tasksRunning"]) assert task_id == app['tasks'][0][ 'id'], "Task has a different ID after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_marathon_backup_and_restore_leader(marathon_service_name): backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection client = marathon.create_client() app_def = { "id": "/sleep", "instances": 1, "cpus": 0.01, "mem": 32, "cmd": "sleep 100000" } app_id = app_def['id'] client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1 assert task_id == app['tasks'][0][ 'id'], "Task has a different Id after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) run, data = shakedown.run_command_on_master(cmd) assert run, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_marathon_backup_and_restore_leader(marathon_service_name): backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection client = marathon.create_client() app_def = { "id": "/sleep", "instances": 1, "cpus": 0.01, "mem": 32, "cmd": "sleep 100000" } app_id = app_def['id'] client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1 assert task_id == app['tasks'][0]['id'], "Task has a different Id after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_marathon_backup_and_restore_leader(marathon_service_name): """Backup and restore meeting is done with only one master since new master has to be able to read the backup file that was created by the previous master and the easiest way to test it is when there is 1 master """ backup_file = 'backup.tar' backup_dir = '/tmp' backup_url = 'file://{}/{}'.format(backup_dir, backup_file) # Deploy a simple test app. It is expected to be there after leader reelection app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) task_id = app['tasks'][0]['id'] # Abdicate the leader with backup and restore original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}&restore={}'.format(backup_url, backup_url) print('DELETE {}'.format(url)) common.delete_marathon_path(url) # Wait for new leader (but same master server) to be up and ready shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) assert task_id == app['tasks'][0]['id'], "Task has a different ID after restore" # Check if the backup file exits and is valid cmd = 'tar -tf {}/{} | wc -l'.format(backup_dir, backup_file) status, data = shakedown.run_command_on_master(cmd) assert status, 'Failed to validate backup file {}'.format(backup_url) assert int(data.rstrip()) > 0, "Backup file is empty"
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) if original_leader == current_leader: common.delete_marathon_path('v2/leader') assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) shakedown.deployment_wait() try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected"
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in shakedown.get_all_master_ips(): _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) if original_leader == current_leader: common.delete_marathon_path('v2/leader') assert original_leader != current_leader, "A new Marathon leader has not been elected"
def test_marathon_delete_leader_and_check_apps(marathon_service_name): original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) if original_leader == current_leader: common.delete_marathon_path('v2/leader') assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances assert app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format(app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def remove_app(app_id): client.remove_app(app_id) remove_app(app_id) shakedown.deployment_wait() try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected" # abdicate leader after app was started successfully common.delete_marathon_path('v2/leader') shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed marathon_leadership_changed() # check if app definition is still not there try: _ = client.get_app(app_id) except: pass else: assert False, "The application resurrected"
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' for master_ip in shakedown.get_all_master_ips(): _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file1)) _ = shakedown.run_command(master_ip, "rm {}/{}".format(backup_dir, backup_file2)) backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) app_def = apps.sleep_app() app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1, "The number of running tasks is {}, but 1 was expected".format(app["tasksRunning"]) # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader, "A new Marathon leader has not been elected" # wait until leader changed marathon_leadership_changed() @retrying.retry(wait_fixed=1000, stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): try: app = client.get_app(app_id) except Exception as e: if expected_instances != 0: raise e else: if expected_instances == 0: assert False, "The application resurrected" else: app['tasksRunning'] == expected_instances, \ "The number of running tasks is {}, but {} was expected".format( app["tasksRunning"], expected_instances) # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() check_app_existence(0) # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def test_marathon_backup_and_check_apps(marathon_service_name): backup_file1 = 'backup1.tar' backup_file2 = 'backup2.tar' backup_dir = '/tmp' backup_url1 = 'file://{}/{}'.format(backup_dir, backup_file1) backup_url2 = 'file://{}/{}'.format(backup_dir, backup_file2) original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) # start an app app_def = common.app(id=uuid.uuid4().hex) app_id = app_def['id'] client = marathon.create_client() client.add_app(app_def) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 1 # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url1) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) assert original_leader != current_leader # wait until leader changed marathon_leadership_changed() @retrying.retry(stop_max_attempt_number=30, retry_on_exception=common.ignore_exception) def check_app_existence(expected_instances): app = client.get_app(app_id) assert app['tasksRunning'] == expected_instances # check if app definition is still there and one instance is still running after new leader was elected check_app_existence(1) # then remove client.remove_app(app_id) shakedown.deployment_wait() app = client.get_app(app_id) assert app['tasksRunning'] == 0 # Do a second backup. Before MARATHON-7525 we had the problem, that doing a backup after an app was deleted # leads to the state that marathon was not able to re-start, because the second backup failed constantly. # Abdicate the leader with backup original_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(original_leader)) url = 'v2/leader?backup={}'.format(backup_url2) print('DELETE {}'.format(url)) common.delete_marathon_path(url) shakedown.wait_for_service_endpoint(marathon_service_name, timedelta(minutes=5).total_seconds()) # wait until leader changed # if leader changed, this means that marathon was able to start again, which is great :-). marathon_leadership_changed() # check if app definition is still not there and no instance is running after new leader was elected check_app_existence(0)
def marathon_leadership_changed(): current_leader = shakedown.marathon_leader_ip() print('leader: {}'.format(current_leader)) if original_leader == current_leader: common.delete_marathon_path('v2/leader') assert original_leader != current_leader, "A new Marathon leader has not been elected"