def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS' journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal') name_ids = sdk_tasks.get_task_ids(foldered_name, 'name') data_ids = sdk_tasks.get_task_ids(foldered_name, 'data') marathon_config = sdk_marathon.get_config(foldered_name) log.info('marathon config: ') log.info(marathon_config) expiry_ms = int(marathon_config['env'][app_config_field]) marathon_config['env'][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids) sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids) sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def test_backup_and_restore_flow(): backup_parameters = { 'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME', 'infinity-framework-test'), 'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'), 'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'), 'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'), 'SNAPSHOT_NAME': str(uuid.uuid1()), 'CASSANDRA_KEYSPACES': '"testspace1 testspace2"', } # Write data to Cassandra with a metronome job launch_and_verify_job(WRITE_DATA_JOB) # Verify that the data was written launch_and_verify_job(VERIFY_DATA_JOB) # Run backup plan, uploading snapshots and schema to S3 plan.start_plan(PACKAGE_NAME, 'backup-s3', parameters=backup_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'backup-s3'). json()['status'] == 'COMPLETE')) # Delete all keyspaces and tables with a metronome job launch_and_verify_job(DELETE_DATA_JOB) # Verify that the keyspaces and tables were deleted launch_and_verify_job(VERIFY_DELETION_JOB) # Run restore plan, retrieving snapshots and schema from S3 plan.start_plan(PACKAGE_NAME, 'restore-s3', parameters=backup_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'restore-s3'). json()['status'] == 'COMPLETE')) # Verify that the data we wrote and then deleted has been restored launch_and_verify_job(VERIFY_DATA_JOB, expected_successes=2)
def test_modify_app_config(): """This tests checks that the modification of the app config does not trigger a recovery.""" sdk_plan.wait_for_completed_recovery(foldered_name) old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS" journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal") name_ids = sdk_tasks.get_task_ids(foldered_name, "name") data_ids = sdk_tasks.get_task_ids(foldered_name, "data") marathon_config = sdk_marathon.get_config(foldered_name) log.info("marathon config: ") log.info(marathon_config) expiry_ms = int(marathon_config["env"][app_config_field]) marathon_config["env"][app_config_field] = str(expiry_ms + 1) sdk_marathon.update_app(marathon_config, timeout=15 * 60) # All tasks should be updated because hdfs-site.xml has changed config.check_healthy(service_name=foldered_name) sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids) sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids) sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids) sdk_plan.wait_for_completed_recovery(foldered_name) new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery") assert old_recovery_plan == new_recovery_plan
def run_backup_and_restore(backup_plan, restore_plan, plan_parameters): # Write data to Cassandra with a metronome job launch_and_verify_job(WRITE_DATA_JOB) # Verify that the data was written launch_and_verify_job(VERIFY_DATA_JOB) # Run backup plan, uploading snapshots and schema to S3 plan.start_plan(PACKAGE_NAME, backup_plan, parameters=plan_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, backup_plan). json()['status'] == 'COMPLETE')) # Delete all keyspaces and tables with a metronome job launch_and_verify_job(DELETE_DATA_JOB) # Verify that the keyspaces and tables were deleted launch_and_verify_job(VERIFY_DELETION_JOB) # Run restore plan, retrieving snapshots and schema from S3 plan.start_plan(PACKAGE_NAME, restore_plan, parameters=plan_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, restore_plan). json()['status'] == 'COMPLETE')) # Verify that the data we wrote and then deleted has been restored launch_and_verify_job(VERIFY_DATA_JOB) # Delete data in preparation for any other backup tests launch_and_verify_job(DELETE_DATA_JOB)
def test_toxic_sidecar_doesnt_trigger_recovery(): # 1. Run the toxic sidecar plan that will never succeed. # 2. Restart the scheduler. # 3. Verify that its recovery plan has not changed, as a failed ONCE task should # never trigger recovery initial_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert (initial_recovery_plan['status'] == "COMPLETE") log.info(initial_recovery_plan) sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic') wait_for_toxic_sidecar() # Restart the scheduler and wait for it to come up. sdk_marathon.restart_app(config.SERVICE_NAME) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # Now, verify that its recovery plan hasn't changed. final_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert (initial_recovery_plan['status'] == final_recovery_plan['status'])
def test_toxic_sidecar_doesnt_trigger_recovery(): # 1. Run the toxic sidecar plan that will never succeed. # 2. Restart the scheduler. # 3. Verify that its recovery plan has not changed, as a failed ONCE task should # never trigger recovery initial_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert(initial_recovery_plan['status'] == "COMPLETE") log.info(initial_recovery_plan) sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic') wait_for_toxic_sidecar() # Restart the scheduler and wait for it to come up. sdk_marathon.restart_app(config.SERVICE_NAME) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # Now, verify that its recovery plan hasn't changed. final_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert(initial_recovery_plan['status'] == final_recovery_plan['status'])
def test_sidecar(): sdk_plan.start_plan(config.SERVICE_NAME, "sidecar") started_plan = sdk_plan.get_plan(config.SERVICE_NAME, "sidecar") log.info(sdk_plan.plan_string("sidecar", started_plan)) assert len(started_plan["phases"]) == 1 assert started_plan["phases"][0]["name"] == "sidecar-deploy" assert len(started_plan["phases"][0]["steps"]) == 2 sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, "sidecar")
def test_sidecar(): sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar') started_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'sidecar') log.info("sidecar plan: " + str(started_plan)) assert(len(started_plan['phases']) == 1) assert(started_plan['phases'][0]['name'] == 'sidecar-deploy') assert(len(started_plan['phases'][0]['steps']) == 2) sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'sidecar')
def run_plan(plan_name, params=None): sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params) started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name) log.info("sidecar plan: " + str(started_plan)) assert(len(started_plan['phases']) == 1) assert(started_plan['phases'][0]['name'] == plan_name + '-deploy') assert(len(started_plan['phases'][0]['steps']) == 2) sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
def run_plan(plan_name, params=None): sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params) started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name) log.info(sdk_plan.plan_string(plan_name, started_plan)) assert len(started_plan["phases"]) == 1 assert started_plan["phases"][0]["name"] == plan_name + "-deploy" assert len(started_plan["phases"][0]["steps"]) == 2 sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
def test_toxic_sidecar_doesnt_trigger_recovery(): # 1. Run the toxic sidecar plan that will never succeed. # 2. Restart the scheduler. # 3. Verify that its recovery plan is empty, as a failed FINISHED task should # never trigger recovery recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert(len(recovery_plan['phases']) == 0) log.info(recovery_plan) sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic') shakedown.wait_for(ToxicSidecarCheck(), timeout_seconds=10 * 60) # Restart the scheduler and wait for it to come up. sdk_marathon.restart_app(config.SERVICE_NAME) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # Now, verify that its recovery plan is empty. sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'recovery') recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert(len(recovery_plan['phases']) == 0)
def test_sidecar(): plan.start_plan(PACKAGE_NAME, 'sidecar') started_plan = plan.get_plan(PACKAGE_NAME, 'sidecar') sdk_utils.out("sidecar plan: " + str(started_plan)) assert(len(started_plan['phases']) == 1) assert(started_plan['phases'][0]['name'] == 'sidecar-deploy') assert(len(started_plan['phases'][0]['steps']) == 2) plan.wait_for_completed_plan(PACKAGE_NAME, 'sidecar')
def test_repair_plan_completes(): repair_parameters = {'CASSANDRA_KEYSPACE': 'testspace1'} plan.start_plan(PACKAGE_NAME, 'repair', parameters=repair_parameters) spin.time_wait_noisy( lambda: ( plan.get_plan(PACKAGE_NAME, 'repair').json()['status'] == 'COMPLETE' ) )
def test_toxic_sidecar_doesnt_trigger_recovery(): # 1. Run the toxic sidecar plan that will never succeed. # 2. Restart the scheduler. # 3. Verify that its recovery plan is empty, as a failed ONCE task should # never trigger recovery recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert (len(recovery_plan['phases']) == 0) log.info(recovery_plan) sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic') wait_for_toxic_sidecar() # Restart the scheduler and wait for it to come up. sdk_marathon.restart_app(config.SERVICE_NAME) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # Now, verify that its recovery plan is empty. sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'recovery') recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery') assert (len(recovery_plan['phases']) == 0)
def run_plan(plan_name, params=None): plan.start_plan(PACKAGE_NAME, plan_name, params) started_plan = plan.get_plan(PACKAGE_NAME, plan_name) sdk_utils.out("sidecar plan: " + str(started_plan)) assert (len(started_plan['phases']) == 1) assert (started_plan['phases'][0]['name'] == plan_name + '-deploy') assert (len(started_plan['phases'][0]['steps']) == 2) plan.wait_for_completed_plan(PACKAGE_NAME, plan_name)
def run_plan(plan_name, params=None): sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params) started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name) log.info("sidecar plan: " + str(started_plan)) assert (len(started_plan['phases']) == 1) assert (started_plan['phases'][0]['name'] == plan_name + '-deploy') assert (len(started_plan['phases'][0]['steps']) == 2) sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
def test_sidecar(): sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar') started_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'sidecar') log.info("sidecar plan: " + str(started_plan)) assert (len(started_plan['phases']) == 1) assert (started_plan['phases'][0]['name'] == 'sidecar-deploy') assert (len(started_plan['phases'][0]['steps']) == 2) sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'sidecar')
def test_cassandra_migration(): backup_service_name = os.getenv('CASSANDRA_BACKUP_CLUSTER_NAME') restore_service_name = os.getenv('CASSANDRA_RESTORE_CLUSTER_NAME') env = EnvironmentContext( CASSANDRA_NODE_ADDRESS=os.getenv('BACKUP_NODE_ADDRESS', 'node-0.cassandra.mesos'), CASSANDRA_NODE_PORT=os.getenv('BACKUP_NODE_PORT', '9042')) plan_parameters = { 'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME', 'infinity-framework-test'), 'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'), 'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'), 'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'), 'SNAPSHOT_NAME': str(uuid.uuid1()), 'CASSANDRA_KEYSPACES': '"testspace1 testspace2"', } data_context = DataContext( init_jobs=[WRITE_DATA_JOB, VERIFY_DATA_JOB], cleanup_jobs=[DELETE_DATA_JOB, VERIFY_DELETION_JOB]) # Install and run the write/delete data jobs against backup cluster, # running dcos-cassandra-service with env, JobContext(TEST_JOBS), data_context: # Back this cluster up to S3 backup_parameters = { 'backup_name': plan_parameters['SNAPSHOT_NAME'], 's3_access_key': plan_parameters['AWS_ACCESS_KEY_ID'], 's3_secret_key': plan_parameters['AWS_SECRET_ACCESS_KEY'], 'external_location': 's3://{}'.format(plan_parameters['S3_BUCKET_NAME']), } dcos.http.put('{}v1/backup/start'.format( shakedown.dcos_service_url(backup_service_name)), json=backup_parameters) spin.time_wait_noisy(lambda: get_dcos_cassandra_plan( backup_service_name).json()['status'] == 'COMPLETE') env = EnvironmentContext( CASSANDRA_NODE_ADDRESS=os.getenv('RESTORE_NODE_ADDRESS', 'node-0-server.sdk-cassandra.mesos'), CASSANDRA_NODE_PORT=os.getenv('RESTORE_NODE_PORT', '9052')) data_context = DataContext( cleanup_jobs=[VERIFY_DATA_JOB, DELETE_DATA_JOB, VERIFY_DELETION_JOB]) with env, JobContext(TEST_JOBS), data_context: plan.start_plan(restore_service_name, 'restore-s3', parameters=plan_parameters) spin.time_wait_noisy( lambda: (plan.get_plan(restore_service_name, 'restore-s3').json()[ 'status'] == 'COMPLETE'))
def _dump_plans(item: pytest.Item, service_name: str): '''If the test had failed, writes the plan state(s) to log file(s).''' # Use brief timeouts, we just want a best-effort attempt here: plan_names = sdk_plan.list_plans(service_name, 5) for plan_name in plan_names: plan = sdk_plan.get_plan(service_name, plan_name, 5) # Include service name in plan filename, but be careful about folders... out_path = _setup_artifact_path(item, 'plan_{}_{}.json'.format(service_name.replace('/', '_'), plan_name)) out_content = json.dumps(plan, indent=2) log.info('=> Writing {} ({} bytes)'.format(out_path, len(out_content))) with open(out_path, 'w') as f: f.write(out_content) f.write('\n') # ... and a trailing newline
def check_healthy(count=DEFAULT_TASK_COUNT): # Getting a plan only returns 200 when it is complete, # so when getting the plan succeeds, the plan is also complete. plan.get_plan(PACKAGE_NAME, "deploy") plan.get_plan(PACKAGE_NAME, "recovery") tasks.check_running(PACKAGE_NAME, count)