Exemple #1
0
def test_modify_app_config():
    """This tests checks that the modification of the app config does not trigger a recovery."""
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")

    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_EXPIRY_MS'
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info('marathon config: ')
    log.info(marathon_config)
    expiry_ms = int(marathon_config['env'][app_config_field])
    marathon_config['env'][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60)

    # All tasks should be updated because hdfs-site.xml has changed
    config.check_healthy(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)

    sdk_plan.wait_for_completed_recovery(foldered_name)
    new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")
    assert old_recovery_plan == new_recovery_plan
def test_modify_app_config():
    """This tests checks that the modification of the app config does not trigger a recovery."""
    foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_recovery(foldered_name)
    old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")

    app_config_field = 'TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS'
    journal_ids = sdk_tasks.get_task_ids(foldered_name, 'journal')
    name_ids = sdk_tasks.get_task_ids(foldered_name, 'name')
    data_ids = sdk_tasks.get_task_ids(foldered_name, 'data')

    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info('marathon config: ')
    log.info(marathon_config)
    expiry_ms = int(marathon_config['env'][app_config_field])
    marathon_config['env'][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(foldered_name, marathon_config, timeout=15 * 60)

    # All tasks should be updated because hdfs-site.xml has changed
    config.check_healthy(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, 'journal', journal_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'name', name_ids)
    sdk_tasks.check_tasks_updated(foldered_name, 'data', data_ids)

    sdk_plan.wait_for_completed_recovery(foldered_name)
    new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")
    assert old_recovery_plan == new_recovery_plan
def test_backup_and_restore_flow():
    backup_parameters = {
        'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME',
                                    'infinity-framework-test'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'),
        'SNAPSHOT_NAME': str(uuid.uuid1()),
        'CASSANDRA_KEYSPACES': '"testspace1 testspace2"',
    }

    # Write data to Cassandra with a metronome job
    launch_and_verify_job(WRITE_DATA_JOB)

    # Verify that the data was written
    launch_and_verify_job(VERIFY_DATA_JOB)

    # Run backup plan, uploading snapshots and schema to S3
    plan.start_plan(PACKAGE_NAME, 'backup-s3', parameters=backup_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'backup-s3').
                                  json()['status'] == 'COMPLETE'))

    # Delete all keyspaces and tables with a metronome job
    launch_and_verify_job(DELETE_DATA_JOB)

    # Verify that the keyspaces and tables were deleted
    launch_and_verify_job(VERIFY_DELETION_JOB)

    # Run restore plan, retrieving snapshots and schema from S3
    plan.start_plan(PACKAGE_NAME, 'restore-s3', parameters=backup_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'restore-s3').
                                  json()['status'] == 'COMPLETE'))

    # Verify that the data we wrote and then deleted has been restored
    launch_and_verify_job(VERIFY_DATA_JOB, expected_successes=2)
def test_modify_app_config():
    """This tests checks that the modification of the app config does not trigger a recovery."""
    sdk_plan.wait_for_completed_recovery(foldered_name)
    old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")

    app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS"
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info("marathon config: ")
    log.info(marathon_config)
    expiry_ms = int(marathon_config["env"][app_config_field])
    marathon_config["env"][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(marathon_config, timeout=15 * 60)

    # All tasks should be updated because hdfs-site.xml has changed
    config.check_healthy(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids)

    sdk_plan.wait_for_completed_recovery(foldered_name)
    new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")
    assert old_recovery_plan == new_recovery_plan
Exemple #5
0
def test_modify_app_config():
    """This tests checks that the modification of the app config does not trigger a recovery."""
    sdk_plan.wait_for_completed_recovery(foldered_name)
    old_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")

    app_config_field = "TASKCFG_ALL_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS"
    journal_ids = sdk_tasks.get_task_ids(foldered_name, "journal")
    name_ids = sdk_tasks.get_task_ids(foldered_name, "name")
    data_ids = sdk_tasks.get_task_ids(foldered_name, "data")

    marathon_config = sdk_marathon.get_config(foldered_name)
    log.info("marathon config: ")
    log.info(marathon_config)
    expiry_ms = int(marathon_config["env"][app_config_field])
    marathon_config["env"][app_config_field] = str(expiry_ms + 1)
    sdk_marathon.update_app(marathon_config, timeout=15 * 60)

    # All tasks should be updated because hdfs-site.xml has changed
    config.check_healthy(service_name=foldered_name)
    sdk_tasks.check_tasks_updated(foldered_name, "journal", journal_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "name", name_ids)
    sdk_tasks.check_tasks_updated(foldered_name, "data", data_ids)

    sdk_plan.wait_for_completed_recovery(foldered_name)
    new_recovery_plan = sdk_plan.get_plan(foldered_name, "recovery")
    assert old_recovery_plan == new_recovery_plan
Exemple #6
0
def run_backup_and_restore(backup_plan, restore_plan, plan_parameters):
    # Write data to Cassandra with a metronome job
    launch_and_verify_job(WRITE_DATA_JOB)

    # Verify that the data was written
    launch_and_verify_job(VERIFY_DATA_JOB)

    # Run backup plan, uploading snapshots and schema to S3
    plan.start_plan(PACKAGE_NAME, backup_plan, parameters=plan_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, backup_plan).
                                  json()['status'] == 'COMPLETE'))

    # Delete all keyspaces and tables with a metronome job
    launch_and_verify_job(DELETE_DATA_JOB)

    # Verify that the keyspaces and tables were deleted
    launch_and_verify_job(VERIFY_DELETION_JOB)

    # Run restore plan, retrieving snapshots and schema from S3
    plan.start_plan(PACKAGE_NAME, restore_plan, parameters=plan_parameters)
    spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, restore_plan).
                                  json()['status'] == 'COMPLETE'))

    # Verify that the data we wrote and then deleted has been restored
    launch_and_verify_job(VERIFY_DATA_JOB)

    # Delete data in preparation for any other backup tests
    launch_and_verify_job(DELETE_DATA_JOB)
Exemple #7
0
def test_toxic_sidecar_doesnt_trigger_recovery():
    # 1. Run the toxic sidecar plan that will never succeed.
    # 2. Restart the scheduler.
    # 3. Verify that its recovery plan has not changed, as a failed ONCE task should
    # never trigger recovery
    initial_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert (initial_recovery_plan['status'] == "COMPLETE")
    log.info(initial_recovery_plan)
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic')
    wait_for_toxic_sidecar()

    # Restart the scheduler and wait for it to come up.
    sdk_marathon.restart_app(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # Now, verify that its recovery plan hasn't changed.
    final_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert (initial_recovery_plan['status'] == final_recovery_plan['status'])
def test_toxic_sidecar_doesnt_trigger_recovery():
    # 1. Run the toxic sidecar plan that will never succeed.
    # 2. Restart the scheduler.
    # 3. Verify that its recovery plan has not changed, as a failed ONCE task should
    # never trigger recovery
    initial_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert(initial_recovery_plan['status'] == "COMPLETE")
    log.info(initial_recovery_plan)
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic')
    wait_for_toxic_sidecar()

    # Restart the scheduler and wait for it to come up.
    sdk_marathon.restart_app(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # Now, verify that its recovery plan hasn't changed.
    final_recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert(initial_recovery_plan['status'] == final_recovery_plan['status'])
def test_sidecar():
    sdk_plan.start_plan(config.SERVICE_NAME, "sidecar")

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, "sidecar")
    log.info(sdk_plan.plan_string("sidecar", started_plan))
    assert len(started_plan["phases"]) == 1
    assert started_plan["phases"][0]["name"] == "sidecar-deploy"
    assert len(started_plan["phases"][0]["steps"]) == 2

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, "sidecar")
def test_sidecar():
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar')

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'sidecar')
    log.info("sidecar plan: " + str(started_plan))
    assert(len(started_plan['phases']) == 1)
    assert(started_plan['phases'][0]['name'] == 'sidecar-deploy')
    assert(len(started_plan['phases'][0]['steps']) == 2)

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'sidecar')
def run_plan(plan_name, params=None):
    sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params)

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name)
    log.info("sidecar plan: " + str(started_plan))
    assert(len(started_plan['phases']) == 1)
    assert(started_plan['phases'][0]['name'] == plan_name + '-deploy')
    assert(len(started_plan['phases'][0]['steps']) == 2)

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
def run_plan(plan_name, params=None):
    sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params)

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name)
    log.info(sdk_plan.plan_string(plan_name, started_plan))
    assert len(started_plan["phases"]) == 1
    assert started_plan["phases"][0]["name"] == plan_name + "-deploy"
    assert len(started_plan["phases"][0]["steps"]) == 2

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
Exemple #13
0
def test_toxic_sidecar_doesnt_trigger_recovery():
    # 1. Run the toxic sidecar plan that will never succeed.
    # 2. Restart the scheduler.
    # 3. Verify that its recovery plan is empty, as a failed FINISHED task should
    # never trigger recovery
    recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert(len(recovery_plan['phases']) == 0)
    log.info(recovery_plan)
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic')
    shakedown.wait_for(ToxicSidecarCheck(), timeout_seconds=10 * 60)

    # Restart the scheduler and wait for it to come up.
    sdk_marathon.restart_app(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # Now, verify that its recovery plan is empty.
    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'recovery')
    recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert(len(recovery_plan['phases']) == 0)
def test_sidecar():
    plan.start_plan(PACKAGE_NAME, 'sidecar')

    started_plan = plan.get_plan(PACKAGE_NAME, 'sidecar')
    sdk_utils.out("sidecar plan: " + str(started_plan))
    assert(len(started_plan['phases']) == 1)
    assert(started_plan['phases'][0]['name'] == 'sidecar-deploy')
    assert(len(started_plan['phases'][0]['steps']) == 2)

    plan.wait_for_completed_plan(PACKAGE_NAME, 'sidecar')
def test_sidecar():
    sdk_plan.start_plan(config.SERVICE_NAME, "sidecar")

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, "sidecar")
    log.info(sdk_plan.plan_string("sidecar", started_plan))
    assert len(started_plan["phases"]) == 1
    assert started_plan["phases"][0]["name"] == "sidecar-deploy"
    assert len(started_plan["phases"][0]["steps"]) == 2

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, "sidecar")
Exemple #16
0
def test_repair_plan_completes():
    repair_parameters = {'CASSANDRA_KEYSPACE': 'testspace1'}

    plan.start_plan(PACKAGE_NAME, 'repair', parameters=repair_parameters)
    spin.time_wait_noisy(
        lambda: (
            plan.get_plan(PACKAGE_NAME, 'repair').json()['status'] ==
            'COMPLETE'
        )
    )
Exemple #17
0
def test_toxic_sidecar_doesnt_trigger_recovery():
    # 1. Run the toxic sidecar plan that will never succeed.
    # 2. Restart the scheduler.
    # 3. Verify that its recovery plan is empty, as a failed ONCE task should
    # never trigger recovery
    recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert (len(recovery_plan['phases']) == 0)
    log.info(recovery_plan)
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar-toxic')
    wait_for_toxic_sidecar()

    # Restart the scheduler and wait for it to come up.
    sdk_marathon.restart_app(config.SERVICE_NAME)
    sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)

    # Now, verify that its recovery plan is empty.
    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'recovery')
    recovery_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'recovery')
    assert (len(recovery_plan['phases']) == 0)
Exemple #18
0
def run_plan(plan_name, params=None):
    plan.start_plan(PACKAGE_NAME, plan_name, params)

    started_plan = plan.get_plan(PACKAGE_NAME, plan_name)
    sdk_utils.out("sidecar plan: " + str(started_plan))
    assert (len(started_plan['phases']) == 1)
    assert (started_plan['phases'][0]['name'] == plan_name + '-deploy')
    assert (len(started_plan['phases'][0]['steps']) == 2)

    plan.wait_for_completed_plan(PACKAGE_NAME, plan_name)
Exemple #19
0
def run_plan(plan_name, params=None):
    sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params)

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name)
    log.info("sidecar plan: " + str(started_plan))
    assert (len(started_plan['phases']) == 1)
    assert (started_plan['phases'][0]['name'] == plan_name + '-deploy')
    assert (len(started_plan['phases'][0]['steps']) == 2)

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
Exemple #20
0
def run_plan(plan_name, params=None):
    sdk_plan.start_plan(config.SERVICE_NAME, plan_name, params)

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, plan_name)
    log.info(sdk_plan.plan_string(plan_name, started_plan))
    assert len(started_plan["phases"]) == 1
    assert started_plan["phases"][0]["name"] == plan_name + "-deploy"
    assert len(started_plan["phases"][0]["steps"]) == 2

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, plan_name)
def test_sidecar():
    sdk_plan.start_plan(config.SERVICE_NAME, 'sidecar')

    started_plan = sdk_plan.get_plan(config.SERVICE_NAME, 'sidecar')
    log.info("sidecar plan: " + str(started_plan))
    assert (len(started_plan['phases']) == 1)
    assert (started_plan['phases'][0]['name'] == 'sidecar-deploy')
    assert (len(started_plan['phases'][0]['steps']) == 2)

    sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'sidecar')
Exemple #22
0
def test_cassandra_migration():
    backup_service_name = os.getenv('CASSANDRA_BACKUP_CLUSTER_NAME')
    restore_service_name = os.getenv('CASSANDRA_RESTORE_CLUSTER_NAME')

    env = EnvironmentContext(
        CASSANDRA_NODE_ADDRESS=os.getenv('BACKUP_NODE_ADDRESS',
                                         'node-0.cassandra.mesos'),
        CASSANDRA_NODE_PORT=os.getenv('BACKUP_NODE_PORT', '9042'))
    plan_parameters = {
        'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME',
                                    'infinity-framework-test'),
        'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'),
        'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'),
        'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'),
        'SNAPSHOT_NAME': str(uuid.uuid1()),
        'CASSANDRA_KEYSPACES': '"testspace1 testspace2"',
    }

    data_context = DataContext(
        init_jobs=[WRITE_DATA_JOB, VERIFY_DATA_JOB],
        cleanup_jobs=[DELETE_DATA_JOB, VERIFY_DELETION_JOB])
    # Install and run the write/delete data jobs against backup cluster,
    # running dcos-cassandra-service
    with env, JobContext(TEST_JOBS), data_context:
        # Back this cluster up to S3
        backup_parameters = {
            'backup_name':
            plan_parameters['SNAPSHOT_NAME'],
            's3_access_key':
            plan_parameters['AWS_ACCESS_KEY_ID'],
            's3_secret_key':
            plan_parameters['AWS_SECRET_ACCESS_KEY'],
            'external_location':
            's3://{}'.format(plan_parameters['S3_BUCKET_NAME']),
        }
        dcos.http.put('{}v1/backup/start'.format(
            shakedown.dcos_service_url(backup_service_name)),
                      json=backup_parameters)
        spin.time_wait_noisy(lambda: get_dcos_cassandra_plan(
            backup_service_name).json()['status'] == 'COMPLETE')

    env = EnvironmentContext(
        CASSANDRA_NODE_ADDRESS=os.getenv('RESTORE_NODE_ADDRESS',
                                         'node-0-server.sdk-cassandra.mesos'),
        CASSANDRA_NODE_PORT=os.getenv('RESTORE_NODE_PORT', '9052'))

    data_context = DataContext(
        cleanup_jobs=[VERIFY_DATA_JOB, DELETE_DATA_JOB, VERIFY_DELETION_JOB])
    with env, JobContext(TEST_JOBS), data_context:
        plan.start_plan(restore_service_name,
                        'restore-s3',
                        parameters=plan_parameters)
        spin.time_wait_noisy(
            lambda: (plan.get_plan(restore_service_name, 'restore-s3').json()[
                'status'] == 'COMPLETE'))
def _dump_plans(item: pytest.Item, service_name: str):
    '''If the test had failed, writes the plan state(s) to log file(s).'''

    # Use brief timeouts, we just want a best-effort attempt here:
    plan_names = sdk_plan.list_plans(service_name, 5)
    for plan_name in plan_names:
        plan = sdk_plan.get_plan(service_name, plan_name, 5)
        # Include service name in plan filename, but be careful about folders...
        out_path = _setup_artifact_path(item, 'plan_{}_{}.json'.format(service_name.replace('/', '_'), plan_name))
        out_content = json.dumps(plan, indent=2)
        log.info('=> Writing {} ({} bytes)'.format(out_path, len(out_content)))
        with open(out_path, 'w') as f:
            f.write(out_content)
            f.write('\n') # ... and a trailing newline
Exemple #24
0
def _dump_plans(item: pytest.Item, service_name: str):
    '''If the test had failed, writes the plan state(s) to log file(s).'''

    # Use brief timeouts, we just want a best-effort attempt here:
    plan_names = sdk_plan.list_plans(service_name, 5)
    for plan_name in plan_names:
        plan = sdk_plan.get_plan(service_name, plan_name, 5)
        # Include service name in plan filename, but be careful about folders...
        out_path = _setup_artifact_path(item, 'plan_{}_{}.json'.format(service_name.replace('/', '_'), plan_name))
        out_content = json.dumps(plan, indent=2)
        log.info('=> Writing {} ({} bytes)'.format(out_path, len(out_content)))
        with open(out_path, 'w') as f:
            f.write(out_content)
            f.write('\n')  # ... and a trailing newline
Exemple #25
0
def check_healthy(count=DEFAULT_TASK_COUNT):
    # Getting a plan only returns 200 when it is complete,
    # so when getting the plan succeeds, the plan is also complete.
    plan.get_plan(PACKAGE_NAME, "deploy")
    plan.get_plan(PACKAGE_NAME, "recovery")
    tasks.check_running(PACKAGE_NAME, count)