def destroy_app(app_name): sdk_cmd.request('delete', api_url_with_param('apps', app_name)) # Make sure the scheduler has been destroyed def fn(): return shakedown.get_service(app_name) is None sdk_spin.time_wait_noisy(lambda: fn())
def test_joins_overlay_network(): """Verify that the container joined the dcos subnet at 9.0.0.0/24. The logic for this is in the task itself, which will check the container IP address and fail if incorrect, thus preventing the plan from reaching the COMPLETE state.""" spin.time_wait_noisy(lambda: (plan.get_deployment_plan(PACKAGE_NAME).json( )['status'] == 'COMPLETE'))
def install(package_name, running_task_count, service_name=None, additional_options={}, package_version=None): if not service_name: service_name = package_name start = time.time() merged_options = get_package_options(additional_options) print('Installing {} with options={} version={}'.format(package_name, merged_options, package_version)) # install_package_and_wait silently waits for all marathon deployments to clear. # to give some visibility, install in the following order: # 1. install package shakedown.install_package(package_name, package_version=package_version, options_json=merged_options) # 2. wait for expected tasks to come up print("Waiting for expected tasks to come up...") sdk_tasks.check_running(service_name, running_task_count) # 3. check service health marathon_client = dcos.marathon.create_client() def fn(): # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather # than the one we care about deploying_apps = set([]) print("Getting deployments") deployments = marathon_client.get_deployments() print("Found {} deployments".format(len(deployments))) for d in deployments: print("Deployment: {}".format(d)) for a in d.get('affectedApps', []): print("Adding {}".format(a)) deploying_apps.add(a) print('Checking deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps)) return not '/{}'.format(service_name) in deploying_apps sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=30) print('Install done after {}'.format(sdk_spin.pretty_time(time.time() - start)))
def test_joins_overlay_network(): """Verify that the container joined the dcos subnet at 9.0.0.0/24. The logic for this is in the task itself, which will check the container IP address and fail if incorrect, thus preventing the plan from reaching the COMPLETE state.""" spin.time_wait_noisy(lambda: ( plan.get_deployment_plan(PACKAGE_NAME).json()['status'] == 'COMPLETE'))
def run_backup_and_restore(backup_plan, restore_plan, plan_parameters): # Write data to Cassandra with a metronome job launch_and_verify_job(WRITE_DATA_JOB) # Verify that the data was written launch_and_verify_job(VERIFY_DATA_JOB) # Run backup plan, uploading snapshots and schema to S3 plan.start_plan(PACKAGE_NAME, backup_plan, parameters=plan_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, backup_plan). json()['status'] == 'COMPLETE')) # Delete all keyspaces and tables with a metronome job launch_and_verify_job(DELETE_DATA_JOB) # Verify that the keyspaces and tables were deleted launch_and_verify_job(VERIFY_DELETION_JOB) # Run restore plan, retrieving snapshots and schema from S3 plan.start_plan(PACKAGE_NAME, restore_plan, parameters=plan_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, restore_plan). json()['status'] == 'COMPLETE')) # Verify that the data we wrote and then deleted has been restored launch_and_verify_job(VERIFY_DATA_JOB) # Delete data in preparation for any other backup tests launch_and_verify_job(DELETE_DATA_JOB)
def test_lock(): '''This test verifies that a second scheduler fails to startup when an existing scheduler is running. Without locking, the scheduler would fail during registration, but after writing its config to ZK. So in order to verify that the scheduler fails immediately, we ensure that the ZK config state is unmodified.''' marathon_client = dcos.marathon.create_client() # Get ZK state from running framework zk_path = "dcos-service-{}/ConfigTarget".format(PACKAGE_NAME) zk_config_old = shakedown.get_zk_node_data(zk_path) # Get marathon app app_id = "/{}".format(PACKAGE_NAME) app = marathon_client.get_app(app_id) old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None) # Scale to 2 instances labels = app["labels"] labels.pop("MARATHON_SINGLE_INSTANCE_APP") marathon_client.update_app(app_id, {"labels": labels}) shakedown.deployment_wait() marathon_client.update_app(app_id, {"instances": 2}) # Wait for second scheduler to fail def fn(): timestamp = marathon_client.get_app(app_id).get("lastTaskFailure", {}).get("timestamp", None) return timestamp != old_timestamp spin.time_wait_noisy(lambda: fn()) # Verify ZK is unchanged zk_config_new = shakedown.get_zk_node_data(zk_path) assert zk_config_old == zk_config_new
def test_deploy(): wait_time = 30 # taskcfg.yml will initially fail to deploy because several options are missing in the default # marathon.json.mustache. verify that tasks are failing for 30s before continuing. print('Checking that tasks are failing to launch for at least {}s'.format(wait_time)) # we can get brief blips of TASK_RUNNING but they shouldnt last more than 2-3s: consecutive_task_running = 0 def fn(): nonlocal consecutive_task_running svc_tasks = shakedown.get_service_tasks(PACKAGE_NAME) states = [t['state'] for t in svc_tasks] print('Task states: {}'.format(states)) if 'TASK_RUNNING' in states: consecutive_task_running += 1 assert consecutive_task_running <= 3 else: consecutive_task_running = 0 return False try: spin.time_wait_noisy(lambda: fn(), timeout_seconds=wait_time) except shakedown.TimeoutExpired: print('Timeout reached as expected') # add the needed envvars in marathon and confirm that the deployment succeeds: config = marathon.get_config(PACKAGE_NAME) env = config['env'] del env['SLEEP_DURATION'] env['TASKCFG_ALL_OUTPUT_FILENAME'] = 'output' env['TASKCFG_ALL_SLEEP_DURATION'] = '1000' marathon.update_app(PACKAGE_NAME, config) check_running()
def check_running(service_name, expected_task_count, timeout_seconds=-1): def fn(): try: tasks = shakedown.get_service_tasks(service_name) except dcos.errors.DCOSHTTPException: sdk_utils.out( 'Failed to get tasks for service {}'.format(service_name)) tasks = [] running_task_names = [] other_tasks = [] for t in tasks: if t['state'] == 'TASK_RUNNING': running_task_names.append(t['name']) else: other_tasks.append('{}={}'.format(t['name'], t['state'])) msg = 'Waiting for {} running tasks, got {} running/{} total:\n- running: {}\n- other: {}'.format( expected_task_count, len(running_task_names), len(tasks), running_task_names, other_tasks) sdk_utils.out(msg) return len(running_task_names) >= expected_task_count if timeout_seconds <= 0: sdk_spin.time_wait_noisy(lambda: fn()) else: sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=timeout_seconds)
def check_tasks_updated(service_name, prefix, old_task_ids, timeout_seconds=-1): def fn(): try: task_ids = get_task_ids(service_name, prefix) except dcos.errors.DCOSHTTPException: print('Failed to get task ids for service {}'.format(service_name)) task_ids = [] print( 'Waiting for tasks starting with "{}" to be updated:\n- Old tasks: {}\n- Current tasks: {}' .format(prefix, old_task_ids, task_ids)) all_updated = True for id in task_ids: if id in old_task_ids: all_updated = False if len(task_ids) < len(old_task_ids): all_updated = False return all_updated if timeout_seconds <= 0: sdk_spin.time_wait_noisy(lambda: fn()) else: sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=timeout_seconds)
def test_lock(): '''This test verifies that a second scheduler fails to startup when an existing scheduler is running. Without locking, the scheduler would fail during registration, but after writing its config to ZK. So in order to verify that the scheduler fails immediately, we ensure that the ZK config state is unmodified.''' marathon_client = dcos.marathon.create_client() # Get ZK state from running framework zk_path = "dcos-service-{}/ConfigTarget".format(PACKAGE_NAME) zk_config_old = shakedown.get_zk_node_data(zk_path) # Get marathon app app_id = "/{}".format(PACKAGE_NAME) app = marathon_client.get_app(app_id) old_timestamp = app.get("lastTaskFailure", {}).get("timestamp", None) # Scale to 2 instances labels = app["labels"] labels.pop("MARATHON_SINGLE_INSTANCE_APP") marathon_client.update_app(app_id, {"labels": labels}) shakedown.deployment_wait() marathon_client.update_app(app_id, {"instances": 2}) # Wait for second scheduler to fail def fn(): timestamp = marathon_client.get_app(app_id).get("lastTaskFailure", {}).get("timestamp", None) return timestamp != old_timestamp spin.time_wait_noisy(lambda: fn()) # Verify ZK is unchanged zk_config_new = shakedown.get_zk_node_data(zk_path) assert zk_config_old == zk_config_new
def verify_job_succeeded(job_name, run_id): # Verify that our most recent run succeeded spin.time_wait_noisy(lambda: (run_id in [ r['id'] for r in json.loads( cmd.run_cli('job history --show-failures --json {}'.format( job_name)))['history']['successfulFinishedRuns'] ]))
def install( package_name, running_task_count, service_name=None, additional_options={}, package_version=None, check_suppression=True): if not service_name: service_name = package_name start = time.time() merged_options = get_package_options(additional_options) sdk_utils.out('Installing {} with options={} version={}'.format( package_name, merged_options, package_version)) # install_package_and_wait silently waits for all marathon deployments to clear. # to give some visibility, install in the following order: # 1. install package shakedown.install_package( package_name, package_version=package_version, options_json=merged_options) # 2. wait for expected tasks to come up sdk_utils.out("Waiting for expected tasks to come up...") sdk_tasks.check_running(service_name, running_task_count) sdk_plan.wait_for_completed_deployment(service_name) # 3. check service health marathon_client = dcos.marathon.create_client() def is_deployment_finished(): # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather # than the one we care about deploying_apps = set([]) sdk_utils.out("Getting deployments") deployments = marathon_client.get_deployments() sdk_utils.out("Found {} deployments".format(len(deployments))) for deployment in deployments: sdk_utils.out("Deployment: {}".format(deployment)) for app in deployment.get('affectedApps', []): sdk_utils.out("Adding {}".format(app)) deploying_apps.add(app) sdk_utils.out('Checking that deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps)) return not '/{}'.format(service_name) in deploying_apps sdk_utils.out("Waiting for marathon deployment to finish...") sdk_spin.time_wait_noisy(is_deployment_finished) # 4. Ensure the framework is suppressed. # # This is only configurable in order to support installs from # Universe during the upgrade_downgrade tests, because currently # the suppression endpoint isn't supported by all frameworks in # Universe. It can be removed once all frameworks rely on # dcos-commons >= 0.13. if check_suppression: sdk_utils.out("Waiting for framework to be suppressed...") sdk_spin.time_wait_noisy( lambda: sdk_api.is_suppressed(service_name)) sdk_utils.out('Install done after {}'.format(sdk_spin.pretty_time(time.time() - start)))
def verify_job_finished(job_name, run_id): spin.time_wait_noisy(lambda: (run_id in [ r['id'] for r in get_runs(job_name)['history']['successfulFinishedRuns'] ] or run_id in [ r['id'] for r in get_runs(job_name)['history']['failedFinishedRuns'] ]))
def check_running(service_name, expected_task_count, timeout_seconds=-1): def fn(): try: tasks = shakedown.get_service_tasks(service_name) except dcos.errors.DCOSHTTPException: print('Failed to get tasks for service {}'.format(service_name)) tasks = [] running_task_names = [] other_tasks = [] for t in tasks: if t['state'] == 'TASK_RUNNING': running_task_names.append(t['name']) else: other_tasks.append('{}={}'.format(t['name'], t['state'])) print('Waiting for {} running tasks, got {} running/{} total:\n- running: {}\n- other: {}'.format( expected_task_count, len(running_task_names), len(tasks), running_task_names, other_tasks)) return len(running_task_names) >= expected_task_count if timeout_seconds <= 0: sdk_spin.time_wait_noisy(lambda: fn()) else: sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=timeout_seconds)
def test_backup_and_restore_flow(): backup_parameters = { 'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME', 'infinity-framework-test'), 'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'), 'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'), 'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'), 'SNAPSHOT_NAME': str(uuid.uuid1()), 'CASSANDRA_KEYSPACES': '"testspace1 testspace2"', } # Write data to Cassandra with a metronome job launch_and_verify_job(WRITE_DATA_JOB) # Verify that the data was written launch_and_verify_job(VERIFY_DATA_JOB) # Run backup plan, uploading snapshots and schema to S3 plan.start_plan(PACKAGE_NAME, 'backup-s3', parameters=backup_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'backup-s3'). json()['status'] == 'COMPLETE')) # Delete all keyspaces and tables with a metronome job launch_and_verify_job(DELETE_DATA_JOB) # Verify that the keyspaces and tables were deleted launch_and_verify_job(VERIFY_DELETION_JOB) # Run restore plan, retrieving snapshots and schema from S3 plan.start_plan(PACKAGE_NAME, 'restore-s3', parameters=backup_parameters) spin.time_wait_noisy(lambda: (plan.get_plan(PACKAGE_NAME, 'restore-s3'). json()['status'] == 'COMPLETE')) # Verify that the data we wrote and then deleted has been restored launch_and_verify_job(VERIFY_DATA_JOB, expected_successes=2)
def upgrade_or_downgrade(package_name, running_task_count): task_ids = tasks.get_task_ids(package_name, '') marathon.destroy_app(package_name) install.install(package_name, running_task_count) print('Waiting for upgrade / downgrade deployment to complete') spin.time_wait_noisy(lambda: ( plan.get_deployment_plan(package_name).json()['status'] == 'COMPLETE')) print('Checking that all tasks have restarted') tasks.check_tasks_updated(package_name, '', task_ids)
def test_task_dns_prefix_points_to_all_tasks(): pod_info = dcos.http.get( shakedown.dcos_service_url(PACKAGE_NAME) + "/v1/pods/{}/info".format("hello-0")).json() # Assert that DiscoveryInfo is correctly set on tasks. assert(all(p["info"]["discovery"]["name"] == "hello-0" for p in pod_info)) # Assert that the hello-0.hello-world.mesos DNS entry points to the right IP. spin.time_wait_noisy(lambda: ( plan.get_deployment_plan(PACKAGE_NAME).json()['status'] == 'COMPLETE'))
def test_repair_plan_completes(): repair_parameters = {'CASSANDRA_KEYSPACE': 'testspace1'} plan.start_plan(PACKAGE_NAME, 'repair', parameters=repair_parameters) spin.time_wait_noisy( lambda: ( plan.get_plan(PACKAGE_NAME, 'repair').json()['status'] == 'COMPLETE' ) )
def test_cassandra_migration(): backup_service_name = os.getenv('CASSANDRA_BACKUP_CLUSTER_NAME') restore_service_name = os.getenv('CASSANDRA_RESTORE_CLUSTER_NAME') env = EnvironmentContext( CASSANDRA_NODE_ADDRESS=os.getenv('BACKUP_NODE_ADDRESS', 'node-0.cassandra.mesos'), CASSANDRA_NODE_PORT=os.getenv('BACKUP_NODE_PORT', '9042')) plan_parameters = { 'S3_BUCKET_NAME': os.getenv('AWS_BUCKET_NAME', 'infinity-framework-test'), 'AWS_ACCESS_KEY_ID': os.getenv('AWS_ACCESS_KEY_ID'), 'AWS_SECRET_ACCESS_KEY': os.getenv('AWS_SECRET_ACCESS_KEY'), 'AWS_REGION': os.getenv('AWS_REGION', 'us-west-2'), 'SNAPSHOT_NAME': str(uuid.uuid1()), 'CASSANDRA_KEYSPACES': '"testspace1 testspace2"', } data_context = DataContext( init_jobs=[WRITE_DATA_JOB, VERIFY_DATA_JOB], cleanup_jobs=[DELETE_DATA_JOB, VERIFY_DELETION_JOB]) # Install and run the write/delete data jobs against backup cluster, # running dcos-cassandra-service with env, JobContext(TEST_JOBS), data_context: # Back this cluster up to S3 backup_parameters = { 'backup_name': plan_parameters['SNAPSHOT_NAME'], 's3_access_key': plan_parameters['AWS_ACCESS_KEY_ID'], 's3_secret_key': plan_parameters['AWS_SECRET_ACCESS_KEY'], 'external_location': 's3://{}'.format(plan_parameters['S3_BUCKET_NAME']), } dcos.http.put('{}v1/backup/start'.format( shakedown.dcos_service_url(backup_service_name)), json=backup_parameters) spin.time_wait_noisy(lambda: get_dcos_cassandra_plan( backup_service_name).json()['status'] == 'COMPLETE') env = EnvironmentContext( CASSANDRA_NODE_ADDRESS=os.getenv('RESTORE_NODE_ADDRESS', 'node-0-server.sdk-cassandra.mesos'), CASSANDRA_NODE_PORT=os.getenv('RESTORE_NODE_PORT', '9052')) data_context = DataContext( cleanup_jobs=[VERIFY_DATA_JOB, DELETE_DATA_JOB, VERIFY_DELETION_JOB]) with env, JobContext(TEST_JOBS), data_context: plan.start_plan(restore_service_name, 'restore-s3', parameters=plan_parameters) spin.time_wait_noisy( lambda: (plan.get_plan(restore_service_name, 'restore-s3').json()[ 'status'] == 'COMPLETE'))
def test_sidecar(): plan.start_sidecar_plan(PACKAGE_NAME) sidecar_plan = plan.get_sidecar_plan(PACKAGE_NAME).json() sdk_utils.out("sidecar_plan: " + str(sidecar_plan)) assert (len(sidecar_plan['phases']) == 1) assert (sidecar_plan['phases'][0]['name'] == 'sidecar-deploy') assert (len(sidecar_plan['phases'][0]['steps']) == 2) spin.time_wait_noisy(lambda: (plan.get_sidecar_plan(PACKAGE_NAME).json()[ 'status'] == 'COMPLETE'))
def test_sidecar(): plan.start_sidecar_plan(PACKAGE_NAME, {'PLAN_PARAMETER': 'parameterized'}) sidecar_plan = plan.get_sidecar_plan(PACKAGE_NAME).json() print("sidecar_plan: " + str(sidecar_plan)) assert(len(sidecar_plan['phases']) == 1) assert(sidecar_plan['phases'][0]['name'] == 'sidecar-deploy') assert(len(sidecar_plan['phases'][0]['steps']) == 2) spin.time_wait_noisy(lambda: ( plan.get_sidecar_plan(PACKAGE_NAME).json()['status'] == 'COMPLETE'))
def test_suppress(): dcos_url = dcos.config.get_config_val('core.dcos_url') suppressed_url = urllib.parse.urljoin(dcos_url, 'service/{}/v1/state/properties/suppressed'.format(PACKAGE_NAME)) def fun(): response = dcos.http.get(suppressed_url) response.raise_for_status() return response.text == "true" spin.time_wait_noisy(fun)
def test_state_properties_get(): # 'suppressed' could be missing if the scheduler recently started, loop for a bit just in case: def check_for_nonempty_properties(): stdout = cmd.run_cli('hello-world state properties') return len(json.loads(stdout)) > 0 spin.time_wait_noisy(lambda: check_for_nonempty_properties(), timeout_seconds=30.) stdout = cmd.run_cli('hello-world state properties') jsonobj = json.loads(stdout) assert len(jsonobj) == 1 assert jsonobj[0] == "suppressed" stdout = cmd.run_cli('hello-world state property suppressed') assert stdout == "true\n"
def launch_and_verify_job(job_name): job_name = qualified_job_name(job_name) output = cmd.run_cli('job run {}'.format(job_name)) # Get the id of the run we just initiated run_id = json.loads(cmd.run_cli( 'job show runs {} --json'.format(job_name)))[0]['id'] # Verify that our most recent run succeeded spin.time_wait_noisy(lambda: (run_id in [ r['id'] for r in json.loads( cmd.run_cli('job history --show-failures --json {}'.format( job_name)))['history']['successfulFinishedRuns'] ]))
def test_state_properties_get(): # 'suppressed' could be missing if the scheduler recently started, loop for a bit just in case: def check_for_nonempty_properties(): stdout = cmd.run_cli('hello-world state properties') return len(json.loads(stdout)) > 0 spin.time_wait_noisy(lambda: check_for_nonempty_properties(), timeout_seconds=30) stdout = cmd.run_cli('hello-world state properties') jsonobj = json.loads(stdout) assert len(jsonobj) == 1 assert jsonobj[0] == "suppressed" stdout = cmd.run_cli('hello-world state property suppressed') assert stdout == "true\n"
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' check_running() task_ids = tasks.get_task_ids(PACKAGE_NAME, '') # caching enabled by default: stdout = cmd.run_cli('hello-world state refresh_cache') assert "Received cmd: refresh" in stdout config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids) check_running() # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): def check_cache_refresh_fails_409conflict(): try: cmd.run_cli('hello-world state refresh_cache') except Exception as e: if "failed: 409 Conflict" in e.args[0]: return True return False spin.time_wait_noisy(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) del config['env']['DISABLE_STATE_CACHE'] cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids) check_running() # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): def check_cache_refresh(): return cmd.run_cli('hello-world state refresh_cache') stdout = spin.time_wait_return(lambda: check_cache_refresh(), timeout_seconds=120.) assert "Received cmd: refresh" in stdout
def check_tasks_updated(service_name, prefix, old_task_ids): def fn(): try: task_ids = get_task_ids(service_name, prefix) except dcos.errors.DCOSHTTPException: print('Failed to get task ids for service {}'.format(service_name)) task_ids = [] print('Waiting for tasks starting with "{}" to be updated:\n- Old tasks: {}\n- Current tasks: {}'.format( prefix, old_task_ids, task_ids)) all_updated = True for id in task_ids: if id in old_task_ids: all_updated = False if len(task_ids) < len(old_task_ids): all_updated = False return all_updated sdk_spin.time_wait_noisy(lambda: fn())
def check_tasks_not_updated(service_name, prefix, old_task_ids): def fn(): try: task_ids = get_task_ids(service_name, prefix) except dcos.errors.DCOSHTTPException: print('Failed to get task ids for service {}'.format(service_name)) task_ids = [] print('Checking prior tasks starting with "{}" are undisturbed:\n- Old tasks: {}\n- Current tasks: {}'.format( prefix, old_task_ids, task_ids)) for task_id in task_ids: if task_id not in old_task_ids: return False return True try: sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=60) except shakedown.TimeoutExpired: print('Timeout reached as expected')
def install(package_name, running_task_count, service_name=None, additional_options={}, package_version=None): if not service_name: service_name = package_name start = time.time() merged_options = get_package_options(additional_options) print('Installing {} with options={} version={}'.format( package_name, merged_options, package_version)) # install_package_and_wait silently waits for all marathon deployments to clear. # to give some visibility, install in the following order: # 1. install package shakedown.install_package(package_name, package_version=package_version, options_json=merged_options) # 2. wait for expected tasks to come up print("Waiting for expected tasks to come up...") sdk_tasks.check_running(service_name, running_task_count) # 3. check service health marathon_client = dcos.marathon.create_client() def fn(): # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather # than the one we care about deploying_apps = set([]) print("Getting deployments") deployments = marathon_client.get_deployments() print("Found {} deployments".format(len(deployments))) for d in deployments: print("Deployment: {}".format(d)) for a in d.get('affectedApps', []): print("Adding {}".format(a)) deploying_apps.add(a) print('Checking deployment of {} has ended:\n- Deploying apps: {}'. format(service_name, deploying_apps)) return not '/{}'.format(service_name) in deploying_apps sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=30) print('Install done after {}'.format( sdk_spin.pretty_time(time.time() - start)))
def check_tasks_not_updated(service_name, prefix, old_task_ids): def fn(): try: task_ids = get_task_ids(service_name, prefix) except dcos.errors.DCOSHTTPException: print('Failed to get task ids for service {}'.format(service_name)) task_ids = [] print( 'Checking prior tasks starting with "{}" are undisturbed:\n- Old tasks: {}\n- Current tasks: {}' .format(prefix, old_task_ids, task_ids)) for task_id in task_ids: if task_id not in old_task_ids: return False return True try: sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=60) except shakedown.TimeoutExpired: print('Timeout reached as expected')
def test_state_refresh_disable_cache(): '''Disables caching via a scheduler envvar''' check_running() task_ids = tasks.get_task_ids(PACKAGE_NAME, '') # caching enabled by default: stdout = cmd.run_cli('hello-world state refresh_cache') assert "Received cmd: refresh" in stdout config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) config['env']['DISABLE_STATE_CACHE'] = 'any-text-here' cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids) check_running() # caching disabled, refresh_cache should fail with a 409 error (eventually, once scheduler is up): def check_cache_refresh_fails_409conflict(): try: cmd.run_cli('hello-world state refresh_cache') except Exception as e: if "failed: 409 Conflict" in e.args[0]: return True return False spin.time_wait_noisy(lambda: check_cache_refresh_fails_409conflict(), timeout_seconds=120.) config = marathon.get_config(PACKAGE_NAME) cpus = float(config['env']['HELLO_CPUS']) del config['env']['DISABLE_STATE_CACHE'] cmd.request('put', marathon.api_url('apps/' + PACKAGE_NAME), json=config) tasks.check_tasks_not_updated(PACKAGE_NAME, '', task_ids) check_running() # caching reenabled, refresh_cache should succeed (eventually, once scheduler is up): def check_cache_refresh(): return cmd.run_cli('hello-world state refresh_cache') stdout = spin.time_wait_return(lambda: check_cache_refresh(), timeout_seconds=120.) assert "Received cmd: refresh" in stdout
def test_node_replace_replaces_node(): tasks = cmd.run_cli('task') node_ip = [t for t in tasks.split('\n') if t.startswith('node-2-server')].pop().split()[1] # Update the placement constraints so the new node doesn't end up on the # same host config = marathon.get_config(PACKAGE_NAME) config['env']['PLACEMENT_CONSTRAINT'] = 'hostname:UNLIKE:{}'.format( node_ip) marathon.update_app(PACKAGE_NAME, config) plan.wait_for_completed_deployment(PACKAGE_NAME) # start replace and wait for it to finish cmd.run_cli('cassandra pods replace node-2') plan.wait_for_completed_recovery(PACKAGE_NAME) # Install replace verification job with correct node IP templated # (the job checks for that IP's absence in the peers list and also verifies # that the expected number of peers is present, meaning that the node was # replaced from Cassandra's perspective) with JobContext([VERIFY_REPLACE_JOB], NODE_IP=node_ip): spin.time_wait_noisy(lambda: try_job(VERIFY_REPLACE_JOB))
def check_default_version_available(package_name, prev_version): def fn(): return get_pkg_version(package_name) != prev_version sdk_spin.time_wait_noisy(lambda: fn())
def launch_and_verify_job(job_name, expected_successes=1): cmd.run_cli('job run {}'.format(qualified_job_name(job_name))) spin.time_wait_noisy(lambda: ('Successful runs: {}'.format( expected_successes) in cmd.run_cli('job history {}'.format( qualified_job_name(job_name)))))
def new_default_version_available(prev_version, package_name): spin.time_wait_noisy(lambda: get_pkg_version(package_name) != prev_version)
def new_default_version_available(prev_version): spin.time_wait_noisy(lambda: get_pkg_version() != prev_version)
def new_default_version_available(prev_version): spin.time_wait_noisy(lambda: get_pkg_version() != prev_version)