def test_canary_fourth(): sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy world-deploy') expected_tasks = [ 'hello-0', 'hello-1', 'hello-2', 'hello-3', 'world-0', 'world-1', 'world-2', 'world-3'] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks pl = sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'deploy') log.info(pl) assert pl['status'] == 'COMPLETE' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE'
def check_healthy(service_name, count=DEFAULT_TASK_COUNT, recovery_expected=False): sdk_plan.wait_for_completed_deployment(service_name, timeout_seconds=25 * 60) if recovery_expected: # TODO(elezar): See INFINITY-2109 where we need to better handle recovery health checks sdk_plan.wait_for_kicked_off_recovery(service_name, timeout_seconds=25 * 60) sdk_plan.wait_for_completed_recovery(service_name, timeout_seconds=25 * 60) sdk_tasks.check_running(service_name, count)
def setup_constraint_switch(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) agents = sdk_agents.get_private_agents() some_agent = agents[0]["hostname"] other_agent = agents[1]["hostname"] log.info("Agents: %s %s", some_agent, other_agent) assert some_agent != other_agent options = _escape_placement_for_1_9( { "service": {"yaml": "marathon_constraint"}, "hello": { "count": 1, # First, we stick the pod to some_agent "placement": '[["hostname", "LIKE", "{}"]]'.format(some_agent), }, "world": {"count": 0}, } ) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, 1, additional_options=options) sdk_tasks.check_running(config.SERVICE_NAME, 1) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello") # Now, stick it to other_agent marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config["env"]["HELLO_PLACEMENT"] = '[["hostname", "LIKE", "{}"]]'.format(other_agent) sdk_marathon.update_app(marathon_config) # Wait for the scheduler to be up and settled before advancing. sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) return some_agent, other_agent, hello_ids
def test_canary_third(): sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "plan continue deploy hello-deploy") expected_tasks = ["hello-0", "hello-1", "hello-2", "hello-3", "world-0"] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod list") assert rc == 0, "Pod list failed" assert json.loads(stdout) == expected_tasks pl = sdk_plan.wait_for_completed_phase(config.SERVICE_NAME, "deploy", "hello-deploy") log.info(pl) assert pl["status"] == "WAITING" assert len(pl["phases"]) == 2 phase = pl["phases"][0] assert phase["status"] == "COMPLETE" steps = phase["steps"] assert len(steps) == 4 assert steps[0]["status"] == "COMPLETE" assert steps[1]["status"] == "COMPLETE" assert steps[2]["status"] == "COMPLETE" assert steps[3]["status"] == "COMPLETE" phase = pl["phases"][1] assert phase["status"] == "WAITING" steps = phase["steps"] assert len(steps) == 4 assert steps[0]["status"] == "COMPLETE" assert steps[1]["status"] == "WAITING" assert steps[2]["status"] == "PENDING" assert steps[3]["status"] == "PENDING"
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9( { "service": {"yaml": "marathon_constraint"}, "hello": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, "world": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, } ) sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, get_num_private_agents() * 2, additional_options=options, ) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0") sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running( config.SERVICE_NAME, get_num_private_agents() * 2 - 1, timeout_seconds=10 ) sdk_tasks.check_running(config.SERVICE_NAME, get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def setup_constraint_switch(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) agents = shakedown.get_private_agents() some_agent = agents[0] other_agent = agents[1] log.info('Agents: %s %s', some_agent, other_agent) assert some_agent != other_agent options = _escape_placement_for_1_9({ "service": { "yaml": "marathon_constraint" }, "hello": { "count": 1, # First, we stick the pod to some_agent "placement": "[[\"hostname\", \"LIKE\", \"{}\"]]".format(some_agent) }, "world": { "count": 0 } }) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, 1, additional_options=options) sdk_tasks.check_running(config.SERVICE_NAME, 1) hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello') # Now, stick it to other_agent marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['HELLO_PLACEMENT'] = "[[\"hostname\", \"LIKE\", \"{}\"]]".format(other_agent) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) # Wait for the scheduler to be up and settled before advancing. sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) return some_agent, other_agent, hello_ids
def test_no_change(): broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) plan1 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) config = marathon.get_config(SERVICE_NAME) marathon.update_app(SERVICE_NAME, config) plan2 = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) assert plan1 == plan2 try: tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids, timeout_seconds=60) assert False, "Should not restart tasks now" except AssertionError as arg: raise arg except: pass tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) assert plan2['status'] == 'COMPLETE' assert plan2['phases'][0]['status'] == 'COMPLETE' for step in range(DEFAULT_BROKER_COUNT): assert plan2['phases'][0]['steps'][step]['status'] == 'COMPLETE'
def install(package_name, running_task_count, service_name=None, additional_options={}, package_version=None): if not service_name: service_name = package_name start = time.time() merged_options = get_package_options(additional_options) print('Installing {} with options={} version={}'.format(package_name, merged_options, package_version)) # install_package_and_wait silently waits for all marathon deployments to clear. # to give some visibility, install in the following order: # 1. install package shakedown.install_package(package_name, package_version=package_version, options_json=merged_options) # 2. wait for expected tasks to come up print("Waiting for expected tasks to come up...") sdk_tasks.check_running(service_name, running_task_count) # 3. check service health marathon_client = dcos.marathon.create_client() def fn(): # TODO(nickbp): upstream fix to shakedown, which currently checks for ANY deployments rather # than the one we care about deploying_apps = set([]) print("Getting deployments") deployments = marathon_client.get_deployments() print("Found {} deployments".format(len(deployments))) for d in deployments: print("Deployment: {}".format(d)) for a in d.get('affectedApps', []): print("Adding {}".format(a)) deploying_apps.add(a) print('Checking deployment of {} has ended:\n- Deploying apps: {}'.format(service_name, deploying_apps)) return not '/{}'.format(service_name) in deploying_apps sdk_spin.time_wait_noisy(lambda: fn(), timeout_seconds=30) print('Install done after {}'.format(sdk_spin.pretty_time(time.time() - start)))
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_port_static_to_dynamic_port(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['BROKER_PORT'] = '0' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) for broker_id in range(config.DEFAULT_BROKER_COUNT): result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'broker get {}'.format(broker_id), json=True) assert result['port'] != 9092 result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints broker', json=True) assert len(result['address']) == config.DEFAULT_BROKER_COUNT assert len(result['dns']) == config.DEFAULT_BROKER_COUNT for port in result['address']: assert int(port.split(':')[-1]) != 9092 for port in result['dns']: assert int(port.split(':')[-1]) != 9092
def test_secrets_basic(): # 1) create Secrets # 2) install examples/secrets.yml # 3) if secret file is not created, tasks will fail # 4) wait till deployment finishes # 5) do replace operation # 6) ensure all tasks are running # 7) delete Secrets sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server") world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server") # ensure that secrets work after replace sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0) # tasks will fail if secret files are not created by mesos module sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # clean up and delete secrets delete_secrets("{}/".format(config.SERVICE_NAME))
def test_increase_decrease_world_nodes(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) config.check_running(foldered_name) original_hello_ids = sdk_tasks.get_task_ids(foldered_name, 'hello') original_world_ids = sdk_tasks.get_task_ids(foldered_name, 'world') log.info('world ids: ' + str(original_world_ids)) # add 2 world nodes sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', 2) config.check_running(foldered_name) sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids) # check 2 world tasks added: assert 2 + len(original_world_ids) == len(sdk_tasks.get_task_ids(foldered_name, 'world')) # subtract 2 world nodes sdk_marathon.bump_task_count_config(foldered_name, 'WORLD_COUNT', -2) config.check_running(foldered_name) # wait for the decommission plan for this subtraction to be complete sdk_plan.wait_for_completed_plan(foldered_name, 'decommission') # check that the total task count is back to original sdk_tasks.check_running( foldered_name, len(original_hello_ids) + len(original_world_ids), allow_more=False) # check that original tasks weren't affected/relaunched in the process sdk_tasks.check_tasks_not_updated(foldered_name, 'hello', original_hello_ids) sdk_tasks.check_tasks_not_updated(foldered_name, 'world', original_world_ids) # check that the world tasks are back to their prior state (also without changing task ids) assert original_world_ids == sdk_tasks.get_task_ids(foldered_name, 'world')
def test_marathon_rack_not_found(): def fun(): try: return service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) except: return False shakedown.install_package(PACKAGE_NAME, service_name=SERVICE_NAME, options_json=install.get_package_options( additional_options={'service':{'placement_constraint':'rack_id:LIKE:rack-foo-.*'}} ), wait_for_completion=False) try: tasks.check_running(PACKAGE_NAME, 1, timeout_seconds=120) assert False, "Should have failed to install" except AssertionError as arg: raise arg except: pass # expected to fail pl = spin.time_wait_return(fun) # check that first node is still (unsuccessfully) looking for a match: assert pl['status'] == 'IN_PROGRESS' assert pl['phases'][0]['status'] == 'IN_PROGRESS' # if so early, it can be PREPARED ? assert pl['phases'][0]['steps'][0]['status'] in ('PREPARED', 'PENDING') assert pl['phases'][0]['steps'][1]['status'] == 'PENDING' assert pl['phases'][0]['steps'][2]['status'] == 'PENDING' install.uninstall(SERVICE_NAME, PACKAGE_NAME)
def test_add_ingest_and_coordinator_nodes_does_not_restart_master_or_data_nodes() -> None: initial_master_task_ids = sdk_tasks.get_task_ids(service_name, "master") initial_data_task_ids = sdk_tasks.get_task_ids(service_name, "data") # Get service configuration. _, svc_config, _ = sdk_cmd.svc_cli(package_name, service_name, "describe", parse_json=True) ingest_nodes_count = get_in(["ingest_nodes", "count"], svc_config) coordinator_nodes_count = get_in(["coordinator_nodes", "count"], svc_config) global current_expected_task_count sdk_service.update_configuration( package_name, service_name, { "ingest_nodes": {"count": ingest_nodes_count + 1}, "coordinator_nodes": {"count": coordinator_nodes_count + 1}, }, current_expected_task_count, # As of 2018-12-14, sdk_upgrade's `wait_for_deployment` has different behavior than # sdk_install's (which is what we wanted here), so don't use it. Check manually afterwards # with `sdk_tasks.check_running`. wait_for_deployment=False, ) # Should be running 2 tasks more. current_expected_task_count += 2 sdk_tasks.check_running(service_name, current_expected_task_count) # Master nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "master", initial_master_task_ids) # Data nodes should not restart. sdk_tasks.check_tasks_not_updated(service_name, "data", initial_data_task_ids)
def test_canary_first(): sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy') expected_tasks = ['hello-0'] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks # do not use service_plan always # when here, plan should always return properly pl = sdk_plan.wait_for_completed_step(config.SERVICE_NAME, 'deploy', 'hello-deploy', 'hello-0:[server]') log.info(pl) assert pl['status'] == 'WAITING' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'WAITING' assert steps[2]['status'] == 'PENDING' assert steps[3]['status'] == 'PENDING' phase = pl['phases'][1] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'WAITING' assert steps[1]['status'] == 'WAITING' assert steps[2]['status'] == 'PENDING' assert steps[3]['status'] == 'PENDING'
def test_enable(): sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 3, timeout_seconds=30, allow_more=False) set_test_boolean("true") sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 6, timeout_seconds=30, allow_more=False) sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME)
def test_enable(): sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 3) set_test_boolean('true') sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 6) sdk_plan.recovery_plan_is_empty(config.SERVICE_NAME)
def test_static_port_comes_online(): sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT, additional_options=STATIC_PORT_OPTIONS_DICT) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)
def test_service_startup_rapid(): max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS task_short_name = 'kafka-0' broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # the following 'dcos kafka topic ....' command has expected output as follows: # 'Output: 100 records sent ....' # but may fail, i.e. have output such as follows: # '...leader not available...' stdout = '' retries = 15 while retries > 0: retries -= 1 stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100') if 'records sent' in stdout: break jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == task_short_name assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ] starting_fallback_time = datetime.datetime.now() sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ]) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # extract starting and started lines from log starting_time = started_time = None retry_seconds_remaining = max_restart_seconds + startup_padding_seconds while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None): stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1)) task_lines = stdout.split('\n') for log_line in reversed(task_lines): if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line: starting_time = log_line_ts(log_line) elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line: started_time = log_line_ts(log_line) if starting_time is None or started_time is None: time.sleep(retry_delay_seconds) if started_time is None or starting_time is None: f = open('/tmp/kafka_startup_stdout', 'w') f.write(stdout) f.close() if starting_time is None: starting_time = starting_fallback_time assert starting_time is not None assert started_time is not None assert started_time >= starting_time assert (started_time - starting_time).total_seconds() <= max_restart_seconds
def replace_broker_pod(service_name=config.SERVICE_NAME): pod_name = '{}-0'.format(config.DEFAULT_POD_TYPE) task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME) broker_0_id = sdk_tasks.get_task_ids(service_name, task_name) sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod replace {}'.format(pod_name)) sdk_tasks.check_tasks_updated(service_name, task_name, broker_0_id) sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT) # wait till all brokers register broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=service_name)
def restart_broker_pods(service_name=config.SERVICE_NAME): for i in range(config.DEFAULT_BROKER_COUNT): pod_name = '{}-{}'.format(config.DEFAULT_POD_TYPE, i) task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME) broker_id = sdk_tasks.get_task_ids(service_name, task_name) restart_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod restart {}'.format(pod_name), json=True) assert len(restart_info) == 2 assert restart_info['tasks'][0] == task_name sdk_tasks.check_tasks_updated(service_name, task_name, broker_id) sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT)
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config['env'] env['SDK_UNINSTALL'] = 'w00t' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 0)
def test_uninstall(): config.check_running() # add the needed envvar in marathon and confirm that the uninstall "deployment" succeeds: marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) env = marathon_config["env"] env["SDK_UNINSTALL"] = "w00t" sdk_marathon.update_app(marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, 0, allow_more=False)
def test_port_dynamic_to_dynamic_port(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) sdk_marathon.bump_cpu_count_config(config.SERVICE_NAME, 'BROKER_CPUS') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT)
def test_overlay_network_deployment_and_endpoints(): # double check sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) endpoints = sdk_networks.get_and_test_endpoints(config.PACKAGE_NAME, config.SERVICE_NAME, "", 2) assert "broker" in endpoints, "broker is missing from endpoints {}".format(endpoints) assert "zookeeper" in endpoints, "zookeeper missing from endpoints {}".format(endpoints) broker_endpoints = sdk_networks.get_and_test_endpoints(config.PACKAGE_NAME, config.SERVICE_NAME, "broker", 3) sdk_networks.check_endpoints_on_overlay(broker_endpoints) zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints zookeeper') assert zookeeper.rstrip() == 'master.mesos:2181/{}'.format(sdk_utils.get_zk_path(config.SERVICE_NAME))
def test_bump_node_counts(): # Run this test last, as it changes the task count config = marathon.get_config(PACKAGE_NAME) data_nodes = int(config['env']['DATA_NODE_COUNT']) config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1) ingest_nodes = int(config['env']['INGEST_NODE_COUNT']) config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1) coordinator_nodes = int(config['env']['COORDINATOR_NODE_COUNT']) config['env']['COORDINATOR_NODE_COUNT'] = str(coordinator_nodes + 1) marathon.update_app(PACKAGE_NAME, config) tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT + 3)
def test_changing_discovery_replaces_certificate_sans(hello_world_service): """ Update service configuration to change discovery prefix of a task. Scheduler should update task and new SANs should be generated. """ original_tasks = sdk_tasks.get_task_ids(config.PACKAGE_NAME, 'discovery') assert len(original_tasks) == 1, 'Expecting exactly one task ID' task_id = original_tasks[0] assert task_id # Load end-entity certificate from PEM encoded file end_entity_cert = x509.load_pem_x509_certificate( task_exec(task_id, 'cat server.crt').encode('ascii'), DEFAULT_BACKEND) san_extension = end_entity_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [ san.value for san in san_extension.value._general_names._general_names] expected_san = ( '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format( name=DISCOVERY_TASK_PREFIX, service_name=config.SERVICE_NAME) ) assert expected_san in sans # Run task update with new discovery prefix marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['DISCOVERY_TASK_PREFIX'] = DISCOVERY_TASK_PREFIX + '-new' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'discovery', original_tasks) sdk_tasks.check_running(config.SERVICE_NAME, 4) new_task_id = sdk_tasks.get_task_ids(config.SERVICE_NAME, "discovery")[0] assert task_id != new_task_id new_cert = x509.load_pem_x509_certificate( task_exec(new_task_id, 'cat server.crt').encode('ascii'), DEFAULT_BACKEND) san_extension = new_cert.extensions.get_extension_for_oid( ExtensionOID.SUBJECT_ALTERNATIVE_NAME) sans = [ san.value for san in san_extension.value._general_names._general_names] expected_san = ( '{name}-0.{service_name}.autoip.dcos.thisdcos.directory'.format( name=DISCOVERY_TASK_PREFIX + '-new', service_name=config.SERVICE_NAME) ) assert expected_san in sans
def test_port_dynamic_to_dynamic_port(): tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) broker_ids = tasks.get_task_ids(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE)) config = marathon.get_config(SERVICE_NAME) broker_cpus = int(config['env']['BROKER_CPUS']) config['env']['BROKER_CPUS'] = str(broker_cpus + 0.1) marathon.update_app(SERVICE_NAME, config) tasks.check_tasks_updated(SERVICE_NAME, '{}-'.format(DEFAULT_POD_TYPE), broker_ids) # all tasks are running tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
def test_bump_node_counts(): # Run this test last, as it changes the task count marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) data_nodes = int(marathon_config['env']['DATA_NODE_COUNT']) marathon_config['env']['DATA_NODE_COUNT'] = str(data_nodes + 1) ingest_nodes = int(marathon_config['env']['INGEST_NODE_COUNT']) marathon_config['env']['INGEST_NODE_COUNT'] = str(ingest_nodes + 1) coordinator_nodes = int(marathon_config['env']['COORDINATOR_NODE_COUNT']) marathon_config['env']['COORDINATOR_NODE_COUNT'] = str( coordinator_nodes + 1) sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_running(FOLDERED_SERVICE_NAME, config.DEFAULT_TASK_COUNT + 3)
def test_placement_max_one_per_hostname(): install.install( PACKAGE_NAME, DEFAULT_BROKER_COUNT, service_name=SERVICE_NAME, additional_options={'service':{'placement_constraint':'hostname:MAX_PER:1'}} ) # double check tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT) pl = service_cli('plan show {}'.format(DEFAULT_PLAN_NAME)) assert pl['status'] == 'COMPLETE' install.uninstall(SERVICE_NAME, PACKAGE_NAME)
def pre_test_setup(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) config.wait_for_expected_nodes_to_exist(task_count=config.DEFAULT_TASK_COUNT)
def setup_function(function): tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT)
def check_running(service_name=PACKAGE_NAME): tasks.check_running(service_name, configured_task_count(service_name))
def check_running(count = DEFAULT_TASK_COUNT): tasks.check_running(PACKAGE_NAME, count)
def test_increase_cpu(): hello_0_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-server') sdk_marathon.bump_cpu_count_config(config.SERVICE_NAME, 'HELLO_CPUS') pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy', 'WAITING') log.info(pl) assert pl['status'] == 'WAITING' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 5 assert steps[0]['status'] == 'WAITING' assert steps[1]['status'] == 'WAITING' assert steps[2]['status'] == 'PENDING' assert steps[3]['status'] == 'PENDING' assert steps[4]['status'] == 'PENDING' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' # check that all prior tasks are still running, no changes yet expected_tasks = [ 'hello-0', 'hello-1', 'hello-2', 'hello-3', 'hello-4', 'world-0', 'world-1', 'world-2', 'world-3' ] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks assert hello_0_ids == sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-0-server') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-0-server', hello_0_ids) sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) pl = sdk_plan.wait_for_step_status(config.SERVICE_NAME, 'deploy', 'hello-deploy', 'hello-0:[server]', 'COMPLETE') log.info(pl) assert pl['status'] == 'WAITING' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 5 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'WAITING' assert steps[2]['status'] == 'PENDING' assert steps[3]['status'] == 'PENDING' assert steps[4]['status'] == 'PENDING' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' hello_1_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, 'hello-1-server') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello-1-server', hello_1_ids) pl = sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) log.info(pl) assert pl['status'] == 'COMPLETE' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 5 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' assert steps[4]['status'] == 'COMPLETE' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE'
def test_secrets_update(): # 1) create Secrets # 2) install examples/secrets.yml # 3) update Secrets # 4) restart task # 5) verify Secrets content (updated after restart) # 6) delete Secrets sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) # tasks will fail if secret file is not created sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) sdk_cmd.run_cli("security secrets update --value={} {}/secret1".format( secret_content_alternative, config.SERVICE_NAME)) sdk_cmd.run_cli("security secrets update --value={} {}/secret2".format( secret_content_alternative, config.SERVICE_NAME)) sdk_cmd.run_cli("security secrets update --value={} {}/secret3".format( secret_content_alternative, config.SERVICE_NAME)) # Verify with hello-0 and world-0, just check with one of the pods hello_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server") world_tasks_old = sdk_tasks.get_task_ids(config.SERVICE_NAME, "world-0-server") # restart pods to retrieve new secret's content sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart world-0') # wait pod restart to complete sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_old) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_old) # wait till it is running sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # make sure content is changed assert secret_content_alternative == read_secret( "world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_alternative == read_secret("world-0-server", "cat WORLD_SECRET2_FILE") assert secret_content_alternative == read_secret( "world-0-server", "cat {}/secret3".format(config.SERVICE_NAME)) # make sure content is changed assert secret_content_alternative == read_secret( "hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET1_FILE") assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET2_FILE") # clean up and delete secrets delete_secrets("{}/".format(config.SERVICE_NAME))
def check_healthy(count=DEFAULT_TASK_COUNT): sdk_plan.wait_for_completed_deployment(PACKAGE_NAME, timeout_seconds=25 * 60) sdk_plan.wait_for_completed_recovery(PACKAGE_NAME, timeout_seconds=25 * 60) sdk_tasks.check_running(PACKAGE_NAME, count)
def pre_test_setup(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_tasks.check_running(foldered_name, current_expected_task_count) config.wait_for_expected_nodes_to_exist( service_name=foldered_name, task_count=current_expected_task_count)
def check_healthy(count=DEFAULT_TASK_COUNT): service_plan_complete("deploy") service_plan_complete("recovery") tasks.check_running(PACKAGE_NAME, count)
def check_running(service_name=SERVICE_NAME): sdk_tasks.check_running(service_name, configured_task_count(service_name))
def test_structured_streaming_recovery(kerberized_spark, kerberized_kafka): kafka_brokers = ','.join( sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns']) LOGGER.info("Kafka brokers: {}".format(kafka_brokers)) _uri = upload_jaas() uris = "spark.mesos.uris={}".format(_uri) jar_uri = utils.upload_dcos_test_jar() kafka_kerberos_args = get_kerberized_kafka_spark_conf( utils.SPARK_SERVICE_NAME) LOGGER.info("Spark Kerberos configuration for Kafka:\n{}".format( '\n'.join(kafka_kerberos_args))) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] + kafka_kerberos_args # configuring streaming job and HDFS folders setup_hdfs_paths() # running kafka producer message_set_a = ["abc"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args, message_set_a) spark_submit_args = [ "--supervise", "--class", "StructuredStreamingWithCheckpointing", "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=1", "--conf", "spark.sql.shuffle.partitions=2", "--conf", "spark.executor.memory=2g" ] + common_args application_args = "{} {} {} {}".format(kafka_brokers, KAFKA_TEST_TOPIC, HDFS_CHECKPOINT_DIR, SPARK_SECURITY_PROTOCOL) driver_task_id = utils.submit_job(app_url=jar_uri, app_args=application_args, service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + spark_submit_args)) # Wait until executor is running LOGGER.info("Starting supervised driver {}".format(driver_task_id)) sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) # validating Structured Streaming topic consumption expected_output_a = "{}| {}".format(message_set_a[0], len(message_set_a)) LOGGER.info( "Validating Structured Streaming topic consumption, waiting for output {}" .format(expected_output_a)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) # killing the driver service_info = shakedown.get_service(SPARK_APPLICATION_NAME).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) sdk_cmd.kill_task_with_pattern(agent_host=service_info['hostname'], pattern=driver_regex) # sending more data to Kafka message_set_b = ["def"] * 100 feed_sample_data(jar_uri, kafka_brokers, KAFKA_TEST_TOPIC, common_args + kafka_kerberos_args, message_set_b) # checkpointing validation sdk_tasks.check_running(SPARK_APPLICATION_NAME, expected_task_count=1, timeout_seconds=600) LOGGER.info("Streaming job has re-started") # validating Structured Streaming resumed topic consumption expected_output_b = "{}| {}".format(message_set_b[0], len(message_set_b)) LOGGER.info( "Validating that consumption resumed from checkpoint, waiting for output '{}' and '{}'" .format(expected_output_a, expected_output_b)) utils.wait_for_running_job_output(driver_task_id, expected_output_a) utils.wait_for_running_job_output(driver_task_id, expected_output_b)
def setup_function(function): tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT) wait_for_expected_nodes_to_exist()
def test_unchanged_scheduler_restarts_without_restarting_tasks(): initial_task_ids = tasks.get_task_ids(PACKAGE_NAME, "master") shakedown.kill_process_on_host(get_marathon_host(), "scheduler.Main") tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT) current_task_ids = tasks.get_task_ids(PACKAGE_NAME, "master") assert initial_task_ids == current_task_ids
def pre_test_setup(): sdk_tasks.check_running(foldered_name, current_expected_task_count) config.wait_for_expected_nodes_to_exist( service_name=foldered_name, task_count=current_expected_task_count)
def update_app(service_name, options, expected_task_count): config = sdk_marathon.get_config(service_name) config['env'].update(options) sdk_marathon.update_app(service_name, config) sdk_plan.wait_for_completed_deployment(service_name) sdk_tasks.check_running(service_name, expected_task_count)
def test_increase_count(): sdk_marathon.bump_task_count_config(config.SERVICE_NAME, 'HELLO_COUNT') expected_tasks = [ 'hello-0', 'hello-1', 'hello-2', 'hello-3', 'world-0', 'world-1', 'world-2', 'world-3' ] try: sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks) + 1, timeout_seconds=60) assert False, "Should not start task now" except AssertionError as arg: raise arg except: pass # expected to fail sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy', 'WAITING') log.info(pl) assert pl['status'] == 'WAITING' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 5 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' assert steps[4]['status'] == 'WAITING' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy') expected_tasks = [ 'hello-0', 'hello-1', 'hello-2', 'hello-3', 'hello-4', 'world-0', 'world-1', 'world-2', 'world-3' ] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks pl = sdk_plan.wait_for_plan_status(config.SERVICE_NAME, 'deploy', 'COMPLETE') log.info(pl) assert pl['status'] == 'COMPLETE' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 5 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' assert steps[4]['status'] == 'COMPLETE' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE'
def check_healthy(count=DEFAULT_TASK_COUNT): plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME, timeout_seconds=20 * 60) plan.wait_for_completed_recovery(FOLDERED_SERVICE_NAME, timeout_seconds=20 * 60) tasks.check_running(FOLDERED_SERVICE_NAME, count)
def test_pipeline(kerberos_flag, stop_count, jar_uri, keytab_secret, spark_service_name, jaas_uri=None): stop_count = str(stop_count) kerberized = True if kerberos_flag == "true" else False broker_dns = sdk_cmd.svc_cli(KAFKA_PACKAGE_NAME, KAFKA_SERVICE_NAME, 'endpoints broker', json=True)['dns'][0] topic = "top1" big_file, big_file_url = "file:///mnt/mesos/sandbox/big.txt", "http://norvig.com/big.txt" # arguments to the application producer_args = " ".join([broker_dns, big_file, topic, kerberos_flag]) uris = "spark.mesos.uris={}".format(big_file_url) if kerberized and jaas_uri is None: jaas_path = os.path.join(THIS_DIR, "resources", "spark-kafka-client-jaas.conf") s3.upload_file(jaas_path) _uri = s3.http_url("spark-kafka-client-jaas.conf") uris += ",{}".format(_uri) else: uris += ",{}".format(jaas_uri) common_args = [ "--conf", "spark.mesos.containerizer=mesos", "--conf", "spark.scheduler.maxRegisteredResourcesWaitingTime=2400s", "--conf", "spark.scheduler.minRegisteredResourcesRatio=1.0", "--conf", uris ] kerberos_args = [ "--conf", "spark.mesos.driver.secret.names={}".format(keytab_secret), "--conf", "spark.mesos.driver.secret.filenames=kafka-client.keytab", "--conf", "spark.mesos.executor.secret.names={}".format(keytab_secret), "--conf", "spark.mesos.executor.secret.filenames=kafka-client.keytab", "--conf", "spark.mesos.task.labels=DCOS_SPACE:/{}".format(spark_service_name), "--conf", "spark.executorEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5), "--conf", "spark.mesos.driverEnv.KRB5_CONFIG_BASE64={}".format(KAFKA_KRB5), "--conf", "spark.driver.extraJavaOptions=-Djava.security.auth.login.config=" "/mnt/mesos/sandbox/spark-kafka-client-jaas.conf", "--conf", "spark.executor.extraJavaOptions=" "-Djava.security.auth.login.config=/mnt/mesos/sandbox/spark-kafka-client-jaas.conf", ] producer_config = [ "--conf", "spark.cores.max=2", "--conf", "spark.executor.cores=2", "--class", "KafkaFeeder" ] + common_args if kerberized: producer_config += kerberos_args producer_id = utils.submit_job(app_url=jar_uri, app_args=producer_args, service_name=spark_service_name, args=producer_config) sdk_tasks.check_running(KAFKA_SERVICE_NAME, 1, timeout_seconds=600) consumer_config = [ "--conf", "spark.cores.max=4", "--class", "KafkaConsumer" ] + common_args if kerberized: consumer_config += kerberos_args consumer_args = " ".join([broker_dns, topic, stop_count, kerberos_flag]) utils.run_tests(app_url=jar_uri, app_args=consumer_args, expected_output="Read {} words".format(stop_count), service_name=spark_service_name, args=consumer_config) utils.kill_driver(producer_id, spark_service_name)
def test_profile_mount_volumes(): sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO)
def test_secrets_config_update(): # 1) install examples/secrets.yml # 2) create new Secrets, delete old Secrets # 2) update configuration with new Secrets # 4) verify secret content (using new Secrets after config update) install.uninstall(PACKAGE_NAME) create_secrets("{}/".format(PACKAGE_NAME)) install.install(PACKAGE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) # launch will fail if secrets are not available or not accessible plan.wait_for_completed_deployment(PACKAGE_NAME) # tasks will fail if secret file is not created tasks.check_running(PACKAGE_NAME, NUM_HELLO + NUM_WORLD) # Verify secret content, one from each pod type # get tasks ids - only first pods hello_tasks = tasks.get_task_ids(PACKAGE_NAME, "hello-0") world_tasks = tasks.get_task_ids(PACKAGE_NAME, "world-0") # make sure it has the default value assert secret_content_default == task_exec( world_tasks[0], "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_default == task_exec(world_tasks[0], "cat WORLD_SECRET2_FILE") assert secret_content_default == task_exec( world_tasks[0], "cat {}/secret3".format(PACKAGE_NAME)) # hello tasks has container image assert secret_content_default == task_exec( hello_tasks[0], "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_default == task_exec(hello_tasks[0], "cat HELLO_SECRET1_FILE") assert secret_content_default == task_exec(hello_tasks[0], "cat HELLO_SECRET2_FILE") # clean up and delete secrets (defaults) delete_secrets("{}/".format(PACKAGE_NAME)) # create new secrets with new content -- New Value create_secrets(secret_content_arg=secret_content_alternative) config = marathon.get_config(PACKAGE_NAME) config['env']['HELLO_SECRET1'] = 'secret1' config['env']['HELLO_SECRET2'] = 'secret2' config['env']['WORLD_SECRET1'] = 'secret1' config['env']['WORLD_SECRET2'] = 'secret2' config['env']['WORLD_SECRET3'] = 'secret3' # config update marathon.update_app(PACKAGE_NAME, config) # wait till plan is complete - pods are supposed to restart plan.wait_for_completed_deployment(PACKAGE_NAME) # all tasks are running tasks.check_running(PACKAGE_NAME, NUM_HELLO + NUM_WORLD) # Verify secret content is changed # get task ids - only first pod hello_tasks = tasks.get_task_ids(PACKAGE_NAME, "hello-0") world_tasks = tasks.get_task_ids(PACKAGE_NAME, "world-0") assert secret_content_alternative == task_exec( world_tasks[0], "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_alternative == task_exec(world_tasks[0], "cat WORLD_SECRET2_FILE") assert secret_content_alternative == task_exec(world_tasks[0], "cat secret3") assert secret_content_alternative == task_exec( hello_tasks[0], "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_alternative == task_exec(hello_tasks[0], "cat HELLO_SECRET1_FILE") assert secret_content_alternative == task_exec(hello_tasks[0], "cat HELLO_SECRET2_FILE") # clean up and delete secrets delete_secrets()
def test_secrets_config_update(): # 1) install examples/secrets.yml # 2) create new Secrets, delete old Secrets # 2) update configuration with new Secrets # 4) verify secret content (using new Secrets after config update) sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) # tasks will fail if secret file is not created sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # Verify secret content, one from each pod type # make sure it has the default value assert secret_content_default == read_secret( "world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_default == read_secret("world-0-server", "cat WORLD_SECRET2_FILE") assert secret_content_default == read_secret( "world-0-server", "cat {}/secret3".format(config.SERVICE_NAME)) # hello tasks has container image assert secret_content_default == read_secret( "hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_default == read_secret("hello-0-server", "cat HELLO_SECRET1_FILE") assert secret_content_default == read_secret("hello-0-server", "cat HELLO_SECRET2_FILE") # clean up and delete secrets (defaults) delete_secrets("{}/".format(config.SERVICE_NAME)) # create new secrets with new content -- New Value create_secrets(secret_content_arg=secret_content_alternative) marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['HELLO_SECRET1'] = 'secret1' marathon_config['env']['HELLO_SECRET2'] = 'secret2' marathon_config['env']['WORLD_SECRET1'] = 'secret1' marathon_config['env']['WORLD_SECRET2'] = 'secret2' marathon_config['env']['WORLD_SECRET3'] = 'secret3' # config update sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) # wait till plan is complete - pods are supposed to restart sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # Verify secret content is changed assert secret_content_alternative == read_secret( "world-0-server", "bash -c 'echo $WORLD_SECRET1_ENV'") assert secret_content_alternative == read_secret("world-0-server", "cat WORLD_SECRET2_FILE") assert secret_content_alternative == read_secret("world-0-server", "cat secret3") assert secret_content_alternative == read_secret( "hello-0-server", "bash -c 'echo $HELLO_SECRET1_ENV'") assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET1_FILE") assert secret_content_alternative == read_secret("hello-0-server", "cat HELLO_SECRET2_FILE") # clean up and delete secrets delete_secrets()
def test_soak_secrets_framework_alive(): sdk_plan.wait_for_completed_deployment(FRAMEWORK_NAME) sdk_tasks.check_running(FRAMEWORK_NAME, NUM_HELLO + NUM_WORLD)
def _set_xpack(service_name, is_enabled): config = marathon.get_config(service_name) config['env']['TASKCFG_ALL_XPACK_ENABLED'] = is_enabled marathon.update_app(service_name, config) sdk_plan.wait_for_completed_deployment(service_name) tasks.check_running(service_name, DEFAULT_TASK_COUNT)
def test_service_startup_rapid(): max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS task_short_name = "kafka-0" broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # the following 'dcos kafka topic ....' command has expected output as follows: # 'Output: 100 records sent ....' # but may fail, i.e. have output such as follows: # '...leader not available...' stdout = "" retries = 15 while retries > 0: retries -= 1 _, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "topic producer_test test 100") if "records sent" in stdout: break _, jsonobj, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod restart {}".format(task_short_name), parse_json=True, ) assert len(jsonobj) == 2 assert jsonobj["pod"] == task_short_name assert jsonobj["tasks"] == ["{}-broker".format(task_short_name)] starting_fallback_time = datetime.datetime.now() sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "{}-".format(config.DEFAULT_POD_TYPE), [broker_task_id_0]) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # extract starting and started lines from log starting_time = started_time = None retry_seconds_remaining = max_restart_seconds + startup_padding_seconds while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None): stdout = sdk_cmd.run_cli( "task log --lines=1000 {}".format(broker_task_id_1)) task_lines = stdout.split("\n") for log_line in reversed(task_lines): if starting_time is None and " starting (kafka.server.KafkaServer)" in log_line: starting_time = log_line_ts(log_line) elif started_time is None and " started (kafka.server.KafkaServer)" in log_line: started_time = log_line_ts(log_line) if starting_time is None or started_time is None: time.sleep(retry_delay_seconds) if started_time is None or starting_time is None: f = open("/tmp/kafka_startup_stdout", "w") f.write(stdout) f.close() if starting_time is None: starting_time = starting_fallback_time assert starting_time is not None assert started_time is not None assert started_time >= starting_time assert (started_time - starting_time).total_seconds() <= max_restart_seconds
def pre_test_setup(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_tasks.check_running(foldered_name, config.DEFAULT_TASK_COUNT) config.wait_for_expected_nodes_to_exist(service_name=foldered_name)
def pre_test_setup() -> None: sdk_tasks.check_running(service_name, current_expected_task_count) config.wait_for_expected_nodes_to_exist( service_name=service_name, task_count=current_expected_task_count - current_non_node_task_count, )
def test_static_port_comes_online(static_port_config): tasks.check_running(SERVICE_NAME, DEFAULT_BROKER_COUNT)
def xpack(is_enabled): config = marathon.get_config(PACKAGE_NAME) config['env']['TASKCFG_ALL_XPACK_ENABLED'] = is_enabled marathon.update_app(PACKAGE_NAME, config) sdk_plan.wait_for_completed_deployment(PACKAGE_NAME) tasks.check_running(PACKAGE_NAME, DEFAULT_TASK_COUNT)
def test_service_health(): sdk_tasks.check_running(config.SERVICE_NAME, config.NO_INGEST_TASK_COUNT) config.wait_for_expected_nodes_to_exist( task_count=config.NO_INGEST_TASK_COUNT) assert shakedown.service_healthy(config.SERVICE_NAME)
def test_supervise(kerberized_spark, hdfs_with_kerberos): job_service_name = "RecoverableNetworkWordCount" @retrying.retry(wait_fixed=1000, stop_max_delay=600 * 1000, retry_on_result=lambda res: not res) def wait_job_present(present): svc = shakedown.get_service(job_service_name) if present: return svc is not None else: return svc is None job_args = [ "--supervise", "--class", "org.apache.spark.examples.streaming.RecoverableNetworkWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ] data_dir = "hdfs://{}".format(HDFS_DATA_DIR) driver_id = utils.submit_job( app_url=utils.SPARK_EXAMPLES, app_args="10.0.0.1 9090 {dir}/netcheck {dir}/outfile".format( dir=data_dir), service_name=utils.SPARK_SERVICE_NAME, args=(SPARK_SUBMIT_HDFS_KERBEROS_ARGS + job_args)) log.info("Started supervised driver {}".format(driver_id)) wait_job_present(True) log.info("Job has registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has running executors") service_info = shakedown.get_service(job_service_name).dict() driver_regex = "spark.mesos.driver.frameworkId={}".format( service_info['id']) status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "ps aux | grep -v grep | grep '{}'".format(driver_regex), username=sdk_cmd.LINUX_USER) pids = [p.strip().split()[1] for p in stdout.splitlines()] for pid in pids: status, stdout = shakedown.run_command_on_agent( service_info['hostname'], "sudo kill -9 {}".format(pid), username=sdk_cmd.LINUX_USER) if status: print("Killed pid: {}".format(pid)) else: print("Unable to killed pid: {}".format(pid)) wait_job_present(True) log.info("Job has re-registered") sdk_tasks.check_running(job_service_name, 1) log.info("Job has re-started") out = utils.kill_driver(driver_id, utils.SPARK_SERVICE_NAME) log.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" wait_job_present(False)