def test_master_node_replace() -> None: # Ideally, the pod will get placed on a different agent. This test will verify that the # remaining two masters find the replaced master at its new IP address. This requires a # reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(package_name, service_name, "pod replace master-0") sdk_plan.wait_for_in_progress_recovery(service_name) sdk_plan.wait_for_completed_recovery(service_name)
def client_can_read_and_write(test_id: str, kafka_client: dict, kafka_server: dict, endpoint_name: str, krb5: object=None) -> tuple: client_id = kafka_client["id"] brokers_list = service_get_brokers(kafka_server, endpoint_name) broker_hosts = map(lambda b: b.split(":")[0], brokers_list) brokers = ",".join(brokers_list) if not sdk_cmd.resolve_hosts(kafka_client["id"], broker_hosts): log.error("Failed to resolve brokers: %s", broker_hosts) return False, [] topic_name = kafka_client["env"]["KAFKA_TOPIC"] sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"], "topic create {}".format(topic_name), json=True) test_utils.wait_for_topic(kafka_server["package_name"], kafka_server["service"]["name"], topic_name) message = str(uuid.uuid4()) security_options = {"is-tls": endpoint_name == "broker-tls", "kerberos": krb5} write_success = write_to_topic(test_id, client_id, topic_name, message, brokers, security_options) if write_success: MESSAGES.append(message) read_messages = read_from_topic(test_id, client_id, topic_name, len(MESSAGES), brokers, security_options) read_success = map(lambda m: m in read_messages, MESSAGES) return write_success, read_success
def test_zones_referenced_in_placement_constraints(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_install.uninstall(config.PACKAGE_NAME, foldered_name) sdk_install.install( config.PACKAGE_NAME, foldered_name, config.DEFAULT_BROKER_COUNT, additional_options={ "service": { "name": foldered_name, "placement_constraint": "[[\"@zone\", \"GROUP_BY\"]]" } }) test_utils.broker_count_check( config.DEFAULT_BROKER_COUNT, service_name=foldered_name) broker_ids = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, 'broker list', json=True) for broker_id in broker_ids: broker_info = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, 'broker get {}'.format(broker_id), json=True) assert sdk_fault_domain.is_valid_zone(broker_info.get('rack')) sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
def test_canary_third(): sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "plan continue deploy hello-deploy") expected_tasks = ["hello-0", "hello-1", "hello-2", "hello-3", "world-0"] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod list") assert rc == 0, "Pod list failed" assert json.loads(stdout) == expected_tasks pl = sdk_plan.wait_for_completed_phase(config.SERVICE_NAME, "deploy", "hello-deploy") log.info(pl) assert pl["status"] == "WAITING" assert len(pl["phases"]) == 2 phase = pl["phases"][0] assert phase["status"] == "COMPLETE" steps = phase["steps"] assert len(steps) == 4 assert steps[0]["status"] == "COMPLETE" assert steps[1]["status"] == "COMPLETE" assert steps[2]["status"] == "COMPLETE" assert steps[3]["status"] == "COMPLETE" phase = pl["phases"][1] assert phase["status"] == "WAITING" steps = phase["steps"] assert len(steps) == 4 assert steps[0]["status"] == "COMPLETE" assert steps[1]["status"] == "WAITING" assert steps[2]["status"] == "PENDING" assert steps[3]["status"] == "PENDING"
def test_topic_offsets_increase_with_writes(): offset_info = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic offsets --time="-1" {}'.format(config.DEFAULT_TOPIC_NAME), json=True) assert len(offset_info) == config.DEFAULT_PARTITION_COUNT offsets = {} for o in offset_info: assert len(o) == config.DEFAULT_REPLICATION_FACTOR offsets.update(o) assert len(offsets) == config.DEFAULT_PARTITION_COUNT num_messages = 10 write_info = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic producer_test {} {}'.format(config.DEFAULT_TOPIC_NAME, num_messages), json=True) assert len(write_info) == 1 assert write_info['message'].startswith( 'Output: {} records sent'.format(num_messages)) offset_info = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic offsets --time="-1" {}'.format(config.DEFAULT_TOPIC_NAME), json=True) assert len(offset_info) == config.DEFAULT_PARTITION_COUNT post_write_offsets = {} for offsets in offset_info: assert len(o) == config.DEFAULT_REPLICATION_FACTOR post_write_offsets.update(o) assert not offsets == post_write_offsets
def test_plan_cli(): plan_name = "deploy" phase_name = "world" _check_json_output(foldered_name, "plan list") rc, _, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "plan show {}".format(plan_name)) assert rc == 0 _check_json_output(foldered_name, "plan show --json {}".format(plan_name)) _check_json_output(foldered_name, "plan show {} --json".format(plan_name)) # trigger a restart so that the plan is in a non-complete state. # the 'interrupt' command will fail if the plan is already complete: rc, _, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "plan force-restart {}".format(plan_name) ) assert rc == 0 rc, _, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "plan interrupt {} {}".format(plan_name, phase_name) ) assert rc == 0 rc, _, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, "plan continue {} {}".format(plan_name, phase_name) ) assert rc == 0 # now wait for plan to finish before continuing to other tests: assert sdk_plan.wait_for_completed_plan(foldered_name, plan_name)
def test_shutdown_host(): candidate_tasks = sdk_tasks.get_tasks_avoiding_scheduler( config.SERVICE_NAME, re.compile('^node-[0-9]+-server$')) assert len(candidate_tasks) != 0, 'Could not find a node to shut down' # Cassandra nodes should never share a machine assert len(candidate_tasks) == len(set([task.host for task in candidate_tasks])), \ 'Expected candidate tasks to all be on different hosts: {}'.format(candidate_tasks) # Just pick the first one from the list replace_task = candidate_tasks[0] replace_pod_name = replace_task.name[:-len('-server')] # Instead of partitioning or reconnecting, we shut down the host permanently sdk_cmd.shutdown_agent(replace_task.host) sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) # Print another dump of current cluster tasks, now that repair has started. sdk_tasks.get_summary() sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_TASK_COUNT) # Find the new version of the task. Note that the old one may still be present/'running' as # Mesos might not have acknowledged the agent's death. new_task = [ task for task in sdk_tasks.get_summary() if task.name == replace_task.name and task.id != replace_task.id][0] log.info('Checking that the original pod has moved to a new agent:\n' 'old={}\nnew={}'.format(replace_task, new_task)) assert replace_task.agent != new_task.agent
def test_hostname_unique(): sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) options = _escape_placement_for_1_9( { "service": {"yaml": "marathon_constraint"}, "hello": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, "world": {"count": get_num_private_agents(), "placement": '[["hostname", "UNIQUE"]]'}, } ) sdk_install.install( config.PACKAGE_NAME, config.SERVICE_NAME, get_num_private_agents() * 2, additional_options=options, ) # hello deploys first. One "world" task should end up placed with each "hello" task. # ensure "hello" task can still be placed with "world" task old_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0") sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) sdk_tasks.check_running( config.SERVICE_NAME, get_num_private_agents() * 2 - 1, timeout_seconds=10 ) sdk_tasks.check_running(config.SERVICE_NAME, get_num_private_agents() * 2) ensure_count_per_agent(hello_count=1, world_count=1)
def test_secrets_basic(): # 1) create Secrets # 2) install examples/secrets.yml # 3) if secret file is not created, tasks will fail # 4) wait till deployment finishes # 5) do replace operation # 6) ensure all tasks are running # 7) delete Secrets sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) create_secrets("{}/".format(config.SERVICE_NAME)) sdk_install.install(config.PACKAGE_NAME, config.SERVICE_NAME, NUM_HELLO + NUM_WORLD, additional_options=secret_options) hello_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0-server") world_tasks_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, "word-0-server") # ensure that secrets work after replace sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace world-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0-server", hello_tasks_0) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'world-0-server', world_tasks_0) # tasks will fail if secret files are not created by mesos module sdk_tasks.check_running(config.SERVICE_NAME, NUM_HELLO + NUM_WORLD) # clean up and delete secrets delete_secrets("{}/".format(config.SERVICE_NAME))
def test_port_static_to_dynamic_port(): sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) marathon_config['env']['BROKER_PORT'] = '0' sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) # all tasks are running sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) for broker_id in range(config.DEFAULT_BROKER_COUNT): result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'broker get {}'.format(broker_id), json=True) assert result['port'] != 9092 result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints broker', json=True) assert len(result['address']) == config.DEFAULT_BROKER_COUNT assert len(result['dns']) == config.DEFAULT_BROKER_COUNT for port in result['address']: assert int(port.split(':')[-1]) != 9092 for port in result['dns']: assert int(port.split(':')[-1]) != 9092
def test_endpoints(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) # check that we can reach the scheduler via admin router, and that returned endpoints are sanitized: core_site = etree.fromstring(sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints core-site.xml')) check_properties(core_site, { 'ha.zookeeper.parent-znode': '/{}/hadoop-ha'.format(sdk_utils.get_zk_path( foldered_name)) }) hdfs_site = etree.fromstring(sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints hdfs-site.xml')) expect = { 'dfs.namenode.shared.edits.dir': 'qjournal://{}/hdfs'.format(';'.join([ sdk_hosts.autoip_host( foldered_name, 'journal-{}-node'.format(i), 8485 ) for i in range(3)])), } for i in range(2): name_node = 'name-{}-node'.format(i) expect['dfs.namenode.rpc-address.hdfs.{}'.format(name_node)] = sdk_hosts.autoip_host( foldered_name, name_node, 9001) expect['dfs.namenode.http-address.hdfs.{}'.format(name_node)] = sdk_hosts.autoip_host( foldered_name, name_node, 9002) check_properties(hdfs_site, expect)
def test_custom_zookeeper(): broker_ids = sdk_tasks.get_task_ids( FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic create {}'.format(config.DEFAULT_TOPIC_NAME), json=True) assert sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == [config.DEFAULT_TOPIC_NAME] marathon_config = sdk_marathon.get_config(FOLDERED_SERVICE_NAME) # should be using default path when this envvar is empty/unset: assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(ZK_SERVICE_PATH) marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path sdk_marathon.update_app(FOLDERED_SERVICE_NAME, marathon_config) sdk_tasks.check_tasks_updated( FOLDERED_SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(FOLDERED_SERVICE_NAME) # wait for brokers to finish registering test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=FOLDERED_SERVICE_NAME) zookeeper = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic list', json=True) == []
def test_config_cli(): configs = sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config list', json=True) assert len(configs) >= 1 # refrain from breaking this test if earlier tests did a config update assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config show {}'.format(configs[0]), print_output=False) # noisy output assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config target', json=True) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'config target_id', json=True)
def test_canary_first(): sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy hello-deploy') expected_tasks = ['hello-0'] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks # do not use service_plan always # when here, plan should always return properly pl = sdk_plan.wait_for_completed_step(config.SERVICE_NAME, 'deploy', 'hello-deploy', 'hello-0:[server]') log.info(pl) assert pl['status'] == 'WAITING' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'WAITING' assert steps[2]['status'] == 'PENDING' assert steps[3]['status'] == 'PENDING' phase = pl['phases'][1] assert phase['status'] == 'WAITING' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'WAITING' assert steps[1]['status'] == 'WAITING' assert steps[2]['status'] == 'PENDING' assert steps[3]['status'] == 'PENDING'
def test_node_replace_replaces_node(): replace_task = [ task for task in sdk_tasks.get_summary() if task.name == 'node-2-server'][0] log.info('avoid host for task {}'.format(replace_task)) replace_pod_name = replace_task.name[:-len('-server')] # Update the placement constraints so the new node doesn't end up on the same host marathon_config = sdk_marathon.get_config(config.SERVICE_NAME) original_constraint = marathon_config['env']['PLACEMENT_CONSTRAINT'] try: marathon_config['env']['PLACEMENT_CONSTRAINT'] = '[["hostname", "UNLIKE", "{}"]]'.format(replace_task.host) sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME) # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(replace_pod_name)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS) finally: # revert to prior placement setting before proceeding with tests: avoid getting stuck. marathon_config['env']['PLACEMENT_CONSTRAINT'] = original_constraint sdk_marathon.update_app(config.SERVICE_NAME, marathon_config) sdk_plan.wait_for_completed_deployment(config.SERVICE_NAME)
def test_canary_fourth(): sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'plan continue deploy world-deploy') expected_tasks = [ 'hello-0', 'hello-1', 'hello-2', 'hello-3', 'world-0', 'world-1', 'world-2', 'world-3'] sdk_tasks.check_running(config.SERVICE_NAME, len(expected_tasks)) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod list', json=True) == expected_tasks pl = sdk_plan.wait_for_completed_plan(config.SERVICE_NAME, 'deploy') log.info(pl) assert pl['status'] == 'COMPLETE' assert len(pl['phases']) == 2 phase = pl['phases'][0] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE' phase = pl['phases'][1] assert phase['status'] == 'COMPLETE' steps = phase['steps'] assert len(steps) == 4 assert steps[0]['status'] == 'COMPLETE' assert steps[1]['status'] == 'COMPLETE' assert steps[2]['status'] == 'COMPLETE' assert steps[3]['status'] == 'COMPLETE'
def test_node_replace_replaces_seed_node(): pod_to_replace = 'node-0' # start replace and wait for it to finish sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace {}'.format(pod_to_replace)) sdk_plan.wait_for_kicked_off_recovery(config.SERVICE_NAME) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=RECOVERY_TIMEOUT_SECONDS)
def test_custom_zookeeper(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) broker_ids = sdk_tasks.get_task_ids(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE)) # create a topic against the default zk: test_utils.create_topic(config.DEFAULT_TOPIC_NAME, service_name=foldered_name) marathon_config = sdk_marathon.get_config(foldered_name) # should be using default path when this envvar is empty/unset: assert marathon_config['env']['KAFKA_ZOOKEEPER_URI'] == '' # use a custom zk path that's WITHIN the 'dcos-service-' path, so that it's automatically cleaned up in uninstall: zk_path = 'master.mesos:2181/{}/CUSTOMPATH'.format(sdk_utils.get_zk_path(foldered_name)) marathon_config['env']['KAFKA_ZOOKEEPER_URI'] = zk_path sdk_marathon.update_app(foldered_name, marathon_config) sdk_tasks.check_tasks_updated(foldered_name, '{}-'.format(config.DEFAULT_POD_TYPE), broker_ids) sdk_plan.wait_for_completed_deployment(foldered_name) # wait for brokers to finish registering test_utils.broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=foldered_name) zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == zk_path # topic created earlier against default zk should no longer be present: topic_list_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'topic list', json=True) test_utils.assert_topic_lists_are_equal_without_automatic_topics([], topic_list_info)
def test_zones_not_referenced_in_placement_constraints(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) sdk_install.uninstall(config.PACKAGE_NAME, foldered_name) sdk_install.install( config.PACKAGE_NAME, foldered_name, config.DEFAULT_BROKER_COUNT, additional_options={ "service": { "name": foldered_name } }) test_utils.broker_count_check( config.DEFAULT_BROKER_COUNT, service_name=foldered_name) broker_ids = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, 'broker list', json=True) for broker_id in broker_ids: broker_info = sdk_cmd.svc_cli( config.PACKAGE_NAME, foldered_name, 'broker get {}'.format(broker_id), json=True) assert broker_info.get('rack') == None sdk_install.uninstall(config.PACKAGE_NAME, foldered_name)
def test_pod_restart(): hello_ids = sdk_tasks.get_task_ids(config.SERVICE_NAME, "hello-0") # get current agent id: rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod info hello-0", print_output=False ) assert rc == 0, "Pod info failed" old_agent = json.loads(stdout)[0]["info"]["slaveId"]["value"] rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod restart hello-0" ) assert rc == 0, "Pod restart failed" jsonobj = json.loads(stdout) assert len(jsonobj) == 2 assert jsonobj["pod"] == "hello-0" assert len(jsonobj["tasks"]) == 1 assert jsonobj["tasks"][0] == "hello-0-server" sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello-0", hello_ids) check_healthy() # check agent didn't move: rc, stdout, _ = sdk_cmd.svc_cli( config.PACKAGE_NAME, config.SERVICE_NAME, "pod info hello-0", print_output=False ) assert rc == 0, "Second pod info failed" new_agent = json.loads(stdout)[0]["info"]["slaveId"]["value"] assert old_agent == new_agent
def check_cache_refresh_fails_409conflict(): try: sdk_cmd.svc_cli(config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'state refresh_cache') except Exception as e: if "failed: 409 Conflict" in e.args[0]: return True return False
def test_topic_partition_count(): sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic create {}'.format(config.DEFAULT_TOPIC_NAME), json=True) topic_info = sdk_cmd.svc_cli( config.PACKAGE_NAME, FOLDERED_SERVICE_NAME, 'topic describe {}'.format(config.DEFAULT_TOPIC_NAME), json=True) assert len(topic_info['partitions']) == config.DEFAULT_PARTITION_COUNT
def test_updated_placement_constraints_replaced_tasks_do_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Replace the task, and verify it moves hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod replace hello-0') sdk_tasks.check_tasks_updated(config.SERVICE_NAME, 'hello', old_ids) assert get_task_host('hello-0-server') == other_agent
def delete_topic(service_name=config.SERVICE_NAME): delete_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'topic delete {}'.format(EPHEMERAL_TOPIC_NAME), json=True) assert len(delete_info) == 1 assert delete_info['message'].startswith('Output: Topic {} is marked for deletion'.format(EPHEMERAL_TOPIC_NAME)) topic_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'topic describe {}'.format(EPHEMERAL_TOPIC_NAME), json=True) assert len(topic_info) == 1 assert len(topic_info['partitions']) == config.DEFAULT_PARTITION_COUNT
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: Optional[List[str]] = None, ) -> None: """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod list") assert rc == 0, "Pod list failed" pod_list = set(json.loads(stdout)) pods_with_updated_tasks = pods_with_updated_tasks if pods_with_updated_tasks else [] pods_to_update = set(pods_with_updated_tasks + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set(sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set(sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) sdk_cmd.svc_cli(package_name, service_name, "pod replace {}".format(pod_name)) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_authz_acls_required(kafka_client, kafka_server, kerberos): client_id = kafka_client["id"] sdk_cmd.resolve_hosts(kafka_client["id"], kafka_client["brokers"]) topic_name = "authz.test" sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"], "topic create {}".format(topic_name), json=True) test_utils.wait_for_topic(kafka_server["package_name"], kafka_server["service"]["name"], topic_name) message = str(uuid.uuid4()) log.info("Writing and reading: Writing to the topic, but not super user") assert not write_to_topic("authorized", client_id, topic_name, message, kerberos) log.info("Writing and reading: Writing to the topic, as super user") assert write_to_topic("super", client_id, topic_name, message, kerberos) log.info("Writing and reading: Reading from the topic, but not super user") assert auth.is_not_authorized(read_from_topic("authorized", client_id, topic_name, 1, kerberos)) log.info("Writing and reading: Reading from the topic, as super user") assert message in read_from_topic("super", client_id, topic_name, 1, kerberos) zookeeper_endpoint = sdk_cmd.svc_cli( kafka_server["package_name"], kafka_server["service"]["name"], "endpoint zookeeper").strip() # TODO: If zookeeper has Kerberos enabled, then the environment should be changed topics.add_acls("authorized", client_id, topic_name, zookeeper_endpoint, env_str=None) # Send a second message which should not be authorized second_message = str(uuid.uuid4()) log.info("Writing and reading: Writing to the topic, but not super user") assert write_to_topic("authorized", client_id, topic_name, second_message, kerberos) log.info("Writing and reading: Writing to the topic, as super user") assert write_to_topic("super", client_id, topic_name, second_message, kerberos) log.info("Writing and reading: Reading from the topic, but not super user") topic_output = read_from_topic("authorized", client_id, topic_name, 3, kerberos) assert message in topic_output assert second_message in topic_output log.info("Writing and reading: Reading from the topic, as super user") topic_output = read_from_topic("super", client_id, topic_name, 3, kerberos) assert message in topic_output assert second_message in topic_output # Check that the unauthorized client can still not read or write from the topic. log.info("Writing and reading: Writing to the topic, but not super user") assert not write_to_topic("unauthorized", client_id, topic_name, second_message, kerberos) log.info("Writing and reading: Reading from the topic, but not super user") assert auth.is_not_authorized(read_from_topic("unauthorized", client_id, topic_name, 1, kerberos))
def test_service_startup_rapid(): max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS task_short_name = 'kafka-0' broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # the following 'dcos kafka topic ....' command has expected output as follows: # 'Output: 100 records sent ....' # but may fail, i.e. have output such as follows: # '...leader not available...' stdout = '' retries = 15 while retries > 0: retries -= 1 stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100') if 'records sent' in stdout: break jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == task_short_name assert jsonobj['tasks'] == [ '{}-broker'.format(task_short_name) ] starting_fallback_time = datetime.datetime.now() sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [ broker_task_id_0 ]) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # extract starting and started lines from log starting_time = started_time = None retry_seconds_remaining = max_restart_seconds + startup_padding_seconds while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None): stdout = sdk_cmd.run_cli("task log --lines=1000 {}".format(broker_task_id_1)) task_lines = stdout.split('\n') for log_line in reversed(task_lines): if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line: starting_time = log_line_ts(log_line) elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line: started_time = log_line_ts(log_line) if starting_time is None or started_time is None: time.sleep(retry_delay_seconds) if started_time is None or starting_time is None: f = open('/tmp/kafka_startup_stdout', 'w') f.write(stdout) f.close() if starting_time is None: starting_time = starting_fallback_time assert starting_time is not None assert started_time is not None assert started_time >= starting_time assert (started_time - starting_time).total_seconds() <= max_restart_seconds
def replace_broker_pod(service_name=config.SERVICE_NAME): pod_name = '{}-0'.format(config.DEFAULT_POD_TYPE) task_name = '{}-{}'.format(pod_name, config.DEFAULT_TASK_NAME) broker_0_id = sdk_tasks.get_task_ids(service_name, task_name) sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'pod replace {}'.format(pod_name)) sdk_tasks.check_tasks_updated(service_name, task_name, broker_0_id) sdk_tasks.check_running(service_name, config.DEFAULT_BROKER_COUNT) # wait till all brokers register broker_count_check(config.DEFAULT_BROKER_COUNT, service_name=service_name)
def test_updated_placement_constraints_replaced_tasks_do_move(): some_agent, other_agent, old_ids = setup_constraint_switch() # Replace the task, and verify it moves hosts sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, "pod replace hello-0") sdk_tasks.check_tasks_updated(config.SERVICE_NAME, "hello", old_ids) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME) assert get_task_host("hello-0-server") == other_agent
def test_config_cli(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) configs = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config list', json=True) assert len(configs) >= 1 # refrain from breaking this test if earlier tests did a config update assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config show {}'.format(configs[0]), print_output=False) # noisy output assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config target', json=True) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'debug config target_id', json=True)
def describe(topic): sdk_cmd.svc_cli(package_name, service_name, "topic describe {}".format(topic), json=True)
def test_no_unavailable_partitions_exist(): partition_info = sdk_cmd.svc_cli(config.PACKAGE_NAME, sdk_utils.get_foldered_name( config.SERVICE_NAME), 'topic unavailable_partitions', json=True)
def kill_driver(driver_id, service_name=SPARK_SERVICE_NAME): return sdk_cmd.svc_cli(SPARK_PACKAGE_NAME, service_name, "kill {}".format(driver_id))
def get_metrics(package_name, service_name, task_name): """Return a list of DC/OS metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ tasks = shakedown.get_service_tasks(service_name) for task in tasks: if task['name'] == task_name: task_to_check = task if task_to_check is None: raise Exception("Could not find task") agent_id = task_to_check['slave_id'] executor_id = task_to_check['executor_id'] pod_name = '-'.join(task_name.split("-")[:2]) pod_info = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), json=True) task_info = None for task in pod_info: if task["info"]["name"] == task_name: task_info = task break if not task_info: return [] task_container_id = task_info["status"]["containerStatus"]["containerId"][ "value"] # Not related to functionality but consuming this # endpoint to verify downstream integrity containers_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers".format(agent_id), retry=False) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True if not container_id_reported: raise ValueError( "The metrics /container endpoint returned {}, expecting {} to be returned as well" .format(reported_container_ids, task_container_id)) app_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers/{}/app".format( agent_id, task_container_id), retry=False) app_json = json.loads(app_response.text) if app_json['dimensions']['executor_id'] == executor_id: return app_json['datapoints'] raise Exception("No metrics found")
def test_topic_offsets_increase_with_writes(kafka_server: dict): package_name = kafka_server["package_name"] service_name = kafka_server["service"]["name"] def offset_is_valid(result) -> bool: initial = result[0] offsets = result[1] LOG.info("Checking validity with initial=%s offsets=%s", initial, offsets) has_elements = bool( topics.filter_empty_offsets(offsets, additional=initial)) # The return of this function triggers the restart. return not has_elements @retrying.retry(wait_exponential_multiplier=1000, wait_exponential_max=60 * 1000, retry_on_result=offset_is_valid) def get_offset_change(topic_name, initial_offsets=[]): """ Run: `dcos kafa topic offsets --time="-1"` until the output is not the initial output specified """ LOG.info("Getting offsets for %s", topic_name) offsets = sdk_cmd.svc_cli( package_name, service_name, 'topic offsets --time="-1" {}'.format(topic_name), json=True) LOG.info("offsets=%s", offsets) return initial_offsets, offsets topic_name = str(uuid.uuid4()) LOG.info("Creating topic: %s", topic_name) test_utils.create_topic(topic_name, service_name) _, offset_info = get_offset_change(topic_name) # offset_info is a list of (partition index, offset) key-value pairs sum the # integer representations of the offsets initial_offset = sum( map(lambda partition: sum(map(int, partition.values())), offset_info)) LOG.info("Initial offset=%s", initial_offset) num_messages = 10 LOG.info("Sending %s messages", num_messages) write_info = sdk_cmd.svc_cli(package_name, service_name, 'topic producer_test {} {}'.format( topic_name, num_messages), json=True) assert len(write_info) == 1 assert write_info['message'].startswith( 'Output: {} records sent'.format(num_messages)) _, post_write_offset_info = get_offset_change(topic_name, offset_info) post_write_offset = sum( map(lambda partition: sum(map(int, partition.values())), post_write_offset_info)) LOG.info("Post-write offset=%s", post_write_offset) assert post_write_offset > initial_offset
def check_cache_refresh_fails_409conflict(): output = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'state refresh_cache') if "failed: 409 Conflict" in output: return True return False
def test_overlay_network(): """Verify that the current deploy plan matches the expected plan from the spec.""" deployment_plan = sdk_plan.wait_for_completed_deployment( config.SERVICE_NAME) log.info("deployment_plan: " + str(deployment_plan)) # test that the deployment plan is correct assert (len(deployment_plan['phases']) == 5) assert (deployment_plan['phases'][0]['name'] == 'hello-overlay-deploy') assert (deployment_plan['phases'][1]['name'] == 'hello-overlay-vip-deploy') assert (deployment_plan['phases'][2]['name'] == 'hello-host-vip-deploy') assert (deployment_plan['phases'][3]['name'] == 'hello-host-deploy') assert (deployment_plan["phases"][4]["name"] == "getter-deploy") assert (len(deployment_plan['phases'][0]['steps']) == 1) assert (len(deployment_plan["phases"][1]["steps"]) == 1) assert (len(deployment_plan["phases"][2]["steps"]) == 1) assert (len(deployment_plan["phases"][3]["steps"]) == 1) assert (len(deployment_plan["phases"][4]["steps"]) == 1) # Due to DNS resolution flakiness, some of the deployed tasks can fail. If so, # we wait for them to redeploy, but if they don't fail we still want to proceed. try: sdk_plan.wait_for_in_progress_recovery(config.SERVICE_NAME, timeout_seconds=60) sdk_plan.wait_for_completed_recovery(config.SERVICE_NAME, timeout_seconds=60) except retrying.RetryError: pass # test that the tasks are all up, which tests the overlay DNS framework_tasks = [ task for task in shakedown.get_service_tasks(config.SERVICE_NAME, completed=False) ] framework_task_names = [t["name"] for t in framework_tasks] for expected_task in EXPECTED_TASKS: assert (expected_task in framework_task_names), "Missing {expected}".format( expected=expected_task) for task in framework_tasks: name = task["name"] if "getter" in name: # don't check the "getter" tasks because they don't use ports continue resources = task["resources"] if "host" in name: assert "ports" in resources.keys( ), "Task {} should have port resources".format(name) if "overlay" in name: assert "ports" not in resources.keys( ), "Task {} should NOT have port resources".format(name) sdk_networks.check_task_network("hello-overlay-0-server") sdk_networks.check_task_network("hello-overlay-vip-0-server") sdk_networks.check_task_network("hello-host-0-server", expected_network_name=None) sdk_networks.check_task_network("hello-host-vip-0-server", expected_network_name=None) endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints', json=True) assert len(endpoints_result ) == 2, "Wrong number of endpoints got {} should be 2".format( len(endpoints_result)) overlay_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints overlay-vip', json=True) assert "address" in overlay_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(overlay_endpoints_result) assert len(overlay_endpoints_result["address"]) == 1 assert overlay_endpoints_result["address"][0].startswith("9") overlay_port = overlay_endpoints_result["address"][0].split(":")[-1] assert overlay_port == "4044" assert "dns" in overlay_endpoints_result.keys() assert len(overlay_endpoints_result["dns"]) == 1 assert overlay_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "hello-overlay-vip-0-server", 4044) host_endpoints_result = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'endpoints host-vip', json=True) assert "address" in host_endpoints_result.keys(), "overlay endpoints missing 'address'"\ "{}".format(host_endpoints_result) assert len(host_endpoints_result["address"]) == 1 assert host_endpoints_result["address"][0].startswith("10") host_port = host_endpoints_result["address"][0].split(":")[-1] assert host_port == "4044" assert "dns" in host_endpoints_result.keys() assert len(host_endpoints_result["dns"]) == 1 assert host_endpoints_result["dns"][0] == sdk_hosts.autoip_host( config.SERVICE_NAME, "hello-host-vip-0-server", 4044)
def _check_json_output(svc_name, cmd): rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, svc_name, cmd) assert rc == 0, "Command failed: {}".format(cmd) # Check that stdout is valid json: json.loads(stdout)
def check_cache_refresh(): rc, stdout, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") assert rc == 0, "Refresh cache failed" return stdout
def check_cache_refresh_fails_409conflict(): rc, stdout, stderr = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "debug state refresh_cache") return rc != 0 and stdout == "" and "failed: 409 Conflict" in stderr
def test_help_cli(): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "help")
def test_authz_acls_not_required(kafka_client, service_account, setup_principals): try: sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) service_options = { "service": { "name": config.SERVICE_NAME, "service_account": service_account["name"], "service_account_secret": service_account["secret"], "security": { "transport_encryption": {"enabled": True}, "ssl_authentication": {"enabled": True}, "authorization": { "enabled": True, "super_users": "User:{}".format("super"), "allow_everyone_if_no_acl_found": True, }, }, } } config.install( config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT, additional_options=service_options, ) kafka_server = {**service_options, **{"package_name": config.PACKAGE_NAME}} topic_name = "authz.test" sdk_cmd.svc_cli( kafka_server["package_name"], kafka_server["service"]["name"], "topic create {}".format(topic_name), ) kafka_client.connect(kafka_server) # Since no ACLs are specified, all users can read and write. for user in ["authorized", "unauthorized", "super"]: log.info("Checking write / read permissions for user=%s", user) write_success, read_successes, _ = kafka_client.can_write_and_read( user, kafka_server, topic_name, None ) assert write_success, "Write failed (user={})".format(user) assert read_successes, ( "Read failed (user={}): " "MESSAGES={} " "read_successes={}".format(user, kafka_client.MESSAGES, read_successes) ) log.info("Writing and reading: Adding acl for authorized user") kafka_client.add_acls("authorized", kafka_server, topic_name) # After adding ACLs the authorized user and super user should still have access to the topic. for user in ["authorized", "super"]: log.info("Checking write / read permissions for user=%s", user) write_success, read_successes, _ = kafka_client.can_write_and_read( user, kafka_server, topic_name, None ) assert write_success, "Write failed (user={})".format(user) assert read_successes, ( "Read failed (user={}): " "MESSAGES={} " "read_successes={}".format(user, kafka_client.MESSAGES, read_successes) ) for user in ["unauthorized"]: log.info("Checking lack of write / read permissions for user=%s", user) write_success, _, read_messages = kafka_client.can_write_and_read( user, kafka_server, topic_name, None ) assert not write_success, "Write not expected to succeed (user={})".format(user) assert auth.is_not_authorized(read_messages), "Unauthorized expected (user={}".format( user ) finally: sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
def test_master_node_replace(): # Ideally, the pod will get placed on a different agent. This test will verify that the remaining two masters # find the replaced master at its new IP address. This requires a reasonably low TTL for Java DNS lookups. sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace master-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def test_endpoints_zookeeper_default(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) _, zookeeper, _ = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "endpoints zookeeper") assert zookeeper.rstrip("\n") == "master.mesos:2181/{}".format( sdk_utils.get_zk_path(foldered_name) )
def test_state_cli(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "state framework_id", parse_json=True)[1] assert sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, "state properties", parse_json=True)[1]
def test_service_startup_rapid(): max_restart_seconds = EXPECTED_KAFKA_STARTUP_SECONDS startup_padding_seconds = EXPECTED_DCOS_STARTUP_SECONDS retry_delay_seconds = STARTUP_POLL_DELAY_SECONDS task_short_name = 'kafka-0' broker_task_id_0 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # the following 'dcos kafka topic ....' command has expected output as follows: # 'Output: 100 records sent ....' # but may fail, i.e. have output such as follows: # '...leader not available...' stdout = '' retries = 15 while retries > 0: retries -= 1 stdout = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'topic producer_test test 100') if 'records sent' in stdout: break jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, config.SERVICE_NAME, 'pod restart {}'.format(task_short_name), json=True) assert len(jsonobj) == 2 assert jsonobj['pod'] == task_short_name assert jsonobj['tasks'] == ['{}-broker'.format(task_short_name)] starting_fallback_time = datetime.datetime.now() sdk_tasks.check_tasks_updated(config.SERVICE_NAME, '{}-'.format(config.DEFAULT_POD_TYPE), [broker_task_id_0]) sdk_tasks.check_running(config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT) broker_task_id_1 = sdk_tasks.get_task_ids(config.SERVICE_NAME, task_short_name)[0] # extract starting and started lines from log starting_time = started_time = None retry_seconds_remaining = max_restart_seconds + startup_padding_seconds while retry_seconds_remaining > 0.0 and (starting_time is None or started_time is None): stdout = sdk_cmd.run_cli( "task log --lines=1000 {}".format(broker_task_id_1)) task_lines = stdout.split('\n') for log_line in reversed(task_lines): if starting_time is None and ' starting (kafka.server.KafkaServer)' in log_line: starting_time = log_line_ts(log_line) elif started_time is None and ' started (kafka.server.KafkaServer)' in log_line: started_time = log_line_ts(log_line) if starting_time is None or started_time is None: time.sleep(retry_delay_seconds) if started_time is None or starting_time is None: f = open('/tmp/kafka_startup_stdout', 'w') f.write(stdout) f.close() if starting_time is None: starting_time = starting_fallback_time assert starting_time is not None assert started_time is not None assert started_time >= starting_time assert (started_time - starting_time).total_seconds() <= max_restart_seconds
def get_metrics(package_name, service_name, pod_name, task_name): """Return a list of DC/OS metrics datapoints. Keyword arguments: package_name -- the name of the package the service is using service_name -- the name of the service to get metrics for task_name -- the name of the task whose agent to run metrics commands from """ # Find task entry in mesos state: tasks = sdk_tasks.get_service_tasks(service_name) for task in tasks: if task.name == task_name: task_to_check = task break if task_to_check is None: raise Exception("Task named {} not found in service {}: {}".format( task_name, service_name, tasks)) # Find task's container id via recent TaskStatus: rc, stdout, _ = sdk_cmd.svc_cli(package_name, service_name, "pod info {}".format(pod_name), print_output=False) assert rc == 0, "Pod info failed" pod_info = json.loads(stdout) task_container_id = None for task in pod_info: if task["info"]["name"] == task_name: task_container_id = task["status"]["containerStatus"][ "containerId"]["value"] break if task_container_id is None: log.warning("Task named {} not found in pod {}: {}".format( task_name, pod_name, pod_info)) return [] # Not related to functionality, but consuming this endpoint to verify metrics integrity containers_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers".format( task_to_check.agent_id), retry=False, ) reported_container_ids = json.loads(containers_response.text) container_id_reported = False for container_id in reported_container_ids: if container_id == task_container_id: container_id_reported = True break if not container_id_reported: raise ValueError( "The metrics /container endpoint returned {} for agent {}, expected {} to be returned as well" .format(reported_container_ids, task_to_check.agent_id, task_container_id)) app_response = sdk_cmd.cluster_request( "GET", "/system/v1/agent/{}/metrics/v0/containers/{}/app".format( task_to_check.agent_id, task_container_id), retry=False, ) app_json = json.loads(app_response.text) if app_json["dimensions"]["executor_id"] == task_to_check.executor_id: return app_json["datapoints"] raise Exception("No metrics found for task {} in service {}".format( task_name, service_name))
def get_pod_type_instances(pod_type_prefix, service_name=SERVICE_NAME): pod_types = sdk_cmd.svc_cli(PACKAGE_NAME, service_name, 'pod list', json=True) return [pod_type for pod_type in pod_types if pod_type.startswith(pod_type_prefix)]
def check_permanent_recovery( package_name: str, service_name: str, pod_name: str, recovery_timeout_s: int, pods_with_updated_tasks: typing.List[str] = None, ): """ Perform a replace (permanent recovery) operation on the specified pod. The specified pod AND any additional pods in `pods_with_updated_tasks` are checked to ensure that their tasks have been restarted. Any remaining pods are checked to ensure that their tasks are not changed. For example, performing a pod replace kafka-0 on a Kafka framework should result in ONLY the kafa-0-broker task being restarted. In this case, pods_with_updated_tasks is specified as None. When performing a pod replace operation on a Cassandra seed node (node-0), a rolling restart of other nodes is triggered, and pods_with_updated_tasks = ["node-0", "node-1", "node-2"] (assuming a three node Cassandra ring) """ LOG.info("Testing pod replace operation for %s:%s", service_name, pod_name) sdk_plan.wait_for_completed_deployment(service_name) sdk_plan.wait_for_completed_recovery(service_name) pod_list = set( sdk_cmd.svc_cli(package_name, service_name, "pod list", json=True)) pods_to_update = set( pods_with_updated_tasks if pods_with_updated_tasks else [] + [pod_name]) tasks_to_replace = {} for pod in pods_to_update: tasks_to_replace[pod] = set( sdk_tasks.get_task_ids(service_name, pod_name)) LOG.info("The following tasks will be replaced: %s", tasks_to_replace) tasks_in_other_pods = {} for pod in pod_list - pods_to_update: tasks_in_other_pods[pod] = set( sdk_tasks.get_task_ids(service_name, pod)) LOG.info("Tasks in other pods should not be replaced: %s", tasks_in_other_pods) replace_cmd = ["pod", "replace", pod_name] sdk_cmd.svc_cli(package_name, service_name, " ".join(replace_cmd), json=True) sdk_plan.wait_for_kicked_off_recovery(service_name, recovery_timeout_s) sdk_plan.wait_for_completed_recovery(service_name, recovery_timeout_s) for pod, tasks in tasks_to_replace.items(): sdk_tasks.check_tasks_updated(service_name, pod, tasks) for pod, tasks in tasks_in_other_pods.items(): sdk_tasks.check_tasks_not_updated(service_name, pod, tasks)
def test_help_cli(): sdk_cmd.svc_cli(config.PACKAGE_NAME, sdk_utils.get_foldered_name(config.SERVICE_NAME), 'help')
def test_authz_acls_required(kafka_client, kafka_server, kerberos): client_id = kafka_client["id"] auth.wait_for_brokers(kafka_client["id"], kafka_client["brokers"]) topic_name = "authz.test" sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"], "topic create {}".format(topic_name), json=True) test_utils.wait_for_topic(kafka_server["package_name"], kafka_server["service"]["name"], topic_name) message = str(uuid.uuid4()) log.info("Writing and reading: Writing to the topic, but not super user") assert not write_to_topic("authorized", client_id, topic_name, message, kerberos) log.info("Writing and reading: Writing to the topic, as super user") assert write_to_topic("super", client_id, topic_name, message, kerberos) log.info("Writing and reading: Reading from the topic, but not super user") assert auth.is_not_authorized( read_from_topic("authorized", client_id, topic_name, 1, kerberos)) log.info("Writing and reading: Reading from the topic, as super user") assert message in read_from_topic("super", client_id, topic_name, 1, kerberos) zookeeper_endpoint = sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"], "endpoint zookeeper").strip() # TODO: If zookeeper has Kerberos enabled, then the environment should be changed topics.add_acls("authorized", client_id, topic_name, zookeeper_endpoint, env_str=None) # Send a second message which should not be authorized second_message = str(uuid.uuid4()) log.info("Writing and reading: Writing to the topic, but not super user") assert write_to_topic("authorized", client_id, topic_name, second_message, kerberos) log.info("Writing and reading: Writing to the topic, as super user") assert write_to_topic("super", client_id, topic_name, second_message, kerberos) log.info("Writing and reading: Reading from the topic, but not super user") topic_output = read_from_topic("authorized", client_id, topic_name, 3, kerberos) assert message in topic_output assert second_message in topic_output log.info("Writing and reading: Reading from the topic, as super user") topic_output = read_from_topic("super", client_id, topic_name, 3, kerberos) assert message in topic_output assert second_message in topic_output # Check that the unauthorized client can still not read or write from the topic. log.info("Writing and reading: Writing to the topic, but not super user") assert not write_to_topic("unauthorized", client_id, topic_name, second_message, kerberos) log.info("Writing and reading: Reading from the topic, but not super user") assert auth.is_not_authorized( read_from_topic("unauthorized", client_id, topic_name, 1, kerberos))
def restart_zookeeper_node(id: int): sdk_cmd.svc_cli(ZK_PACKAGE, ZK_SERVICE_NAME, "pod restart zookeeper-{}".format(id)) sdk_plan.wait_for_kicked_off_recovery(ZK_SERVICE_NAME) sdk_plan.wait_for_completed_recovery(ZK_SERVICE_NAME)
def test_authz_acls_required(kafka_client: client.KafkaClient, kafka_server: dict, kerberos: sdk_auth.KerberosEnvironment): topic_name = "authz.test" sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"], "topic create {}".format(topic_name), json=True) kafka_client.connect(kafka_server) # Since no ACLs are specified, only the super user can read and write for user in [ "super", ]: log.info("Checking write / read permissions for user=%s", user) write_success, read_successes, _ = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert write_success, "Write failed (user={})".format(user) assert read_successes, "Read failed (user={}): " \ "MESSAGES={} " \ "read_successes={}".format(user, kafka_client.MESSAGES, read_successes) for user in [ "authorized", "unauthorized", ]: log.info("Checking lack of write / read permissions for user=%s", user) write_success, _, read_messages = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert not write_success, "Write not expected to succeed (user={})".format( user) assert auth.is_not_authorized( read_messages), "Unauthorized expected (user={}".format(user) log.info("Writing and reading: Adding acl for authorized user") kafka_client.add_acls("authorized", kafka_server, topic_name) # After adding ACLs the authorized user and super user should still have access to the topic. for user in ["authorized", "super"]: log.info("Checking write / read permissions for user=%s", user) write_success, read_successes, _ = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert write_success, "Write failed (user={})".format(user) assert read_successes, "Read failed (user={}): " \ "MESSAGES={} " \ "read_successes={}".format(user, kafka_client.MESSAGES, read_successes) for user in [ "unauthorized", ]: log.info("Checking lack of write / read permissions for user=%s", user) write_success, _, read_messages = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert not write_success, "Write not expected to succeed (user={})".format( user) assert auth.is_not_authorized( read_messages), "Unauthorized expected (user={}".format(user)
def check_cache_refresh(): return sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'state refresh_cache')
def test_authz_acls_not_required(kafka_client: client.KafkaClient, zookeeper_server, kerberos): try: zookeeper_dns = sdk_cmd.svc_cli(zookeeper_server["package_name"], zookeeper_server["service"]["name"], "endpoint clientport", json=True)["dns"] sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME) service_options = { "service": { "name": config.SERVICE_NAME, "security": { "kerberos": { "enabled": True, "enabled_for_zookeeper": True, "kdc": { "hostname": kerberos.get_host(), "port": int(kerberos.get_port()) }, "realm": kerberos.get_realm(), "keytab_secret": kerberos.get_keytab_path(), }, "authorization": { "enabled": True, "super_users": "User:{}".format("super"), "allow_everyone_if_no_acl_found": True } } }, "kafka": { "kafka_zookeeper_uri": ",".join(zookeeper_dns) } } config.install(config.PACKAGE_NAME, config.SERVICE_NAME, config.DEFAULT_BROKER_COUNT, additional_options=service_options) kafka_server = { **service_options, **{ "package_name": config.PACKAGE_NAME } } topic_name = "authz.test" sdk_cmd.svc_cli(kafka_server["package_name"], kafka_server["service"]["name"], "topic create {}".format(topic_name), json=True) kafka_client.connect(kafka_server) # Clear the ACLs kafka_client.remove_acls("authorized", kafka_server, topic_name) # Since no ACLs are specified, all users can read and write. for user in [ "authorized", "unauthorized", "super", ]: log.info("Checking write / read permissions for user=%s", user) write_success, read_successes, _ = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert write_success, "Write failed (user={})".format(user) assert read_successes, "Read failed (user={}): " \ "MESSAGES={} " \ "read_successes={}".format(user, kafka_client.MESSAGES, read_successes) log.info("Writing and reading: Adding acl for authorized user") kafka_client.add_acls("authorized", kafka_server, topic_name) # After adding ACLs the authorized user and super user should still have access to the topic. for user in [ "authorized", "super", ]: log.info("Checking write / read permissions for user=%s", user) write_success, read_successes, _ = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert write_success, "Write failed (user={})".format(user) assert read_successes, "Read failed (user={}): " \ "MESSAGES={} " \ "read_successes={}".format(user, kafka_client.MESSAGES, read_successes) for user in [ "unauthorized", ]: log.info("Checking lack of write / read permissions for user=%s", user) write_success, _, read_messages = kafka_client.can_write_and_read( user, kafka_server, topic_name, kerberos) assert not write_success, "Write not expected to succeed (user={})".format( user) assert auth.is_not_authorized( read_messages), "Unauthorized expected (user={}".format(user) finally: sdk_install.uninstall(config.PACKAGE_NAME, config.SERVICE_NAME)
def check_for_nonempty_properties(): jsonobj = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'state properties', json=True) return len(jsonobj) > 0
def test_endpoints_zookeeper_default(): foldered_name = sdk_utils.get_foldered_name(config.SERVICE_NAME) zookeeper = sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'endpoints zookeeper') assert zookeeper.rstrip('\n') == 'master.mesos:2181/{}'.format( sdk_utils.get_zk_path(foldered_name))
def test_coordinator_node_replace(): sdk_cmd.svc_cli(config.PACKAGE_NAME, foldered_name, 'pod replace coordinator-0') sdk_plan.wait_for_in_progress_recovery(foldered_name) sdk_plan.wait_for_completed_recovery(foldered_name)
def broker_count_check(count, service_name=config.SERVICE_NAME): brokers = sdk_cmd.svc_cli(config.PACKAGE_NAME, service_name, 'broker list', json=True) return len(brokers) == count
def get_zookeeper_connect(self) -> str: return str( sdk_cmd.svc_cli(self._package_name, self._service_name, "endpoint zookeeper")).strip()