def test_scram_sha512(self): """ Check that username/password authentication works for users that were created using the scram_sha512 mechanism (as opposed to the default scram_sha256) """ import requests rpath = requests.urllib3.util.retry.__file__ self.logger.info(f"rpath = '{rpath}'") try: charles = SaslCredentials("charles", "highEntropyHipster", "SCRAM-SHA-512") create_user_and_wait(self.redpanda, self.superuser_admin, charles) self.redpanda.set_cluster_config({ 'superusers': [ charles.username, self.redpanda.SUPERUSER_CREDENTIALS.username ] }) charles_admin = Admin(self.redpanda, auth=(charles.username, charles.password)) # Hit an endpoint requiring superuser charles_admin.get_cluster_config() except: import time self.logger.exception("I need an adult") time.sleep(3600)
def test_move_consumer_offsets_intranode(self): """ Exercise moving the consumer_offsets/0 partition between shards within the same nodes. This reproduces certain bugs in the special handling of this topic. """ throughput, records, moves = self._get_scale_params() self.start_redpanda(num_nodes=3, extra_rp_conf={"default_topic_replications": 3}) spec = TopicSpec(name="topic", partition_count=3, replication_factor=3) self.client().create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=throughput) self.start_consumer(1) self.await_startup() admin = Admin(self.redpanda) topic = "__consumer_offsets" partition = 0 for _ in range(moves): assignments = self._get_assignments(admin, topic, partition) for a in assignments: # Bounce between core 0 and 1 a['core'] = (a['core'] + 1) % 2 admin.set_partition_replicas(topic, partition, assignments) self._wait_post_move(topic, partition, assignments, 360) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45, min_records=records)
def test_cross_shard(self): """ Test interaction between the shadow indexing and the partition movement. Move partitions with SI enabled between shards. """ throughput, records, moves, partitions = self._get_scale_params() self.start_redpanda(num_nodes=3) spec = TopicSpec(name="topic", partition_count=partitions, replication_factor=3) self.client().create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=throughput) self.start_consumer(1) self.await_startup() admin = Admin(self.redpanda) topic = self.topic partition = 0 for _ in range(moves): assignments = self._get_assignments(admin, topic, partition) for a in assignments: # Bounce between core 0 and 1 a['core'] = (a['core'] + 1) % 2 admin.set_partition_replicas(topic, partition, assignments) self._wait_post_move(topic, partition, assignments, 360) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45, min_records=records)
def __init__(self, context, num_brokers, client_type, enable_rp=True, extra_rp_conf=None, enable_pp=False, enable_sr=False, topics=None, num_cores=3): super(RedpandaService, self).__init__(context, num_nodes=num_brokers) self._context = context self._client_type = client_type self._enable_rp = enable_rp self._extra_rp_conf = extra_rp_conf or dict() self._enable_pp = enable_pp self._enable_sr = enable_sr self._log_level = self._context.globals.get(self.LOG_LEVEL_KEY, self.DEFAULT_LOG_LEVEL) self._topics = topics or () self._num_cores = num_cores self._admin = Admin(self) self._started = [] # client is intiialized after service starts self._client = None self.config_file_lock = threading.Lock()
def test_self_transfer(self): admin = Admin(self.redpanda) for topic in self.topics: for partition in range(topic.partition_count): leader = admin.get_partitions(topic, partition)['leader_id'] admin.partition_transfer_leadership("kafka", topic, partition, leader)
def test_decommissioning_working_node(self): self.start_redpanda(num_nodes=4) topics = [] for partition_count in range(1, 5): for replication_factor in (3, 3): name = f"topic{len(topics)}" spec = TopicSpec(name=name, partition_count=partition_count, replication_factor=replication_factor) topics.append(spec) for spec in topics: self.client().create_topic(spec) self.topic = spec.name self.start_producer(1) self.start_consumer(1) self.await_startup() admin = Admin(self.redpanda) brokers = admin.get_brokers() to_decommission = random.choice(brokers) self.logger.info(f"decommissioning node: {to_decommission}", ) admin.decommission_broker(to_decommission['node_id']) def node_removed(): brokers = admin.get_brokers() for b in brokers: if b['node_id'] == to_decommission['node_id']: return False return True wait_until(node_removed, timeout_sec=120, backoff_sec=2) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
def node_removed(): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id: return False return True
def test_id_allocator_leader_isolation(self): """ Isolate id allocator leader. This test validates whether the cluster is still available when `kafka_internal/id_allocator` leader has been isolated. """ admin = Admin(self.redpanda) self._expect_available() # Find which node is the leader for id allocator partition admin.wait_stable_configuration(namespace='kafka_internal', topic='id_allocator', replication=3) initial_leader_id = admin.get_partition_leader( namespace='kafka_internal', topic='id_allocator', partition=0) leader_node = self.redpanda.get_node(initial_leader_id) self.logger.info( f"kafka_internal/id_allocator/0 leader: {initial_leader_id}, node: {leader_node.account.hostname}" ) self._expect_available() with FailureInjector(self.redpanda) as fi: # isolate id_allocator fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.get_node(initial_leader_id))) # expect messages to be produced and consumed without a timeout connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
def test_moving_not_fully_initialized_partition(self): """ Move partition before first leader is elected """ self.start_redpanda(num_nodes=3) hb = HoneyBadger() # if failure injector is not enabled simply skip this test if not hb.is_enabled(self.redpanda.nodes[0]): return for n in self.redpanda.nodes: hb.set_exception(n, 'raftgen_service::failure_probes', 'vote') topic = "topic-1" partition = 0 spec = TopicSpec(name=topic, partition_count=1, replication_factor=3) self.redpanda.create_topic(spec) admin = Admin(self.redpanda) # choose a random topic-partition self.logger.info(f"selected topic-partition: {topic}-{partition}") # get the partition's replica set, including core assignments. the kafka # api doesn't expose core information, so we use the redpanda admin api. assignments = self._get_assignments(admin, topic, partition) self.logger.info(f"assignments for {topic}-{partition}: {assignments}") brokers = admin.get_brokers() # replace all node cores in assignment for assignment in assignments: for broker in brokers: if broker['node_id'] == assignment['node_id']: assignment['core'] = random.randint( 0, broker["num_cores"] - 1) self.logger.info( f"new assignments for {topic}-{partition}: {assignments}") admin.set_partition_replicas(topic, partition, assignments) def status_done(): info = admin.get_partitions(topic, partition) self.logger.info( f"current assignments for {topic}-{partition}: {info}") converged = self._equal_assignments(info["replicas"], assignments) return converged and info["status"] == "done" # unset failures for n in self.redpanda.nodes: hb.unset_failures(n, 'raftgen_service::failure_probes', 'vote') # wait until redpanda reports complete wait_until(status_done, timeout_sec=30, backoff_sec=1) def derived_done(): info = self._get_current_partitions(admin, topic, partition) self.logger.info( f"derived assignments for {topic}-{partition}: {info}") return self._equal_assignments(info, assignments) wait_until(derived_done, timeout_sec=30, backoff_sec=1)
def node_stopped(node_id): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: self.logger.debug(f"broker: {b}") if b['node_id'] == node_id: return b['is_alive'] == False return False
def node_removed(): admin = Admin(self.redpanda) try: brokers = admin.get_brokers(node=self.redpanda.nodes[0]) for b in brokers: if b['node_id'] == node_id: return False return True except: return False
def test_two_nodes_down(self): """ Validate that when two nodes are down, the cluster becomes unavailable, and that when one of those nodes is restored, the cluster becomes available again. """ admin = Admin(self.redpanda) # Find which node is the leader initial_leader_id, replicas = self._wait_for_leader() self.ping_pong().ping_pong() leader_node = self.redpanda.get_node(initial_leader_id) other_node_id = (set(replicas) - {initial_leader_id}).pop() other_node = self.redpanda.get_node(other_node_id) self.logger.info( f"Stopping {initial_leader_id} ({leader_node.account.hostname}) and {other_node_id} ({other_node.account.hostname})" ) self.redpanda.stop_node(leader_node) self.redpanda.stop_node(other_node) # 2/3 nodes down, cluster should be unavailable for acks=-1 self._expect_unavailable() # Bring back one node (not the original leader) self.redpanda.start_node(self.redpanda.get_node(other_node_id)) hosts = [ n.account.hostname for n in self.redpanda.nodes if self.redpanda.idx(n) != initial_leader_id ] admin.wait_stable_configuration("id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2, hosts=hosts) # This will be a slow election because priorities have to adjust down # (our two live nodes are the lower-priority ones of the three) # We have to wait for availability rather than leader state, because # leader state may already be reported as the expected leader from # stale pre-shutdown metadata. wait_until(lambda: self._is_available() is True, timeout_sec=ELECTION_TIMEOUT * 2, backoff_sec=0.5, err_msg=f"Cluster did not become available!") new_leader, _ = self._wait_for_leader( lambda l: l is not None and l != initial_leader_id, timeout=ELECTION_TIMEOUT * 2) # 1/3 nodes down, cluster should be available self._expect_available()
def registered(self, node): """ Check if a newly added node is fully registered with the cluster, such that a kafka metadata request to any node in the cluster will include it. We first check the admin API to do a kafka-independent check, and then verify that kafka clients see the same thing. """ idx = self.idx(node) self.logger.debug( f"registered: checking if broker {idx} ({node.name} is registered..." ) # Query all nodes' admin APIs, so that we don't advance during setup until # the node is stored in raft0 AND has been replayed on all nodes. Otherwise # a kafka metadata request to the last node to join could return incomplete # metadata and cause strange issues within a test. admin = Admin(self) for peer in self._started: try: admin_brokers = admin.get_brokers(node=peer) except requests.exceptions.RequestException as e: # We run during startup, when admin API may not even be listening yet: tolerate # API errors but presume that if some APIs are not up yet, then node registration # is also not complete. self.logger.debug( f"registered: peer {peer.name} admin API not yet available ({e})" ) return False found = idx in [b['node_id'] for b in admin_brokers] if not found: self.logger.info( f"registered: node {node.name} not yet found in peer {peer.name}'s broker list ({admin_brokers})" ) return False else: self.logger.debug( f"registered: node {node.name} now visible in peer {peer.name}'s broker list ({admin_brokers})" ) client = PythonLibrdkafka(self) brokers = client.brokers() broker = brokers.get(idx, None) if broker is None: # This should never happen, because we already checked via the admin API # that the node of interest had become visible to all peers. self.logger.error( f"registered: node {node.name} not found in kafka metadata!") assert broker is not None self.logger.debug(f"registered: found broker info: {broker}") return True
def cluster_is_stable(): admin = Admin(self.redpanda) brokers = admin.get_brokers() if len(brokers) < 3: return False for b in brokers: self.logger.debug(f"broker: {b}") if not (b['is_alive'] and 'disk_space' in b): return False return True
def __init__(self, *args, **kwargs): rp_conf = BOOTSTRAP_CONFIG.copy() # Enable our feature flag rp_conf['enable_central_config'] = True super(ClusterConfigTest, self).__init__(*args, extra_rp_conf=rp_conf, **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda)
def __init__(self, *args, **kwargs): rp_conf = BOOTSTRAP_CONFIG.copy() # Force verbose logging for the secret redaction test kwargs['log_level'] = 'trace' super(ClusterConfigTest, self).__init__(*args, extra_rp_conf=rp_conf, **kwargs) self.admin = Admin(self.redpanda) self.rpk = RpkTool(self.redpanda)
def create_user_and_wait(redpanda, admin: Admin, creds: SaslCredentials): admin.create_user(*creds) def user_exists_everywhere(): for node in redpanda.nodes: users = redpanda._admin.list_users(node=node) if creds.username not in users: redpanda.logger.info(f"{creds.username} not in {users}") return False return True # It should only take milliseconds for raft0 write to propagate wait_until(user_exists_everywhere, timeout_sec=5, backoff_sec=0.5)
def __init__(self, test_context): super(TxAdminTest, self).__init__(test_context=test_context, num_brokers=3, extra_rp_conf={ "enable_idempotence": True, "enable_transactions": True, "tx_timeout_delay_ms": 10000000, "abort_timed_out_transactions_interval_ms": 10000000, 'enable_leader_balancer': False }) self.admin = Admin(self.redpanda)
def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") admin = Admin(self.redpanda) admin.decommission_broker(id=node_id) def node_removed(): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id: return False return True wait_until(node_removed, timeout_sec=240, backoff_sec=2)
def test_controller_node_isolation(self): """ Isolate controller node, expect cluster to be available """ def controller_available(): return self.redpanda.controller() is not None admin = Admin(self.redpanda) # wait for controller wait_until(controller_available, timeout_sec=ELECTION_TIMEOUT * 2, backoff_sec=1) initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() allocator_info = admin.wait_stable_configuration( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2) # isolate controller with FailureInjector(self.redpanda) as fi: controller_id = self.redpanda.idx( self.redpanda.controller().account.hostname) fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.controller())) if allocator_info.leader == controller_id: hosts = [ n.account.hostname for n in self.redpanda.nodes if self.redpanda.idx(n) != controller_id ] admin.await_stable_leader( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2, hosts=hosts, check=lambda node_id: node_id != controller_id) connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
def test_get_config(self): """ Verify that the config GET endpoint serves valid json with some options in it. """ admin = Admin(self.redpanda) config = admin.get_cluster_config() # Pick an arbitrary config property to verify that the result # contained some properties assert 'enable_transactions' in config node_config = admin.get_node_config() # Some arbitrary property to check syntax of result assert 'kafka_api' in node_config
def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1]) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2)
def start_node(self, node, override_cfg_params=None): """ Start a single instance of redpanda. This function will not return until redpanda appears to have started successfully. If redpanda does not start within a timeout period the service will fail to start. Thus this function also acts as an implicit test that redpanda starts quickly. """ node.account.mkdirs(RedpandaService.DATA_DIR) node.account.mkdirs(os.path.dirname(RedpandaService.CONFIG_FILE)) self.write_conf_file(node, override_cfg_params) if self.coproc_enabled(): self.start_wasm_engine(node) cmd = (f"nohup {self.find_binary('redpanda')}" f" --redpanda-cfg {RedpandaService.CONFIG_FILE}" f" --default-log-level {self._log_level}" f" --logger-log-level=exception=debug:archival=debug " f" --kernel-page-cache=true " f" --overprovisioned " f" --smp {self._num_cores} " f" --memory 6G " f" --reserve-memory 0M " f" >> {RedpandaService.STDOUT_STDERR_CAPTURE} 2>&1 &") node.account.ssh(cmd) wait_until( lambda: Admin.ready(node).get("status") == "ready", timeout_sec=RedpandaService.READY_TIMEOUT_SEC, err_msg=f"Redpanda service {node.account.hostname} failed to start", retry_on_exc=True) self._started.append(node)
def test_follower_isolation(self): """ Simplest HA test. Stop the leader for our partition. Validate that the cluster remains available afterwards, and that the expected peer takes over as the new leader. """ admin = Admin(self.redpanda) # Find which node is the leader initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() leader_node = self.redpanda.get_node(initial_leader_id) self.logger.info( f"Initial leader {initial_leader_id} {leader_node.account.hostname}" ) allocator_info = admin.wait_stable_configuration( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2) follower = None for node in replicas: if node == initial_leader_id: continue if node == allocator_info.leader: continue follower = node break assert follower != None with FailureInjector(self.redpanda) as fi: # isolate one of the followers fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.get_node(follower))) # expect messages to be produced and consumed without a timeout connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
def test_not_a_superuser(self): anonymous_admin = Admin(self.redpanda) # Nobody may enable auth unless they are themselves in the superusers list self.redpanda.set_cluster_config({'superusers': ['bob']}) with expect_http_error(400): self.redpanda.set_cluster_config({'admin_api_require_auth': True}) with expect_http_error(400): anonymous_admin.patch_cluster_config( {'admin_api_require_auth': True}) # A superuser may enable auth self.redpanda.set_cluster_config({ 'superusers': ['bob', self.redpanda.SUPERUSER_CREDENTIALS.username] }) self.redpanda.set_cluster_config({'admin_api_require_auth': True})
def _verify_materialized_assignments(self, topic, partition, assignments): admin = Admin(self.redpanda) massignments = self._get_assignments(admin, topic, partition) self.logger.info( f"materialized assignments for {topic}-{partition}: {massignments}" ) self._wait_post_move(topic, partition, assignments)
def test_decommissioning_working_node(self): self.start_redpanda(num_nodes=4) topics = [] for partition_count in range(1, 5): for replication_factor in (3, 3): name = f"topic{len(topics)}" spec = TopicSpec(name=name, partition_count=partition_count, replication_factor=replication_factor) topics.append(spec) for spec in topics: self.client().create_topic(spec) self.topic = spec.name self.start_producer(1) self.start_consumer(1) self.await_startup() admin = Admin(self.redpanda) brokers = admin.get_brokers() to_decommission = random.choice(brokers) self.logger.info(f"decommissioning node: {to_decommission}", ) admin.decommission_broker(to_decommission['node_id']) # A node which isn't being decommed, to use when calling into # the admin API from this point onwards. survivor_node = [ n for n in self.redpanda.nodes if self.redpanda.idx(n) != to_decommission['node_id'] ][0] self.logger.info( f"Using survivor node {survivor_node.name} {self.redpanda.idx(survivor_node)}" ) def node_removed(): brokers = admin.get_brokers(node=survivor_node) for b in brokers: if b['node_id'] == to_decommission['node_id']: return False return True wait_until(node_removed, timeout_sec=120, backoff_sec=2) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
def test_controller_recovery(self): kc = KafkaCat(self.redpanda) # choose a partition and a target node partition = self._get_partition(kc) target_node_id = next( filter(lambda r: r["id"] != partition["leader"], partition["replicas"]))["id"] self.logger.debug( f"Transfering leader from {partition['leader']} to {target_node_id}" ) # build the transfer url meta = kc.metadata() brokers = meta["brokers"] source_broker = next( filter(lambda b: b["id"] == partition["leader"], brokers)) target_broker = next( filter(lambda b: b["id"] == target_node_id, brokers)) self.logger.debug(f"Source broker {source_broker}") self.logger.debug(f"Target broker {target_broker}") # Send the request to any host, they should redirect to # the leader of the partition. partition_id = partition['partition'] admin = Admin(self.redpanda) admin.partition_transfer_leadership("kafka", self.topic, partition_id, target_node_id) def transfer_complete(): for _ in range(3): # just give it a moment time.sleep(1) meta = kc.metadata() partition = next( filter(lambda p: p["partition"] == partition_id, meta["topics"][0]["partitions"])) if partition["leader"] == target_node_id: return True return False wait_until(lambda: transfer_complete(), timeout_sec=30, backoff_sec=5, err_msg="Transfer did not complete")
def test_log_level_control(self): admin = Admin(self.redpanda) node = self.redpanda.nodes[0] # This test assumes the default log level while testing is trace default_log_level = "trace" # set to warn level. message seen at trace with self.redpanda.monitor_log(node) as mon: admin.set_log_level("admin_api_server", "warn") mon.wait_until( f"Set log level for {{admin_api_server}}: {default_log_level} -> warn", timeout_sec=5, backoff_sec=1, err_msg="Never saw message") # set to debug. log level at warn, so shouldn't see it try: with self.redpanda.monitor_log(node) as mon: admin.set_log_level("admin_api_server", "debug") mon.wait_until( "Set log level for {admin_api_server}: warn -> debug", timeout_sec=10, backoff_sec=1, err_msg="Never saw message") assert False, "Should not have seen message" except ducktape.errors.TimeoutError: pass # should now see it again with self.redpanda.monitor_log(node) as mon: admin.set_log_level("admin_api_server", "info") mon.wait_until( "Set log level for {admin_api_server}: debug -> info", timeout_sec=5, backoff_sec=1, err_msg="Never saw message") with self.redpanda.monitor_log(node) as mon: admin.set_log_level("admin_api_server", "debug", expires=5) mon.wait_until( f"Expiring log level for {{admin_api_server}} to {default_log_level}", timeout_sec=10, backoff_sec=1, err_msg="Never saw message")
def decommissioned(): try: admin = Admin(self.redpanda) # if broker is already draining, it is suceess brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id and b[ 'membership_status'] == 'draining': return True r = admin.decommission_broker(id=node_id) return r.status_code == 200 except requests.exceptions.RetryError: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.HTTPError: return False