def _failure_injector_loop(self): while self.enable_failures: f_injector = FailureInjector(self.redpanda) f_injector.inject_failure(self._next_failure()) delay = self.failure_delay_provier() self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay)
def test_follower_isolation(self): """ Simplest HA test. Stop the leader for our partition. Validate that the cluster remains available afterwards, and that the expected peer takes over as the new leader. """ # Find which node is the leader initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() leader_node = self.redpanda.get_node(initial_leader_id) self.logger.info( f"Initial leader {initial_leader_id} {leader_node.account.hostname}" ) with FailureInjector(self.redpanda) as fi: # isolate one of the followers fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.get_node(replicas[1]))) # expect messages to be produced and consumed without a timeout for i in range(0, 128): self._ping_pong()
def test_id_allocator_leader_isolation(self): """ Isolate id allocator leader. This test validates whether the cluster is still available when `kafka_internal/id_allocator` leader has been isolated. """ admin = Admin(self.redpanda) self._expect_available() # Find which node is the leader for id allocator partition admin.wait_stable_configuration(namespace='kafka_internal', topic='id_allocator', replication=3) initial_leader_id = admin.get_partition_leader( namespace='kafka_internal', topic='id_allocator', partition=0) leader_node = self.redpanda.get_node(initial_leader_id) self.logger.info( f"kafka_internal/id_allocator/0 leader: {initial_leader_id}, node: {leader_node.account.hostname}" ) self._expect_available() with FailureInjector(self.redpanda) as fi: # isolate id_allocator fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.get_node(initial_leader_id))) # expect messages to be produced and consumed without a timeout connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
class ProcessKill(DisruptiveAction): PROCESS_START_WAIT_SEC = 20 PROCESS_START_WAIT_BACKOFF = 2 def __init__(self, redpanda: RedpandaService, config: ActionConfig, admin: Admin): super(ProcessKill, self).__init__(redpanda, config, admin) self.failure_injector = FailureInjector(self.redpanda) self.is_reversible = True def max_affected_nodes_reached(self): return len(self.affected_nodes) >= self.config.max_affected_nodes def do_action(self): node = self.target_node() if node: self.redpanda.logger.info( f'executing action on {node.account.hostname}') self.failure_injector.inject_failure( FailureSpec(FailureSpec.FAILURE_KILL, node)) self.affected_nodes.add(node) self.last_affected_node = node # Update started_nodes so storage validations are run # on the correct set of nodes later. self.redpanda.remove_from_started_nodes(node) return node else: self.redpanda.logger.warn(f'no usable node') return None def do_reverse_action(self): self._start_rp(node=self.last_affected_node) self.affected_nodes.remove(self.last_affected_node) self.redpanda.add_to_started_nodes(self.last_affected_node) last_affected_node, self.last_affected_node = self.last_affected_node, None return last_affected_node def _start_rp(self, node): self.failure_injector._start(node) wait_until( lambda: self.redpanda.redpanda_pid(node), timeout_sec=self.PROCESS_START_WAIT_SEC, backoff_sec=self.PROCESS_START_WAIT_BACKOFF, err_msg= f'Failed to start redpanda process on {node.account.hostname}')
def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint(1, 10) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) - 1 node = self.redpanda.nodes[idx] f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint(20, 45) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay)
def test_controller_node_isolation(self): """ Isolate controller node, expect cluster to be available """ def controller_available(): return self.redpanda.controller() is not None admin = Admin(self.redpanda) # wait for controller wait_until(controller_available, timeout_sec=ELECTION_TIMEOUT * 2, backoff_sec=1) initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() allocator_info = admin.wait_stable_configuration( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2) # isolate controller with FailureInjector(self.redpanda) as fi: controller_id = self.redpanda.idx( self.redpanda.controller().account.hostname) fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.controller())) if allocator_info.leader == controller_id: hosts = [ n.account.hostname for n in self.redpanda.nodes if self.redpanda.idx(n) != controller_id ] admin.await_stable_leader( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2, hosts=hosts, check=lambda node_id: node_id != controller_id) connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint( 1, NodeOperationFuzzyTest.max_suspend_duration_seconds) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) node = self.redpanda.get_node(idx) f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint( NodeOperationFuzzyTest.min_inter_failure_time, NodeOperationFuzzyTest.max_inter_failure_time) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay)
def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 node = random.choice(self.redpanda.nodes) while self.redpanda.idx(node) in suppressed: node = random.choice(self.redpanda.nodes) # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint( 1, ConsumerOffsetsMigrationTest.max_suspend_duration_sec) f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint( ConsumerOffsetsMigrationTest.min_inter_failure_time_sec, ConsumerOffsetsMigrationTest.max_inter_failure_time_sec) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay)
def test_follower_isolation(self): """ Simplest HA test. Stop the leader for our partition. Validate that the cluster remains available afterwards, and that the expected peer takes over as the new leader. """ admin = Admin(self.redpanda) # Find which node is the leader initial_leader_id, replicas = self._wait_for_leader() assert initial_leader_id == replicas[0] self._expect_available() leader_node = self.redpanda.get_node(initial_leader_id) self.logger.info( f"Initial leader {initial_leader_id} {leader_node.account.hostname}" ) allocator_info = admin.wait_stable_configuration( "id_allocator", namespace="kafka_internal", replication=3, timeout_s=ELECTION_TIMEOUT * 2) follower = None for node in replicas: if node == initial_leader_id: continue if node == allocator_info.leader: continue follower = node break assert follower != None with FailureInjector(self.redpanda) as fi: # isolate one of the followers fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.get_node(follower))) # expect messages to be produced and consumed without a timeout connection = self.ping_pong() connection.ping_pong(timeout_s=10, retries=10) for i in range(0, 127): connection.ping_pong()
def test_controller_node_isolation(self): """ Isolate controller node, expect cluster to be available """ def controller_available(): return self.redpanda.controller() is not None # wait for controller wait_until(controller_available, timeout_sec=10, backoff_sec=1) # isolate controller with FailureInjector(self.redpanda) as fi: fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, self.redpanda.controller())) for i in range(0, 128): self._ping_pong()
def test_metadata_request_does_not_contain_failed_node( self, failure, node): """ Check if broker list returned from metadata request does not contain node which is not alive """ # validate initial conditions wait_until(lambda: self.controller_present, 10, 1) rpk = RpkTool(self.redpanda) nodes = rpk.cluster_info() assert len(nodes) == 3 redpanda_ids = [self.redpanda.idx(n) for n in self.redpanda.nodes] node_ids = [n.id for n in nodes] assert sorted(redpanda_ids) == sorted(node_ids) def get_node(): if node == 'controller': return self.redpanda.controller() else: n = self.redpanda.nodes[0] while n == self.redpanda.controller(): n = random.choice(self.redpanda.nodes) return n node = get_node() node_id = self.redpanda.idx(node) self.redpanda.logger.info( f"Injecting failure on node {node.account.hostname} with id: {node_id}", ) with FailureInjector(self.redpanda) as fi: if failure == "isolate": fi.inject_failure( FailureSpec(FailureSpec.FAILURE_ISOLATE, node)) else: self.redpanda.stop_node(node) rpk = RpkTool(self.redpanda) def contains_only_alive_nodes(): nodes = rpk.cluster_info() returned_ids = [n.id for n in nodes] return len(nodes) == 2 and node_id not in returned_ids wait_until(contains_only_alive_nodes, 30, 1)
def inject_failure(self, spec): f_injector = FailureInjector(self.redpanda) f_injector.inject_failure(spec)
def __init__(self, redpanda: RedpandaService, config: ActionConfig, admin: Admin): super(ProcessKill, self).__init__(redpanda, config, admin) self.failure_injector = FailureInjector(self.redpanda) self.is_reversible = True