class AvailabilityTests(EndToEndFinjectorTest): def validate_records(self): min_records = 40000 producer_timeout_sec = 60 consumer_timeout_sec = 60 if self.scale.ci or self.scale.release: min_records = 100000 producer_timeout_sec = 180 consumer_timeout_sec = 180 self.run_validation(min_records=min_records, enable_idempotence=False, producer_timeout_sec=producer_timeout_sec, consumer_timeout_sec=consumer_timeout_sec) @cluster(num_nodes=5) def test_availability_when_one_node_failed(self): self.redpanda = RedpandaService( self.test_context, 3, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 1, "default_topic_replications": 3, }) self.redpanda.start() spec = TopicSpec(name="test-topic", partition_count=6, replication_factor=3) self.redpanda.create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=10000) self.start_consumer(1) self.await_startup() # start failure injector with default parameters self.start_finjector() self.validate_records() @cluster(num_nodes=5) def test_recovery_after_catastrophic_failure(self): self.redpanda = RedpandaService( self.test_context, 3, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 1, "default_topic_replications": 3, }) self.redpanda.start() spec = TopicSpec(name="test-topic", partition_count=6, replication_factor=3) self.redpanda.create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=10000) self.start_consumer(1) self.await_startup() # inject permanent random failure f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES), random.choice(self.redpanda.nodes[0:1])) self.inject_failure(f_spec) # inject transient failure on other node f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES), self.redpanda.nodes[2], length=2.0 if self.scale.local else 15.0) self.inject_failure(f_spec) self.validate_records()
class NodeOperationFuzzyTest(EndToEndTest): def generate_random_workload(self, count, skip_nodes): op_types = [ADD, DECOMMISSION] tp_op_types = [ADD_TOPIC, DELETE_TOPIC] # current state active_nodes = [1, 2, 3, 4, 5] decommissioned_nodes = [] operations = [] topics = [] def eligible_active_nodes(): return list( filter(lambda n: not (n == 1 or n in skip_nodes), active_nodes)) def decommission(id): active_nodes.remove(id) decommissioned_nodes.append(id) def add(id): active_nodes.append(id) decommissioned_nodes.remove(id) for _ in range(0, count): if len(decommissioned_nodes) == 2: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) elif len(decommissioned_nodes) == 0: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) else: op = random.choice(op_types) if op == DECOMMISSION: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) elif op == ADD: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) # topic operation if len(topics) == 0: op = ADD_TOPIC else: op = random.choice(tp_op_types) if op == ADD_TOPIC: operations.append(( ADD_TOPIC, f"test-topic-{random.randint(0,2000)}-{round(time.time()*1000000)}", random.choice(ALLOWED_REPLICATION), 3)) else: operations.append((DELETE_TOPIC, random.choice(topics))) return operations def _create_random_topics(self, count): max_partitions = 10 topics = [] for i in range(0, count): name = f"topic-{i}" spec = TopicSpec( name=name, partition_count=random.randint(1, max_partitions), replication_factor=random.choice(ALLOWED_REPLICATION)) topics.append(spec) for spec in topics: self.redpanda.create_topic(spec) return topics """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=7) @parametrize(enable_failures=True) @parametrize(enable_failures=False) def test_node_opeartions(self, enable_failures): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) self.active_nodes = set([1, 2, 3, 4, 5]) self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() NODE_OP_TIMEOUT = 360 def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while enable_failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint(1, 10) node = random.choice(self.redpanda.nodes) else: #kill/termianate only active nodes (not to influence the test outcome) idx = random.choice(list(self.active_nodes)) - 1 node = self.redpanda.nodes[idx] f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint(20, 45) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if enable_failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") def decommissioned(): try: admin = Admin(self.redpanda) # if broker is already draining, it is suceess brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id and b[ 'membership_status'] == 'draining': return True r = admin.decommission_broker(id=node_id) return r.status_code == 200 except requests.exceptions.RetryError: return False except requests.exceptions.ConnectionError: return False except requests.exceptions.HTTPError: return False wait_until(decommissioned, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def node_removed(): admin = Admin(self.redpanda) try: brokers = admin.get_brokers(node=self.redpanda.nodes[0]) for b in brokers: if b['node_id'] == node_id: return False return True except: return False wait_until(node_removed, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1], preserve_logs=True) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=NODE_OP_TIMEOUT, backoff_sec=2) def is_topic_present(name): kcl = KCL(self.redpanda) lines = kcl.list_topics().splitlines() self.redpanda.logger.debug( f"checking if topic {name} is present in {lines}") for l in lines: if l.startswith(name): return True return False def create_topic(spec): try: self.redpanda.create_topic(spec) except Exception as e: self.redpanda.logger.warn( f"error creating topic {spec.name} - {e}") try: return is_topic_present(spec.name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False def delete_topic(name): try: self.redpanda.delete_topic(name) except Exception as e: self.redpanda.logger.warn(f"error deleting topic {name} - {e}") try: return not is_topic_present(name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False work = self.generate_random_workload(10, skip_nodes=set()) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info(f"executing - {op}") if op_type == ADD: id = op[1] self.active_nodes.add(id) restart_node(id) if op_type == DECOMMISSION: id = op[1] self.active_nodes.remove(id) decommission(id) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) wait_until(lambda: create_topic(spec) == True, timeout_sec=180, backoff_sec=2) elif op_type == DELETE_TOPIC: wait_until(lambda: delete_topic(op[1]) == True, timeout_sec=180, backoff_sec=2) enable_failures = False self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
class NodeOperationFuzzyTest(EndToEndTest): def generate_random_workload(self, count, skip_nodes): op_types = [ADD, DECOMMISSION] tp_op_types = [ADD_TOPIC, DELETE_TOPIC] # current state active_nodes = [1, 2, 3, 4, 5] decommissioned_nodes = [] operations = [] topics = [] def eligible_active_nodes(): return list( filter(lambda n: not (n == 1 or n in skip_nodes), active_nodes)) def decommission(id): active_nodes.remove(id) decommissioned_nodes.append(id) def add(id): active_nodes.append(id) decommissioned_nodes.remove(id) for _ in range(0, count): if len(decommissioned_nodes) == 2: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) elif len(decommissioned_nodes) == 0: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) else: op = random.choice(op_types) if op == DECOMMISSION: id = random.choice(eligible_active_nodes()) operations.append((DECOMMISSION, id)) decommission(id) elif op == ADD: id = random.choice(decommissioned_nodes) operations.append((ADD, id)) add(id) # topic operation if len(topics) == 0: op = ADD_TOPIC else: op = random.choice(tp_op_types) if op == ADD_TOPIC: operations.append(( ADD_TOPIC, f"test-topic-{random.randint(0,2000)}-{time.time()*1000.0}", random.choice(ALLOWED_REPLICATION), 3)) else: operations.append((DELETE_TOPIC, random.choice(topics))) return operations def _create_random_topics(self, count): max_partitions = 10 topics = [] for i in range(0, count): name = f"topic-{i}" spec = TopicSpec( name=name, partition_count=random.randint(1, max_partitions), replication_factor=random.choice(ALLOWED_REPLICATION)) topics.append(spec) for spec in topics: self.redpanda.create_topic(spec) return topics """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=7) def test_node_opeartions(self): # allocate 5 nodes for the cluster self.redpanda = RedpandaService( self.test_context, 5, KafkaCliTools, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 3, "default_topic_replications": 3, }) # start 3 nodes self.redpanda.start() # create some topics topics = self._create_random_topics(10) self.redpanda.logger.info(f"using topics: {topics}") # select one of the topics to use in consumer/producer self.topic = random.choice(topics).name self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() def decommission(node_id): self.logger.info(f"decommissioning node: {node_id}") admin = Admin(self.redpanda) admin.decommission_broker(id=node_id) def node_removed(): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: if b['node_id'] == node_id: return False return True wait_until(node_removed, timeout_sec=240, backoff_sec=2) kafkacat = KafkaCat(self.redpanda) def replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def restart_node(node_id, cleanup=True): self.logger.info(f"restarting node: {node_id}") self.redpanda.stop_node(self.redpanda.nodes[node_id - 1]) if cleanup: self.redpanda.clean_node(self.redpanda.nodes[node_id - 1]) self.redpanda.start_node(self.redpanda.nodes[node_id - 1]) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") def has_new_replicas(): per_node = replicas_per_node() self.logger.info(f"replicas per node: {per_node}") return node_id in per_node wait_until(has_new_replicas, timeout_sec=240, backoff_sec=2) admin = Admin(self.redpanda) admin.set_log_level("cluster", "trace") work = self.generate_random_workload(10, skip_nodes=set()) self.redpanda.logger.info(f"node operations to execute: {work}") for op in work: op_type = op[0] self.logger.info(f"executing - {op}") if op_type == ADD: id = op[1] restart_node(id) if op_type == DECOMMISSION: id = op[1] decommission(id) elif op_type == ADD_TOPIC: spec = TopicSpec(name=op[1], replication_factor=op[2], partition_count=op[3]) self.redpanda.create_topic(spec) elif op_type == DELETE_TOPIC: self.redpanda.delete_topic(op[1]) self.run_validation(enable_idempotence=False, consumer_timeout_sec=180)
class FetchAfterDeleteTest(Test): def __init__(self, test_context): super(FetchAfterDeleteTest, self).__init__(test_context) self.scale = Scale(test_context) @cluster(num_nodes=3) @parametrize(transactions_enabled=True) @parametrize(transactions_enabled=False) def test_fetch_after_committed_offset_was_removed(self, transactions_enabled): """ Test fetching when consumer offset was deleted by retention """ segment_size = 1048576 self.redpanda = RedpandaService(self.test_context, 3, KafkaCliTools, extra_rp_conf={ "enable_transactions": transactions_enabled, "enable_idempotence": transactions_enabled, "log_compaction_interval_ms": 5000, "log_segment_size": segment_size, "enable_leader_balancer": False, }) self.redpanda.start() topic = TopicSpec(partition_count=1, replication_factor=3, cleanup_policy=TopicSpec.CLEANUP_DELETE) self.redpanda.create_topic(topic) self.topic = topic.name kafka_tools = KafkaCliTools(self.redpanda) # produce until segments have been compacted produce_until_segments( self.redpanda, topic=self.topic, partition_idx=0, count=10, ) consumer_group = 'test' rpk = RpkTool(self.redpanda) def consume(n=1): out = rpk.consume(self.topic, group=consumer_group, n=n) split = out.split('}') split = filter(lambda s: "{" in s, split) return map(lambda s: json.loads(s + "}"), split) #consume from the beggining msgs = consume(10) last = list(msgs).pop() offset = last['offset'] # change retention time kafka_tools.alter_topic_config( self.topic, { TopicSpec.PROPERTY_RETENTION_BYTES: 2 * segment_size, }) wait_for_segments_removal(self.redpanda, self.topic, partition_idx=0, count=5) partitions = list(rpk.describe_topic(self.topic)) p = partitions[0] assert p.start_offset > offset # consume from the offset that doesn't exists, # the one that was committed previously was already removed out = list(consume(1)) assert out[0]['offset'] == p.start_offset
class ScalingUpTest(EndToEndTest): """ Adding nodes to the cluster should result in partition reallocations to new nodes """ @cluster(num_nodes=5) def test_adding_nodes_to_cluster(self): self.redpanda = RedpandaService(self.test_context, 3, KafkaCliTools) # start single node cluster self.redpanda.start(nodes=[self.redpanda.nodes[0]]) # create some topics topics = [] total_replicas = 0 for partition_count in range(1, 5): name = f"topic{len(topics)}" spec = TopicSpec(name=name, partition_count=partition_count, replication_factor=1) total_replicas += partition_count topics.append(spec) for spec in topics: self.redpanda.create_topic(spec) self.topic = spec.name self.start_producer(1) self.start_consumer(1) self.await_startup() # add second node self.redpanda.start_node(self.redpanda.nodes[1]) kafkacat = KafkaCat(self.redpanda) def _replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def partitions_rebalanced(): per_node = _replicas_per_node() self.redpanda.logger.info(f"replicas per node: {per_node}") if len(per_node) < len(self.redpanda.started_nodes()): return False replicas = sum(per_node.values()) if replicas != total_replicas: return False return all(p[1] > 1 for p in per_node.items()) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) # add third node self.redpanda.start_node(self.redpanda.nodes[2]) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
class MetricsReporterTest(Test): def __init__(self, test_ctx, *args, **kwargs): self._ctx = test_ctx super(MetricsReporterTest, self).__init__(test_context=test_ctx) """ Validates key availability properties of the system using a single partition. """ @cluster(num_nodes=4) def test_redpanda_metrics_reporting(self): """ Testing if when fetching from single node all partitions are returned in round robin fashion """ # setup http server http = HttpServer(self._ctx) http.start() # report every two seconds extra_conf = { "health_monitor_tick_interval": 1000, "metrics_reporter_tick_interval": 2000, "metrics_reporter_report_interval": 1000, "enable_metrics_reporter": True, "metrics_reporter_url": f"{http.url}/metrics", } self.redpanda = RedpandaService(self.test_context, 3, KafkaCliTools, extra_rp_conf=extra_conf) self.redpanda.start() total_topics = 5 total_partitions = 0 for _ in range(0, total_topics): partitions = random.randint(1, 8) total_partitions += partitions self.redpanda.create_topic( [TopicSpec(partition_count=partitions, replication_factor=3)]) # create topics self.redpanda.logger.info( f"created {total_topics} topics with {total_partitions} partitions" ) def _state_up_to_date(): if http.requests: r = json.loads(http.requests[-1]['body']) return r['topic_count'] == total_topics return False wait_until(_state_up_to_date, 20, backoff_sec=1) http.stop() metadata = [json.loads(r['body']) for r in http.requests] for m in metadata: self.redpanda.logger.info(m) def assert_fields_are_the_same(metadata, field): assert all(m[field] == metadata[0][field] for m in metadata) # cluster uuid and create timestamp should stay the same across requests assert_fields_are_the_same(metadata, 'cluster_uuid') assert_fields_are_the_same(metadata, 'cluster_created_ts') # get the last report last = metadata.pop() assert last['topic_count'] == total_topics assert last['partition_count'] == total_partitions nodes_meta = last['nodes'] assert len(last['nodes']) == 3 assert all('node_id' in n for n in nodes_meta) assert all('cpu_count' in n for n in nodes_meta) assert all('version' in n for n in nodes_meta) assert all('uptime_ms' in n for n in nodes_meta) assert all('is_alive' in n for n in nodes_meta) assert all('disks' in n for n in nodes_meta)