def test_simple_end_to_end(self, source_type): # start brokers self.start_brokers(source_type=source_type) # start mirror maker self.mirror_maker = MirrorMaker2(self.test_context, num_nodes=1, source_cluster=self.source_broker, target_cluster=self.redpanda) topics = [] for i in range(0, 10): topics.append( TopicSpec(partition_count=random.randint(1, 10), retention_bytes=random.randint(100000000, 300000000), retention_ms=random.randint(1 * 3600000, 10 * 3600000))) self.source_client.create_topic(topics) self.mirror_maker.start() # start source producer & target consumer self.start_workload() self.run_validation(consumer_timeout_sec=120) self.mirror_maker.stop() target_client = DefaultClient(self.redpanda) for t in topics: desc = target_client.describe_topic(t.name) self.logger.debug(f'source topic: {t}, target topic: {desc}') assert len(desc.partitions) == t.partition_count
def test_availability_when_one_node_failed(self): self.redpanda = RedpandaService( self.test_context, 3, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 1, "default_topic_replications": 3, }) self.redpanda.start() spec = TopicSpec(name="test-topic", partition_count=6, replication_factor=3) DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=10000) self.start_consumer(1) self.await_startup() # start failure injector with default parameters self.start_finjector() self.validate_records()
def test_adding_nodes_to_cluster(self): self.redpanda = RedpandaService( self.test_context, 3, extra_rp_conf={"group_topic_partitions": 1}) # start single node cluster self.redpanda.start(nodes=[self.redpanda.nodes[0]]) # create some topics topics = [] # include __consumer_offsets topic replica total_replicas = 1 for partition_count in range(1, 5): name = f"topic{len(topics)}" spec = TopicSpec(name=name, partition_count=partition_count, replication_factor=1) total_replicas += partition_count topics.append(spec) for spec in topics: DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name self.start_producer(1) self.start_consumer(1) self.await_startup() # add second node self.redpanda.start_node(self.redpanda.nodes[1]) kafkacat = KafkaCat(self.redpanda) def _replicas_per_node(): node_replicas = {} md = kafkacat.metadata() self.redpanda.logger.info(f"metadata: {md}") for topic in md['topics']: for p in topic['partitions']: for r in p['replicas']: id = r['id'] if id not in node_replicas: node_replicas[id] = 0 node_replicas[id] += 1 return node_replicas def partitions_rebalanced(): per_node = _replicas_per_node() self.redpanda.logger.info(f"replicas per node: {per_node}") if len(per_node) < len(self.redpanda.started_nodes()): return False replicas = sum(per_node.values()) if replicas != total_replicas: return False return all(p[1] > 1 for p in per_node.items()) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) # add third node self.redpanda.start_node(self.redpanda.nodes[2]) wait_until(partitions_rebalanced, timeout_sec=30, backoff_sec=1) self.run_validation(enable_idempotence=False, consumer_timeout_sec=45)
def test_recovery_after_multiple_restarts(self): self.start_redpanda(3, extra_rp_conf=self._extra_rp_conf) spec = TopicSpec(partition_count=60, replication_factor=3) DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name rpk = RpkTool(self.redpanda) rpk.alter_topic_config(spec.name, 'redpanda.remote.write', 'true') rpk.alter_topic_config(spec.name, 'redpanda.remote.read', 'true') self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() def no_under_replicated_partitions(): metric_sample = self.redpanda.metrics_sample("under_replicated") for s in metric_sample.samples: if s.value > 0: return False return True # restart all the nodes and wait for recovery for i in range(0, 10): for n in self.redpanda.nodes: self.redpanda.signal_redpanda(n) self.redpanda.start_node(n) wait_until(no_under_replicated_partitions, 30, 2) self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
def delete_topic(name): try: DefaultClient(self.redpanda).delete_topic(name) except Exception as e: self.redpanda.logger.warn(f"error deleting topic {name} - {e}") try: return not is_topic_present(name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False
def create_topic(spec): try: DefaultClient(self.redpanda).create_topic(spec) except Exception as e: self.redpanda.logger.warn( f"error creating topic {spec.name} - {e}") try: return is_topic_present(spec.name) except Exception as e: self.redpanda.logger.warn(f"error while listing topics - {e}") return False
def start_redpanda(self, num_nodes=1, extra_rp_conf=None): if extra_rp_conf is not None: # merge both configurations, the extra_rp_conf passed in # paramter takes the precedence self._extra_rp_conf = {**self._extra_rp_conf, **extra_rp_conf} assert self.redpanda is None self.redpanda = RedpandaService(self.test_context, num_nodes, extra_rp_conf=self._extra_rp_conf) self.redpanda.start() self._client = DefaultClient(self.redpanda)
def start_brokers(self, source_type=kafka_source): if source_type == TestMirrorMakerService.redpanda_source: self.source_broker = RedpandaService(self.test_context, num_brokers=3) else: self.source_broker = KafkaServiceAdapter( self.test_context, KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=V_3_0_0)) self.redpanda = RedpandaService(self.test_context, num_brokers=3) self.source_broker.start() self.redpanda.start() self.source_client = DefaultClient(self.source_broker) self.topic.partition_count = 1000 if self.redpanda.dedicated_nodes else 1 self.source_client.create_topic(self.topic)
def __init__(self, test_context, num_brokers=3, extra_rp_conf=dict(), enable_pp=False, enable_sr=False, num_cores=3): super(RedpandaTest, self).__init__(test_context) self.scale = Scale(test_context) self.redpanda = RedpandaService(test_context, num_brokers, extra_rp_conf=extra_rp_conf, enable_pp=enable_pp, enable_sr=enable_sr, num_cores=num_cores) self._client = DefaultClient(self.redpanda)
def _create_random_topics(self, count): max_partitions = 10 topics = [] for i in range(0, count): name = f"topic-{i}" spec = TopicSpec( name=name, partition_count=random.randint(1, max_partitions), replication_factor=random.choice(ALLOWED_REPLICATION)) topics.append(spec) for spec in topics: DefaultClient(self.redpanda).create_topic(spec) return topics
def test_recovery_after_multiple_restarts(self): # If a debug build has to do a restart across a significant # number of partitions, it gets slow. Use fewer partitions # on debug builds. partition_count = 10 if self.debug_mode else 60 si_settings = SISettings(cloud_storage_reconciliation_interval_ms=500, cloud_storage_max_connections=5, log_segment_size=self.log_segment_size) self.s3_bucket_name = si_settings.cloud_storage_bucket self.start_redpanda(3, extra_rp_conf=self._extra_rp_conf, si_settings=si_settings) spec = TopicSpec(partition_count=partition_count, replication_factor=3) DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name rpk = RpkTool(self.redpanda) rpk.alter_topic_config(spec.name, 'redpanda.remote.write', 'true') rpk.alter_topic_config(spec.name, 'redpanda.remote.read', 'true') self.start_producer(1, throughput=100) self.start_consumer(1) self.await_startup() def no_under_replicated_partitions(): metric_sample = self.redpanda.metrics_sample("under_replicated") for s in metric_sample.samples: if s.value > 0: return False return True # restart all the nodes and wait for recovery for i in range(0, 10): for n in self.redpanda.nodes: self.redpanda.signal_redpanda(n) self.redpanda.start_node(n) wait_until(no_under_replicated_partitions, 30, 2) self.run_validation(enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180)
def __init__(self, test_context, num_brokers=None, extra_rp_conf=dict(), enable_pp=False, enable_sr=False, si_settings=None, **kwargs): """ Any trailing keyword arguments are passed through to the RedpandaService constructor. """ super(RedpandaTest, self).__init__(test_context) self.scale = Scale(test_context) self.si_settings = si_settings if num_brokers is None: # Default to a 3 node cluster if sufficient nodes are available, else # a single node cluster. This is just a default: tests are welcome # to override constructor to pass an explicit size. This logic makes # it convenient to mix 3 node and 1 node cases in the same class, by # just modifying the @cluster node count per test. if test_context.cluster.available().size() >= 3: num_brokers = 3 else: num_brokers = 1 if self.si_settings: self.si_settings.load_context(self.logger, test_context) self.redpanda = RedpandaService(test_context, num_brokers, extra_rp_conf=extra_rp_conf, enable_pp=enable_pp, enable_sr=enable_sr, si_settings=self.si_settings, **kwargs) self._client = DefaultClient(self.redpanda)
def test_recovery_after_catastrophic_failure(self): self.redpanda = RedpandaService( self.test_context, 3, extra_rp_conf={ "enable_auto_rebalance_on_node_add": True, "group_topic_partitions": 1, "default_topic_replications": 3, }) self.redpanda.start() spec = TopicSpec(name="test-topic", partition_count=6, replication_factor=3) DefaultClient(self.redpanda).create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=10000) self.start_consumer(1) self.await_startup() # inject permanent random failure f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES), random.choice(self.redpanda.nodes[0:1])) self.inject_failure(f_spec) # inject transient failure on other node f_spec = FailureSpec(random.choice(FailureSpec.FAILURE_TYPES), self.redpanda.nodes[2], length=2.0 if self.scale.local else 15.0) self.inject_failure(f_spec) self.validate_records()
def test_cluster_is_available_during_upgrade_without_group_topic(self): ''' Validates that cluster is available and healthy during upgrade when `kafka_internal::group` topic is not present ''' # set redpanda logical version to value without __consumer_offsets support self.redpanda = RedpandaService( self.test_context, 5, extra_rp_conf={ "group_topic_partitions": 16, "default_topic_replications": 3, }, environment={"__REDPANDA_LOGICAL_VERSION": 1}) self.redpanda.start() self._client = DefaultClient(self.redpanda) spec = TopicSpec(partition_count=6, replication_factor=3) self.client().create_topic(spec) self.topic = spec.name def cluster_is_stable(): admin = Admin(self.redpanda) brokers = admin.get_brokers() if len(brokers) < 3: return False for b in brokers: self.logger.debug(f"broker: {b}") if not (b['is_alive'] and 'disk_space' in b): return False return True def node_stopped(node_id): admin = Admin(self.redpanda) brokers = admin.get_brokers() for b in brokers: self.logger.debug(f"broker: {b}") if b['node_id'] == node_id: return b['is_alive'] == False return False kcl = KCL(self.redpanda) # check that consumer offsets topic is not present topics = set(kcl.list_topics()) assert "__consumer_offsets" not in topics # enable consumer offsets support self.redpanda.set_environment({"__REDPANDA_LOGICAL_VERSION": 2}) def get_raft0_follower(): ctrl = self.redpanda.controller node = random.choice(self.redpanda.nodes) while self.redpanda.idx(node) == self.redpanda.idx(ctrl): node = random.choice(self.redpanda.nodes) return node # restart node that is not controller n = get_raft0_follower() self.logger.info(f"restarting node {n.account.hostname}") self.redpanda.stop_node(n, timeout=60) # wait for leader balancer to start evening out leadership wait_until(lambda: node_stopped(self.redpanda.idx(n)), 90, backoff_sec=2) self.redpanda.start_node(n) wait_until(cluster_is_stable, 90, backoff_sec=2)
def test_migrating_consume_offsets(self, failures, cpus): ''' Validates correctness while executing consumer offsets migration ''' # set redpanda logical version to value without __consumer_offsets support self.redpanda = RedpandaService( self.test_context, 5, resource_settings=ResourceSettings(num_cpus=cpus), extra_rp_conf={ "group_topic_partitions": 16, "default_topic_replications": 3, }, environment={"__REDPANDA_LOGICAL_VERSION": 1}) self.redpanda.start() self._client = DefaultClient(self.redpanda) # set of failure suppressed nodes - required to make restarts deterministic suppressed = set() def failure_injector_loop(): f_injector = FailureInjector(self.redpanda) while failures: f_type = random.choice(FailureSpec.FAILURE_TYPES) length = 0 node = random.choice(self.redpanda.nodes) while self.redpanda.idx(node) in suppressed: node = random.choice(self.redpanda.nodes) # allow suspending any node if f_type == FailureSpec.FAILURE_SUSPEND: length = random.randint( 1, ConsumerOffsetsMigrationTest.max_suspend_duration_sec) f_injector.inject_failure( FailureSpec(node=node, type=f_type, length=length)) delay = random.randint( ConsumerOffsetsMigrationTest.min_inter_failure_time_sec, ConsumerOffsetsMigrationTest.max_inter_failure_time_sec) self.redpanda.logger.info( f"waiting {delay} seconds before next failure") time.sleep(delay) if failures: finjector_thread = threading.Thread(target=failure_injector_loop, args=()) finjector_thread.daemon = True finjector_thread.start() spec = TopicSpec(partition_count=6, replication_factor=3) self.client().create_topic(spec) self.topic = spec.name self.start_producer(1, throughput=5000) self.start_consumer(1) self.await_startup() def cluster_is_stable(): admin = Admin(self.redpanda) brokers = admin.get_brokers() if len(brokers) < 3: return False for b in brokers: self.logger.debug(f"broker: {b}") if not (b['is_alive'] and 'disk_space' in b): return False return True kcl = KCL(self.redpanda) def _group_present(): return len(kcl.list_groups().splitlines()) > 1 # make sure that group is there wait_until(_group_present, 10, 1) # check that consumer offsets topic is not present topics = set(kcl.list_topics()) assert "__consumer_offsets" not in topics # enable consumer offsets support self.redpanda.set_environment({"__REDPANDA_LOGICAL_VERSION": 2}) for n in self.redpanda.nodes: id = self.redpanda.idx(n) suppressed.add(id) self.redpanda.restart_nodes(n, stop_timeout=60) suppressed.remove(id) # wait for leader balancer to start evening out leadership wait_until(cluster_is_stable, 90, backoff_sec=2) def _consumer_offsets_present(): try: partitions = list( self.client().describe_topic("__consumer_offsets")) return len(partitions) > 0 except: return False wait_until(_consumer_offsets_present, timeout_sec=90, backoff_sec=3) self.run_validation(min_records=100000, producer_timeout_sec=300, consumer_timeout_sec=180)
def test_node_recovery(self, recovery_type): self.start_redpanda(num_nodes=3) kafka_tools = KafkaCliTools(self.redpanda) kafka_cat = KafkaCat(self.redpanda) # create topics topics = [] for _ in range(0, 6): topics.append(TopicSpec(partition_count=random.randint(1, 10))) # chose one topic to run the main workload DefaultClient(self.redpanda).create_topic(topics) self.topic = random.choice(topics).name self.start_producer(1) self.start_consumer(2) self.await_startup() # chose another topic and populate it with data prepopulated_topic = random.choice(topics) while self.topic == prepopulated_topic.name: prepopulated_topic = random.choice(topics) # populate topic with data kafka_tools.produce(prepopulated_topic.name, 20000, 1024) def list_offsets(): offsets = {} for p in range(0, prepopulated_topic.partition_count): offsets[p] = kafka_cat.list_offsets(prepopulated_topic.name, p) # store offsets offsets = list_offsets() self.redpanda.logger.info(f"Topic offsets: {offsets}") # stop one of the nodes and remove its data stopped = random.choice(self.redpanda.nodes) # prepare seed servers list seeds = map(lambda n: { "address": n.account.hostname, "port": 33145 }, self.redpanda.nodes) seeds = list( filter(lambda n: n['address'] != stopped.account.hostname, seeds)) self.redpanda.stop_node(stopped) if recovery_type == FullNodeRecoveryTest.FULL_RECOVERY: self.redpanda.clean_node(stopped, preserve_logs=True) # produce some more data to make sure that stopped node is behind kafka_tools.produce(prepopulated_topic.name, 20000, 1024) # start node with the same node id, and not empty seed server list to # give node more time to start as it has to recover self.redpanda.start_node(stopped, override_cfg_params={'seed_servers': seeds}, timeout=90) def all_topics_recovered(): metric = self.redpanda.metrics_sample("under_replicated_replicas", self.redpanda.nodes) under_replicated = filter(lambda s: s.value == 1, metric.samples) under_replicated = list( map( lambda s: (s.labels['namespace'], s.labels['topic'], s. labels['partition']), under_replicated)) self.redpanda.logger.info( f"under replicated partitions: {list(under_replicated)}") return len(under_replicated) == 0 # wait for prepopulated topic to recover wait_until(all_topics_recovered, 60, 1) self.run_validation(min_records=20000, enable_idempotence=False, producer_timeout_sec=60, consumer_timeout_sec=180) # validate prepopulated topic offsets assert offsets == list_offsets()
class TestMirrorMakerService(EndToEndTest): kafka_source = "kafka" redpanda_source = "redpanda" def __init__(self, test_context): super(TestMirrorMakerService, self).__init__(test_context) self.topic = TopicSpec(replication_factor=3) # create single zookeeper node for Kafka self.zk = ZookeeperService(self.test_context, num_nodes=1, version=V_3_0_0) self.source_broker = None def setUp(self): self.zk.start() def tearDown(self): # ducktape handle service teardown automatically, but it is hard # to tell what went wrong if one of the services hangs. Do it # explicitly here with some logging, to enable debugging issues # like https://github.com/redpanda-data/redpanda/issues/4270 if self.source_broker is not None: self.logger.info( f"Stopping source broker ({self.source_broker.__class__.__name__})..." ) self.source_broker.stop() self.logger.info( f"Awaiting source broker ({self.source_broker.__class__.__name__})..." ) self.logger.info("Stopping zookeeper...") self.zk.stop() self.logger.info("Awaiting zookeeper...") def start_brokers(self, source_type=kafka_source): if source_type == TestMirrorMakerService.redpanda_source: self.source_broker = RedpandaService(self.test_context, num_brokers=3) else: self.source_broker = KafkaServiceAdapter( self.test_context, KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=V_3_0_0)) self.redpanda = RedpandaService(self.test_context, num_brokers=3) self.source_broker.start() self.redpanda.start() self.source_client = DefaultClient(self.source_broker) self.topic.partition_count = 1000 if self.redpanda.dedicated_nodes else 1 self.source_client.create_topic(self.topic) def start_workload(self): self.consumer = VerifiableConsumer( self.test_context, num_nodes=1, redpanda=self.redpanda, topic=self.topic.name, group_id='consumer_test_group', on_record_consumed=self.on_record_consumed) self.consumer.start() self.producer = VerifiableProducer( self.test_context, num_nodes=1, redpanda=self.source_broker, topic=self.topic.name, throughput=1000, message_validator=is_int_with_prefix) self.producer.start() def wait_for_n_messages(self, n_messages=100): """Wait for a minimum number of messages to be successfully produced.""" wait_until( lambda: self.producer.num_acked > n_messages, timeout_sec=10, err_msg= "Producer failed to produce %d messages in a reasonable amount of time." % n_messages) @cluster(num_nodes=10) @parametrize(source_type=kafka_source) @parametrize(source_type=redpanda_source) def test_simple_end_to_end(self, source_type): # start brokers self.start_brokers(source_type=source_type) # start mirror maker self.mirror_maker = MirrorMaker2(self.test_context, num_nodes=1, source_cluster=self.source_broker, target_cluster=self.redpanda) topics = [] for i in range(0, 10): topics.append( TopicSpec(partition_count=random.randint(1, 10), retention_bytes=random.randint(100000000, 300000000), retention_ms=random.randint(1 * 3600000, 10 * 3600000))) self.source_client.create_topic(topics) self.mirror_maker.start() # start source producer & target consumer self.start_workload() self.run_validation(consumer_timeout_sec=120) self.mirror_maker.stop() target_client = DefaultClient(self.redpanda) for t in topics: desc = target_client.describe_topic(t.name) self.logger.debug(f'source topic: {t}, target topic: {desc}') assert len(desc.partitions) == t.partition_count @cluster(num_nodes=9) @parametrize(source_type=kafka_source) @parametrize(source_type=redpanda_source) def test_consumer_group_mirroring(self, source_type): # start redpanda self.start_brokers(source_type=source_type) consumer_group = "test-group-1" # start mirror maker self.mirror_maker = MirrorMaker2(self.test_context, num_nodes=1, source_cluster=self.source_broker, target_cluster=self.redpanda, consumer_group_pattern=consumer_group, log_level="TRACE") self.mirror_maker.start() msg_size = 512 msg_cnt = 1000000 if self.redpanda.dedicated_nodes else 100 # produce some messages to source redpanda producer = RpkProducer(self.test_context, self.source_broker, self.topic.name, msg_size, msg_cnt, acks=-1) producer.start() producer.wait() producer.free() # consume some messages from source redpanda consumer = RpkConsumer(self.test_context, self.source_broker, self.topic.name, ignore_errors=False, retries=3, group=consumer_group, save_msgs=False, num_msgs=int(msg_cnt / 5)) consumer.start() consumer.wait() consumer.stop() source_messages = consumer.messages self.logger.info(f"source message count: {len(source_messages)}") consumer.free() src_rpk = RpkTool(self.source_broker) source_group = src_rpk.group_describe(consumer_group) target_rpk = RpkTool(self.redpanda) def target_group_equal(): try: target_group = target_rpk.group_describe(consumer_group) except RpkException as e: # e.g. COORDINATOR_NOT_AVAILABLE self.logger.info(f"Error describing target cluster group: {e}") return False self.logger.info( f"source {source_group}, target_group: {target_group}") return target_group.partitions == source_group.partitions and target_group.name == source_group.name # wait for consumer group sync timeout = 600 if self.redpanda.dedicated_nodes else 60 wait_until(target_group_equal, timeout_sec=timeout, backoff_sec=5) self.mirror_maker.stop()