def test_null(self): """ The null case where we are never exceeding the limit, but are repeatedly creating+destroying connections. """ self.redpanda.set_cluster_config({"kafka_connections_max": 6}) metrics = [ MetricCheck(self.logger, self.redpanda, n, REJECTED_METRIC, {}, sum) for n in self.redpanda.nodes ] producer = RpkProducer(self.test_context, self.redpanda, self.topic, msg_size=16384, msg_count=1, quiet=True, produce_timeout=5) for n in range(0, 100): producer.start() producer.wait() assert all([ m.evaluate([(REJECTED_METRIC, lambda a, b: b == a)]) for m in metrics ])
def test_node_resize(self): # Create a topic and write some data to make sure the cluster # is all up & initialized, and that subsequent checks are happening # with some partitions actually assigned to shards. self._client.create_topic( TopicSpec(name="test", partition_count=10, replication_factor=3)) producer = RpkProducer(context=self.test_context, redpanda=self.redpanda, topic="test", msg_size=4096, msg_count=1000, acks=-1) producer.start() producer.wait() # Choose one node from the cluster to exercise checks on. target_node = self.redpanda.nodes[0] # Attempt to decrease CPU count relative to initial: redpanda should fail to start self._restart_with_num_cpus(node=target_node, num_cpus=self.INITIAL_NUM_CPUS - 1, expect_fail=True) # Increase CPU count: redpanda should accept this self._restart_with_num_cpus(node=target_node, num_cpus=self.INITIAL_NUM_CPUS + 1, expect_fail=False) # Now decrease back to original core count: this should fail, because we previously # increased so the original core count is now below the high water mark self._restart_with_num_cpus(node=target_node, num_cpus=self.INITIAL_NUM_CPUS, expect_fail=True)
def _produce(self, topic, msg_cnt): wait_until(lambda: self._all_have_leaders(topic), 20, backoff_sec=2) producer = RpkProducer(self.test_context, self.redpanda, topic, 16384, msg_cnt, acks=-1) producer.start() producer.wait() producer.free()
def test_exceed_broker_limit(self): self.redpanda.set_cluster_config({"kafka_connections_max": 6}) metrics = [ MetricCheck(self.logger, self.redpanda, n, REJECTED_METRIC, {}, sum) for n in self.redpanda.nodes ] # I happen to know that an `rpk topic consume` occupies three # connections. So after opening two consumers, I should find # that a producer cannot get in. consumers = [ RpkConsumer(self.test_context, self.redpanda, self.topic), RpkConsumer(self.test_context, self.redpanda, self.topic), ] for c in consumers: c.start() producer = RpkProducer(self.test_context, self.redpanda, self.topic, msg_size=16384, msg_count=1, produce_timeout=5) producer.start() try: producer.wait() except Exception: # This is a non-specific exception because ducktape re-raises in wait() # as a bare Exception pass else: raise RuntimeError("Producer should have failed") for c in consumers: c.stop() c.wait() assert any([ m.evaluate([(REJECTED_METRIC, lambda a, b: b > a)]) for m in metrics ])
def test_sarama_consumergroup(self): count = 10 if self.scale.local else 5000 sarama_example = SaramaExamples.SaramaConsumerGroup( self.redpanda, self.topic, count) example = ExampleRunner(self._ctx, sarama_example, timeout_sec=self._timeout) producer = RpkProducer(self._ctx, self.redpanda, self.topic, 4, count, acks=-1, printable=True) def until_partitions(): storage = self.redpanda.storage() return len(list(storage.partitions("kafka", self.topic))) == 3 # Must wait for the paritions to materialize or else # kaf may try to produce during leadership election. # This results in a skipped record since kaf doesn't auto-retry. wait_until(until_partitions, timeout_sec=30, backoff_sec=2, err_msg="Expected partition did not materialize") # Run the producer and wait for the worker # threads to finish producing producer.start() producer.wait() # Start the example example.start() # Wait until the example is OK to terminate wait_until(example.condition_met, timeout_sec=self._timeout, backoff_sec=1)
def transfer_all_leaders(): partitions = rpk.describe_topic(topic) for p in partitions: replicas = set(p.replicas) replicas.remove(p.leader) target = random.choice(list(replicas)) admin.partition_transfer_leadership("kafka", topic, p.id, target) wait_until(lambda: wait_for_leader(p.id, target), timeout_sec=30, backoff_sec=1) msg_cnt = 100 producer = RpkProducer(self.test_context, self.redpanda, topic, 16384, msg_cnt, acks=-1) producer.start() producer.wait() producer.free()
class ConsumerGroupTest(RedpandaTest): def __init__(self, test_ctx, *args, **kwargs): self._ctx = test_ctx self.producer = None super(ConsumerGroupTest, self).__init__( test_ctx, num_brokers=3, *args, # disable leader balancer to make sure that group will not be realoaded because of leadership changes extra_rp_conf={"enable_leader_balancer": False}, **kwargs) def make_consumer_properties(base_properties, instance_id=None): properties = {} properties.update(base_properties) if instance_id: properties['group.instance.id'] = instance_id return properties def create_consumer(self, topic, group, instance_id=None, consumer_properties={}): return KafkaCliConsumer( self.test_context, self.redpanda, topic=topic, group=group, from_beginning=True, formatter_properties={ 'print.value': 'false', 'print.key': 'false', 'print.partition': 'true', 'print.offset': 'true', }, consumer_properties=ConsumerGroupTest.make_consumer_properties( consumer_properties, instance_id)) def create_consumers(self, consumer_count, topic, group, static_members, consumer_properties={}): consumers = [] for i in range(0, consumer_count): instance_id = f"panda-consumer-{i}" if static_members else None consumers.append( self.create_consumer(topic, group=group, instance_id=instance_id, consumer_properties=consumer_properties)) for c in consumers: c.start() return consumers def consumed_at_least(consumers, count): return all([len(c._messages) > count for c in consumers]) def validate_group_state(self, group, expected_state, static_members): rpk = RpkTool(self.redpanda) # validate group state rpk_group = rpk.group_describe(group) assert rpk_group.members == 2 assert rpk_group.state == expected_state for p in rpk_group.partitions: if static_members: assert 'panda-consumer' in p.instance_id else: assert p.instance_id is None def setup_producer(self, p_cnt): # create topic self.topic_spec = TopicSpec(partition_count=p_cnt, replication_factor=3) self.client().create_topic(specs=self.topic_spec) # produce some messages to the topic self.producer = RpkProducer(self._ctx, self.redpanda, self.topic_spec.name, 128, 5000, -1) self.producer.start() @cluster(num_nodes=6) @parametrize(static_members=True) @parametrize(static_members=False) def test_basic_group_join(self, static_members): """ Test validating that consumers are able to join the group and consume topic """ self.setup_producer(20) group = 'test-gr-1' # use 2 consumers consumers = self.create_consumers(2, self.topic_spec.name, group, static_members=static_members) # wait for some messages wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50), 30, 2) self.validate_group_state(group, expected_state="Stable", static_members=static_members) self.producer.wait() self.producer.free() for c in consumers: c.stop() c.wait() c.free() @cluster(num_nodes=6) def test_mixed_consumers_join(self): """ Test validating that dynamic and static consumers may exists in the same group """ self.setup_producer(20) group = 'test-gr-1' consumers = [] consumers.append( self.create_consumer(self.topic_spec.name, group, "panda-instance")) consumers.append( self.create_consumer(self.topic_spec.name, group, None)) for c in consumers: c.start() # wait for some messages wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50), 30, 2) rpk = RpkTool(self.redpanda) # validate group state rpk_group = rpk.group_describe(group) assert rpk_group.members == 2 assert rpk_group.state == "Stable" static_members = set() dynamic_members = set() for p in rpk_group.partitions: if p.instance_id: static_members.add(p.client_id) else: dynamic_members.add(p.client_id) assert len(static_members) == 1 assert len(dynamic_members) == 1 self.producer.wait() self.producer.free() for c in consumers: c.stop() c.wait() c.free() def wait_for_members(self, group, members_count): rpk = RpkTool(self.redpanda) def group_stable(): rpk_group = rpk.group_describe(group) return rpk_group.members == members_count and rpk_group.state == "Stable" return wait_until(group_stable, 30, 2) @cluster(num_nodes=6) @parametrize(static_members=True) @parametrize(static_members=False) def test_consumer_rejoin(self, static_members): """ Test validating that re-joining static member will not casuse rebalance """ self.setup_producer(20) group = 'test-gr-1' consumers = self.create_consumers( 2, self.topic_spec.name, group, static_members=static_members, consumer_properties={"session.timeout.ms": 40000}) # wait for some messages wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50), 30, 2) rpk = RpkTool(self.redpanda) # at this point we have 2 consumers in stable group self.validate_group_state(group, expected_state="Stable", static_members=static_members) # stop one of the consumers consumers[0].stop() consumers[0].wait() rpk_group = rpk.group_describe(group) if static_members: # with static members group should still be in stable state assert rpk_group.state == "Stable" assert rpk_group.members == 2 else: # consumer will request group leave when shutdown gracefully and it is dynamic self.wait_for_members(group, 1) # start the consumer again consumers[0].start() consumers[0].wait_for_started() # wait for consumer to start if static_members: # with static members group should be stable immediately as the # consumer is rejoining with the same instance id self.validate_group_state(group, expected_state="Stable", static_members=static_members) else: # group should get back to its original 2 members state self.wait_for_members(group, 2) self.producer.wait() self.producer.free() for c in consumers: c.stop() c.wait() c.free() @cluster(num_nodes=6) @parametrize(static_members=True) @parametrize(static_members=False) def test_consumer_is_removed_when_timedout(self, static_members): """ Test validating that consumer is evicted if it failed to deliver heartbeat to the broker """ self.setup_producer(20) group = 'test-gr-1' # using short session timeout to make the test finish faster consumers = self.create_consumers( 2, self.topic_spec.name, group, static_members=static_members, consumer_properties={"session.timeout.ms": 6000}) # wait for some messages wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50), 30, 2) rpk = RpkTool(self.redpanda) # at this point we have 2 consumers in stable group self.validate_group_state(group, expected_state="Stable", static_members=static_members) # stop one of the consumers consumers[0].stop() # wait for rebalance self.wait_for_members(group, 1) # start the consumer again consumers[0].start() # group should get back to its original 2 members state self.wait_for_members(group, 2) self.validate_group_state(group, expected_state="Stable", static_members=static_members) self.producer.wait() self.producer.free() for c in consumers: c.stop() c.wait() c.free()
def test_leader_transfers_recovery(self, acks): """ Validate that leadership transfers complete successfully under acks=1 writes that prompt the leader to frequently activate recovery_stm. When acks=1, this is a reproducer for https://github.com/vectorizedio/redpanda/issues/2580 When acks=-1, this is a reproducer rfor https://github.com/vectorizedio/redpanda/issues/2606 """ leader_node_id, replicas = self._wait_for_leader() if acks == -1: producer = RpkProducer(self._ctx, self.redpanda, self.topic, 16384, sys.maxsize, acks=acks) else: # To reproduce acks=1 issue, we need an intermittent producer that # waits long enough between messages to let recovery_stm go to sleep # waiting for follower_state_change # KafProducer is intermittent because it starts a fresh process for # each message, whereas RpkProducer writes a continuous stream. # TODO: create a test traffic generator that has inter-message # delay as an explicit parameter, rather than relying on implementation # details of the producer helpers. producer = KafProducer(self._ctx, self.redpanda, self.topic) producer.start() # Pass leadership around in a ring self.logger.info(f"Initial leader of {self.topic} is {leader_node_id}") transfer_count = 50 # FIXME: with a transfer count >100, we tend to see # reactor stalls and corresponding nondeterministic behaviour/failures. # This appears unrelated to the functionality under test, something else # is tripping up the cluster when we have so many leadership transfers. # https://github.com/vectorizedio/redpanda/issues/2623 admin = Admin(self.redpanda) initial_leader_id = leader_node_id for n in range(0, transfer_count): target_idx = (initial_leader_id + n) % len(self.redpanda.nodes) target_node_id = target_idx + 1 self.logger.info(f"Starting transfer to {target_node_id}") admin.partition_transfer_leadership("kafka", self.topic, 0, target_node_id) self._wait_for_leader( lambda l: l is not None and l == target_node_id, timeout=ELECTION_TIMEOUT * 2) self.logger.info(f"Completed transfer to {target_node_id}") self.logger.info(f"Completed {transfer_count} transfers successfully") # Explicit stop of producer so that we see any errors producer.stop() producer.wait() producer.free()
def test_consumer_group_mirroring(self, source_type): # start redpanda self.start_brokers(source_type=source_type) consumer_group = "test-group-1" # start mirror maker self.mirror_maker = MirrorMaker2(self.test_context, num_nodes=1, source_cluster=self.source_broker, target_cluster=self.redpanda, consumer_group_pattern=consumer_group, log_level="TRACE") self.mirror_maker.start() msg_size = 512 msg_cnt = 1000000 if self.redpanda.dedicated_nodes else 100 # produce some messages to source redpanda producer = RpkProducer(self.test_context, self.source_broker, self.topic.name, msg_size, msg_cnt, acks=-1) producer.start() producer.wait() producer.free() # consume some messages from source redpanda consumer = RpkConsumer(self.test_context, self.source_broker, self.topic.name, ignore_errors=False, retries=3, group=consumer_group, save_msgs=False, num_msgs=int(msg_cnt / 5)) consumer.start() consumer.wait() consumer.stop() source_messages = consumer.messages self.logger.info(f"source message count: {len(source_messages)}") consumer.free() src_rpk = RpkTool(self.source_broker) source_group = src_rpk.group_describe(consumer_group) target_rpk = RpkTool(self.redpanda) def target_group_equal(): try: target_group = target_rpk.group_describe(consumer_group) except RpkException as e: # e.g. COORDINATOR_NOT_AVAILABLE self.logger.info(f"Error describing target cluster group: {e}") return False self.logger.info( f"source {source_group}, target_group: {target_group}") return target_group.partitions == source_group.partitions and target_group.name == source_group.name # wait for consumer group sync timeout = 600 if self.redpanda.dedicated_nodes else 60 wait_until(target_group_equal, timeout_sec=timeout, backoff_sec=5) self.mirror_maker.stop()
def test_overlapping_changes(self): """ Check that while a movement is in flight, rules about overlapping operations are properly enforced. """ self.start_redpanda(num_nodes=4) node_ids = {1, 2, 3, 4} # Create topic with enough data that inter-node movement # will take a while. name = f"movetest" spec = TopicSpec(name=name, partition_count=1, replication_factor=3) self.client().create_topic(spec) # Wait for the partition to have a leader (`rpk produce` errors # out if it tries to write data before this) def partition_ready(): return KafkaCat(self.redpanda).get_partition_leader( name, 0)[0] is not None wait_until(partition_ready, timeout_sec=10, backoff_sec=0.5) # Write a substantial amount of data to the topic msg_size = 512 * 1024 write_bytes = 512 * 1024 * 1024 producer = RpkProducer(self._ctx, self.redpanda, name, msg_size=msg_size, msg_count=int(write_bytes / msg_size)) t1 = time.time() producer.start() # This is an absurdly low expected throughput, but necessarily # so to run reliably on current test runners, which share an EBS # backend among many parallel tests. 10MB/s has been empirically # shown to be too high an expectation. expect_bps = 1 * 1024 * 1024 expect_runtime = write_bytes / expect_bps producer.wait(timeout_sec=expect_runtime) self.logger.info( f"Write complete {write_bytes} in {time.time() - t1} seconds") # - Admin API redirects writes but not reads. Because we want synchronous # status after submitting operations, send all operations to the controller # leader. This is not necessary for operations to work, just to simplify # this test by letting it see synchronous status updates. # - Because we will later verify that a 503 is sent in response to # a move request to an in_progress topic, set retry_codes=[] to # disable default retries on 503. admin_node = self.redpanda.controller() admin = Admin(self.redpanda, default_node=admin_node, retry_codes=[]) # Start an inter-node move, which should take some time # to complete because of recovery network traffic assignments = self._get_assignments(admin, name, 0) new_node = list(node_ids - set([a['node_id'] for a in assignments]))[0] self.logger.info(f"old assignments: {assignments}") old_assignments = assignments assignments = assignments[1:] + [{'node_id': new_node, 'core': 0}] self.logger.info(f"new assignments: {assignments}") r = admin.set_partition_replicas(name, 0, assignments) r.raise_for_status() assert admin.get_partitions(name, 0)['status'] == "in_progress" # Another move should fail assert admin.get_partitions(name, 0)['status'] == "in_progress" try: r = admin.set_partition_replicas(name, 0, old_assignments) except requests.exceptions.HTTPError as e: assert e.response.status_code == 503 else: raise RuntimeError(f"Expected 503 but got {r.status_code}") # An update to partition properties should succeed # (issue https://github.com/vectorizedio/redpanda/issues/2300) rpk = RpkTool(self.redpanda) assert admin.get_partitions(name, 0)['status'] == "in_progress" rpk.alter_topic_config(name, "retention.ms", "3600000") # A deletion should succeed assert name in rpk.list_topics() assert admin.get_partitions(name, 0)['status'] == "in_progress" rpk.delete_topic(name) assert name not in rpk.list_topics()