Example #1
0
    def test_null(self):
        """
        The null case where we are never exceeding the limit, but
        are repeatedly creating+destroying connections.
        """
        self.redpanda.set_cluster_config({"kafka_connections_max": 6})

        metrics = [
            MetricCheck(self.logger, self.redpanda, n, REJECTED_METRIC, {},
                        sum) for n in self.redpanda.nodes
        ]

        producer = RpkProducer(self.test_context,
                               self.redpanda,
                               self.topic,
                               msg_size=16384,
                               msg_count=1,
                               quiet=True,
                               produce_timeout=5)
        for n in range(0, 100):
            producer.start()
            producer.wait()

        assert all([
            m.evaluate([(REJECTED_METRIC, lambda a, b: b == a)])
            for m in metrics
        ])
Example #2
0
    def test_node_resize(self):
        # Create a topic and write some data to make sure the cluster
        # is all up & initialized, and that subsequent checks are happening
        # with some partitions actually assigned to shards.
        self._client.create_topic(
            TopicSpec(name="test", partition_count=10, replication_factor=3))
        producer = RpkProducer(context=self.test_context,
                               redpanda=self.redpanda,
                               topic="test",
                               msg_size=4096,
                               msg_count=1000,
                               acks=-1)
        producer.start()
        producer.wait()

        # Choose one node from the cluster to exercise checks on.
        target_node = self.redpanda.nodes[0]

        # Attempt to decrease CPU count relative to initial: redpanda should fail to start
        self._restart_with_num_cpus(node=target_node,
                                    num_cpus=self.INITIAL_NUM_CPUS - 1,
                                    expect_fail=True)

        # Increase CPU count: redpanda should accept this
        self._restart_with_num_cpus(node=target_node,
                                    num_cpus=self.INITIAL_NUM_CPUS + 1,
                                    expect_fail=False)

        # Now decrease back to original core count: this should fail, because we previously
        # increased so the original core count is now below the high water mark
        self._restart_with_num_cpus(node=target_node,
                                    num_cpus=self.INITIAL_NUM_CPUS,
                                    expect_fail=True)
    def _produce(self, topic, msg_cnt):
        wait_until(lambda: self._all_have_leaders(topic), 20, backoff_sec=2)

        producer = RpkProducer(self.test_context,
                               self.redpanda,
                               topic,
                               16384,
                               msg_cnt,
                               acks=-1)
        producer.start()
        producer.wait()
        producer.free()
Example #4
0
    def test_exceed_broker_limit(self):
        self.redpanda.set_cluster_config({"kafka_connections_max": 6})

        metrics = [
            MetricCheck(self.logger, self.redpanda, n, REJECTED_METRIC, {},
                        sum) for n in self.redpanda.nodes
        ]

        # I happen to know that an `rpk topic consume` occupies three
        # connections.  So after opening two consumers, I should find
        # that a producer cannot get in.
        consumers = [
            RpkConsumer(self.test_context, self.redpanda, self.topic),
            RpkConsumer(self.test_context, self.redpanda, self.topic),
        ]

        for c in consumers:
            c.start()

        producer = RpkProducer(self.test_context,
                               self.redpanda,
                               self.topic,
                               msg_size=16384,
                               msg_count=1,
                               produce_timeout=5)
        producer.start()
        try:
            producer.wait()
        except Exception:
            # This is a non-specific exception because ducktape re-raises in wait()
            # as a bare Exception
            pass
        else:
            raise RuntimeError("Producer should have failed")

        for c in consumers:
            c.stop()
            c.wait()

        assert any([
            m.evaluate([(REJECTED_METRIC, lambda a, b: b > a)])
            for m in metrics
        ])
Example #5
0
    def test_sarama_consumergroup(self):
        count = 10 if self.scale.local else 5000

        sarama_example = SaramaExamples.SaramaConsumerGroup(
            self.redpanda, self.topic, count)
        example = ExampleRunner(self._ctx,
                                sarama_example,
                                timeout_sec=self._timeout)
        producer = RpkProducer(self._ctx,
                               self.redpanda,
                               self.topic,
                               4,
                               count,
                               acks=-1,
                               printable=True)

        def until_partitions():
            storage = self.redpanda.storage()
            return len(list(storage.partitions("kafka", self.topic))) == 3

        # Must wait for the paritions to materialize or else
        # kaf may try to produce during leadership election.
        # This results in a skipped record since kaf doesn't auto-retry.
        wait_until(until_partitions,
                   timeout_sec=30,
                   backoff_sec=2,
                   err_msg="Expected partition did not materialize")

        # Run the producer and wait for the worker
        # threads to finish producing
        producer.start()
        producer.wait()

        # Start the example
        example.start()

        # Wait until the example is OK to terminate
        wait_until(example.condition_met,
                   timeout_sec=self._timeout,
                   backoff_sec=1)
        def transfer_all_leaders():
            partitions = rpk.describe_topic(topic)
            for p in partitions:
                replicas = set(p.replicas)
                replicas.remove(p.leader)
                target = random.choice(list(replicas))
                admin.partition_transfer_leadership("kafka", topic, p.id,
                                                    target)
                wait_until(lambda: wait_for_leader(p.id, target),
                           timeout_sec=30,
                           backoff_sec=1)
            msg_cnt = 100
            producer = RpkProducer(self.test_context,
                                   self.redpanda,
                                   topic,
                                   16384,
                                   msg_cnt,
                                   acks=-1)

            producer.start()
            producer.wait()
            producer.free()
Example #7
0
class ConsumerGroupTest(RedpandaTest):
    def __init__(self, test_ctx, *args, **kwargs):
        self._ctx = test_ctx
        self.producer = None
        super(ConsumerGroupTest, self).__init__(
            test_ctx,
            num_brokers=3,
            *args,
            # disable leader balancer to make sure that group will not be realoaded because of leadership changes
            extra_rp_conf={"enable_leader_balancer": False},
            **kwargs)

    def make_consumer_properties(base_properties, instance_id=None):
        properties = {}
        properties.update(base_properties)
        if instance_id:
            properties['group.instance.id'] = instance_id
        return properties

    def create_consumer(self,
                        topic,
                        group,
                        instance_id=None,
                        consumer_properties={}):
        return KafkaCliConsumer(
            self.test_context,
            self.redpanda,
            topic=topic,
            group=group,
            from_beginning=True,
            formatter_properties={
                'print.value': 'false',
                'print.key': 'false',
                'print.partition': 'true',
                'print.offset': 'true',
            },
            consumer_properties=ConsumerGroupTest.make_consumer_properties(
                consumer_properties, instance_id))

    def create_consumers(self,
                         consumer_count,
                         topic,
                         group,
                         static_members,
                         consumer_properties={}):

        consumers = []
        for i in range(0, consumer_count):
            instance_id = f"panda-consumer-{i}" if static_members else None
            consumers.append(
                self.create_consumer(topic,
                                     group=group,
                                     instance_id=instance_id,
                                     consumer_properties=consumer_properties))

        for c in consumers:
            c.start()
        return consumers

    def consumed_at_least(consumers, count):
        return all([len(c._messages) > count for c in consumers])

    def validate_group_state(self, group, expected_state, static_members):
        rpk = RpkTool(self.redpanda)
        # validate group state
        rpk_group = rpk.group_describe(group)

        assert rpk_group.members == 2
        assert rpk_group.state == expected_state

        for p in rpk_group.partitions:
            if static_members:
                assert 'panda-consumer' in p.instance_id
            else:
                assert p.instance_id is None

    def setup_producer(self, p_cnt):
        # create topic
        self.topic_spec = TopicSpec(partition_count=p_cnt,
                                    replication_factor=3)
        self.client().create_topic(specs=self.topic_spec)
        # produce some messages to the topic
        self.producer = RpkProducer(self._ctx, self.redpanda,
                                    self.topic_spec.name, 128, 5000, -1)
        self.producer.start()

    @cluster(num_nodes=6)
    @parametrize(static_members=True)
    @parametrize(static_members=False)
    def test_basic_group_join(self, static_members):
        """
        Test validating that consumers are able to join the group and consume topic
        """

        self.setup_producer(20)
        group = 'test-gr-1'
        # use 2 consumers
        consumers = self.create_consumers(2,
                                          self.topic_spec.name,
                                          group,
                                          static_members=static_members)

        # wait for some messages
        wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50),
                   30, 2)
        self.validate_group_state(group,
                                  expected_state="Stable",
                                  static_members=static_members)

        self.producer.wait()
        self.producer.free()

        for c in consumers:
            c.stop()
            c.wait()
            c.free()

    @cluster(num_nodes=6)
    def test_mixed_consumers_join(self):
        """
        Test validating that dynamic and static consumers may exists in the same group
        """
        self.setup_producer(20)
        group = 'test-gr-1'
        consumers = []
        consumers.append(
            self.create_consumer(self.topic_spec.name, group,
                                 "panda-instance"))
        consumers.append(
            self.create_consumer(self.topic_spec.name, group, None))

        for c in consumers:
            c.start()

        # wait for some messages
        wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50),
                   30, 2)

        rpk = RpkTool(self.redpanda)
        # validate group state
        rpk_group = rpk.group_describe(group)

        assert rpk_group.members == 2
        assert rpk_group.state == "Stable"

        static_members = set()
        dynamic_members = set()

        for p in rpk_group.partitions:
            if p.instance_id:
                static_members.add(p.client_id)
            else:
                dynamic_members.add(p.client_id)

        assert len(static_members) == 1
        assert len(dynamic_members) == 1

        self.producer.wait()
        self.producer.free()

        for c in consumers:
            c.stop()
            c.wait()
            c.free()

    def wait_for_members(self, group, members_count):
        rpk = RpkTool(self.redpanda)

        def group_stable():
            rpk_group = rpk.group_describe(group)
            return rpk_group.members == members_count and rpk_group.state == "Stable"

        return wait_until(group_stable, 30, 2)

    @cluster(num_nodes=6)
    @parametrize(static_members=True)
    @parametrize(static_members=False)
    def test_consumer_rejoin(self, static_members):
        """
        Test validating that re-joining static member will not casuse rebalance
        """
        self.setup_producer(20)
        group = 'test-gr-1'

        consumers = self.create_consumers(
            2,
            self.topic_spec.name,
            group,
            static_members=static_members,
            consumer_properties={"session.timeout.ms": 40000})

        # wait for some messages
        wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50),
                   30, 2)
        rpk = RpkTool(self.redpanda)
        # at this point we have 2 consumers in stable group
        self.validate_group_state(group,
                                  expected_state="Stable",
                                  static_members=static_members)

        # stop one of the consumers
        consumers[0].stop()
        consumers[0].wait()

        rpk_group = rpk.group_describe(group)
        if static_members:
            # with static members group should still be in stable state
            assert rpk_group.state == "Stable"
            assert rpk_group.members == 2
        else:
            # consumer will request group leave when shutdown gracefully and it is dynamic
            self.wait_for_members(group, 1)

        # start the consumer again
        consumers[0].start()
        consumers[0].wait_for_started()
        # wait for consumer to start
        if static_members:
            # with static members group should be stable immediately as the
            # consumer is rejoining with the same instance id
            self.validate_group_state(group,
                                      expected_state="Stable",
                                      static_members=static_members)
        else:
            # group should get back to its original 2 members state
            self.wait_for_members(group, 2)

        self.producer.wait()
        self.producer.free()

        for c in consumers:
            c.stop()
            c.wait()
            c.free()

    @cluster(num_nodes=6)
    @parametrize(static_members=True)
    @parametrize(static_members=False)
    def test_consumer_is_removed_when_timedout(self, static_members):
        """
        Test validating that consumer is evicted if it failed to deliver heartbeat to the broker
        """
        self.setup_producer(20)
        group = 'test-gr-1'
        # using short session timeout to make the test finish faster
        consumers = self.create_consumers(
            2,
            self.topic_spec.name,
            group,
            static_members=static_members,
            consumer_properties={"session.timeout.ms": 6000})

        # wait for some messages
        wait_until(lambda: ConsumerGroupTest.consumed_at_least(consumers, 50),
                   30, 2)
        rpk = RpkTool(self.redpanda)
        # at this point we have 2 consumers in stable group
        self.validate_group_state(group,
                                  expected_state="Stable",
                                  static_members=static_members)

        # stop one of the consumers
        consumers[0].stop()

        # wait for rebalance
        self.wait_for_members(group, 1)

        # start the consumer again
        consumers[0].start()

        # group should get back to its original 2 members state
        self.wait_for_members(group, 2)
        self.validate_group_state(group,
                                  expected_state="Stable",
                                  static_members=static_members)

        self.producer.wait()
        self.producer.free()

        for c in consumers:
            c.stop()
            c.wait()
            c.free()
    def test_leader_transfers_recovery(self, acks):
        """
        Validate that leadership transfers complete successfully
        under acks=1 writes that prompt the leader to frequently
        activate recovery_stm.

        When acks=1, this is a reproducer for
        https://github.com/vectorizedio/redpanda/issues/2580

        When acks=-1, this is a reproducer rfor
        https://github.com/vectorizedio/redpanda/issues/2606
        """

        leader_node_id, replicas = self._wait_for_leader()

        if acks == -1:
            producer = RpkProducer(self._ctx,
                                   self.redpanda,
                                   self.topic,
                                   16384,
                                   sys.maxsize,
                                   acks=acks)
        else:
            # To reproduce acks=1 issue, we need an intermittent producer that
            # waits long enough between messages to let recovery_stm go to sleep
            # waiting for follower_state_change

            # KafProducer is intermittent because it starts a fresh process for
            # each message, whereas RpkProducer writes a continuous stream.
            # TODO: create a test traffic generator that has inter-message
            # delay as an explicit parameter, rather than relying on implementation
            # details of the producer helpers.
            producer = KafProducer(self._ctx, self.redpanda, self.topic)

        producer.start()

        # Pass leadership around in a ring
        self.logger.info(f"Initial leader of {self.topic} is {leader_node_id}")

        transfer_count = 50

        # FIXME: with a transfer count >100, we tend to see
        # reactor stalls and corresponding nondeterministic behaviour/failures.
        # This appears unrelated to the functionality under test, something else
        # is tripping up the cluster when we have so many leadership transfers.
        # https://github.com/vectorizedio/redpanda/issues/2623

        admin = Admin(self.redpanda)

        initial_leader_id = leader_node_id
        for n in range(0, transfer_count):
            target_idx = (initial_leader_id + n) % len(self.redpanda.nodes)
            target_node_id = target_idx + 1

            self.logger.info(f"Starting transfer to {target_node_id}")
            admin.partition_transfer_leadership("kafka", self.topic, 0,
                                                target_node_id)

            self._wait_for_leader(
                lambda l: l is not None and l == target_node_id,
                timeout=ELECTION_TIMEOUT * 2)
            self.logger.info(f"Completed transfer to {target_node_id}")

        self.logger.info(f"Completed {transfer_count} transfers successfully")

        # Explicit stop of producer so that we see any errors
        producer.stop()
        producer.wait()
        producer.free()
Example #9
0
    def test_consumer_group_mirroring(self, source_type):
        # start redpanda
        self.start_brokers(source_type=source_type)
        consumer_group = "test-group-1"
        # start mirror maker
        self.mirror_maker = MirrorMaker2(self.test_context,
                                         num_nodes=1,
                                         source_cluster=self.source_broker,
                                         target_cluster=self.redpanda,
                                         consumer_group_pattern=consumer_group,
                                         log_level="TRACE")
        self.mirror_maker.start()

        msg_size = 512
        msg_cnt = 1000000 if self.redpanda.dedicated_nodes else 100

        # produce some messages to source redpanda
        producer = RpkProducer(self.test_context,
                               self.source_broker,
                               self.topic.name,
                               msg_size,
                               msg_cnt,
                               acks=-1)

        producer.start()
        producer.wait()
        producer.free()

        # consume some messages from source redpanda
        consumer = RpkConsumer(self.test_context,
                               self.source_broker,
                               self.topic.name,
                               ignore_errors=False,
                               retries=3,
                               group=consumer_group,
                               save_msgs=False,
                               num_msgs=int(msg_cnt / 5))

        consumer.start()
        consumer.wait()
        consumer.stop()
        source_messages = consumer.messages
        self.logger.info(f"source message count: {len(source_messages)}")
        consumer.free()

        src_rpk = RpkTool(self.source_broker)
        source_group = src_rpk.group_describe(consumer_group)
        target_rpk = RpkTool(self.redpanda)

        def target_group_equal():
            try:
                target_group = target_rpk.group_describe(consumer_group)
            except RpkException as e:
                # e.g. COORDINATOR_NOT_AVAILABLE
                self.logger.info(f"Error describing target cluster group: {e}")
                return False

            self.logger.info(
                f"source {source_group}, target_group: {target_group}")
            return target_group.partitions == source_group.partitions and target_group.name == source_group.name

        # wait for consumer group sync
        timeout = 600 if self.redpanda.dedicated_nodes else 60
        wait_until(target_group_equal, timeout_sec=timeout, backoff_sec=5)

        self.mirror_maker.stop()
    def test_overlapping_changes(self):
        """
        Check that while a movement is in flight, rules about
        overlapping operations are properly enforced.
        """

        self.start_redpanda(num_nodes=4)
        node_ids = {1, 2, 3, 4}

        # Create topic with enough data that inter-node movement
        # will take a while.
        name = f"movetest"
        spec = TopicSpec(name=name, partition_count=1, replication_factor=3)
        self.client().create_topic(spec)

        # Wait for the partition to have a leader (`rpk produce` errors
        # out if it tries to write data before this)
        def partition_ready():
            return KafkaCat(self.redpanda).get_partition_leader(
                name, 0)[0] is not None

        wait_until(partition_ready, timeout_sec=10, backoff_sec=0.5)

        # Write a substantial amount of data to the topic
        msg_size = 512 * 1024
        write_bytes = 512 * 1024 * 1024
        producer = RpkProducer(self._ctx,
                               self.redpanda,
                               name,
                               msg_size=msg_size,
                               msg_count=int(write_bytes / msg_size))
        t1 = time.time()
        producer.start()

        # This is an absurdly low expected throughput, but necessarily
        # so to run reliably on current test runners, which share an EBS
        # backend among many parallel tests.  10MB/s has been empirically
        # shown to be too high an expectation.
        expect_bps = 1 * 1024 * 1024
        expect_runtime = write_bytes / expect_bps
        producer.wait(timeout_sec=expect_runtime)

        self.logger.info(
            f"Write complete {write_bytes} in {time.time() - t1} seconds")

        # - Admin API redirects writes but not reads.  Because we want synchronous
        #   status after submitting operations, send all operations to the controller
        #   leader.  This is not necessary for operations to work, just to simplify
        #   this test by letting it see synchronous status updates.
        # - Because we will later verify that a 503 is sent in response to
        #   a move request to an in_progress topic, set retry_codes=[] to
        #   disable default retries on 503.
        admin_node = self.redpanda.controller()
        admin = Admin(self.redpanda, default_node=admin_node, retry_codes=[])

        # Start an inter-node move, which should take some time
        # to complete because of recovery network traffic
        assignments = self._get_assignments(admin, name, 0)
        new_node = list(node_ids - set([a['node_id'] for a in assignments]))[0]
        self.logger.info(f"old assignments: {assignments}")
        old_assignments = assignments
        assignments = assignments[1:] + [{'node_id': new_node, 'core': 0}]
        self.logger.info(f"new assignments: {assignments}")
        r = admin.set_partition_replicas(name, 0, assignments)
        r.raise_for_status()
        assert admin.get_partitions(name, 0)['status'] == "in_progress"

        # Another move should fail
        assert admin.get_partitions(name, 0)['status'] == "in_progress"
        try:
            r = admin.set_partition_replicas(name, 0, old_assignments)
        except requests.exceptions.HTTPError as e:
            assert e.response.status_code == 503
        else:
            raise RuntimeError(f"Expected 503 but got {r.status_code}")

        # An update to partition properties should succeed
        # (issue https://github.com/vectorizedio/redpanda/issues/2300)
        rpk = RpkTool(self.redpanda)
        assert admin.get_partitions(name, 0)['status'] == "in_progress"
        rpk.alter_topic_config(name, "retention.ms", "3600000")

        # A deletion should succeed
        assert name in rpk.list_topics()
        assert admin.get_partitions(name, 0)['status'] == "in_progress"
        rpk.delete_topic(name)
        assert name not in rpk.list_topics()