Exemple #1
0
 def test_self_transfer(self):
     admin = Admin(self.redpanda)
     for topic in self.topics:
         for partition in range(topic.partition_count):
             leader = admin.get_partitions(topic, partition)['leader_id']
             admin.partition_transfer_leadership("kafka", topic, partition,
                                                 leader)
Exemple #2
0
    def test_controller_recovery(self):
        kc = KafkaCat(self.redpanda)

        # choose a partition and a target node
        partition = self._get_partition(kc)
        target_node_id = next(
            filter(lambda r: r["id"] != partition["leader"],
                   partition["replicas"]))["id"]
        self.logger.debug(
            f"Transfering leader from {partition['leader']} to {target_node_id}"
        )

        # build the transfer url
        meta = kc.metadata()
        brokers = meta["brokers"]
        source_broker = next(
            filter(lambda b: b["id"] == partition["leader"], brokers))
        target_broker = next(
            filter(lambda b: b["id"] == target_node_id, brokers))
        self.logger.debug(f"Source broker {source_broker}")
        self.logger.debug(f"Target broker {target_broker}")

        # Send the request to any host, they should redirect to
        # the leader of the partition.
        partition_id = partition['partition']

        admin = Admin(self.redpanda)
        admin.partition_transfer_leadership("kafka", self.topic, partition_id,
                                            target_node_id)

        def transfer_complete():
            for _ in range(3):  # just give it a moment
                time.sleep(1)
                meta = kc.metadata()
                partition = next(
                    filter(lambda p: p["partition"] == partition_id,
                           meta["topics"][0]["partitions"]))
                if partition["leader"] == target_node_id:
                    return True
            return False

        wait_until(lambda: transfer_complete(),
                   timeout_sec=30,
                   backoff_sec=5,
                   err_msg="Transfer did not complete")
Exemple #3
0
    def _transfer_leadership(self, admin: Admin, namespace: str, topic: str,
                             target_node_id: int) -> None:

        last_log_msg = ""  # avoid spamming log

        def leader_predicate(l: Optional[int]) -> bool:
            nonlocal last_log_msg, target_node_id
            if not l:
                return False
            if l != target_node_id:  # type: ignore
                log_msg = f'Still waiting for leader {target_node_id}, got {l}'
                if log_msg != last_log_msg:  # type: ignore # "unbound"
                    self.logger.info(log_msg)
                    last_log_msg = log_msg
                return False
            return True

        retry_once = True
        while True:
            self.logger.info(f"Starting transfer to {target_node_id}")
            admin.partition_transfer_leadership("kafka", topic, 0,
                                                target_node_id)
            try:
                self._wait_for_leader(leader_predicate,
                                      timeout=ELECTION_TIMEOUT * 2)
            except ducktape.errors.TimeoutError as e:
                if retry_once:
                    self.logger.info(
                        f'Failed to get desired leader, retrying once.')
                    retry_once = False
                    continue
                else:
                    raise e
            break  # no exception -> success, we can return now

        self.logger.info(f"Completed transfer to {target_node_id}")
    def test_leader_transfers_recovery(self, acks):
        """
        Validate that leadership transfers complete successfully
        under acks=1 writes that prompt the leader to frequently
        activate recovery_stm.

        When acks=1, this is a reproducer for
        https://github.com/vectorizedio/redpanda/issues/2580

        When acks=-1, this is a reproducer rfor
        https://github.com/vectorizedio/redpanda/issues/2606
        """

        leader_node_id, replicas = self._wait_for_leader()

        if acks == -1:
            producer = RpkProducer(self._ctx,
                                   self.redpanda,
                                   self.topic,
                                   16384,
                                   sys.maxsize,
                                   acks=acks)
        else:
            # To reproduce acks=1 issue, we need an intermittent producer that
            # waits long enough between messages to let recovery_stm go to sleep
            # waiting for follower_state_change

            # KafProducer is intermittent because it starts a fresh process for
            # each message, whereas RpkProducer writes a continuous stream.
            # TODO: create a test traffic generator that has inter-message
            # delay as an explicit parameter, rather than relying on implementation
            # details of the producer helpers.
            producer = KafProducer(self._ctx, self.redpanda, self.topic)

        producer.start()

        # Pass leadership around in a ring
        self.logger.info(f"Initial leader of {self.topic} is {leader_node_id}")

        transfer_count = 50

        # FIXME: with a transfer count >100, we tend to see
        # reactor stalls and corresponding nondeterministic behaviour/failures.
        # This appears unrelated to the functionality under test, something else
        # is tripping up the cluster when we have so many leadership transfers.
        # https://github.com/vectorizedio/redpanda/issues/2623

        admin = Admin(self.redpanda)

        initial_leader_id = leader_node_id
        for n in range(0, transfer_count):
            target_idx = (initial_leader_id + n) % len(self.redpanda.nodes)
            target_node_id = target_idx + 1

            self.logger.info(f"Starting transfer to {target_node_id}")
            admin.partition_transfer_leadership("kafka", self.topic, 0,
                                                target_node_id)

            self._wait_for_leader(
                lambda l: l is not None and l == target_node_id,
                timeout=ELECTION_TIMEOUT * 2)
            self.logger.info(f"Completed transfer to {target_node_id}")

        self.logger.info(f"Completed {transfer_count} transfers successfully")

        # Explicit stop of producer so that we see any errors
        producer.stop()
        producer.wait()
        producer.free()