コード例 #1
0
    def test_pause_state_persistent(self):
        """
        Verify that paused state is preserved after a cluster restart.
        """

        self.setup_services()
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        self.source = VerifiableSource(self.cc)
        self.source.start()

        wait_until(
            lambda: self.is_running(self.source),
            timeout_sec=30,
            err_msg="Failed to see connector transition to the RUNNING state")

        self.cc.pause_connector(self.source.name)

        self.cc.restart()

        # we should still be paused after restarting
        for node in self.cc.nodes:
            wait_until(
                lambda: self.is_paused(self.source, node),
                timeout_sec=30,
                err_msg="Failed to see connector startup in PAUSED state")
コード例 #2
0
    def test_pause_and_resume_sink(self, connect_protocol):
        """
        Verify that sink connectors stop consuming records when paused and begin again after
        being resumed.
        """

        self.CONNECT_PROTOCOL = connect_protocol
        self.setup_services()
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        # use the verifiable source to produce a steady stream of messages
        self.source = VerifiableSource(self.cc, topic=self.TOPIC)
        self.source.start()

        wait_until(
            lambda: len(self.source.committed_messages()) > 0,
            timeout_sec=30,
            err_msg=
            "Timeout expired waiting for source task to produce a message")

        self.sink = VerifiableSink(self.cc, topics=[self.TOPIC])
        self.sink.start()

        wait_until(
            lambda: self.is_running(self.sink),
            timeout_sec=30,
            err_msg="Failed to see connector transition to the RUNNING state")

        self.cc.pause_connector(self.sink.name)

        # wait until all nodes report the paused transition
        for node in self.cc.nodes:
            wait_until(
                lambda: self.is_paused(self.sink, node),
                timeout_sec=30,
                err_msg="Failed to see connector transition to the PAUSED state"
            )

        # verify that we do not consume new messages while paused
        num_messages = len(self.sink.received_messages())
        time.sleep(10)
        assert num_messages == len(self.sink.received_messages(
        )), "Paused sink connector should not consume any messages"

        self.cc.resume_connector(self.sink.name)

        for node in self.cc.nodes:
            wait_until(
                lambda: self.is_running(self.sink, node),
                timeout_sec=30,
                err_msg=
                "Failed to see connector transition to the RUNNING state")

        # after resuming, we should see records consumed again
        wait_until(
            lambda: len(self.sink.received_messages()) > num_messages,
            timeout_sec=30,
            err_msg="Failed to consume messages after resuming sink connector")
コード例 #3
0
    def test_pause_state_persistent(self, connect_protocol):
        """
        Verify that paused state is preserved after a cluster restart.
        """

        self.CONNECT_PROTOCOL = connect_protocol
        self.setup_services()
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        self.source = VerifiableSource(self.cc, topic=self.TOPIC)
        self.source.start()

        wait_until(
            lambda: self.is_running(self.source),
            timeout_sec=30,
            err_msg="Failed to see connector transition to the RUNNING state")

        self.cc.pause_connector(self.source.name)

        self.cc.restart()

        if connect_protocol == 'compatible':
            timeout_sec = 120
        else:
            timeout_sec = 30

        # we should still be paused after restarting
        for node in self.cc.nodes:
            wait_until(
                lambda: self.is_paused(self.source, node),
                timeout_sec=timeout_sec,
                err_msg="Failed to see connector startup in PAUSED state")
コード例 #4
0
    def test_pause_and_resume_source(self, connect_protocol):
        """
        Verify that source connectors stop producing records when paused and begin again after
        being resumed.
        """

        self.CONNECT_PROTOCOL = connect_protocol
        self.setup_services()
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        self.source = VerifiableSource(self.cc, topic=self.TOPIC)
        self.source.start()

        wait_until(
            lambda: self.is_running(self.source),
            timeout_sec=30,
            err_msg="Failed to see connector transition to the RUNNING state")

        self.cc.pause_connector(self.source.name)

        # wait until all nodes report the paused transition
        for node in self.cc.nodes:
            wait_until(
                lambda: self.is_paused(self.source, node),
                timeout_sec=30,
                err_msg="Failed to see connector transition to the PAUSED state"
            )

        # verify that we do not produce new messages while paused
        num_messages = len(self.source.sent_messages())
        time.sleep(10)
        assert num_messages == len(self.source.sent_messages(
        )), "Paused source connector should not produce any messages"

        self.cc.resume_connector(self.source.name)

        for node in self.cc.nodes:
            wait_until(
                lambda: self.is_running(self.source, node),
                timeout_sec=30,
                err_msg=
                "Failed to see connector transition to the RUNNING state")

        # after resuming, we should see records produced again
        wait_until(
            lambda: len(self.source.sent_messages()) > num_messages,
            timeout_sec=30,
            err_msg="Failed to produce messages after resuming source connector"
        )
コード例 #5
0
    def test_bounce(self, clean):
        """
        Validates that source and sink tasks that run continuously and produce a predictable sequence of messages
        run correctly and deliver messages exactly once when Kafka Connect workers undergo clean rolling bounces.
        """
        num_tasks = 3

        self.setup_services()
        self.cc.set_configs(lambda node: self.render(
            "connect-distributed.properties", node=node))
        self.cc.start()

        self.source = VerifiableSource(self.cc,
                                       tasks=num_tasks,
                                       throughput=100)
        self.source.start()
        self.sink = VerifiableSink(self.cc, tasks=num_tasks)
        self.sink.start()

        for _ in range(3):
            for node in self.cc.nodes:
                started = time.time()
                self.logger.info("%s bouncing Kafka Connect on %s",
                                 clean and "Clean" or "Hard",
                                 str(node.account))
                self.cc.stop_node(node, clean_shutdown=clean)
                with node.account.monitor_log(self.cc.LOG_FILE) as monitor:
                    self.cc.start_node(node)
                    monitor.wait_until(
                        "Starting connectors and tasks using config offset",
                        timeout_sec=90,
                        err_msg=
                        "Kafka Connect worker didn't successfully join group and start work"
                    )
                self.logger.info(
                    "Bounced Kafka Connect on %s and rejoined in %f seconds",
                    node.account,
                    time.time() - started)

                # Give additional time for the consumer groups to recover. Even if it is not a hard bounce, there are
                # some cases where a restart can cause a rebalance to take the full length of the session timeout
                # (e.g. if the client shuts down before it has received the memberId from its initial JoinGroup).
                # If we don't give enough time for the group to stabilize, the next bounce may cause consumers to
                # be shut down before they have any time to process data and we can end up with zero data making it
                # through the test.
                time.sleep(15)

        self.source.stop()
        self.sink.stop()
        self.cc.stop()

        # Validate at least once delivery of everything that was reported as written since we should have flushed and
        # cleanly exited. Currently this only tests at least once delivery because the sink task may not have consumed
        # all the messages generated by the source task. This needs to be done per-task since seqnos are not unique across
        # tasks.
        success = True
        errors = []
        allow_dups = not clean
        src_messages = self.source.messages()
        sink_messages = self.sink.messages()
        for task in range(num_tasks):
            # Validate source messages
            src_seqnos = [
                msg['seqno'] for msg in src_messages if msg['task'] == task
            ]
            # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because clean
            # bouncing should commit on rebalance.
            src_seqno_max = max(src_seqnos)
            self.logger.debug("Max source seqno: %d", src_seqno_max)
            src_seqno_counts = Counter(src_seqnos)
            missing_src_seqnos = sorted(
                set(range(src_seqno_max)).difference(set(src_seqnos)))
            duplicate_src_seqnos = sorted([
                seqno for seqno, count in src_seqno_counts.iteritems()
                if count > 1
            ])

            if missing_src_seqnos:
                self.logger.error("Missing source sequence numbers for task " +
                                  str(task))
                errors.append(
                    "Found missing source sequence numbers for task %d: %s" %
                    (task, missing_src_seqnos))
                success = False
            if not allow_dups and duplicate_src_seqnos:
                self.logger.error(
                    "Duplicate source sequence numbers for task " + str(task))
                errors.append(
                    "Found duplicate source sequence numbers for task %d: %s" %
                    (task, duplicate_src_seqnos))
                success = False

            # Validate sink messages
            sink_seqnos = [
                msg['seqno'] for msg in sink_messages
                if msg['task'] == task and 'flushed' in msg
            ]
            # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because
            # clean bouncing should commit on rebalance.
            sink_seqno_max = max(sink_seqnos)
            self.logger.debug("Max sink seqno: %d", sink_seqno_max)
            sink_seqno_counts = Counter(sink_seqnos)
            missing_sink_seqnos = sorted(
                set(range(sink_seqno_max)).difference(set(sink_seqnos)))
            duplicate_sink_seqnos = sorted([
                seqno for seqno, count in sink_seqno_counts.iteritems()
                if count > 1
            ])

            if missing_sink_seqnos:
                self.logger.error("Missing sink sequence numbers for task " +
                                  str(task))
                errors.append(
                    "Found missing sink sequence numbers for task %d: %s" %
                    (task, missing_sink_seqnos))
                success = False
            if not allow_dups and duplicate_sink_seqnos:
                self.logger.error("Duplicate sink sequence numbers for task " +
                                  str(task))
                errors.append(
                    "Found duplicate sink sequence numbers for task %d: %s" %
                    (task, duplicate_sink_seqnos))
                success = False

            # Validate source and sink match
            if sink_seqno_max > src_seqno_max:
                self.logger.error(
                    "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d",
                    task, sink_seqno_max, src_seqno_max)
                errors.append(
                    "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d"
                    % (task, sink_seqno_max, src_seqno_max))
                success = False
            if src_seqno_max < 1000 or sink_seqno_max < 1000:
                errors.append(
                    "Not enough messages were processed: source:%d sink:%d" %
                    (src_seqno_max, sink_seqno_max))
                success = False

        if not success:
            self.mark_for_collect(self.cc)
            # Also collect the data in the topic to aid in debugging
            consumer_validator = ConsoleConsumer(self.test_context,
                                                 1,
                                                 self.kafka,
                                                 self.source.topic,
                                                 consumer_timeout_ms=1000,
                                                 print_key=True)
            consumer_validator.run()
            self.mark_for_collect(consumer_validator, "consumer_stdout")

        assert success, "Found validation errors:\n" + "\n  ".join(errors)