def test_pause_state_persistent(self): """ Verify that paused state is preserved after a cluster restart. """ self.setup_services() self.cc.set_configs(lambda node: self.render( "connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc) self.source.start() wait_until( lambda: self.is_running(self.source), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.source.name) self.cc.restart() # we should still be paused after restarting for node in self.cc.nodes: wait_until( lambda: self.is_paused(self.source, node), timeout_sec=30, err_msg="Failed to see connector startup in PAUSED state")
def test_pause_and_resume_sink(self, connect_protocol): """ Verify that sink connectors stop consuming records when paused and begin again after being resumed. """ self.CONNECT_PROTOCOL = connect_protocol self.setup_services() self.cc.set_configs(lambda node: self.render( "connect-distributed.properties", node=node)) self.cc.start() # use the verifiable source to produce a steady stream of messages self.source = VerifiableSource(self.cc, topic=self.TOPIC) self.source.start() wait_until( lambda: len(self.source.committed_messages()) > 0, timeout_sec=30, err_msg= "Timeout expired waiting for source task to produce a message") self.sink = VerifiableSink(self.cc, topics=[self.TOPIC]) self.sink.start() wait_until( lambda: self.is_running(self.sink), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.sink.name) # wait until all nodes report the paused transition for node in self.cc.nodes: wait_until( lambda: self.is_paused(self.sink, node), timeout_sec=30, err_msg="Failed to see connector transition to the PAUSED state" ) # verify that we do not consume new messages while paused num_messages = len(self.sink.received_messages()) time.sleep(10) assert num_messages == len(self.sink.received_messages( )), "Paused sink connector should not consume any messages" self.cc.resume_connector(self.sink.name) for node in self.cc.nodes: wait_until( lambda: self.is_running(self.sink, node), timeout_sec=30, err_msg= "Failed to see connector transition to the RUNNING state") # after resuming, we should see records consumed again wait_until( lambda: len(self.sink.received_messages()) > num_messages, timeout_sec=30, err_msg="Failed to consume messages after resuming sink connector")
def test_pause_state_persistent(self, connect_protocol): """ Verify that paused state is preserved after a cluster restart. """ self.CONNECT_PROTOCOL = connect_protocol self.setup_services() self.cc.set_configs(lambda node: self.render( "connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc, topic=self.TOPIC) self.source.start() wait_until( lambda: self.is_running(self.source), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.source.name) self.cc.restart() if connect_protocol == 'compatible': timeout_sec = 120 else: timeout_sec = 30 # we should still be paused after restarting for node in self.cc.nodes: wait_until( lambda: self.is_paused(self.source, node), timeout_sec=timeout_sec, err_msg="Failed to see connector startup in PAUSED state")
def test_pause_and_resume_source(self, connect_protocol): """ Verify that source connectors stop producing records when paused and begin again after being resumed. """ self.CONNECT_PROTOCOL = connect_protocol self.setup_services() self.cc.set_configs(lambda node: self.render( "connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc, topic=self.TOPIC) self.source.start() wait_until( lambda: self.is_running(self.source), timeout_sec=30, err_msg="Failed to see connector transition to the RUNNING state") self.cc.pause_connector(self.source.name) # wait until all nodes report the paused transition for node in self.cc.nodes: wait_until( lambda: self.is_paused(self.source, node), timeout_sec=30, err_msg="Failed to see connector transition to the PAUSED state" ) # verify that we do not produce new messages while paused num_messages = len(self.source.sent_messages()) time.sleep(10) assert num_messages == len(self.source.sent_messages( )), "Paused source connector should not produce any messages" self.cc.resume_connector(self.source.name) for node in self.cc.nodes: wait_until( lambda: self.is_running(self.source, node), timeout_sec=30, err_msg= "Failed to see connector transition to the RUNNING state") # after resuming, we should see records produced again wait_until( lambda: len(self.source.sent_messages()) > num_messages, timeout_sec=30, err_msg="Failed to produce messages after resuming source connector" )
def test_bounce(self, clean): """ Validates that source and sink tasks that run continuously and produce a predictable sequence of messages run correctly and deliver messages exactly once when Kafka Connect workers undergo clean rolling bounces. """ num_tasks = 3 self.setup_services() self.cc.set_configs(lambda node: self.render( "connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc, tasks=num_tasks, throughput=100) self.source.start() self.sink = VerifiableSink(self.cc, tasks=num_tasks) self.sink.start() for _ in range(3): for node in self.cc.nodes: started = time.time() self.logger.info("%s bouncing Kafka Connect on %s", clean and "Clean" or "Hard", str(node.account)) self.cc.stop_node(node, clean_shutdown=clean) with node.account.monitor_log(self.cc.LOG_FILE) as monitor: self.cc.start_node(node) monitor.wait_until( "Starting connectors and tasks using config offset", timeout_sec=90, err_msg= "Kafka Connect worker didn't successfully join group and start work" ) self.logger.info( "Bounced Kafka Connect on %s and rejoined in %f seconds", node.account, time.time() - started) # Give additional time for the consumer groups to recover. Even if it is not a hard bounce, there are # some cases where a restart can cause a rebalance to take the full length of the session timeout # (e.g. if the client shuts down before it has received the memberId from its initial JoinGroup). # If we don't give enough time for the group to stabilize, the next bounce may cause consumers to # be shut down before they have any time to process data and we can end up with zero data making it # through the test. time.sleep(15) self.source.stop() self.sink.stop() self.cc.stop() # Validate at least once delivery of everything that was reported as written since we should have flushed and # cleanly exited. Currently this only tests at least once delivery because the sink task may not have consumed # all the messages generated by the source task. This needs to be done per-task since seqnos are not unique across # tasks. success = True errors = [] allow_dups = not clean src_messages = self.source.messages() sink_messages = self.sink.messages() for task in range(num_tasks): # Validate source messages src_seqnos = [ msg['seqno'] for msg in src_messages if msg['task'] == task ] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because clean # bouncing should commit on rebalance. src_seqno_max = max(src_seqnos) self.logger.debug("Max source seqno: %d", src_seqno_max) src_seqno_counts = Counter(src_seqnos) missing_src_seqnos = sorted( set(range(src_seqno_max)).difference(set(src_seqnos))) duplicate_src_seqnos = sorted([ seqno for seqno, count in src_seqno_counts.iteritems() if count > 1 ]) if missing_src_seqnos: self.logger.error("Missing source sequence numbers for task " + str(task)) errors.append( "Found missing source sequence numbers for task %d: %s" % (task, missing_src_seqnos)) success = False if not allow_dups and duplicate_src_seqnos: self.logger.error( "Duplicate source sequence numbers for task " + str(task)) errors.append( "Found duplicate source sequence numbers for task %d: %s" % (task, duplicate_src_seqnos)) success = False # Validate sink messages sink_seqnos = [ msg['seqno'] for msg in sink_messages if msg['task'] == task and 'flushed' in msg ] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because # clean bouncing should commit on rebalance. sink_seqno_max = max(sink_seqnos) self.logger.debug("Max sink seqno: %d", sink_seqno_max) sink_seqno_counts = Counter(sink_seqnos) missing_sink_seqnos = sorted( set(range(sink_seqno_max)).difference(set(sink_seqnos))) duplicate_sink_seqnos = sorted([ seqno for seqno, count in sink_seqno_counts.iteritems() if count > 1 ]) if missing_sink_seqnos: self.logger.error("Missing sink sequence numbers for task " + str(task)) errors.append( "Found missing sink sequence numbers for task %d: %s" % (task, missing_sink_seqnos)) success = False if not allow_dups and duplicate_sink_seqnos: self.logger.error("Duplicate sink sequence numbers for task " + str(task)) errors.append( "Found duplicate sink sequence numbers for task %d: %s" % (task, duplicate_sink_seqnos)) success = False # Validate source and sink match if sink_seqno_max > src_seqno_max: self.logger.error( "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d", task, sink_seqno_max, src_seqno_max) errors.append( "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d" % (task, sink_seqno_max, src_seqno_max)) success = False if src_seqno_max < 1000 or sink_seqno_max < 1000: errors.append( "Not enough messages were processed: source:%d sink:%d" % (src_seqno_max, sink_seqno_max)) success = False if not success: self.mark_for_collect(self.cc) # Also collect the data in the topic to aid in debugging consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.source.topic, consumer_timeout_ms=1000, print_key=True) consumer_validator.run() self.mark_for_collect(consumer_validator, "consumer_stdout") assert success, "Found validation errors:\n" + "\n ".join(errors)