def test_quota(self, quota_type, override_quota=True, producer_num=1, consumer_num=1, old_broker_throttling_behavior=False, old_client_throttling_behavior=False): # Old (pre-2.0) throttling behavior for broker throttles before sending a response to the client. if old_broker_throttling_behavior: self.kafka.set_version(LATEST_1_1) self.kafka.start() self.quota_config = QuotaConfig(quota_type, override_quota, self.kafka) producer_client_id = self.quota_config.client_id consumer_client_id = self.quota_config.client_id # Old (pre-2.0) throttling behavior for client does not throttle upon receiving a response with a non-zero throttle time. if old_client_throttling_behavior: client_version = LATEST_1_1 else: client_version = DEV_BRANCH # Produce all messages producer = ProducerPerformanceService(self.test_context, producer_num, self.kafka, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_client_id, version=client_version) producer.run() # Consume all messages consumer = ConsoleConsumer( self.test_context, consumer_num, self.kafka, self.topic, consumer_timeout_ms=60000, client_id=consumer_client_id, jmx_object_names=[ 'kafka.consumer:type=consumer-fetch-manager-metrics,client-id=%s' % consumer_client_id ], jmx_attributes=['bytes-consumed-rate'], version=client_version) consumer.run() for idx, messages in consumer.messages_consumed.iteritems(): assert len( messages ) > 0, "consumer %d didn't consume any message before timeout" % idx success, msg = self.validate(self.kafka, producer, consumer) assert success, msg
def test_transformations(self, connect_protocol): self.CONNECT_PROTOCOL = connect_protocol self.setup_services(timestamp_type='CreateTime') self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() ts_fieldname = 'the_timestamp' NamedConnector = namedtuple('Connector', ['name']) source_connector = NamedConnector(name='file-src') self.cc.create_connector({ 'name': source_connector.name, 'connector.class': 'org.apache.kafka.connect.file.FileStreamSourceConnector', 'tasks.max': 1, 'file': self.INPUT_FILE, 'topic': self.TOPIC, 'transforms': 'hoistToStruct,insertTimestampField', 'transforms.hoistToStruct.type': 'org.apache.kafka.connect.transforms.HoistField$Value', 'transforms.hoistToStruct.field': 'content', 'transforms.insertTimestampField.type': 'org.apache.kafka.connect.transforms.InsertField$Value', 'transforms.insertTimestampField.timestamp.field': ts_fieldname, }) wait_until(lambda: self.connector_is_running(source_connector), timeout_sec=30, err_msg='Failed to see connector transition to the RUNNING state') for node in self.cc.nodes: node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " + self.INPUT_FILE) consumer = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC, consumer_timeout_ms=15000, print_timestamp=True) consumer.run() assert len(consumer.messages_consumed[1]) == len(self.FIRST_INPUT_LIST) expected_schema = { 'type': 'struct', 'fields': [ {'field': 'content', 'type': 'string', 'optional': False}, {'field': ts_fieldname, 'name': 'org.apache.kafka.connect.data.Timestamp', 'type': 'int64', 'version': 1, 'optional': True}, ], 'optional': False } for msg in consumer.messages_consumed[1]: (ts_info, value) = msg.split('\t') assert ts_info.startswith('CreateTime:') ts = int(ts_info[len('CreateTime:'):]) obj = json.loads(value) assert obj['schema'] == expected_schema assert obj['payload']['content'] in self.FIRST_INPUT_LIST assert obj['payload'][ts_fieldname] == ts
def test_transformations(self): self.setup_services(timestamp_type='CreateTime') self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() ts_fieldname = 'the_timestamp' NamedConnector = namedtuple('Connector', ['name']) source_connector = NamedConnector(name='file-src') self.cc.create_connector({ 'name': source_connector.name, 'connector.class': 'org.apache.kafka.connect.file.FileStreamSourceConnector', 'tasks.max': 1, 'file': self.INPUT_FILE, 'topic': self.TOPIC, 'transforms': 'hoistToStruct,insertTimestampField', 'transforms.hoistToStruct.type': 'org.apache.kafka.connect.transforms.HoistField$Value', 'transforms.hoistToStruct.field': 'content', 'transforms.insertTimestampField.type': 'org.apache.kafka.connect.transforms.InsertField$Value', 'transforms.insertTimestampField.timestamp.field': ts_fieldname, }) wait_until(lambda: self.connector_is_running(source_connector), timeout_sec=30, err_msg='Failed to see connector transition to the RUNNING state') for node in self.cc.nodes: node.account.ssh("echo -e -n " + repr(self.FIRST_INPUTS) + " >> " + self.INPUT_FILE) consumer = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC, consumer_timeout_ms=15000, print_timestamp=True) consumer.run() assert len(consumer.messages_consumed[1]) == len(self.FIRST_INPUT_LIST) expected_schema = { 'type': 'struct', 'fields': [ {'field': 'content', 'type': 'string', 'optional': False}, {'field': ts_fieldname, 'name': 'org.apache.kafka.connect.data.Timestamp', 'type': 'int64', 'version': 1, 'optional': True}, ], 'optional': False } for msg in consumer.messages_consumed[1]: (ts_info, value) = msg.split('\t') assert ts_info.startswith('CreateTime:') ts = int(ts_info[len('CreateTime:'):]) obj = json.loads(value) assert obj['schema'] == expected_schema assert obj['payload']['content'] in self.FIRST_INPUT_LIST assert obj['payload'][ts_fieldname] == ts
def test_quota(self, quota_type, override_quota=True, producer_num=1, consumer_num=1): self.quota_config = QuotaConfig(quota_type, override_quota, self.kafka) producer_client_id = self.quota_config.client_id consumer_client_id = self.quota_config.client_id # Produce all messages producer = ProducerPerformanceService( self.test_context, producer_num, self.kafka, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_client_id, jmx_object_names=[ 'kafka.producer:type=producer-metrics,client-id=%s' % producer_client_id ], jmx_attributes=['outgoing-byte-rate']) producer.run() # Consume all messages consumer = ConsoleConsumer( self.test_context, consumer_num, self.kafka, self.topic, new_consumer=True, consumer_timeout_ms=60000, client_id=consumer_client_id, jmx_object_names=[ 'kafka.consumer:type=consumer-fetch-manager-metrics,client-id=%s' % consumer_client_id ], jmx_attributes=['bytes-consumed-rate']) consumer.run() for idx, messages in consumer.messages_consumed.iteritems(): assert len( messages ) > 0, "consumer %d didn't consume any message before timeout" % idx success, msg = self.validate(self.kafka, producer, consumer) assert success, msg
def test_quota(self, producer_id='default_id', producer_num=1, consumer_id='default_id', consumer_num=1): # Produce all messages producer = ProducerPerformanceService( self.test_context, producer_num, self.kafka, security_protocol=self.security_protocol, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_id, jmx_object_names=[ 'kafka.producer:type=producer-metrics,client-id=%s' % producer_id ], jmx_attributes=['outgoing-byte-rate']) producer.run() # Consume all messages consumer = ConsoleConsumer( self.test_context, consumer_num, self.kafka, self.topic, security_protocol=self.security_protocol, new_consumer=False, consumer_timeout_ms=60000, client_id=consumer_id, jmx_object_names=[ 'kafka.consumer:type=ConsumerTopicMetrics,name=BytesPerSec,clientId=%s' % consumer_id ], jmx_attributes=['OneMinuteRate']) consumer.run() for idx, messages in consumer.messages_consumed.iteritems(): assert len( messages ) > 0, "consumer %d didn't consume any message before timeout" % idx success, msg = self.validate(self.kafka, producer, consumer) assert success, msg
def test_quota(self, producer_id='default_id', producer_num=1, consumer_id='default_id', consumer_num=1): # Produce all messages producer = ProducerPerformanceService( self.test_context, producer_num, self.kafka, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_id, jmx_object_names=['kafka.producer:type=producer-metrics,client-id=%s' % producer_id], jmx_attributes=['outgoing-byte-rate']) producer.run() # Consume all messages consumer = ConsoleConsumer(self.test_context, consumer_num, self.kafka, self.topic, new_consumer=False, consumer_timeout_ms=60000, client_id=consumer_id, jmx_object_names=['kafka.consumer:type=ConsumerTopicMetrics,name=BytesPerSec,clientId=%s' % consumer_id], jmx_attributes=['OneMinuteRate']) consumer.run() for idx, messages in consumer.messages_consumed.iteritems(): assert len(messages) > 0, "consumer %d didn't consume any message before timeout" % idx success, msg = self.validate(self.kafka, producer, consumer) assert success, msg
def test_quota(self, quota_type, override_quota=True, producer_num=1, consumer_num=1, old_broker_throttling_behavior=False, old_client_throttling_behavior=False): # Old (pre-2.0) throttling behavior for broker throttles before sending a response to the client. if old_broker_throttling_behavior: self.kafka.set_version(LATEST_1_1) self.kafka.start() self.quota_config = QuotaConfig(quota_type, override_quota, self.kafka) producer_client_id = self.quota_config.client_id consumer_client_id = self.quota_config.client_id # Old (pre-2.0) throttling behavior for client does not throttle upon receiving a response with a non-zero throttle time. if old_client_throttling_behavior: client_version = LATEST_1_1 else: client_version = DEV_BRANCH # Produce all messages producer = ProducerPerformanceService( self.test_context, producer_num, self.kafka, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_client_id, version=client_version) producer.run() # Consume all messages consumer = ConsoleConsumer(self.test_context, consumer_num, self.kafka, self.topic, consumer_timeout_ms=60000, client_id=consumer_client_id, jmx_object_names=['kafka.consumer:type=consumer-fetch-manager-metrics,client-id=%s' % consumer_client_id], jmx_attributes=['bytes-consumed-rate'], version=client_version) consumer.run() for idx, messages in consumer.messages_consumed.iteritems(): assert len(messages) > 0, "consumer %d didn't consume any message before timeout" % idx success, msg = self.validate(self.kafka, producer, consumer) assert success, msg
def test_quota(self, quota_type, override_quota=True, producer_num=1, consumer_num=1): self.quota_config = QuotaConfig(quota_type, override_quota, self.kafka) producer_client_id = self.quota_config.client_id consumer_client_id = self.quota_config.client_id # Produce all messages producer = ProducerPerformanceService( self.test_context, producer_num, self.kafka, topic=self.topic, num_records=self.num_records, record_size=self.record_size, throughput=-1, client_id=producer_client_id) producer.run() # Consume all messages consumer = ConsoleConsumer(self.test_context, consumer_num, self.kafka, self.topic, consumer_timeout_ms=60000, client_id=consumer_client_id, jmx_object_names=['kafka.consumer:type=consumer-fetch-manager-metrics,client-id=%s' % consumer_client_id], jmx_attributes=['bytes-consumed-rate']) consumer.run() for idx, messages in consumer.messages_consumed.iteritems(): assert len(messages) > 0, "consumer %d didn't consume any message before timeout" % idx success, msg = self.validate(self.kafka, producer, consumer) assert success, msg
def test_bounce(self, clean): """ Validates that source and sink tasks that run continuously and produce a predictable sequence of messages run correctly and deliver messages exactly once when Kafka Connect workers undergo clean rolling bounces. """ num_tasks = 3 self.setup_services() self.cc.set_configs(lambda node: self.render( "connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc, tasks=num_tasks, throughput=100) self.source.start() self.sink = VerifiableSink(self.cc, tasks=num_tasks) self.sink.start() for _ in range(3): for node in self.cc.nodes: started = time.time() self.logger.info("%s bouncing Kafka Connect on %s", clean and "Clean" or "Hard", str(node.account)) self.cc.stop_node(node, clean_shutdown=clean) with node.account.monitor_log(self.cc.LOG_FILE) as monitor: self.cc.start_node(node) monitor.wait_until( "Starting connectors and tasks using config offset", timeout_sec=90, err_msg= "Kafka Connect worker didn't successfully join group and start work" ) self.logger.info( "Bounced Kafka Connect on %s and rejoined in %f seconds", node.account, time.time() - started) # Give additional time for the consumer groups to recover. Even if it is not a hard bounce, there are # some cases where a restart can cause a rebalance to take the full length of the session timeout # (e.g. if the client shuts down before it has received the memberId from its initial JoinGroup). # If we don't give enough time for the group to stabilize, the next bounce may cause consumers to # be shut down before they have any time to process data and we can end up with zero data making it # through the test. time.sleep(15) self.source.stop() self.sink.stop() self.cc.stop() # Validate at least once delivery of everything that was reported as written since we should have flushed and # cleanly exited. Currently this only tests at least once delivery because the sink task may not have consumed # all the messages generated by the source task. This needs to be done per-task since seqnos are not unique across # tasks. success = True errors = [] allow_dups = not clean src_messages = self.source.messages() sink_messages = self.sink.messages() for task in range(num_tasks): # Validate source messages src_seqnos = [ msg['seqno'] for msg in src_messages if msg['task'] == task ] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because clean # bouncing should commit on rebalance. src_seqno_max = max(src_seqnos) self.logger.debug("Max source seqno: %d", src_seqno_max) src_seqno_counts = Counter(src_seqnos) missing_src_seqnos = sorted( set(range(src_seqno_max)).difference(set(src_seqnos))) duplicate_src_seqnos = sorted([ seqno for seqno, count in src_seqno_counts.iteritems() if count > 1 ]) if missing_src_seqnos: self.logger.error("Missing source sequence numbers for task " + str(task)) errors.append( "Found missing source sequence numbers for task %d: %s" % (task, missing_src_seqnos)) success = False if not allow_dups and duplicate_src_seqnos: self.logger.error( "Duplicate source sequence numbers for task " + str(task)) errors.append( "Found duplicate source sequence numbers for task %d: %s" % (task, duplicate_src_seqnos)) success = False # Validate sink messages sink_seqnos = [ msg['seqno'] for msg in sink_messages if msg['task'] == task and 'flushed' in msg ] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because # clean bouncing should commit on rebalance. sink_seqno_max = max(sink_seqnos) self.logger.debug("Max sink seqno: %d", sink_seqno_max) sink_seqno_counts = Counter(sink_seqnos) missing_sink_seqnos = sorted( set(range(sink_seqno_max)).difference(set(sink_seqnos))) duplicate_sink_seqnos = sorted([ seqno for seqno, count in sink_seqno_counts.iteritems() if count > 1 ]) if missing_sink_seqnos: self.logger.error("Missing sink sequence numbers for task " + str(task)) errors.append( "Found missing sink sequence numbers for task %d: %s" % (task, missing_sink_seqnos)) success = False if not allow_dups and duplicate_sink_seqnos: self.logger.error("Duplicate sink sequence numbers for task " + str(task)) errors.append( "Found duplicate sink sequence numbers for task %d: %s" % (task, duplicate_sink_seqnos)) success = False # Validate source and sink match if sink_seqno_max > src_seqno_max: self.logger.error( "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d", task, sink_seqno_max, src_seqno_max) errors.append( "Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d" % (task, sink_seqno_max, src_seqno_max)) success = False if src_seqno_max < 1000 or sink_seqno_max < 1000: errors.append( "Not enough messages were processed: source:%d sink:%d" % (src_seqno_max, sink_seqno_max)) success = False if not success: self.mark_for_collect(self.cc) # Also collect the data in the topic to aid in debugging consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.source.topic, consumer_timeout_ms=1000, print_key=True) consumer_validator.run() self.mark_for_collect(consumer_validator, "consumer_stdout") assert success, "Found validation errors:\n" + "\n ".join(errors)
class ConnectStandaloneFileTest(Test): """ Simple test of Kafka Connect that produces data from a file in one standalone process and consumes it on another, validating the output is identical to the input. """ FILE_SOURCE_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSourceConnector' FILE_SINK_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSinkConnector' INPUT_FILE = "/mnt/connect.input" OUTPUT_FILE = "/mnt/connect.output" OFFSETS_FILE = "/mnt/connect.offsets" TOPIC = "${file:%s:topic.external}" % ConnectServiceBase.EXTERNAL_CONFIGS_FILE TOPIC_TEST = "test" FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = { "type": "string", "optional": False } def __init__(self, test_context): super(ConnectStandaloneFileTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = { 'test' : { 'partitions': 1, 'replication-factor': 1 } } self.zk = ZookeeperService(test_context, self.num_zk) @cluster(num_nodes=5) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=True) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=False) @parametrize(converter="org.apache.kafka.connect.storage.StringConverter", schemas=None) @parametrize(security_protocol=SecurityConfig.PLAINTEXT) @cluster(num_nodes=6) @parametrize(security_protocol=SecurityConfig.SASL_SSL) def test_file_source_and_sink(self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True, security_protocol='PLAINTEXT'): """ Validates basic end-to-end functionality of Connect standalone using the file source and sink converters. Includes parameterizations to test different converters (which also test per-connector converter overrides), schema/schemaless modes, and security support. """ assert converter != None, "converter type must be set" # Template parameters. Note that we don't set key/value.converter. These default to JsonConverter and we validate # converter overrides via the connector configuration. if converter != "org.apache.kafka.connect.json.JsonConverter": self.override_key_converter = converter self.override_value_converter = converter self.schemas = schemas self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=security_protocol, topics=self.topics) self.source = ConnectStandaloneService(self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService(self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC_TEST, consumer_timeout_ms=10000) self.zk.start() self.kafka.start() self.source.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.sink.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node)) self.sink.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node)) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE) wait_until(lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.") # Restarting both should result in them picking up where they left off, # only processing new data. self.source.restart() self.sink.restart() self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE) wait_until(lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg="Sink output file never converged to the same state as the input file") # Validate the format of the data in the Kafka topic self.consumer_validator.run() expected = json.dumps([line if not self.schemas else { "schema": self.SCHEMA, "payload": line } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST]) decoder = (json.loads if converter.endswith("JsonConverter") else str) actual = json.dumps([decoder(x) for x in self.consumer_validator.messages_consumed[1]]) assert expected == actual, "Expected %s but saw %s in Kafka" % (expected, actual) def validate_output(self, value): try: output_hash = list(self.sink.node.account.ssh_capture("md5sum " + self.OUTPUT_FILE))[0].strip().split()[0] return output_hash == hashlib.md5(value).hexdigest() except RemoteCommandError: return False @cluster(num_nodes=5) @parametrize(error_tolerance=ErrorTolerance.ALL) @parametrize(error_tolerance=ErrorTolerance.NONE) def test_skip_and_log_to_dlq(self, error_tolerance): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) # set config props self.override_error_tolerance_props = error_tolerance self.enable_deadletterqueue = True successful_records = [] faulty_records = [] records = [] for i in range(0, 1000): if i % 2 == 0: records.append('{"some_key":' + str(i) + '}') successful_records.append('{some_key=' + str(i) + '}') else: # badly formatted json records (missing a quote after the key) records.append('{"some_key:' + str(i) + '}') faulty_records.append('{"some_key:' + str(i) + '}') records = "\n".join(records) + "\n" successful_records = "\n".join(successful_records) + "\n" if error_tolerance == ErrorTolerance.ALL: faulty_records = ",".join(faulty_records) else: faulty_records = faulty_records[0] self.source = ConnectStandaloneService(self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService(self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.zk.start() self.kafka.start() self.override_key_converter = "org.apache.kafka.connect.storage.StringConverter" self.override_value_converter = "org.apache.kafka.connect.storage.StringConverter" self.source.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.override_key_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_value_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_key_converter_schemas_enable = False self.override_value_converter_schemas_enable = False self.sink.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node)) self.sink.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node)) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(records) + " >> " + self.INPUT_FILE) if error_tolerance == ErrorTolerance.NONE: try: wait_until(lambda: self.validate_output(successful_records), timeout_sec=15, err_msg="Clean records added to input file were not seen in the output file in a reasonable amount of time.") raise Exception("Expected to not find any results in this file.") except TimeoutError: self.logger.info("Caught expected exception") else: wait_until(lambda: self.validate_output(successful_records), timeout_sec=15, err_msg="Clean records added to input file were not seen in the output file in a reasonable amount of time.") if self.enable_deadletterqueue: self.logger.info("Reading records from deadletterqueue") consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, "my-connector-errors", consumer_timeout_ms=10000) consumer_validator.run() actual = ",".join(consumer_validator.messages_consumed[1]) assert faulty_records == actual, "Expected %s but saw %s in dead letter queue" % (faulty_records, actual)
def test_skip_and_log_to_dlq(self, error_tolerance): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) # set config props self.override_error_tolerance_props = error_tolerance self.enable_deadletterqueue = True successful_records = [] faulty_records = [] records = [] for i in range(0, 1000): if i % 2 == 0: records.append('{"some_key":' + str(i) + '}') successful_records.append('{some_key=' + str(i) + '}') else: # badly formatted json records (missing a quote after the key) records.append('{"some_key:' + str(i) + '}') faulty_records.append('{"some_key:' + str(i) + '}') records = "\n".join(records) + "\n" successful_records = "\n".join(successful_records) + "\n" if error_tolerance == ErrorTolerance.ALL: faulty_records = ",".join(faulty_records) else: faulty_records = faulty_records[0] self.source = ConnectStandaloneService(self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService(self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.zk.start() self.kafka.start() self.override_key_converter = "org.apache.kafka.connect.storage.StringConverter" self.override_value_converter = "org.apache.kafka.connect.storage.StringConverter" self.source.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.override_key_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_value_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_key_converter_schemas_enable = False self.override_value_converter_schemas_enable = False self.sink.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node)) self.sink.set_external_configs(lambda node: self.render("connect-file-external.properties", node=node)) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(records) + " >> " + self.INPUT_FILE) if error_tolerance == ErrorTolerance.NONE: try: wait_until(lambda: self.validate_output(successful_records), timeout_sec=15, err_msg="Clean records added to input file were not seen in the output file in a reasonable amount of time.") raise Exception("Expected to not find any results in this file.") except TimeoutError: self.logger.info("Caught expected exception") else: wait_until(lambda: self.validate_output(successful_records), timeout_sec=15, err_msg="Clean records added to input file were not seen in the output file in a reasonable amount of time.") if self.enable_deadletterqueue: self.logger.info("Reading records from deadletterqueue") consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, "my-connector-errors", consumer_timeout_ms=10000) consumer_validator.run() actual = ",".join(consumer_validator.messages_consumed[1]) assert faulty_records == actual, "Expected %s but saw %s in dead letter queue" % (faulty_records, actual)
def test_bounce(self, clean): """ Validates that source and sink tasks that run continuously and produce a predictable sequence of messages run correctly and deliver messages exactly once when Kafka Connect workers undergo clean rolling bounces. """ num_tasks = 3 self.cc.set_configs(lambda node: self.render("connect-distributed.properties", node=node)) self.cc.start() self.source = VerifiableSource(self.cc, tasks=num_tasks) self.source.start() self.sink = VerifiableSink(self.cc, tasks=num_tasks) self.sink.start() for _ in range(3): for node in self.cc.nodes: started = time.time() self.logger.info("%s bouncing Kafka Connect on %s", clean and "Clean" or "Hard", str(node.account)) self.cc.stop_node(node, clean_shutdown=clean) with node.account.monitor_log(self.cc.LOG_FILE) as monitor: self.cc.start_node(node) monitor.wait_until("Starting connectors and tasks using config offset", timeout_sec=90, err_msg="Kafka Connect worker didn't successfully join group and start work") self.logger.info("Bounced Kafka Connect on %s and rejoined in %f seconds", node.account, time.time() - started) # If this is a hard bounce, give additional time for the consumer groups to recover. If we don't give # some time here, the next bounce may cause consumers to be shut down before they have any time to process # data and we can end up with zero data making it through the test. if not clean: time.sleep(15) self.source.stop() self.sink.stop() self.cc.stop() # Validate at least once delivery of everything that was reported as written since we should have flushed and # cleanly exited. Currently this only tests at least once delivery because the sink task may not have consumed # all the messages generated by the source task. This needs to be done per-task since seqnos are not unique across # tasks. success = True errors = [] allow_dups = not clean src_messages = self.source.messages() sink_messages = self.sink.messages() for task in range(num_tasks): # Validate source messages src_seqnos = [msg['seqno'] for msg in src_messages if msg['task'] == task] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because clean # bouncing should commit on rebalance. src_seqno_max = max(src_seqnos) self.logger.debug("Max source seqno: %d", src_seqno_max) src_seqno_counts = Counter(src_seqnos) missing_src_seqnos = sorted(set(range(src_seqno_max)).difference(set(src_seqnos))) duplicate_src_seqnos = sorted([seqno for seqno,count in src_seqno_counts.iteritems() if count > 1]) if missing_src_seqnos: self.logger.error("Missing source sequence numbers for task " + str(task)) errors.append("Found missing source sequence numbers for task %d: %s" % (task, missing_src_seqnos)) success = False if not allow_dups and duplicate_src_seqnos: self.logger.error("Duplicate source sequence numbers for task " + str(task)) errors.append("Found duplicate source sequence numbers for task %d: %s" % (task, duplicate_src_seqnos)) success = False # Validate sink messages sink_seqnos = [msg['seqno'] for msg in sink_messages if msg['task'] == task and 'flushed' in msg] # Every seqno up to the largest one we ever saw should appear. Each seqno should only appear once because # clean bouncing should commit on rebalance. sink_seqno_max = max(sink_seqnos) self.logger.debug("Max sink seqno: %d", sink_seqno_max) sink_seqno_counts = Counter(sink_seqnos) missing_sink_seqnos = sorted(set(range(sink_seqno_max)).difference(set(sink_seqnos))) duplicate_sink_seqnos = sorted([seqno for seqno,count in sink_seqno_counts.iteritems() if count > 1]) if missing_sink_seqnos: self.logger.error("Missing sink sequence numbers for task " + str(task)) errors.append("Found missing sink sequence numbers for task %d: %s" % (task, missing_sink_seqnos)) success = False if not allow_dups and duplicate_sink_seqnos: self.logger.error("Duplicate sink sequence numbers for task " + str(task)) errors.append("Found duplicate sink sequence numbers for task %d: %s" % (task, duplicate_sink_seqnos)) success = False # Validate source and sink match if sink_seqno_max > src_seqno_max: self.logger.error("Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d", task, sink_seqno_max, src_seqno_max) errors.append("Found sink sequence number greater than any generated sink sequence number for task %d: %d > %d" % (task, sink_seqno_max, src_seqno_max)) success = False if src_seqno_max < 1000 or sink_seqno_max < 1000: errors.append("Not enough messages were processed: source:%d sink:%d" % (src_seqno_max, sink_seqno_max)) success = False if not success: self.mark_for_collect(self.cc) # Also collect the data in the topic to aid in debugging consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.source.topic, consumer_timeout_ms=1000, print_key=True) consumer_validator.run() self.mark_for_collect(consumer_validator, "consumer_stdout") assert success, "Found validation errors:\n" + "\n ".join(errors)
class ConnectStandaloneFileTest(Test): """ Simple test of Kafka Connect that produces data from a file in one standalone process and consumes it on another, validating the output is identical to the input. """ FILE_SOURCE_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSourceConnector' FILE_SINK_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSinkConnector' INPUT_FILE = "/mnt/connect.input" OUTPUT_FILE = "/mnt/connect.output" OFFSETS_FILE = "/mnt/connect.offsets" TOPIC = "${file:" + EXTERNAL_CONFIGS_FILE + ":topic.external}" TOPIC_TEST = "test" FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = {"type": "string", "optional": False} def __init__(self, test_context): super(ConnectStandaloneFileTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = {'test': {'partitions': 1, 'replication-factor': 1}} self.zk = ZookeeperService(test_context, self.num_zk) @cluster(num_nodes=5) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=True) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=False) @parametrize(converter="org.apache.kafka.connect.storage.StringConverter", schemas=None) @parametrize(security_protocol=SecurityConfig.PLAINTEXT) @cluster(num_nodes=6) @parametrize(security_protocol=SecurityConfig.SASL_SSL) def test_file_source_and_sink( self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True, security_protocol='PLAINTEXT'): """ Validates basic end-to-end functionality of Connect standalone using the file source and sink converters. Includes parameterizations to test different converters (which also test per-connector converter overrides), schema/schemaless modes, and security support. """ assert converter != None, "converter type must be set" # Template parameters. Note that we don't set key/value.converter. These default to JsonConverter and we validate # converter overrides via the connector configuration. if converter != "org.apache.kafka.connect.json.JsonConverter": self.override_key_converter = converter self.override_value_converter = converter self.schemas = schemas self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=security_protocol, topics=self.topics) self.source = ConnectStandaloneService( self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService( self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC_TEST, consumer_timeout_ms=10000) self.zk.start() self.kafka.start() self.source.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.sink.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.set_external_configs(lambda node: self.render( "connect-file-external.properties", node=node)) self.sink.set_external_configs(lambda node: self.render( "connect-file-external.properties", node=node)) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg= "Data added to input file was not seen in the output file in a reasonable amount of time." ) # Restarting both should result in them picking up where they left off, # only processing new data. self.source.restart() self.sink.restart() self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg= "Sink output file never converged to the same state as the input file" ) # Validate the format of the data in the Kafka topic self.consumer_validator.run() expected = json.dumps([ line if not self.schemas else { "schema": self.SCHEMA, "payload": line } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST ]) decoder = (json.loads if converter.endswith("JsonConverter") else str) actual = json.dumps( [decoder(x) for x in self.consumer_validator.messages_consumed[1]]) assert expected == actual, "Expected %s but saw %s in Kafka" % ( expected, actual) def validate_output(self, value): try: output_hash = list( self.sink.node.account.ssh_capture( "md5sum " + self.OUTPUT_FILE))[0].strip().split()[0] return output_hash == hashlib.md5(value).hexdigest() except RemoteCommandError: return False @cluster(num_nodes=5) @parametrize(error_tolerance=ErrorTolerance.ALL) @parametrize(error_tolerance=ErrorTolerance.NONE) def test_skip_and_log_to_dlq(self, error_tolerance): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) # set config props self.override_error_tolerance_props = error_tolerance self.enable_deadletterqueue = True successful_records = [] faulty_records = [] records = [] for i in range(0, 1000): if i % 2 == 0: records.append('{"some_key":' + str(i) + '}') successful_records.append('{some_key=' + str(i) + '}') else: # badly formatted json records (missing a quote after the key) records.append('{"some_key:' + str(i) + '}') faulty_records.append('{"some_key:' + str(i) + '}') records = "\n".join(records) + "\n" successful_records = "\n".join(successful_records) + "\n" if error_tolerance == ErrorTolerance.ALL: faulty_records = ",".join(faulty_records) else: faulty_records = faulty_records[0] self.source = ConnectStandaloneService( self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService( self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.zk.start() self.kafka.start() self.override_key_converter = "org.apache.kafka.connect.storage.StringConverter" self.override_value_converter = "org.apache.kafka.connect.storage.StringConverter" self.source.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.override_key_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_value_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_key_converter_schemas_enable = False self.override_value_converter_schemas_enable = False self.sink.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.set_external_configs(lambda node: self.render( "connect-file-external.properties", node=node)) self.sink.set_external_configs(lambda node: self.render( "connect-file-external.properties", node=node)) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(records) + " >> " + self.INPUT_FILE) if error_tolerance == ErrorTolerance.NONE: try: wait_until( lambda: self.validate_output(successful_records), timeout_sec=15, err_msg= "Clean records added to input file were not seen in the output file in a reasonable amount of time." ) raise Exception( "Expected to not find any results in this file.") except TimeoutError: self.logger.info("Caught expected exception") else: wait_until( lambda: self.validate_output(successful_records), timeout_sec=15, err_msg= "Clean records added to input file were not seen in the output file in a reasonable amount of time." ) if self.enable_deadletterqueue: self.logger.info("Reading records from deadletterqueue") consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, "my-connector-errors", consumer_timeout_ms=10000) consumer_validator.run() actual = ",".join(consumer_validator.messages_consumed[1]) assert faulty_records == actual, "Expected %s but saw %s in dead letter queue" % ( faulty_records, actual)
def test_skip_and_log_to_dlq(self, error_tolerance): self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, topics=self.topics) # set config props self.override_error_tolerance_props = error_tolerance self.enable_deadletterqueue = True successful_records = [] faulty_records = [] records = [] for i in range(0, 1000): if i % 2 == 0: records.append('{"some_key":' + str(i) + '}') successful_records.append('{some_key=' + str(i) + '}') else: # badly formatted json records (missing a quote after the key) records.append('{"some_key:' + str(i) + '}') faulty_records.append('{"some_key:' + str(i) + '}') records = "\n".join(records) + "\n" successful_records = "\n".join(successful_records) + "\n" if error_tolerance == ErrorTolerance.ALL: faulty_records = ",".join(faulty_records) else: faulty_records = faulty_records[0] self.source = ConnectStandaloneService( self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService( self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.zk.start() self.kafka.start() self.override_key_converter = "org.apache.kafka.connect.storage.StringConverter" self.override_value_converter = "org.apache.kafka.connect.storage.StringConverter" self.source.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.override_key_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_value_converter = "org.apache.kafka.connect.json.JsonConverter" self.override_key_converter_schemas_enable = False self.override_value_converter_schemas_enable = False self.sink.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.set_external_configs(lambda node: self.render( "connect-file-external.properties", node=node)) self.sink.set_external_configs(lambda node: self.render( "connect-file-external.properties", node=node)) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(records) + " >> " + self.INPUT_FILE) if error_tolerance == ErrorTolerance.NONE: try: wait_until( lambda: self.validate_output(successful_records), timeout_sec=15, err_msg= "Clean records added to input file were not seen in the output file in a reasonable amount of time." ) raise Exception( "Expected to not find any results in this file.") except TimeoutError: self.logger.info("Caught expected exception") else: wait_until( lambda: self.validate_output(successful_records), timeout_sec=15, err_msg= "Clean records added to input file were not seen in the output file in a reasonable amount of time." ) if self.enable_deadletterqueue: self.logger.info("Reading records from deadletterqueue") consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, "my-connector-errors", consumer_timeout_ms=10000) consumer_validator.run() actual = ",".join(consumer_validator.messages_consumed[1]) assert faulty_records == actual, "Expected %s but saw %s in dead letter queue" % ( faulty_records, actual)
class ConnectStandaloneFileTest(Test): """ Simple test of Kafka Connect that produces data from a file in one standalone process and consumes it on another, validating the output is identical to the input. """ FILE_SOURCE_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSourceConnector' FILE_SINK_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSinkConnector' INPUT_FILE = "/mnt/connect.input" OUTPUT_FILE = "/mnt/connect.output" OFFSETS_FILE = "/mnt/connect.offsets" TOPIC = "test" FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = {"type": "string", "optional": False} def __init__(self, test_context): super(ConnectStandaloneFileTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = {'test': {'partitions': 1, 'replication-factor': 1}} self.zk = ZookeeperService(test_context, self.num_zk) @cluster(num_nodes=5) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=True) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=False) @parametrize(converter="org.apache.kafka.connect.storage.StringConverter", schemas=None) @parametrize(security_protocol=SecurityConfig.PLAINTEXT) @cluster(num_nodes=6) @parametrize(security_protocol=SecurityConfig.SASL_SSL) def test_file_source_and_sink( self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True, security_protocol='PLAINTEXT'): """ Validates basic end-to-end functionality of Connect standalone using the file source and sink converters. Includes parameterizations to test different converters (which also test per-connector converter overrides), schema/schemaless modes, and security support. """ assert converter != None, "converter type must be set" # Template parameters. Note that we don't set key/value.converter. These default to JsonConverter and we validate # converter overrides via the connector configuration. if converter != "org.apache.kafka.connect.json.JsonConverter": self.override_key_converter = converter self.override_value_converter = converter self.schemas = schemas self.kafka = KafkaService( self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=security_protocol, topics=self.topics) self.source = ConnectStandaloneService( self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService( self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC, consumer_timeout_ms=1000) self.zk.start() self.kafka.start() self.source.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.sink.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg= "Data added to input file was not seen in the output file in a reasonable amount of time." ) # Restarting both should result in them picking up where they left off, # only processing new data. self.source.restart() self.sink.restart() self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg= "Sink output file never converged to the same state as the input file" ) # Validate the format of the data in the Kafka topic self.consumer_validator.run() expected = json.dumps([ line if not self.schemas else { "schema": self.SCHEMA, "payload": line } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST ]) decoder = (json.loads if converter.endswith("JsonConverter") else str) actual = json.dumps( [decoder(x) for x in self.consumer_validator.messages_consumed[1]]) assert expected == actual, "Expected %s but saw %s in Kafka" % ( expected, actual) def validate_output(self, value): try: output_hash = list( self.sink.node.account.ssh_capture( "md5sum " + self.OUTPUT_FILE))[0].strip().split()[0] return output_hash == hashlib.md5(value).hexdigest() except RemoteCommandError: return False
class ClientCompatibilityTest(Test): def __init__(self, test_context): super(ClientCompatibilityTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={self.topic: { "partitions": 3, "replication-factor": 3, 'configs': {"min.insync.replicas": 2}}}) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def test_producer_back_compatibility(self): """Run 0.9.X java producer against 0.8.X brokers. This test documents the fact that java producer v0.9.0.0 and later won't run against 0.8.X brokers the broker responds to a V1 produce request with a V0 fetch response; the client then tries to parse this V0 produce response as a V1 produce response, resulting in a BufferUnderflowException """ self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, max_messages=100, throughput=self.producer_throughput, version=TRUNK) node = self.producer.nodes[0] try: self.producer.start() self.producer.wait() raise Exception("0.9.X java producer should not run successfully against 0.8.X broker") except: # Expected pass finally: self.producer.kill_node(node, clean_shutdown=False) self.logger.info("Grepping producer log for expected error type") node.account.ssh("egrep -m 1 %s %s" % ("\"org\.apache\.kafka\.common\.protocol\.types\.SchemaException.*throttle_time_ms.*: java\.nio\.BufferUnderflowException\"", self.producer.LOG_FILE), allow_fail=False) def test_consumer_back_compatibility(self): """Run the scala 0.8.X consumer against an 0.9.X cluster. Expect 0.8.X scala consumer to fail with buffer underflow. This error is the same as when an 0.9.X producer is run against an 0.8.X broker: the broker responds to a V1 fetch request with a V0 fetch response; the client then tries to parse this V0 fetch response as a V1 fetch response, resulting in a BufferUnderflowException """ num_messages = 10 self.producer = VerifiableProducer( self.test_context, self.num_producers, self.kafka, self.topic, max_messages=num_messages, throughput=self.producer_throughput, version=LATEST_0_8_2) self.consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-09X", consumer_timeout_ms=10000, message_validator=is_int, version=TRUNK) self.old_consumer = ConsoleConsumer( self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-08X", consumer_timeout_ms=10000, message_validator=is_int, version=LATEST_0_8_2) self.producer.run() self.consumer.run() self.old_consumer.run() consumed = len(self.consumer.messages_consumed[1]) old_consumed = len(self.old_consumer.messages_consumed[1]) assert old_consumed == num_messages, "Expected 0.8.X scala consumer to consume %d, but only got %d" % (num_messages, old_consumed) assert consumed == 0, "Expected 0.9.X scala consumer to fail to consume any messages, but got %d" % consumed self.logger.info("Grepping consumer log for expected error type") node = self.consumer.nodes[0] node.account.ssh("egrep -m 1 %s %s" % ("\"java\.nio\.BufferUnderflowException\"", self.consumer.LOG_FILE), allow_fail=False)
class ClientCompatibilityTest(Test): def __init__(self, test_context): super(ClientCompatibilityTest, self).__init__(test_context=test_context) def setUp(self): self.topic = "test_topic" self.zk = ZookeeperService(self.test_context, num_nodes=1) self.kafka = KafkaService(self.test_context, num_nodes=3, zk=self.zk, version=LATEST_0_8_2, topics={ self.topic: { "partitions": 3, "replication-factor": 3, 'configs': { "min.insync.replicas": 2 } } }) self.zk.start() self.kafka.start() # Producer and consumer self.producer_throughput = 10000 self.num_producers = 1 self.num_consumers = 1 def test_producer_back_compatibility(self): """Run 0.9.X java producer against 0.8.X brokers. This test documents the fact that java producer v0.9.0.0 and later won't run against 0.8.X brokers the broker responds to a V1 produce request with a V0 fetch response; the client then tries to parse this V0 produce response as a V1 produce response, resulting in a BufferUnderflowException """ self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, max_messages=100, throughput=self.producer_throughput, version=TRUNK) node = self.producer.nodes[0] try: self.producer.start() self.producer.wait() raise Exception( "0.9.X java producer should not run successfully against 0.8.X broker" ) except: # Expected pass finally: self.producer.kill_node(node, clean_shutdown=False) self.logger.info("Grepping producer log for expected error type") node.account.ssh("egrep -m 1 %s %s" % ( "\"org\.apache\.kafka\.common\.protocol\.types\.SchemaException.*throttle_time_ms.*: java\.nio\.BufferUnderflowException\"", self.producer.LOG_FILE), allow_fail=False) def test_consumer_back_compatibility(self): """Run the scala 0.8.X consumer against an 0.9.X cluster. Expect 0.8.X scala consumer to fail with buffer underflow. This error is the same as when an 0.9.X producer is run against an 0.8.X broker: the broker responds to a V1 fetch request with a V0 fetch response; the client then tries to parse this V0 fetch response as a V1 fetch response, resulting in a BufferUnderflowException """ num_messages = 10 self.producer = VerifiableProducer(self.test_context, self.num_producers, self.kafka, self.topic, max_messages=num_messages, throughput=self.producer_throughput, version=LATEST_0_8_2) self.consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-09X", consumer_timeout_ms=10000, message_validator=is_int, version=TRUNK) self.old_consumer = ConsoleConsumer(self.test_context, self.num_consumers, self.kafka, self.topic, group_id="consumer-08X", consumer_timeout_ms=10000, message_validator=is_int, version=LATEST_0_8_2) self.producer.run() self.consumer.run() self.old_consumer.run() consumed = len(self.consumer.messages_consumed[1]) old_consumed = len(self.old_consumer.messages_consumed[1]) assert old_consumed == num_messages, "Expected 0.8.X scala consumer to consume %d, but only got %d" % ( num_messages, old_consumed) assert consumed == 0, "Expected 0.9.X scala consumer to fail to consume any messages, but got %d" % consumed self.logger.info("Grepping consumer log for expected error type") node = self.consumer.nodes[0] node.account.ssh("egrep -m 1 %s %s" % ("\"java\.nio\.BufferUnderflowException\"", self.consumer.LOG_FILE), allow_fail=False)
class ConnectStandaloneFileTest(Test): """ Simple test of Kafka Connect that produces data from a file in one standalone process and consumes it on another, validating the output is identical to the input. """ FILE_SOURCE_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSourceConnector' FILE_SINK_CONNECTOR = 'org.apache.kafka.connect.file.FileStreamSinkConnector' INPUT_FILE = "/mnt/connect.input" OUTPUT_FILE = "/mnt/connect.output" OFFSETS_FILE = "/mnt/connect.offsets" TOPIC = "test" FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = { "type": "string", "optional": False } def __init__(self, test_context): super(ConnectStandaloneFileTest, self).__init__(test_context) self.num_zk = 1 self.num_brokers = 1 self.topics = { 'test' : { 'partitions': 1, 'replication-factor': 1 } } self.zk = ZookeeperService(test_context, self.num_zk) @cluster(num_nodes=5) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=True) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=False) @parametrize(converter="org.apache.kafka.connect.storage.StringConverter", schemas=None) @parametrize(security_protocol=SecurityConfig.PLAINTEXT) @cluster(num_nodes=6) @parametrize(security_protocol=SecurityConfig.SASL_SSL) def test_file_source_and_sink(self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True, security_protocol='PLAINTEXT'): """ Validates basic end-to-end functionality of Connect standalone using the file source and sink converters. Includes parameterizations to test different converters (which also test per-connector converter overrides), schema/schemaless modes, and security support. """ assert converter != None, "converter type must be set" # Template parameters. Note that we don't set key/value.converter. These default to JsonConverter and we validate # converter overrides via the connector configuration. if converter != "org.apache.kafka.connect.json.JsonConverter": self.override_key_converter = converter self.override_value_converter = converter self.schemas = schemas self.kafka = KafkaService(self.test_context, self.num_brokers, self.zk, security_protocol=security_protocol, interbroker_security_protocol=security_protocol, topics=self.topics) self.source = ConnectStandaloneService(self.test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService(self.test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.consumer_validator = ConsoleConsumer(self.test_context, 1, self.kafka, self.TOPIC, consumer_timeout_ms=10000) self.zk.start() self.kafka.start() self.source.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")]) self.sink.set_configs(lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")]) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE) wait_until(lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.") # Restarting both should result in them picking up where they left off, # only processing new data. self.source.restart() self.sink.restart() self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE) wait_until(lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg="Sink output file never converged to the same state as the input file") # Validate the format of the data in the Kafka topic self.consumer_validator.run() expected = json.dumps([line if not self.schemas else { "schema": self.SCHEMA, "payload": line } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST]) decoder = (json.loads if converter.endswith("JsonConverter") else str) actual = json.dumps([decoder(x) for x in self.consumer_validator.messages_consumed[1]]) assert expected == actual, "Expected %s but saw %s in Kafka" % (expected, actual) def validate_output(self, value): try: output_hash = list(self.sink.node.account.ssh_capture("md5sum " + self.OUTPUT_FILE))[0].strip().split()[0] return output_hash == hashlib.md5(value).hexdigest() except RemoteCommandError: return False
class CopycatStandaloneFileTest(KafkaTest): """ Simple test of Copycat that produces data from a file in one Copycat standalone process and consumes it on another, validating the output is identical to the input. """ INPUT_FILE = "/mnt/copycat.input" OUTPUT_FILE = "/mnt/copycat.output" OFFSETS_FILE = "/mnt/copycat.offsets" TOPIC = "test" FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = {"type": "string", "optional": False} def __init__(self, test_context): super(CopycatStandaloneFileTest, self).__init__( test_context, num_zk=1, num_brokers=1, topics={'test': { 'partitions': 1, 'replication-factor': 1 }}) self.source = CopycatStandaloneService( test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = CopycatStandaloneService( test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.consumer_validator = ConsoleConsumer(test_context, 1, self.kafka, self.TOPIC, consumer_timeout_ms=1000) @parametrize(converter="org.apache.kafka.copycat.json.JsonConverter", schemas=True) @parametrize(converter="org.apache.kafka.copycat.json.JsonConverter", schemas=False) @parametrize(converter="org.apache.kafka.copycat.storage.StringConverter", schemas=None) def test_file_source_and_sink( self, converter="org.apache.kafka.copycat.json.JsonConverter", schemas=True): assert converter != None, "converter type must be set" # Template parameters self.key_converter = converter self.value_converter = converter self.schemas = schemas self.source.set_configs( self.render("copycat-standalone.properties"), [self.render("copycat-file-source.properties")]) self.sink.set_configs(self.render("copycat-standalone.properties"), [self.render("copycat-file-sink.properties")]) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg= "Data added to input file was not seen in the output file in a reasonable amount of time." ) # Restarting both should result in them picking up where they left off, # only processing new data. self.source.restart() self.sink.restart() self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg= "Sink output file never converged to the same state as the input file" ) # Validate the format of the data in the Kafka topic self.consumer_validator.run() expected = json.dumps([ line if not self.schemas else { "schema": self.SCHEMA, "payload": line } for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST ]) decoder = (json.loads if converter.endswith("JsonConverter") else str) actual = json.dumps( [decoder(x) for x in self.consumer_validator.messages_consumed[1]]) assert expected == actual, "Expected %s but saw %s in Kafka" % ( expected, actual) def validate_output(self, value): try: output_hash = list( self.sink.node.account.ssh_capture( "md5sum " + self.OUTPUT_FILE))[0].strip().split()[0] return output_hash == hashlib.md5(value).hexdigest() except subprocess.CalledProcessError: return False
class ConnectStandaloneFileTest(KafkaTest): """ Simple test of Kafka Connect that produces data from a file in one standalone process and consumes it on another, validating the output is identical to the input. """ INPUT_FILE = "/mnt/connect.input" OUTPUT_FILE = "/mnt/connect.output" OFFSETS_FILE = "/mnt/connect.offsets" TOPIC = "test" FIRST_INPUT_LIST = ["foo", "bar", "baz"] FIRST_INPUT = "\n".join(FIRST_INPUT_LIST) + "\n" SECOND_INPUT_LIST = ["razz", "ma", "tazz"] SECOND_INPUT = "\n".join(SECOND_INPUT_LIST) + "\n" SCHEMA = {"type": "string", "optional": False} def __init__(self, test_context): super(ConnectStandaloneFileTest, self).__init__( test_context, num_zk=1, num_brokers=1, topics={"test": {"partitions": 1, "replication-factor": 1}} ) self.source = ConnectStandaloneService(test_context, self.kafka, [self.INPUT_FILE, self.OFFSETS_FILE]) self.sink = ConnectStandaloneService(test_context, self.kafka, [self.OUTPUT_FILE, self.OFFSETS_FILE]) self.consumer_validator = ConsoleConsumer(test_context, 1, self.kafka, self.TOPIC, consumer_timeout_ms=1000) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=True) @parametrize(converter="org.apache.kafka.connect.json.JsonConverter", schemas=False) @parametrize(converter="org.apache.kafka.connect.storage.StringConverter", schemas=None) def test_file_source_and_sink(self, converter="org.apache.kafka.connect.json.JsonConverter", schemas=True): assert converter != None, "converter type must be set" # Template parameters self.key_converter = converter self.value_converter = converter self.schemas = schemas self.source.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-source.properties")], ) self.sink.set_configs( lambda node: self.render("connect-standalone.properties", node=node), [self.render("connect-file-sink.properties")], ) self.source.start() self.sink.start() # Generating data on the source node should generate new records and create new output on the sink node self.source.node.account.ssh("echo -e -n " + repr(self.FIRST_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT), timeout_sec=60, err_msg="Data added to input file was not seen in the output file in a reasonable amount of time.", ) # Restarting both should result in them picking up where they left off, # only processing new data. self.source.restart() self.sink.restart() self.source.node.account.ssh("echo -e -n " + repr(self.SECOND_INPUT) + " >> " + self.INPUT_FILE) wait_until( lambda: self.validate_output(self.FIRST_INPUT + self.SECOND_INPUT), timeout_sec=60, err_msg="Sink output file never converged to the same state as the input file", ) # Validate the format of the data in the Kafka topic self.consumer_validator.run() expected = json.dumps( [ line if not self.schemas else {"schema": self.SCHEMA, "payload": line} for line in self.FIRST_INPUT_LIST + self.SECOND_INPUT_LIST ] ) decoder = json.loads if converter.endswith("JsonConverter") else str actual = json.dumps([decoder(x) for x in self.consumer_validator.messages_consumed[1]]) assert expected == actual, "Expected %s but saw %s in Kafka" % (expected, actual) def validate_output(self, value): try: output_hash = list(self.sink.node.account.ssh_capture("md5sum " + self.OUTPUT_FILE))[0].strip().split()[0] return output_hash == hashlib.md5(value).hexdigest() except subprocess.CalledProcessError: return False