def test_kafka_flush_on_big_message(kafka_cluster): # Create batchs of messages of size ~100Kb kafka_messages = 1000 batch_messages = 1000 messages = [ json.dumps({ 'key': i, 'value': 'x' * 100 }) * batch_messages for i in range(kafka_messages) ] kafka_produce('flush', messages) instance.query(''' DROP TABLE IF EXISTS test.view; DROP TABLE IF EXISTS test.consumer; CREATE TABLE test.kafka (key UInt64, value String) ENGINE = Kafka SETTINGS kafka_broker_list = 'kafka1:19092', kafka_topic_list = 'flush', kafka_group_name = 'flush', kafka_format = 'JSONEachRow', kafka_max_block_size = 10; CREATE TABLE test.view (key UInt64, value String) ENGINE = MergeTree ORDER BY key; CREATE MATERIALIZED VIEW test.consumer TO test.view AS SELECT * FROM test.kafka; ''') client = KafkaAdminClient(bootstrap_servers="localhost:9092") received = False while not received: try: offsets = client.list_consumer_group_offsets('flush') for topic, offset in offsets.items(): if topic.topic == 'flush' and offset.offset == kafka_messages: received = True break except kafka.errors.GroupCoordinatorNotAvailableError: continue while True: result = instance.query('SELECT count() FROM test.view') if int(result) == kafka_messages * batch_messages: break instance.query(''' DROP TABLE test.consumer; DROP TABLE test.view; ''') assert int( result ) == kafka_messages * batch_messages, 'ClickHouse lost some messages: {}'.format( result)
def describe_group(bootstrap_server, consumer_group_name): kafka_admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_server) consumer_offset = {} for br in kafka_admin_client._client.cluster.brokers(): this_group_offset = kafka_admin_client.list_consumer_group_offsets( group_id=consumer_group_name, group_coordinator_id=1001) for (topic, partition), (offset, metadata) in iteritems(this_group_offset): consumer_offset[partition] = offset namedtuple(consumer_group_name, consumer_offset) print(consumer_group_name)
def test_kafka_flush_on_big_message(kafka_cluster): # Create batchs of messages of size ~100Kb kafka_messages = 10000 batch_messages = 1000 messages = [json.dumps({'key': i, 'value': 'x' * 100}) * batch_messages for i in range(kafka_messages)] kafka_produce('flush', messages) instance.query(''' DROP TABLE IF EXISTS test.view; DROP TABLE IF EXISTS test.consumer; CREATE TABLE test.kafka (key UInt64, value String) ENGINE = Kafka SETTINGS kafka_broker_list = 'kafka1:19092', kafka_topic_list = 'flush', kafka_group_name = 'flush', kafka_format = 'JSONEachRow', kafka_max_block_size = 10; CREATE TABLE test.view (key UInt64, value String) ENGINE = MergeTree ORDER BY key; CREATE MATERIALIZED VIEW test.consumer TO test.view AS SELECT * FROM test.kafka; ''') client = KafkaAdminClient(bootstrap_servers="localhost:9092") received = False while not received: try: offsets = client.list_consumer_group_offsets('flush') for topic, offset in offsets.items(): if topic.topic == 'flush' and offset.offset == kafka_messages: received = True break except kafka.errors.GroupCoordinatorNotAvailableError: continue for _ in range(20): time.sleep(1) result = instance.query('SELECT count() FROM test.view') if int(result) == kafka_messages*batch_messages: break assert int(result) == kafka_messages*batch_messages, 'ClickHouse lost some messages: {}'.format(result)
def collect_topic_information(bootstrap_servers, old_consumer_group): """Gets a list of current topics being subscribed to by this consumer group that we may need to remove with the migration. Using the `list_consumer_group_offsets()` function since `describe_consumer_groups()` doesn't return proper data. :param bootstrap_servers: The Kafka brokers in the cluster to connect to. :param old_consumer_group: The consumer group we are migrating from. """ adminClient = KafkaAdminClient(bootstrap_servers=bootstrap_servers) results = adminClient.list_consumer_group_offsets(old_consumer_group) topics = [] for k, v in results.items(): topic = k._asdict()['topic'] if topic not in topics: topics.append(topic) adminClient.close() return topics
def collect_old_consumer_group_offsets(bootstrap_servers, old_consumer_group, removed_topics): """ Connects to the brokers specified to gather current offset information of the consumer group we're migrating from. :param bootstrap_servers: The Kafka brokers in the cluster to connect to. :param old_consumer_group: The consumer group we are migrating from. """ adminClient = KafkaAdminClient(bootstrap_servers=bootstrap_servers) results = adminClient.list_consumer_group_offsets(old_consumer_group) delimeter = ',' with open(OUTPUT_FILE, 'w') as f: for k, v in results.items(): if len(removed_topics) > 0: topic = k._asdict()['topic'] if topic in removed_topics: continue f.write(str(k._asdict()['topic']) + delimeter) f.write(str(k._asdict()['partition']) + delimeter) f.write(str(v._asdict()['offset']) + '\n') adminClient.close()
class KafkaUtils(object): def __init__(self, bootstrap_servers: list, topic: str, group_id: str): self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers, api_version=(5, 5, 1), request_timeout_ms=1000) self.consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers) self.admin_client = KafkaAdminClient( bootstrap_servers=bootstrap_servers) self.bootstrap_servers = bootstrap_servers self.topic = topic self.group_id = group_id def has_consumer_group(self) -> bool: for group in self.admin_client.list_consumer_groups(): if group[0] == self.group_id: return True return False def wait_until_consumer_group(self): do_until_true_with_timeout(self.has_consumer_group) def consume_messages_and_close(self): tmp_consumer = KafkaConsumer(self.topic, bootstrap_servers=self.bootstrap_servers, auto_offset_reset='earliest', group_id=self.group_id, consumer_timeout_ms=5000, enable_auto_commit=True) for msg in tmp_consumer: log.info(f"Found message [ {msg.value} ]") tmp_consumer.close() def ensure_topic_created(self): try: self.admin_client.create_topics([NewTopic(self.topic, 2, 1)]) except TopicAlreadyExistsError: pass def _produce_record_sync(self, key: str, value: str): future = self.producer.send(self.topic, str.encode(value), str.encode(key)) try: future.get(5) self.producer.flush(5) except KafkaError as e: logging.warning("Could not produce Kafka record!" + str(e)) raise e def produce_element_with_delay(self, delay_ms: int): key = uuid() log.info( f"Producing element with key [ {key} ] and delay [ {delay_ms} ]") self._produce_record_sync(key, str(delay_ms)) def _get_topic_partitions(self) -> list[TopicPartition]: return [ TopicPartition(self.topic, partition) for partition in self.consumer.partitions_for_topic(self.topic) ] def get_latest_offsets(self) -> dict[int, int]: return convert_to_ordered_dict({ topic_partition.partition: offset for (topic_partition, offset) in self.consumer.end_offsets( self._get_topic_partitions()).items() }) def get_latest_offset_for_partition(self, partition: int) -> int: latest_offsets = self.get_latest_offsets() return latest_offsets.get(partition, -1) def get_offsets(self) -> dict[int, int]: return convert_to_ordered_dict({ topic_partition.partition: offset_meta.offset for (topic_partition, offset_meta) in self.admin_client.list_consumer_group_offsets( self.group_id).items() }) def get_offset_difference(self) -> OffsetDifference: return OffsetDifference(self.get_offsets(), self.get_latest_offsets()) def wait_for_offset_catchup(self, timeout_seconds: int = 60): end_time = time.time() + timeout_seconds while time.time() < end_time: try: self.assert_group_up_to_date() return except Exception as e: log.info(e) time.sleep(1) raise Exception("Timed out!") def assert_group_up_to_date(self): assert self.get_offset_difference().is_up_to_date() def ensure_not_up_to_date_for_n_seconds(self, seconds: int): end_time = time.time() + seconds while time.time() < end_time: offset_difference = self.get_offset_difference() log.info("Offset difference: " + str(offset_difference)) if offset_difference.is_up_to_date(): raise Exception("Offsets are up to date!") time.sleep(2)