def getMsgData(topic, group, result, maxsize): try: saveResult = SaveDataResult() saveResult.guid = str(uuid.uuid4()) saveResult.CreateDate = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") msgInfos = [] result.guid = saveResult.guid result.topic_messages = [] consumer = KafkaConsumer(bootstrap_servers=tmpbootstrap_servers, enable_auto_commit=False, group_id=group) # Get all partitions by topic par = consumer.partitions_for_topic(topic) now_count = 0 for p in par: tp = TopicPartition(topic, p) consumer.assign([tp]) print(tp) info = MsgPartitionInfo() # Get committed offset print('start to get committed offset.....') try: committed = consumer.committed(tp) or 0 except Exception, e_commit: print(str(e_commit)) # Move consumer to end to get the last position consumer.seek_to_end(tp) last_offset = consumer.position(tp) # Move consumer to beginning to get the first position consumer.seek_to_beginning() now_offset = consumer.position(tp) from_offset = committed if from_offset is None: from_offset = now_offset if from_offset < now_offset: from_offset = now_offset info.partition_ID = tp.partition info.get_last_offset = last_offset msgInfos.append(info) print("[%s] partition(%s) -> now:%s, last:%s, committed:%s" % (tp.topic, tp.partition, now_offset, last_offset, committed)) # Get msg from position to offset while (from_offset < last_offset) and (now_count < maxsize): consumer.seek(tp, from_offset) polldata = consumer.poll(100) from_offset += 1 now_count += 1 print('now_count=' + str(now_count)) result.topic_messages.append(polldata[tp][0].value) saveResult.MsgInfo = json.dumps(msgInfos, default=encode_MsgPartitionInfo, ensure_ascii=False) print(saveResult.MsgInfo) consumer.close() saveResult.message = "Success" saveResult.Code = 200 producer = KafkaProducer(bootstrap_servers=tmpbootstrap_servers) producer.send(topic + "_log", json.dumps(saveResult, default=encode_SaveDataResult)) producer.flush()
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self.kafka_groups = defaultdict(set) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition).get(group_id, []) def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic( CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks( self.active_partitions.values()) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[ tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(self.active_partitions.values()) self.log.info("Consuming from %s", self.active_partitions) while not self.finished(): try: message = self.consumer.next() except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[ message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) return { group: topics for group, topics in self.kafka_groups.items() if topics } def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [(p, self.consumer.position(p)) for p in self.active_partitions.values()] self.consumer.assign(self.active_partitions.values()) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = bytearray(message.key) ((key_schema, ), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException( ) # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition, ), cur) = relative_unpack(b'>l', key, cur) if message.value: value = bytearray(message.value) ((value_schema, ), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset, ), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return str(group), str(topic), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message( message) except InvalidMessageException: return if offset and (group not in self.kafka_groups or topic not in self.kafka_groups[group]): self.kafka_groups[group].add(topic) self.log.info("Added group %s topic %s to list of groups", group, topic) elif not offset and group in self.kafka_groups and \ topic in self.kafka_groups[group]: # No offset means topic deletion self.kafka_groups[group].discard(topic) self.log.info("Removed group %s topic %s from list of groups", group, topic) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return { part: offset for part, offset in offsets[CONSUMER_OFFSET_TOPIC].iteritems() if offset.highmark > offset.lowmark and ( partitions is None or part in partitions_set) } def finished(self): return self._finished
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self._kafka_groups = defaultdict(lambda: defaultdict(dict)) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition)[group_id] def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) # Fetch metadata as partitions_for_topic only returns locally cached metadata # See https://github.com/dpkp/kafka-python/issues/1742 self.consumer.topics() if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic( CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks( list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[ tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[ message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics } def _remove_unsubscribed_topics(self): for group, topics in list(six.iteritems(self._kafka_groups)): for topic, partitions in list(six.iteritems(topics)): # If offsets for all partitions are 0, consider the topic as unsubscribed if not any(partitions.values()): del self._kafka_groups[group][topic] self.log.info( "Removed group {group} topic {topic} from list of groups" .format(group=group, topic=topic)) def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [(p, self.consumer.position(p)) for p in self.active_partitions.values()] self.consumer.assign(list(self.active_partitions.values())) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = message.key ((key_schema, ), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException( ) # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition, ), cur) = relative_unpack(b'>l', key, cur) if message.value: value = message.value ((value_schema, ), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset, ), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return group.decode(), topic.decode(), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message( message) except InvalidMessageException: return if offset is not None: self._kafka_groups[group][topic][partition] = offset self.log.info( "Updated group {group} topic {topic} and updated offset in list of groups" .format( group=group, topic=topic, ), ) # TODO: check if we can ever find an offset commit message with message.value is None elif offset is None and group in self._kafka_groups and \ topic in self._kafka_groups[group]: # No offset means topic deletion del self._kafka_groups[group][topic] self.log.info( "Removed group {group} topic {topic} from list of groups". format(group=group, topic=topic)) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return { part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and ( partitions is None or part in partitions_set) } def finished(self): return self._finished
class KafkaGroupReader: def __init__(self, kafka_config): self.log = logging.getLogger(__name__) self.kafka_config = kafka_config self._kafka_groups = defaultdict(lambda: defaultdict(dict)) self.active_partitions = {} self._finished = False def read_group(self, group_id): partition_count = get_offset_topic_partition_count(self.kafka_config) partition = get_group_partition(group_id, partition_count) return self.read_groups(partition).get(group_id, []) def read_groups(self, partition=None): self.consumer = KafkaConsumer( group_id='offset_monitoring_consumer', bootstrap_servers=self.kafka_config.broker_list, auto_offset_reset='earliest', enable_auto_commit=False, consumer_timeout_ms=30000, fetch_max_wait_ms=2000, max_partition_fetch_bytes=10 * 1024 * 1024, # 10MB ) # Fetch metadata as partitions_for_topic only returns locally cached metadata # See https://github.com/dpkp/kafka-python/issues/1742 self.consumer.topics() if partition is not None: self.active_partitions = { partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition), } else: self.active_partitions = { p: TopicPartition(CONSUMER_OFFSET_TOPIC, p) for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC) } self.watermarks = self.get_current_watermarks(list(self.active_partitions.values())) # Active partitions are not empty. Remove the empty ones. self.active_partitions = { p: tp for p, tp in self.active_partitions.items() if tp.partition in self.watermarks and self.watermarks[tp.partition].highmark > 0 and self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark } # Cannot consume if there are no active partitions if not self.active_partitions: return {} self.consumer.assign(list(self.active_partitions.values())) self.log.info("Consuming from %s", self.active_partitions) message_iterator = iter(self.consumer) while not self.finished(): try: message = next(message_iterator) except StopIteration: continue # Stop when reaching the last message written to the # __consumer_offsets topic when KafkaGroupReader first started if message.offset >= self.watermarks[message.partition].highmark - 1: self.remove_partition_from_consumer(message.partition) self.process_consumer_offset_message(message) self._remove_unsubscribed_topics() return { group: topics.keys() for group, topics in six.iteritems(self._kafka_groups) if topics } def _remove_unsubscribed_topics(self): for group, topics in list(six.iteritems(self._kafka_groups)): for topic, partitions in list(six.iteritems(topics)): # If offsets for all partitions are 0, consider the topic as unsubscribed if not any(partitions.values()): del self._kafka_groups[group][topic] self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic)) def remove_partition_from_consumer(self, partition): deleted = self.active_partitions.pop(partition) # Terminate if there are no more partitions to consume if not self.active_partitions: self.log.info("Completed reading from all partitions") self._finished = True return # Reassign the remaining partitions to the consumer while saving the # position positions = [ (p, self.consumer.position(p)) for p in self.active_partitions.values() ] self.consumer.assign(list(self.active_partitions.values())) for topic_partition, position in positions: self.consumer.seek(topic_partition, position) self.log.info( "Completed reading from %s. Remaining partitions: %s", deleted, self.active_partitions, ) def parse_consumer_offset_message(self, message): key = message.key ((key_schema,), cur) = relative_unpack(b'>h', key, 0) if key_schema not in [0, 1]: raise InvalidMessageException() # This is not an offset commit message (group, cur) = read_short_string(key, cur) (topic, cur) = read_short_string(key, cur) ((partition,), cur) = relative_unpack(b'>l', key, cur) if message.value: value = message.value ((value_schema,), cur) = relative_unpack(b'>h', value, 0) if value_schema not in [0, 1]: raise InvalidMessageException() # Unrecognized message value ((offset,), cur) = relative_unpack(b'>q', value, cur) else: offset = None # Offset was deleted return group.decode(), topic.decode(), partition, offset def process_consumer_offset_message(self, message): try: group, topic, partition, offset = self.parse_consumer_offset_message(message) except InvalidMessageException: return if offset is not None: self._kafka_groups[group][topic][partition] = offset self.log.info( "Updated group {group} topic {topic} and updated offset in list of groups".format( group=group, topic=topic, ), ) # TODO: check if we can ever find an offset commit message with message.value is None elif offset is None and group in self._kafka_groups and \ topic in self._kafka_groups[group]: # No offset means topic deletion del self._kafka_groups[group][topic] self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic)) def get_current_watermarks(self, partitions=None): client = KafkaToolClient(self.kafka_config.broker_list) client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC) offsets = get_topics_watermarks( client, [CONSUMER_OFFSET_TOPIC], ) partitions_set = set(tp.partition for tp in partitions) if partitions else None return {part: offset for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC]) if offset.highmark > offset.lowmark and (partitions is None or part in partitions_set)} def finished(self): return self._finished