def test_offsets_for_times(): c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) # Query broker for timestamps for partition try: test_topic_partition = TopicPartition("test", 0, 100) c.offsets_for_times([test_topic_partition], timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) c.close()
def test_offsets_for_times(): c = Consumer({ 'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100 }) # Query broker for timestamps for partition try: test_topic_partition = TopicPartition("test", 0, 100) c.offsets_for_times([test_topic_partition], timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) c.close()
class KafkaQueryConsumer: """ Wraps Kafka library consumer methods which query the broker for metadata and poll for single messages. It is a thin wrapper but allows a fake to be used in unit tests. """ def __init__(self, broker: str): # Set "enable.auto.commit" to False, as we do not need to report to the # kafka broker where we got to (it usually does this in case of a # crash, but we simply restart the process and go and find the last # run_start message. # # Set "queued.min.messages" to 1 as we will consume backwards through # the partition one message at a time; we do not want to retrieve # multiple messages in the forward direction each time we step # backwards by 1 offset conf = { "bootstrap.servers": broker, "group.id": "consumer_group_name", "auto.offset.reset": "latest", "enable.auto.commit": False, "queued.min.messages": 1 } self._consumer = Consumer(**conf) def get_topic_partitions(self, topic: str, offset: int = -1): metadata = self._consumer.list_topics(topic) return [ TopicPartition(topic, partition[1].id, offset=offset) for partition in metadata.topics[topic].partitions.items() ] def seek(self, partition: TopicPartition): """ Set offset in partition, the consumer will seek to that offset """ self._consumer.seek(partition) def poll(self, timeout=2.): """ Poll for a message from Kafka """ return self._consumer.poll(timeout=timeout) def get_watermark_offsets(self, partition: TopicPartition) -> Tuple[int, int]: """ Get the offset of the first and last available message in the given partition """ return self._consumer.get_watermark_offsets(partition, cached=False) def assign(self, partitions: List[TopicPartition]): self._consumer.assign(partitions) def offsets_for_times(self, partitions: List[TopicPartition]): return self._consumer.offsets_for_times(partitions)
def test_calling_store_offsets_after_close_throws_erro(): """ calling store_offset after close should throw RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.offsets_for_times([TopicPartition("test", 0)]) assert ex.match('Consumer closed')
def test_calling_store_offsets_after_close_throws_erro(): """ calling store_offset after close should throw RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.offsets_for_times([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value)
class KafkaConsumer: def __init__(self, conf, group_id='kafka-rest-service'): conf = dict(conf) conf['group.id'] = group_id self.consumer = Consumer(conf) # @cached(cache=TTLCache(maxsize=1024, ttl=60)) def get_topic_partition_count(self, topic_name): cmd = self.consumer.list_topics(topic_name) tmd = cmd.topics.get(topic_name, None) pcount = 0 if tmd: pcount = len(tmd.partitions) return pcount # @cached(cache=TTLCache(maxsize=1024, ttl=60)) def get_topic_offsets(self, topic_name): pcount = self.get_topic_partition_count(topic_name) if pcount == 0: return dict(error=f"Requested topic {topic_name} not found", status="ERROR", report=None) part_status_map = {} for p in range(pcount): l, h = self.consumer.get_watermark_offsets( TopicPartition(topic_name, p)) part_status_map[p] = [h, '1 month'] def get_minute_report(minute, time_text): timestamp = (datetime.now() - timedelta(minutes=minute)).timestamp() timestamp = int(timestamp) * 1000 partitions = [ TopicPartition(topic_name, p, timestamp) for p in range(pcount) ] partitions = self.consumer.offsets_for_times(partitions) for par in partitions: if par.offset > -1: part_status_map[par.partition][-1] = time_text get_minute_report(60 * 24 * 7, '1 week') get_minute_report(60 * 24, '1 day') get_minute_report(60, '1 hour') get_minute_report(10, '10 minutes') get_minute_report(1, '1 minute') part_status_map = {k: list(v) for k, v in part_status_map.items()} return dict(error=None, status="SUCCESS", topic=topic_name, offsets=part_status_map)
def get_consumer(self, topic): ''' get_consumer() create a consumer interface and set the offset to the start timestamp. ''' consumer = Consumer({ 'bootstrap.servers': self.config['DEFAULT']['KafkaServer'], 'group.id': 'mygroup', 'client.id': 'client-1', 'enable.auto.commit': True, 'session.timeout.ms': 6000, 'max.poll.interval.ms': 60000000, 'default.topic.config': {'auto.offset.reset': 'smallest'} }) topicPartitions = [TopicPartition(topic, 0, dt2ts(self.start)*1000 )] offsetsTimestamp = consumer.offsets_for_times(topicPartitions) consumer.assign(offsetsTimestamp) return consumer
def get_consumer(self, topics): willtry = 0 while True: if willtry > 1000: logging.error(f"failed to create consumer: no try left") return None try: ''' get_customer() create a consumer interface and set the offset to the start timestamp. ''' consumer = Consumer({ 'bootstrap.servers': self.config["DEFAULT"]["KafkaServer"], 'group.id': 'mygroup', 'client.id': 'client-1', 'enable.auto.commit': True, 'session.timeout.ms': 6000, 'max.poll.interval.ms': 6000000, 'default.topic.config': { 'auto.offset.reset': 'smallest' }, }) topicPartitions = [ TopicPartition(topic, 0, dt2ts(self.start) * 1000) for topic in topics ] offsetsTimestamp = consumer.offsets_for_times(topicPartitions) consumer.assign(offsetsTimestamp) logging.debug(f"[get_consumer] successfully create customer") return consumer except Exception as e: logging.error( f"failed to create consumer: {e}, try {willtry}/1000") willtry += 1
def get_consumer(self) : try : consumer = Consumer({ 'bootstrap.servers': self.config['DEFAULT']['KafkaServer'], 'group.id': 'mygroup', 'client.id': 'client-1', 'enable.auto.commit': True, 'session.timeout.ms': 6000, 'default.topic.config': {'auto.offset.reset': 'smallest'}, }) topic = f"{self.topic_header}_{self.config['BGPScheduler']['SchedulerTopic']}" topicPartitions = [ TopicPartition( topic, self.partition, dt2ts(self.start)*1000 ) ] offsetsTimestamp = consumer.offsets_for_times(topicPartitions) consumer.assign(offsetsTimestamp) return consumer except Exception as e : logging.error(f"[{topic}] {e}") return
class TimeOrderedGeneratorWithTimeout(GeneratorInterface): """ A general generator which can read multiple topics and merge their messages in time order. A message must be emitted at (arrival_system_time + latency_ms). In batch mode (until reaching the first EOP on each stream) the generator will not discard any messages. """ def __init__(self, broker, groupid, topics_infos: List[TopicInfo], latency_ms, commit_interval_sec=None, group_by_time=False, begin_timestamp=None, begin_flag=None, end_timestamp=None, end_flag=None, heartbeat_interval_ms=-1): """ :param broker: Broker to connect to. :param groupid: Group id of the consumer. :param topics_infos: [TopicInfo()] - list of TopicInfo objects. :param latency_ms: (integer >=0) Latency to wait before serving a message. After this messages with lower or equal timestamps will be discarded. :param commit_interval_sec: How many seconds to wait between commits.-1 does not commit with the given group id. :param group_by_time: Group messages with the same timestamp. This will yield a list of messages. :param begin_timestamp: Timestamp of the kafka messages where the generator will start. :param begin_flag: BEGINNING, CONTINUE, LIVE - CONTINUE will continue from the last committed offset. If there was no committed offset will start from the end of the stream. :param end_timestamp: Timestamp where to end the reading. :param end_flag: NEVER, END_OF_PARTITION :param heartbeat_interval_ms: -1 does not produce heartbeat. After every interval will produce a HeartBeat typed message with the timestamp. """ if begin_timestamp is not None and begin_flag is not None: raise Exception( 'You can not set the begin timestamp and a flag in the same time.' ) if end_timestamp is not None and end_flag is not None: raise Exception( 'You can not set the end timestamp and a flag in the same time.' ) if begin_timestamp is not None and end_timestamp is not None and begin_timestamp >= end_timestamp: raise Exception( 'The begin timestamp is larger then the end timestamp.') if begin_flag is not None and end_flag is not None and \ begin_flag == BeginFlag.LIVE and end_flag == EndFlag.END_OF_PARTITION: raise Exception( 'You can not start in live and process until the end of the streams.' ) if end_flag is not None and not (end_flag == EndFlag.END_OF_PARTITION or end_flag == EndFlag.NEVER): raise Exception( 'Unknow end flag: {} . Please use the given enum to use proper end flag.' .format(end_flag)) self.end_ts = end_timestamp self.end_flag = end_flag self.commit_interval_sec = commit_interval_sec self.latency_ms = latency_ms self.group_by_time = group_by_time self.consumer = Consumer({ 'bootstrap.servers': broker, 'group.id': groupid, 'enable.auto.commit': False, 'auto.offset.reset': 'latest', 'enable.partition.eof': True, 'fetch.wait.max.ms': 50 }) self.tps = [] self.queues = {} self.messages_to_be_committed = {} self.begin_timestamp = begin_timestamp for ti in topics_infos: topic_name = ti.topic self.messages_to_be_committed[topic_name] = { 'last_msg': None, 'committed': True } if begin_timestamp is not None: self.tps.extend( self.consumer.offsets_for_times([ TopicPartition(topic_name, partition=ti.partition, offset=begin_timestamp) ])) elif begin_flag is not None: if begin_flag == BeginFlag.BEGINNING: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_BEGINNING)) elif begin_flag == BeginFlag.CONTINUE: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_STORED)) elif begin_flag == BeginFlag.LIVE: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) else: raise Exception( 'Unknown begin flag. Please use the enum to provide proper begin flag.' ) else: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) end_offset = None if end_flag is not None and end_flag == EndFlag.END_OF_PARTITION: end_offset = self.consumer.get_watermark_offsets( TopicPartition(topic_name, 0))[1] - 1 if end_offset is None or end_offset >= 0: self.queues[topic_name] = Topic(topic_name, self.consumer, end_offset=end_offset, partition=ti.partition, drop=ti.drop) self.consumer.assign(self.tps) self.last_commit = time.time() self.running = True self.heartbeat_interval_ms = heartbeat_interval_ms self.next_hb = None def stopGenerator(self): self.running = False def _serve_messages(self, message_to_serve): if self.commit_interval_sec is not None and self.group_by_time: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False # serve messages if self.group_by_time: yield message_to_serve else: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False yield msg if not self.running: break # commit messages when they were delivered current_time = time.time() if self.commit_interval_sec is not None and ( current_time - self.last_commit) > self.commit_interval_sec: for k in self.messages_to_be_committed.keys(): if not self.messages_to_be_committed[k]['committed']: self.consumer.commit( self.messages_to_be_committed[k]['last_msg']) self.messages_to_be_committed[k]['committed'] = True self.last_commit = current_time def _serve_heartbeat(self, current_timestamp_ms): if self.next_hb is None: if self.begin_timestamp is not None: self.next_hb = self.begin_timestamp else: self.next_hb = current_timestamp_ms while self.next_hb <= current_timestamp_ms: yield HeartBeat(self.next_hb) self.next_hb += self.heartbeat_interval_ms def _can_serve(self): min_ets = min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=-1) if min_ets == -1: return None deadline = getSystemTimestamp() - self.latency_ms if all([q.can_be_emitted(min_ets) for q in self.queues.values()]) and \ any([q.queue[0].ts < deadline for q in self.queues.values() if len(q.queue) > 0 and q.queue[0].message.timestamp()[1] == min_ets]): return min_ets else: return None def getMessages(self): while self.running: if all([v.stopped for v in self.queues.values()]): message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.queue) message_to_serve = [m.message for m in message_to_serve] message_to_serve.sort(key=lambda x: x.timestamp()[1]) while len(message_to_serve) > 0: ts = message_to_serve[0].timestamp()[1] serve_it = [] while len(message_to_serve) > 0 and message_to_serve[ 0].timestamp()[1] == ts: serve_it.append(message_to_serve.pop(0)) if not self.heartbeat_interval_ms == -1: yield from self._serve_heartbeat(ts) yield from self._serve_messages(serve_it) logging.debug('Exiting from generator.') break msg = self.consumer.poll(0.001) if msg is not None: if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: if msg.topic() in self.queues: self.queues[msg.topic()].first_eop_reached = True self.queues[msg.topic()].end_of_partition = True else: logging.error('Unhandle error: {}'.format(msg.error())) break else: self.queues[msg.topic()].end_of_partition = False if self.end_ts is not None and msg.timestamp( )[1] > self.end_ts: self.queues[msg.topic()].stop_topic() else: self.queues[msg.topic()].add_message(msg) while self.running: event_ts_to_serve = self._can_serve() if event_ts_to_serve is None: if self.end_flag == EndFlag.NEVER and self.heartbeat_interval_ms != -1 \ and any([q.end_of_partition for q in self.queues.values()]): if self.next_hb is None: self.next_hb = getSystemTimestamp( ) - self.latency_ms yield from self._serve_heartbeat(getSystemTimestamp() - self.latency_ms) break if self.heartbeat_interval_ms != -1: yield from self._serve_heartbeat(event_ts_to_serve) message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.get_messages(event_ts_to_serve)) yield from self._serve_messages(message_to_serve) if self.end_ts is not None and self.end_ts <= event_ts_to_serve: self.running = False self.consumer.close()
class Reader(): """Consumes traceroute data from Kafka""" def __init__(self, start, end, timetrack_converter, msm_ids=[5001, 5004, 5005], probe_ids=[1, 2, 3, 4, 5, 6, 7, 8], chunk_size=900, config=None): self.msm_ids = msm_ids self.probe_ids = probe_ids self.start = int(calendar.timegm(start.timetuple()))*1000 self.end = int(calendar.timegm(end.timetuple()))*1000 self.chunk_size = chunk_size self.params = [] self.timetrack_converter = timetrack_converter self.consumer = None self.config = config self.topic = self.config.get('io', 'kafka_topic') self.partition_total = 0 self.partition_paused = 0 def __enter__(self): """Setup kafka consumer""" self.consumer = Consumer({ 'bootstrap.servers': 'kafka1:9092, kafka2:9092, kafka3:9092', 'group.id': 'ihr_raclette_'+str(self.start), 'auto.offset.reset': 'earliest', 'max.poll.interval.ms': 1800*1000, }) # Set offsets according to start time topic_info = self.consumer.list_topics(self.topic) partitions = [TopicPartition(self.topic, partition_id, self.start) for partition_id in topic_info.topics[self.topic].partitions.keys()] offsets = self.consumer.offsets_for_times(partitions) # remove empty partitions offsets = [part for part in offsets if part.offset > 0] self.partition_total = len(offsets) self.partition_paused = 0 self.consumer.assign(offsets) return self def __exit__(self, type, value, traceback): self.consumer.close() logging.info("closed the consumer") def read(self): logging.info("Start consuming data") while True: msg = self.consumer.poll(1.0) if msg is None: continue if msg.error(): logging.error("Consumer error: {}".format(msg.error())) continue # Filter with start and end times ts = msg.timestamp() if ts[0] == confluent_kafka.TIMESTAMP_CREATE_TIME and ts[1] < self.start: continue if ts[0] == confluent_kafka.TIMESTAMP_CREATE_TIME and ts[1] >= self.end: self.consumer.pause([TopicPartition(self.topic, msg.partition())]) self.partition_paused += 1 if self.partition_paused < self.partition_total: continue else: break traceroute = msgpack.unpackb(msg.value(), raw=False) #needed? the consumer is not filtering the msm or probe ids # if (self.probe_ids is not None and traceroute['prb_id'] not in self.probe_ids) or \ # (self.msm_ids is not None and traceroute['msm_id'] not in self.msm_ids): # pass yield self.timetrack_converter.traceroute2timetrack(traceroute)
class OffsetTranslator(): """Translates consumer group offsets as part of a migration to a new cluster. Given a consumer group, source and destination cluster, it will find the topics involved in the consumer group and the committed offsets. For there it uses OffsetsForTimes() to find the offset for a message with an equal or greater time in the destination cluster and compares a hash of the message value to confirm if the offset relates to the same message. If not, it advances the timestamp by one millisecond and finds the next offset - this becomes the range of offsets it will traverse over to find a matching hash. If there were no more recent timestamps on the topic partition, it will call getWatermarkOffsets() to get the last offset and traverse accordingly. If the number of messages to traverse is stupidly large (currently set at 500) it throws a warning. There is every possibility that the message simply doesn't exist, in which case it will throw an exception. """ def __init__(self, src_bootstrap_server, src_group_id, src_topic, dest_bootstrap_server, dest_group_id): self._admin = AdminClient({"bootstrap.servers": src_bootstrap_server}) # For reading offsets/messages in the source cluster self._consumer = Consumer({ "bootstrap.servers": src_bootstrap_server, "group.id": src_group_id, "enable.auto.commit": "false" }) # For reading offsets/messages in the destination cluster self._dest_consumer = Consumer({ "bootstrap.servers": dest_bootstrap_server, "group.id": dest_group_id, "enable.auto.commit": "false" }) # Handy instance variables self._src_group_id = src_group_id self._src_topic = src_topic self._src_bootstrap_servers = src_bootstrap_server self._dest_group_id = dest_group_id self._dest_bootstrap_servers = dest_bootstrap_server self._metadata = defaultdict(dict) self.logger = logging.getLogger('translator') self.logger.info("Offset Translator object instantiated.") self.logger.info( f" Source bootstrap servers: {self._src_bootstrap_servers}") self.logger.info( f" Destination bootstrap servers: {self._src_bootstrap_servers}") self.logger.info(f" Consumer group: {self._src_group_id}") def metadataKeyFromTPO(self, tpo): """Return a string key from TopicPartition object for use in metadata hash """ return f"{tpo.topic}::{tpo.partition}" def buildMetadataMap(self, tpos): """Use TopicPartition data to build internal metadata hash for comparing offsets, timestamps etc between source and destination clusters. """ self.logger.info(f"Building metadata map...") for tpo in tpos: key = self.metadataKeyFromTPO(tpo) self._metadata[key] = { "src_offset": tpo.offset, "src_timestamp": 0, "src_hash": None, "src_tpo": tpo, "src_message": None, "dest_offset": None, "dest_timestamp": None, "dest_hash": None, "dest_tpo": None, "dest_message": None } self.logger.info(f"Built metadata for {len(tpos)} TPOs") return self._metadata def getTPOs(self, topics): """Use the AdminAPI to return a list of TopicParition objects for a list of topics """ self.logger.info( f"Getting TPOs for {len(topics)} topics via admin API...") tpos = [] for t in topics: for p in self._admin.list_topics(t).topics[t].partitions: tpos.append(TopicPartition(t, p)) self.logger.info(f"Found {len(tpos)} TPOs for {len(topics)} topics.") return tpos def updateMetadata(self, metadata): """Takes output of inspectTPOMessages() and updates metadata. We don't do this automatically within inspectTPOMessagse, as we may want to use inspectTPOMessages on the destination cluster and compare to the source, so updating the object's metadata would render that useless. """ self.logger.info("Updating metadata...") for key in metadata.keys(): for inner_key in metadata[key]: self._metadata[key][inner_key] = metadata[key][inner_key] # Grab the first key and check if it relates to src_ or dest_ data.. sample = metadata[next(iter(metadata.keys()))] if 'src_offset' in sample.keys(): cluster = "source" elif 'dest_offset' in sample.keys(): cluster = "destination" else: raise Exception( "Metadata doesn't clearly indicate which cluster it is from.. no src_offset or dest_offset key present..." ) self.logger.info( f"{len(metadata)} updates to metadata from {cluster} cluster.") return self._metadata def inspectTPOMessages(self, tpos, cluster="source"): """ Given a list of TopicPartition objects, for each partition read the message at the required offset and extract the timestamp, hash the message value """ self.logger.info(f"Inspecting {len(tpos)} TPOs in {cluster} cluster.") # Default to the source cluster consumer; we will also use this # to inspect destination cluster messages if cluster == "source": consumer = self._consumer elif cluster == "destination": consumer = self._dest_consumer else: raise Exception( "cluster argument to inspectTPOMessages must be one of 'source' or 'destination'" ) circuit_breaker_retry_count = 0 metadata = defaultdict(dict) # This seems a slow way to just read one message at a time from a partition, but I'm not aware # of a better way of reading a single message for each partition when there may be further messages # on the partition. for tpo in tpos: # If the tpo.offset is < 0, then the consumer hasn't read anything # from the topic partition, so skip it. if tpo.offset < 0: continue consumer.assign([tpo]) while True: # Poll for data on this specific TopicPartition m = consumer.poll(1) if m is None: circuit_breaker_retry_count += 1 if circuit_breaker_retry_count > 10: print( "Too many iterations polling for data and getting nothing." ) break else: continue elif m.error() is None: # We'll build a local copy of metadata md = {} if cluster == "source": md['src_offset'] = m.offset() md['src_timestamp'] = m.timestamp()[1] md['src_hash'] = self.sha256Object(m.value()) md['src_tpo'] = tpo md['src_message'] = m elif cluster == "destination": md['dest_offset'] = m.offset() md['dest_timestamp'] = m.timestamp()[1] md['dest_hash'] = self.sha256Object(m.value()) md['dest_tpo'] = tpo md['dest_message'] = m key = self.metadataKeyFromTPO(tpo) metadata[key] = md circruit_breaker_retry_count = 0 # Break the while loop, we've got our data for this topic/partition break else: raise Exception( f"Error reading offset {tpo.offset} from {tpo.topic}/{tpo.partition}: {m.error()}" ) self.logger.info(f"Returning metadata for {len(metadata)} TPOs") return metadata def sha256Object(self, obj): """Return the sha256 digest for a supplied object""" return hashlib.sha256(bytes(obj)).hexdigest() def getTPOsByTime(self, metadata=None): """ Build a list of TopicPartitions using message timestamps instead of offsets """ if metadata is None: metadata = self._metadata self.logger.info( f"Getting offsets from timestamps for {len(metadata)} metadata entries.." ) tpos_by_time = list() for key in metadata.keys(): md = self._metadata[key] if md['src_timestamp'] > 0: tpo = md['src_tpo'] tpos_by_time.append( TopicPartition(tpo.topic, tpo.partition, md['src_timestamp'])) # This returns the earliest offset for a given timestamp tpos = self._dest_consumer.offsets_for_times(tpos_by_time) # Check for errors for t in [t for t in tpos if t.error is not None]: raise Exception( f"Error getting offset from timestamp: Topic {t.topic}, Partition {t.partition}, Offset {t.offset}: Error {t.error}" ) self.logger.info( f"Returning {len(tpos)} offsets from destination cluster.") return tpos def findMatchingMessages(self): """Iterate over metadata and find matching source/destination messages and separate into matched / unmatched buckets, returning a tuple """ self.logger.info( "Searching for destination messages that match via message hash..." ) # Iterate over the source cluster metadata and compare to destination cluster translated_offsets = list() unmatched_offsets = list() for key in self._metadata.keys(): metadata = self._metadata[key] src_tpo = metadata['src_tpo'] dest_message = metadata['dest_message'] dest_timestamp = metadata['dest_timestamp'] dest_tpo = metadata['dest_tpo'] self.logger.info( f" Working with TopicPartition({src_tpo.topic},{src_tpo.partition},{src_tpo.offset}) @ {metadata['src_timestamp']}" ) # We found the destination cluster message by offsets_for_times and compared hashes # If they match, then the destination offset if metadata['src_hash'] == metadata['dest_hash']: self.logger.info( f" FOUND: TopicPartition({dest_tpo.topic},{dest_tpo.partition},{dest_tpo.offset}) @ {dest_timestamp} in destination cluster" ) self._metadata[key]['matched'] = True translated_offsets.append(dest_tpo) else: self.logger.info( f" NOT FOUND: TopicPartition({dest_tpo.topic},{dest_tpo.partition},{dest_tpo.offset}) @ {dest_timestamp} does not have same hash." ) self.logger.info( f" will traverse messages and attempt to find a match.") self._metadata[key]['matched'] = False unmatched_offsets.append(metadata) self.logger.info( f"Found {len(translated_offsets)} matching offsets and {len(unmatched_offsets)} that don't match." ) return (translated_offsets, unmatched_offsets) def findOffsetRangeToScan(self, md): """Using a metadata record as a base, identify how many records (maximum) to scan through to find a match We are here because we didn't find a match for source cluster timestamp, which means it is either not there, or multiple messages were produced during that millisecond and our offsets_for_times() call provided the lowest offset for that millisecond. We will add 1 ms to the timestamp and get the offset (if possible) and then iterate over each message and compare hashes to determine what the exact offset should be. """ self.logger.info( "Find the start/end offsets to iterate over to find a match based on message value hash." ) timestamp_end = md['src_timestamp'] + 1 # add one millisecond tpo = md['dest_tpo'] starting_offset = md['dest_offset'] end_offset = self._dest_consumer.offsets_for_times( [TopicPartition(tpo.topic, tpo.partition, timestamp_end)]) self.logger.info( f"Shifting timestamp by 1ms, from {md['src_timestamp']} to {timestamp_end}" ) self.logger.info( f" yields an offset of {end_offset[0]}") target_offset = -1 if end_offset[0].offset == -1: # There are no more recent timestamps for the topic/partition # Set the ending offset at the end of partition low, high = self._dest_consumer.get_watermark_offsets( TopicPartition(tpo.topic, tpo.partition)) target_offset = high self.logger.info( f"Reading to end of the partition... {target_offset}") if target_offset - tpo.offset > 500: self.logger.warning( f" Note: that involves reading and hashing {target_offset - tpo.offet} messages.. might take some time." ) else: # There was a more recent timestamped message, so we'll use that as our target offset target_offset = end_offset[0].offset self.logger.info( f"Starting offset for scan is {starting_offset} (inclusive)") self.logger.info( f"Ending offset for scan is {target_offset} (exclusive)") return (starting_offset, target_offset) def compareOffsets(self): """For the list of tpos in the source cluster, look them up in the destination and compare value hashes; if they match all good; if not, iterate over records until a match is found (where duration is one millisecond, based on the assumption that multiple messages have been produced during the same millisecond) """ self.logger.info( "Comparing offsets between source and destination cluster...") pp = pprint.PrettyPrinter(indent=4) pp.pprint(self._metadata) # Check that we have destination cluster offsets and hashes before proceeding - if not, we # have incomplete data and should explode into a ball of flames to the sound of a distorted # sitar being played backwards. counter = 0 for k in self._metadata.keys(): if self._metadata[k]['dest_hash'] is None or \ self._metadata[k]['dest_offset'] is None or \ self._metadata[k]['src_hash'] is None: counter += 1 if counter > 0: raise Exception( f"{counter} out of {len(self._metadata)} topic partitions have insufficient data. Exiting." ) translated_offsets, unmatched_offsets = self.findMatchingMessages() self.logger.info("Working on unmatched offsets...") messages_found = 0 for md in unmatched_offsets: tpo = md['dest_tpo'] (starting_offset, target_offset) = self.findOffsetRangeToScan(md) for offset in range(starting_offset, target_offset): self.logger.info( f"Inspecting destination cluster message at offset {offset}..." ) results = self.inspectTPOMessages( [TopicPartition(tpo.topic, tpo.partition, offset)], cluster="destination") if len(results) == 0: raise Exception( "Didn't get any metadata from call to inspectTPOMessages(). This implies we read data from the source cluster, but couldn't inspect any messages in the destination cluster. Stopping." ) elif len(results) > 1: raise Exception( f"Expecting only one result from call to inspectTPOMessages, but got {len(results)}. Stopping" ) else: # Get the (only) key from the dict key = next(iter(results)) dest_hash = results[key]['dest_hash'] dest_tpo = results[key]['dest_tpo'] dest_message = results[key]['dest_message'] if dest_hash == md['src_hash']: self.logger.info(" FOUND matching record: ") self.logger.info( f" source hash was {md['src_hash']}, and" ) self.logger.info( f" dest_hash is {dest_hash}" ) self.logger.info( f". destination {dest_tpo}" ) self._metadata[key]['matched'] = True # Update our metadata to accurately reflect the correct destination message self._metadata[key][ 'dest_offset'] = dest_message.offset() self._metadata[key]['dest_hash'] = dest_hash self._metadata[key][ 'dest_timestamp'] = dest_message.timestamp()[1] self._metadata[key]['dest_tpo'] = dest_tpo self._metadata[key]['dest_message'] = dest_message translated_offsets.append(dest_tpo) messages_found += 1 # Found it so stop iterating break self.logger.info( f"Found {messages_found} out of {len(unmatched_offsets)} unmatched objects." ) # Sort the offset map by partition number, which may have become out of # order if we needed to read and hash messages to find a hash match return sorted(translated_offsets, key=lambda k: k.partition) def getMetadata(self): """Return our offset metadata object""" return self._metadata def getMessage(self, consumer, tpo): """Read a message at a tpo, return it""" consumer.assign([tpo]) res = consumer.consume(num_messages=1, timeout=3) if len(res) == 1: return res[0] else: return None def commitTranslatedOffsets(self, tpos): """Given a list of TopicPartition objects, set the consumer group offsets""" self.logger.info("Committing offsets for supplied TPOs...") # Our offsets have been the last message consumed; need to set all offsets to +1 # so that they represent the next message to consume. for t in tpos: t.offset += 1 self.logger.info( " TPO offsets are incremented by one so that next message consumed is correct." ) errored_commits = list() retries = 3 while retries > 0: self.logger.info( f" Calling commit() for {len(tpos)} topic/partitions to destination cluster." ) committed = self._dest_consumer.commit(offsets=tpos, asynchronous=False) for t in [t for t in committed if t.error is not None]: errored_commits.append(t) if len(errored_commits) > 0: self.logger.warning(f" Errors commiting offsets:") for t in errored_commits: self.logger.info( f" Partition({t.partition}), Offset({t.offset}): {t.error}" ) self.logger.info(f" Trying again in 2 seconds...") time.sleep(2) tpos = errored_commits errored_commits = list() retries -= 1 else: self.logger.info( "Offsets committed successfully to destination cluster") errored_commits.clear() break if len(errored_commits) > 0: self.logger.warning("Still had errors after 3 tries:") for t in errored_commits: self.logger.info( f" Partition({t.partition}), Offset({t.offset}): {t.error}" ) self.logger.info("Returning with a job not finished!!") return committed def printMetadata(self, metadata=None): if metadata is None: metadata = self._metadata #print("================================================================================") #print("================================================================================") #print("================================================================================") #pp = pprint.PrettyPrinter(indent=4) #pp.pprint(metadata) #print("================================================================================") #print("================================================================================") #print("================================================================================") topic = None for key in metadata.keys(): md = metadata[key] tpo = md['src_tpo'] if tpo.topic != topic: topic = tpo.topic self.logger.info(f"topic: {tpo.topic}:") src_offset = md['src_offset'] src_timestamp = md['src_timestamp'] src_hash = md['src_hash'] # We might be passed a metadata object that doesn't set dest_* fields if 'dest_tpo' in md: if md['dest_tpo'] is not None: dest_offset = md['dest_tpo'].offset else: dest_offset = '' else: dest_offset = '' if 'dest_message' in md: if md['dest_message'] is not None: dest_timestamp = md['dest_message'].timestamp()[1] else: dest_timestamp = '' else: dest_timestamp = '' if 'dest_hash' in md: dest_hash = md['dest_hash'] else: dest_hash = '' self.logger.info(f" p[{tpo.partition:1}]") self.logger.info( f" source last message offset ({src_offset:1}), timestamp({src_timestamp:12}), hash({src_hash})" ) self.logger.info( f" destination last message offset ({dest_offset:1}), timestamp({dest_timestamp:12}), hash({dest_hash})" ) #if 'src_message' in md and md['src_message'] is not None: # pp.pprint(str(md['src_message'].value(),'utf-8')) #if 'dest_message' in md and md['dest_message'] is not None: # pp.pprint(str(md['dest_message'].value(),'utf-8')) #print("<<<<<< DONE") def getConsumerGroupOffsets(self, topics): """Return the latest offset for the consumer group defined at object initialisation time. Moves offset by -1 so that we can re-read the last message consumed. """ self.logger.info( f"Getting consumer group offsets for {len(topics)} topics...") tpos = self.getTPOs(topics) tpos = self._consumer.committed(tpos) self.logger.info( " Decrementing offsets so that we can inspect the last message consumed (for hashing, timestamps, etc)" ) # Wind back one offset so that we can re-read the messages for t in tpos: t.offset -= 1 self.logger.info(f"Found offsets for {len(tpos)} topic partitions.") return tpos def allOffsetsMatched(self): """Test that all metadata has a matched == True value """ self.logger.info( "Checking that all metadata records were matched in the destination cluster..." ) for md in self._metadata: if self._metadata[md]['matched'] == False: self.logger.info("Unmatched metadata records found.") return False self.logger.info("All metadata was matched.") return True def findTopicsForConsumerGroup(self, cg=None): """Given a consumer group name, Find the topics associated with the consumer group. We use the shell because the confluent_kafka_python package doesn't yet provide this, see: https://github.com/confluentinc/confluent-kafka-python/issues/223 """ self.logger.info( f"Finding topics associated with {self._src_group_id}...") # Test that we have a kafka-consumer-groups handy... if subprocess.run(['which', 'kafka-consumer-groups']).returncode == 1: raise OSError("No 'kafka-consumer-groups' command found in $PATH") if cg is None: cg = self._src_group_id cmd = f"kafka-consumer-groups --bootstrap-server {self._src_bootstrap_servers} --describe --group {cg} 2>/dev/null| grep {cg} | grep -v 'Error: Consumer group '| awk '{{print $2}}' | sort -u" self.logger.info(f"Running {cmd}") res = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE) cg_topics = list() for topic in str(res.stdout, 'utf-8').split('\n'): if topic != '': cg_topics.append(topic) print(f">>>>>>{cg_topics}>>>>>") if len(cg_topics) == 0: raise Exception( f"No topics found for consumer group {cg}. Nothing to do. Stopping." ) # If we were configured to run for just one topic in a CG; then return just that topic, # but only if it exists in the CG if self._src_topic is not None: if self._src_topic in cg_topics: self.logger.info( "Overriding topic list from CG tool with supplied topic.") cg_topics = [self._src_topic] else: raise Exception( f"{self._src_topic} is not associated with {cg}. Stopping." ) self.logger.info(f"Returning {cg_topics}...") return (cg_topics)