def get_partitions_with_offsets(broker): input_consumer = Consumer({ 'bootstrap.servers': broker, 'group.id': str(uuid.uuid4()), 'auto.offset.reset': 'earliest', 'enable.auto.commit': True, 'auto.commit.interval.ms': 1000, 'api.version.request': True, 'max.poll.interval.ms': 60000 }) output_consumer = Consumer({ 'bootstrap.servers': broker, 'group.id': str(uuid.uuid4()), 'auto.offset.reset': 'earliest', 'enable.auto.commit': True, 'auto.commit.interval.ms': 1000, 'api.version.request': True, 'max.poll.interval.ms': 60000 }) input_consumer.subscribe(['read', 'update', 'transfer']) output_consumer.subscribe(['responses']) msgs = input_consumer.consume(timeout=5, num_messages=100) if len(msgs) == 0: print("returned empty") return {} partitions_with_offsets = {'input': [], 'output': []} input_partitions = input_consumer.assignment() for p in input_partitions: _, h = input_consumer.get_watermark_offsets(p) p.offset = h partitions_with_offsets['input'].append(p) output_consumer.consume(timeout=5, num_messages=100) output_partitions = output_consumer.assignment() for p in output_partitions: _, h = output_consumer.get_watermark_offsets(p) p.offset = h partitions_with_offsets['output'].append(p) return partitions_with_offsets
class TimeOrderedGeneratorWithTimeout(GeneratorInterface): """ A general generator which can read multiple topics and merge their messages in time order. A message must be emitted at (arrival_system_time + latency_ms). In batch mode (until reaching the first EOP on each stream) the generator will not discard any messages. """ def __init__(self, broker, groupid, topics_infos: List[TopicInfo], latency_ms, commit_interval_sec=None, group_by_time=False, begin_timestamp=None, begin_flag=None, end_timestamp=None, end_flag=None, heartbeat_interval_ms=-1): """ :param broker: Broker to connect to. :param groupid: Group id of the consumer. :param topics_infos: [TopicInfo()] - list of TopicInfo objects. :param latency_ms: (integer >=0) Latency to wait before serving a message. After this messages with lower or equal timestamps will be discarded. :param commit_interval_sec: How many seconds to wait between commits.-1 does not commit with the given group id. :param group_by_time: Group messages with the same timestamp. This will yield a list of messages. :param begin_timestamp: Timestamp of the kafka messages where the generator will start. :param begin_flag: BEGINNING, CONTINUE, LIVE - CONTINUE will continue from the last committed offset. If there was no committed offset will start from the end of the stream. :param end_timestamp: Timestamp where to end the reading. :param end_flag: NEVER, END_OF_PARTITION :param heartbeat_interval_ms: -1 does not produce heartbeat. After every interval will produce a HeartBeat typed message with the timestamp. """ if begin_timestamp is not None and begin_flag is not None: raise Exception( 'You can not set the begin timestamp and a flag in the same time.' ) if end_timestamp is not None and end_flag is not None: raise Exception( 'You can not set the end timestamp and a flag in the same time.' ) if begin_timestamp is not None and end_timestamp is not None and begin_timestamp >= end_timestamp: raise Exception( 'The begin timestamp is larger then the end timestamp.') if begin_flag is not None and end_flag is not None and \ begin_flag == BeginFlag.LIVE and end_flag == EndFlag.END_OF_PARTITION: raise Exception( 'You can not start in live and process until the end of the streams.' ) if end_flag is not None and not (end_flag == EndFlag.END_OF_PARTITION or end_flag == EndFlag.NEVER): raise Exception( 'Unknow end flag: {} . Please use the given enum to use proper end flag.' .format(end_flag)) self.end_ts = end_timestamp self.end_flag = end_flag self.commit_interval_sec = commit_interval_sec self.latency_ms = latency_ms self.group_by_time = group_by_time self.max_poll_interval_ms = 5 * 60 * 1000 self.consumer = Consumer({ 'bootstrap.servers': broker, 'group.id': groupid, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest' if begin_flag == BeginFlag.CONTINUE_OR_BEGINNING else 'latest', 'fetch.wait.max.ms': 20, 'max.poll.interval.ms': self.max_poll_interval_ms, 'enable.partition.eof': True }) self.last_poll = None self.tps = [] self.queues = {} self.messages_to_be_committed = {} self.begin_timestamp = begin_timestamp for ti in topics_infos: topic_name = ti.topic self.messages_to_be_committed[topic_name] = { 'last_msg': None, 'committed': True } if begin_timestamp is not None: self.tps.extend( self.consumer.offsets_for_times([ TopicPartition(topic_name, partition=ti.partition, offset=begin_timestamp) ])) elif begin_flag is not None: if begin_flag == BeginFlag.BEGINNING: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_BEGINNING)) elif begin_flag in (BeginFlag.CONTINUE, BeginFlag.CONTINUE_OR_BEGINNING): self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_STORED)) elif begin_flag == BeginFlag.LIVE: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) else: raise Exception( 'Unknown begin flag. Please use the enum to provide proper begin flag.' ) else: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) end_offset = None if end_flag is not None and end_flag == EndFlag.END_OF_PARTITION: end_offset = self.consumer.get_watermark_offsets( TopicPartition(topic_name, 0))[1] - 1 if end_offset is None or end_offset >= 0: self.queues[topic_name] = Topic(topic_name, self.consumer, end_offset=end_offset, partition=ti.partition, drop=ti.drop) self.consumer.assign(self.tps) self.last_commit = time.time() self.running = True self.heartbeat_interval_ms = heartbeat_interval_ms self.next_hb = None def stopGenerator(self): self.running = False def _serve_messages(self, message_to_serve): if self.commit_interval_sec is not None and self.group_by_time: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False # serve messages if self.group_by_time: yield message_to_serve else: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False yield msg if not self.running: break # commit messages when they were delivered current_time = time.time() if self.commit_interval_sec is not None and ( current_time - self.last_commit) > self.commit_interval_sec: for k in self.messages_to_be_committed.keys(): if not self.messages_to_be_committed[k]['committed']: self.consumer.commit( self.messages_to_be_committed[k]['last_msg']) self.messages_to_be_committed[k]['committed'] = True self.last_commit = current_time def _serve_heartbeat(self, current_timestamp_ms): if self.next_hb is None: if self.begin_timestamp is not None: self.next_hb = self.begin_timestamp else: self.next_hb = current_timestamp_ms while self.next_hb <= current_timestamp_ms: yield HeartBeat(self.next_hb) self.next_hb += self.heartbeat_interval_ms def _can_serve(self): min_ets = min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=-1) if min_ets == -1: return None deadline = getSystemTimestamp() - self.latency_ms if all([q.can_be_emitted(min_ets) for q in self.queues.values()]) and \ any([q.queue[0].ts < deadline for q in self.queues.values() if len(q.queue) > 0 and q.queue[0].message.timestamp()[1] == min_ets]): return min_ets else: return None def getMessages(self): while self.running: if all([v.stopped for v in self.queues.values()]): message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.queue) message_to_serve = [m.message for m in message_to_serve] message_to_serve.sort(key=lambda x: x.timestamp()[1]) while len(message_to_serve) > 0: ts = message_to_serve[0].timestamp()[1] serve_it = [] while len(message_to_serve) > 0 and message_to_serve[ 0].timestamp()[1] == ts: serve_it.append(message_to_serve.pop(0)) if not self.heartbeat_interval_ms == -1: yield from self._serve_heartbeat(ts) yield from self._serve_messages(serve_it) logging.info('Exiting from generator.') break self.last_poll = getSystemTimestamp() msg = self.consumer.poll(0.001) if msg is not None: if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: if msg.topic() in self.queues: self.queues[msg.topic()].first_eop_reached = True self.queues[msg.topic()].end_of_partition = True else: logging.error('Unhandle error: {}'.format(msg.error())) break else: self.queues[msg.topic()].end_of_partition = False if self.end_ts is not None and msg.timestamp( )[1] > self.end_ts: self.queues[msg.topic()].stop_topic() else: self.queues[msg.topic()].add_message(msg) while self.running: event_ts_to_serve = self._can_serve() if event_ts_to_serve is None or \ self.max_poll_interval_ms - (getSystemTimestamp() - self.last_poll) < 30000: if self.end_flag == EndFlag.NEVER and self.heartbeat_interval_ms != -1 \ and any([q.end_of_partition for q in self.queues.values()]): if self.next_hb is None: self.next_hb = min( getSystemTimestamp() - self.latency_ms, min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=sys.maxsize)) if self.next_hb < min( getSystemTimestamp() - self.latency_ms, min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=sys.maxsize)): yield from self._serve_heartbeat(self.next_hb) break if self.heartbeat_interval_ms != -1: yield from self._serve_heartbeat(event_ts_to_serve) message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.get_messages(event_ts_to_serve)) yield from self._serve_messages(message_to_serve) if self.end_ts is not None and self.end_ts <= event_ts_to_serve: self.running = False self.consumer.close()
def exec_benchmark(duration_s, fps, kafka_loc, output_topic, silent): """Measures throughput at the output Kafka topic, by checking the growth in all partitions""" c = Consumer({ 'bootstrap.servers': kafka_loc, 'group.id': 'benchmark-' + str(uuid.uuid4()), 'auto.offset.reset': 'latest', 'max.poll.interval.ms': 86400000, 'isolation.level': 'read_committed' }) # === Get topic partitions topic_partitions = None def store_topic_partition(consumer, partitions): nonlocal topic_partitions topic_partitions = partitions c.subscribe([output_topic], on_assign=store_topic_partition) while topic_partitions is None: c.consume(timeout=0.5) #Loop read partitions throughput_measured = [] throughput_measured_per_partition = {} last_values = {} for p in topic_partitions: low, high = c.get_watermark_offsets(p) throughput_measured_per_partition[p.partition] = [] last_values[p.partition] = high #if silent != "silent": # print("Starting value for partition {}: {}".format(p.partition, high)) MS_PER_UPDATE = 1000 / fps start_time = current_milli_time() last_time = start_time current_time = start_time last_write_time = current_time lag = 0.0 while current_time < start_time + duration_s * 1000: current_time = current_milli_time() elapsed = current_time - last_time last_time = current_time lag += elapsed while lag >= MS_PER_UPDATE: #calc new val total_new = 0 curr_time_for_print = current_milli_time() time_delta = ((curr_time_for_print - last_write_time) / 1000) if time_delta > 0: for p in topic_partitions: low, high = c.get_watermark_offsets(p) delta = high - last_values[p.partition] total_new += delta throughput_measured_per_partition[p.partition].append( (delta / time_delta, curr_time_for_print)) last_values[p.partition] = high throughput_measured.append( (total_new / time_delta, curr_time_for_print)) last_write_time = curr_time_for_print lag -= MS_PER_UPDATE if silent != "silent": #Print column names #TIME THROUGHPUT PART-0 ... PART-N columns = "TIME\tTHROUGHPUT" for i in range(len(topic_partitions)): columns += "\tPART-{}".format(str(i)) print(columns) for row in range(len(throughput_measured)): row_data = "{}\t{}".format(throughput_measured[row][1], int(throughput_measured[row][0])) for i in range(len(topic_partitions)): row_data += "\t{}".format( int(throughput_measured_per_partition[i][row][0])) print(row_data) else: print( int( statistics.mean( [x[0] for x in throughput_measured if x[0] > 0.0])))