def employeeportal(): tp = TopicPartition('crashed-devices', 0) consumer = KafkaConsumer( 'crashed-devices', bootstrap_servers=[ 'ec2-52-203-135-135.compute-1.amazonaws.com:9092', 'ec2-52-70-111-222.compute-1.amazonaws.com:9092', 'ec2-34-193-78-218.compute-1.amazonaws.com:9092' ], enable_auto_commit=True, group_id='my-group', auto_offset_reset='earliest', value_deserializer=lambda x: loads(x.decode('utf-8'))) lastOffset = consumer.beginning_offsets([tp])[tp] latitudes = [] longitudes = [] i = 0 for message in consumer: i += 1 msg = message.value latitudes.append(msg['latitude']) longitudes.append(msg['longitude']) print(latitudes, longitudes) if i == 1: print("GOT HERE") consumer.commit() break consumer.close() return render_template("employeeportal.html", APIkey='AIzaSyD9e3Rdo8fGQq6hzaXkdsdQzv9Hy0rTolE', latitudes=latitudes, longitudes=longitudes)
def create_consumer(args, policy): """ Refer to Python package kafka-python, a high-level message consumer of Kafka brokers. The consumer iterator returns consumer records, which expose basic message attributes: topic, partition, offset, key, and value. :param args: Input arguments :param policy: Object to store Network Policy for processing :return: KafkaConsumer object, messages from the message bus for processing """ consumer = KafkaConsumer(args.get('topic'), api_version=API_VERSION, bootstrap_servers=args.get('broker'), client_id=CLIENT_ID, # name passed to servers for identification auto_offset_reset=args.get('start_at'), # consume earliest or latest available msgs enable_auto_commit=AUTOCOMMIT, # autocommit offsets? consumer_timeout_ms=args.get('timeout'), # StopIteration if no message after 'n' seconds security_protocol=SSL, ssl_context=create_ssl_context(args) ) # Returned values are of type Set msg = ["All the topics available :{}".format(consumer.topics()), "Subscription:{}".format(consumer.subscription()), "Partitions for topic:{}".format(consumer.partitions_for_topic(args.get('topic'))), "TopicPartitions:{}".format(consumer.assignment()) ] policy.add_fact('consumer_debug', msg) # Offsets are type Int policy.add_fact('beginning_offsets', str(consumer.beginning_offsets(consumer.assignment()))) policy.add_fact('end_offsets', str(consumer.end_offsets(consumer.assignment()))) policy.start_at_offset = args.get('start_at_offset') policy.add_fact('start_at_offset', policy.start_at_offset) return consumer
def main(): consumer = KafkaConsumer('topic_test_cluster', bootstrap_servers=['master:9092']) print consumer.partitions_for_topic('topic_test_cluster') print consumer.topics() print consumer.subscription() print consumer.assignment() print consumer.beginning_offsets(consumer.assignment()) # 读取partition为2、偏移量从5开始的数据 consumer.seek(TopicPartition(topic=u'topic_test_cluster', partition=2), 5) for msg in consumer: print('%s:%d:%d: key=%s value=%s' % (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
def __init__(self, broker: str, topic: str, partition: int = -1, start: Union[int, datetime, PartitionOffset] = PartitionOffset.END, stop: Union[int, datetime, PartitionOffset] = PartitionOffset.NEVER): consumer = KafkaConsumer(bootstrap_servers=broker, fetch_max_bytes=52428800 * 6, consumer_timeout_ms=100) existing_topics = consumer.topics() self.current_msg = None self.current_offset_limits = HighLowOffset(-1, -1) if topic not in existing_topics: raise RuntimeError(f"Topic \"{topic}\" does not exist.") existing_partitions = consumer.partitions_for_topic(topic) if partition == -1: partition = existing_partitions.pop() elif partition not in existing_partitions: raise RuntimeError(f"Partition {partition} for topic \"{topic}\" does not exist.") topic_partition = TopicPartition(topic, partition) consumer.assign([topic_partition, ]) if start == PartitionOffset.BEGINNING: consumer.seek_to_beginning() elif start == PartitionOffset.END or start == PartitionOffset.NEVER: consumer.seek_to_end() elif type(start) is int: first_offset = consumer.beginning_offsets([topic_partition, ]) if first_offset[topic_partition] > start: consumer.seek_to_beginning() else: consumer.seek(partition=topic_partition, offset=start) elif type(start) is datetime: found_offsets = consumer.offsets_for_times({topic_partition: int(start.timestamp() * 1000)}) consumer.seek(partition=topic_partition, offset=found_offsets[topic_partition].offset) self.to_thread = Queue() self.from_thread = Queue(maxsize=100) self.thread = Thread(target=thread_function, daemon=True, kwargs={"consumer": consumer, "stop": stop, "in_queue": self.to_thread, "out_queue": self.from_thread, "stop": stop, "topic_partition": topic_partition}) self.thread.start()
def consume(self): consumer = KafkaConsumer(self.topic, bootstrap_servers=self.bootstrap_servers) print(consumer.partitions_for_topic(self.topic)) #获取test主题的分区信息 print(consumer.topics()) #获取主题列表 print(consumer.subscription()) #获取当前消费者订阅的主题 print(consumer.assignment()) #获取当前消费者topic、分区信息 print(consumer.beginning_offsets(consumer.assignment())) #获取当前消费者可消费的偏移量 consumer.seek(TopicPartition(topic=self.topic, partition=0), 1) #重置偏移量,从第1个偏移量消费 for message in consumer: print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
def consume(self): consumer = KafkaConsumer(self.topic, bootstrap_servers=self.bootstrap_servers) print(consumer.partitions_for_topic(self.topic)) print(consumer.topics()) print(consumer.subscription()) print(consumer.assignment()) print(consumer.beginning_offsets(consumer.assignment())) consumer.seek(TopicPartition(topic=self.topic, partition=0), 1) for message in consumer: print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
class GetEffectiveOffset: def __init__(self, broker_list, group_name, topic): self.topic = topic self.consumer = KafkaConsumer(group_id=group_name, bootstrap_servers=broker_list) def get_offset(self): partitions_structs = [] for partition_id in self.consumer.partitions_for_topic(self.topic): partitions_structs.append(TopicPartition(self.topic, partition_id)) beginning_offset = self.consumer.beginning_offsets(partitions_structs) end_offset = self.consumer.end_offsets(partitions_structs) for partition, offset in beginning_offset.items(): print('{0} => beginning offset = {1}; end offset = {2}'.format( partition, offset, end_offset[partition]))
def offset_manage_manually_consume(): """ 手动设置offset :return: """ consumer = KafkaConsumer(TOPIC, bootstrap_servers=BOOTSTRAP_SERVERS) print(consumer.partitions_for_topic(TOPIC)) # 获取topic的分区信息 print(consumer.topics()) # 获取topic列表 当前kafka server有哪些topic print(consumer.subscription()) # 获取当前消费者订阅的topic print(consumer.assignment()) # 获取当前消费者topic、分区信息 print(consumer.beginning_offsets(consumer.assignment())) # 获取当前消费者可消费的偏移量 print(consumer.assignment()) # 获取当前消费者可消费的偏移量 consumer.seek(TopicPartition(topic=u'%s' % TOPIC, partition=0), 235000) # 重置偏移量,从第235000个偏移量消费 for message in consumer: print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))
def desc_topic(args): consumer = KafkaConsumer(bootstrap_servers=[args.broker]) topics = consumer.topics() if args.topic not in topics: consumer.close() print(f'Topic "{args.topic}" not in cluster.') else: partitions = consumer.partitions_for_topic(args.topic) tp_list = [] for p in partitions: tp = TopicPartition(args.topic, p) tp_list.append(tp) beginning_offsets = consumer.beginning_offsets(tp_list) end_offsets = consumer.end_offsets(tp_list) print(f'Topic: {args.topic}') print(f'Partition: {partitions}') print(f'Beginning Offsets: {list(beginning_offsets.values())}') print(f'End Offsets: {list(end_offsets.values())}')
def cusumer(): start = time.time() n = 0 _consumer = KafkaConsumer('4.1.1.1.python-test', group_id='test1', bootstrap_servers='192.168.18.134:9092', consumer_timeout_ms=1000) print(_consumer.partitions_for_topic('4.1.1.1.python-test')) #TopicPartition('4.1.1.1.python-test','0') #a = namedtuple("_TopicPartition",["_4.1.1.1.python-test", "_0"]) offset = _consumer.committed( TopicPartition(topic='4.1.1.1.python-test', partition=0)) #_consumer.seek_to_beginning() #_consumer.seek_to_beginning(TopicPartition(topic = '4.1.1.1.python-test',partition = 0)) #_consumer.assign(TopicPartition(topic = '4.1.1.1.python-test',partition = 0)) print(_consumer.assignment()) print(_consumer.subscription()) print( _consumer.beginning_offsets( TopicPartition(topic='4.1.1.1.python-test', partition=0))) #_consumer.seek(TopicPartition(topic = '4.1.1.1.python-test',partition = 0),offset-1) #print(_consumer.position(TopicPartition(topic = '4.1.1.1.python-test',partition = 0))) #_consumer.commit() return while 1: try: for message in _consumer: #yield message print(message.value) n = n + 1 stop = time.time() if stop - start > 1: print(n / (stop - start)) start = time.time() n = 0 #print('time out') except KafkaTimeoutError as e: print(e) except KafkaError as e: print(e) finally: pass
def info(topic): print('brokers: {}'.format(','.join(bootstrap_servers))) consumerclient = KafkaConsumer(bootstrap_servers=bootstrap_servers) partitions = consumerclient.partitions_for_topic(topic) print('topic: {}'.format(topic)) print('partitions: {}'.format(','.join( str(partition) for partition in partitions))) partitioninstances = [] for partition in partitions: partitioninstance = TopicPartition(topic, int(partition)) partitioninstances.append(partitioninstance) beginningoffsets = consumerclient.beginning_offsets(partitioninstances) endoffsets = consumerclient.end_offsets(partitioninstances) for pi in partitioninstances: msg = 'partition: {}, beginning_offset: {}, end_offset: {}' msg = msg.format(pi.partition, beginningoffsets[pi], endoffsets[pi]) print(msg)
def doTest(self): print(self.className + " - " + pyUtils.getCurrentRunningFunctionName() + "------------------") _topicName = "pro_bilog" local_url = ['网址:端口'] real_url = ['ip:端口'] _consumer = KafkaConsumer(_topicName, bootstrap_servers=local_url, group_id='test', request_timeout_ms=3000, session_timeout_ms=5000) # 获取test主题的分区信息 print(_consumer.partitions_for_topic(_topicName)) # 获取主题列表 print(_consumer.topics()) # 获取当前消费者订阅的主题 print(_consumer.subscription()) # 获取当前消费者topic、分区信息 print(_consumer.assignment()) # 获取当前消费者可消费的偏移量 print(_consumer.beginning_offsets(_consumer.assignment()))
def thread_function(consumer: KafkaConsumer, stop: Union[datetime, int], in_queue: Queue, out_queue: Queue, topic_partition): known_sources: Dict[bytes, DataSource] = {} start_time = datetime.now(tz=timezone.utc) update_timer = datetime.now(tz=timezone.utc) while True: messages_ctr = 0 for kafka_msg in consumer: new_msg = Message(kafka_msg) if type(stop) is int and new_msg.offset > stop: pass elif type(stop) is datetime and new_msg.timestamp is not None and new_msg.timestamp > stop: pass elif type(stop) is datetime and new_msg.timestamp is None and new_msg.kafka_timestamp > stop: pass else: if not new_msg.source_hash in known_sources: known_sources[new_msg.source_hash] = DataSource(new_msg.source_name, new_msg.message_type, start_time) known_sources[new_msg.source_hash].process_message(new_msg) messages_ctr += 1 if messages_ctr == CHECK_FOR_MSG_INTERVAL: break if not in_queue.empty(): new_msg = in_queue.get() if new_msg == "exit": break now = datetime.now(tz=timezone.utc) if now - update_timer > UPDATE_STATUS_INTERVAL: update_timer = now try: out_queue.put(copy(known_sources), block=False) low_offset = consumer.beginning_offsets([topic_partition, ])[topic_partition] high_offset = consumer.end_offsets([topic_partition, ])[topic_partition] out_queue.put(HighLowOffset(low_offset, high_offset)) except Full: pass # Do nothing consumer.close(True)
consumer = KafkaConsumer( kafka_topic_name, group_id=kafka_consumer_group_id, client_id=kafka_client_id, bootstrap_servers=kafka_brokers, request_timeout_ms=6001, session_timeout_ms=6000, heartbeat_interval_ms=2000, auto_offset_reset="earliest", enable_auto_commit=False ) app_logger.get.info("Consumer init successful") kafka_partitions: Set[TopicPartition] = consumer.partitions_for_topic(kafka_topic_name) kafka_topic_partitions: List[TopicPartition] = [TopicPartition(kafka_topic_name, p) for p in kafka_partitions] kafka_min_offsets = consumer.beginning_offsets(kafka_topic_partitions) kafka_max_offsets = consumer.end_offsets(kafka_topic_partitions) app_logger.get.info(f"Cluster info: brokers - {kafka_brokers} partitions - {kafka_topic_partitions} " f"min. offset - {kafka_min_offsets} " f"max. offsets - {kafka_max_offsets}") # Print connection statistics print("\nClaster statistics:") print(f"\tBrockers:\t{kafka_brokers}") print(f"\tTopic:\t{kafka_topic_name}") print(f"\tPartitions:\t{kafka_partitions}") print(f"\tConsumer group id:\t{kafka_consumer_group_id}") print(f"\nStart reading from topic \"{kafka_topic_name}\"...") counter: int = 0
class Consumer: __flight_messages = dict() __consumers_count = 0 def __init__(self, *args, **kwargs): self.topics = args self.consumer_id = kwargs.pop('consumer_id', Consumer.__consumers_count + 1) self.manager_id = kwargs.pop('manager_id', '') self.__consumer = KafkaConsumer(*args, **kwargs) self.__enable_polling = True self.__is_active = True self.__poll_delay = MIN_POLL_DELAY self.__group_id = kwargs.get('group_id', None) self.__name = 'Consumer_{}'.format(Consumer.get_consumer_count()) self.processed_images = 0 Consumer.increment_consumer_count() print('Topics listened by the consumers:', self.topics) if self.__group_id and self.__group_id not in Consumer.__flight_messages: Consumer.__flight_messages.update({self.__group_id: 0}) @staticmethod def get_messages_in_flight(): return Consumer.__flight_messages @staticmethod def get_consumer_count(): return Consumer.__consumers_count @staticmethod def update_consumer_count(count): Consumer.__consumers_count = count @staticmethod def increment_consumer_count(): Consumer.__consumers_count += 1 def is_active(self): return self.__is_active def get_current_subscriptions(self): return self.__consumer.subscription() def get_initial_offset(self, partitions): return self.__consumer.beginning_offsets(partitions) def get_current_position(self, partition): return self.__consumer.position(partition) def get_end_offset(self, partitions): return self.__consumer.end_offsets(partitions) def subscribe_topics(self, *topics): self.__consumer.subscribe(topics=topics) print('subscribed to the topics', topics) def consume_messages(self, process_fn): assert process_fn and callable(process_fn), \ 'process_fn is mandatory and must be callable' for message in self.__consumer: process_fn(message) self.close_consumer() def set_alive(self, is_alive): redis_cli.set_multi_value( HEALTHCHECK_HASHKEY, '{}:{}'.format(self.manager_id, self.consumer_id), 1 if is_alive else 0) def log_consumer_meta(self): consumer_meta = dict(messages_count=self.processed_images, topics=self.topics, group_id=self.__group_id, name=self.__name) redis_cli.set_multi_value( META_INFO_HASHKEY, '{}:{}'.format(self.manager_id, self.consumer_id), json.loads(consumer_meta)) def __poll(self, timeout_ms=0, max_records=MAX_RECORDS_PER_POLL): while self.__enable_polling: print('{} Listening to messages...'.format(self.__name)) try: message = self.__consumer.poll(timeout_ms=timeout_ms, max_records=max_records) if message: Consumer.__flight_messages[self.__group_id] = 0 self.__poll_delay = MIN_POLL_DELAY max_offset_position = self.get_end_offset(message.keys()) for partitions in message: try: records = message.get(partitions) Consumer.__flight_messages[self.__group_id] += (max_offset_position.get(partitions) \ - self.get_current_position(partitions)) yield records except Exception as exc: print(exc) else: delay = self.__poll_delay * 2 if delay <= MAX_POLL_DELAY: self.__poll_delay = delay else: self.__poll_delay = MAX_POLL_DELAY self.processed_images += len(records) self.log_consumer_meta() self.set_alive(False) self.log_consumer_meta() sleep(self.__poll_delay) except AssertionError as assertion_exc: self.stop_polling() print(assertion_exc) print('returning None') self.set_alive(False) return None def poll_topics(self, process_fn, timeout_ms=0, max_records=MAX_RECORDS_PER_POLL): assert process_fn and callable(process_fn), \ 'process_fn is mandatory and must be callable' print('polling has begun') for records in self.__poll(timeout_ms=timeout_ms, max_records=max_records): if records is None: break process_fn(records) print('exit from consuming messages') def stop_polling(self): print('stopping polling and closing the consumer') self.__enable_polling = False self.close_consumer() def close_consumer(self): self.__consumer.close() self.__is_active = False
class KafkaCli(object): """ kafka cli """ CMD_HELP_LINES = { "list": "list <optional: match pattern, regex format>", "partition": "partition <required: topic>" } CMD_OPTIONS = [ "list", "partition", ] def __init__(self, server_addr): self.server_addr = server_addr self.consumer = None self.producer = None self.cmd_proc_funcs = {} self.prompt_line = _color("kafka> ", "cyan") # reg self.reg_all_cmds() def connect(self): """ connect to kafka """ try: self.consumer = KafkaConsumer(bootstrap_servers=self.server_addr) return True, "Success" except Exception as e: return False, "connect to {} failed, {}".format( self.server_addr, e) def cmd_completer(self, text, state): """ cmd completer """ # on first trigger, build possible matches matches = [] if state == 0: # cache matches (entries that start with entered text) if text: matches = [ s for s in self.CMD_OPTIONS if s and s.startswith(text) ] else: # no text entered, all matches possible matches = self.options[:] # return match indexed by state try: return matches[state] except IndexError: return None def prepare_auto_complete(self): """ prepare auto complete """ # cmd complete readline.set_completer(self.cmd_completer) readline.parse_and_bind('tab: complete') def reg_cmd_process(self, cmd_starts, func): """ register cmd process function """ self.cmd_proc_funcs[cmd_starts] = func def reg_all_cmds(self): """ reg all cmds """ # help self.reg_cmd_process("help", self.print_help) # list self.reg_cmd_process("list", self.list_topics) # partition offsets self.reg_cmd_process("partition", self.get_partitions) def dispatch_cmd(self, cmd_line): """ dispatch """ matches = [ s for s in self.cmd_proc_funcs if s and cmd_line.startswith(s) ] # get the first if matches: match_cmd = matches[0] self.cmd_proc_funcs[match_cmd](cmd_line) else: self.print_help() def print_help(self, cmd=None, cmd_line=None): """ help """ print("Usage: ") if not cmd or cmd not in self.CMD_HELP_LINES: for cmd in self.CMD_HELP_LINES: print("{}".format(self.CMD_HELP_LINES[cmd])) print("") else: print("{}\n".format(self.CMD_HELP_LINES[cmd])) def print_sep_line(self): """ print seprate line """ print(_color("+{}+".format("-" * 50), 'magenta')) def list_topics(self, cmd_line): """ list topics """ line_info = re.split("\s+", cmd_line) match_pattern = None if len(line_info) > 1: match_pattern = re.compile(line_info[1]) topics = self.consumer.topics() if topics: cnt = 0 print(_color("+{}+".format("-" * 50), 'magenta')) for topic in topics: if match_pattern: m = match_pattern.match(topic) if m: print(topic) cnt += 1 else: print(topic) cnt += 1 print(_color("+{}+".format("-" * 50), 'magenta')) print(_color("\nGet {} result(s)\n".format(cnt), 'yellow')) else: print(_color("\nGet 0 result(s)\n", 'yellow')) def get_partitions(self, cmd_line): """ get topic partitions """ line_info = re.split("\s+", cmd_line) if len(line_info) < 2: self.print_help(cmd="partition") else: topics = line_info[1:] offsets = {} for topic in topics: partition_ids = self.consumer.partitions_for_topic(topic) if not partition_ids: continue offsets[topic] = {} topic_partitions = \ [TopicPartition(topic, p_id) for p_id in partition_ids] # begin offsets begin_offsets = self.consumer.beginning_offsets( topic_partitions) # end offsets end_offsets = self.consumer.end_offsets(topic_partitions) for tp in topic_partitions: p_id = tp.partition offsets[topic][p_id] = {} offsets[topic][p_id]["begin"] = begin_offsets[tp] offsets[topic][p_id]["end"] = end_offsets[tp] # print result for topic in topics: if topic in offsets: self.print_sep_line() print("{}".format(topic)) for p_id in offsets[topic]: print("{} {}:{}".format(_color(p_id, 'yellow'), offsets[topic][p_id]["begin"], offsets[topic][p_id]["end"])) else: self.print_sep_line() print("Get no partitions for topic {}".format(topic)) self.print_sep_line() print("") def run(self): """ run cli """ if not self.consumer \ and not self.connect(): return False # cmd complete self.prepare_auto_complete() # loop while True: line = raw_input(self.prompt_line) line = line.strip() if not line: continue try: self.dispatch_cmd(line) except Exception as e: print("Exception occored, {}".format(e))
import time import pandas as pd import json from kafka import KafkaConsumer, TopicPartition datalist = [] i = 0 """消费者(手动设置偏移量)""" consumer = KafkaConsumer('phone-game-userinfo', bootstrap_servers=['172.23.11.150:9092']) print (consumer.partitions_for_topic("phone-game-userinfo")) # 获取phone-game-userinfo主题的分区信息 print (consumer.topics()) # 获取主题列表 print (consumer.subscription()) # 获取当前消费者订阅的主题 print (consumer.assignment()) # 获取当前消费者topic、分区信息 print (consumer.beginning_offsets(consumer.assignment())) # 获取当前消费者可消费的偏移量 consumer.seek(TopicPartition(topic=u'phone-game-userinfo', partition=0), 202025) # 重置偏移量,从第50个偏移量消费 print(consumer.end_offsets(consumer.assignment())) # Get the last offset for the given partitions print(consumer.end_offsets([TopicPartition(topic='phone-game-userinfo', partition=0)])) # 同上一句等价 t= '2018-05-10' timeArray =time.strptime(t,'%Y-%m-%d') timeStamp=int(time.mktime(timeArray)) print(consumer.offsets_for_times({TopicPartition(topic='phone-game-userinfo', partition=0):timeStamp})) for message in consumer: print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value.decode('utf-8'))) # # print (message.value.decode('utf-8')) # # print (message.offset) # data = message.value.split(',') # print data
def __init__(self, broker: str, topic: str, partition: int = -1, start: Tuple[Union[int, datetime, PartitionOffset], Optional[int]] = PartitionOffset.END, stop: Union[int, datetime, PartitionOffset] = PartitionOffset.NEVER): self.to_thread = Queue() self.from_thread = Queue(maxsize=100) consumer = KafkaConsumer(bootstrap_servers=broker, fetch_max_bytes=52428800 * 6, consumer_timeout_ms=100) existing_topics = consumer.topics() self.current_msg = None self.current_offset_limits = HighLowOffset(-1, -1) if topic not in existing_topics: raise RuntimeError(f"Topic \"{topic}\" does not exist.") existing_partitions = consumer.partitions_for_topic(topic) if partition == -1: partition = existing_partitions.pop() elif partition not in existing_partitions: raise RuntimeError(f"Partition {partition} for topic \"{topic}\" does not exist.") topic_partition = TopicPartition(topic, partition) consumer.assign([topic_partition, ]) first_offset = consumer.beginning_offsets([topic_partition])[topic_partition] last_offset = consumer.end_offsets([topic_partition])[topic_partition] origin_offset = None offset_to_offset = start[1] if start[0] == PartitionOffset.BEGINNING: origin_offset = first_offset # consumer.seek_to_beginning() # if type(start[1]) == int and start[1] > 0 and first_offset + start[1] <= last_offset: # consumer.seek(partition=topic_partition, offset=first_offset + start[1]) elif start[0] == PartitionOffset.END or start == PartitionOffset.NEVER: origin_offset = last_offset # consumer.seek_to_end() # if type(start[1]) == int and start[1] < 0 and last_offset + start[1] >= first_offset: # consumer.seek(partition=topic_partition, offset=first_offset + start[1]) elif type(start[0]) is int: if first_offset > start[0]: origin_offset = first_offset # consumer.seek_to_beginning() elif last_offset < start[0]: origin_offset = last_offset else: origin_offset = start[0] # consumer.seek_to_end() # else: # consumer.seek(partition=topic_partition, offset=start[0]) elif type(start[0]) is datetime: found_offsets = consumer.offsets_for_times({topic_partition: int(start[0].timestamp() * 1000)}) if found_offsets[topic_partition] is None: origin_offset = last_offset else: origin_offset = found_offsets[topic_partition].offset # if type(start[1]) == int: # used_offset += start[1] # consumer.seek(partition=topic_partition, offset=used_offset) else: raise RuntimeError("Unknown start offset configured.") if offset_to_offset is not None: origin_offset += offset_to_offset if origin_offset < first_offset: origin_offset = first_offset elif origin_offset > last_offset: origin_offset = last_offset consumer.seek(partition=topic_partition, offset=origin_offset) self.thread = Thread(target=thread_function, daemon=True, kwargs={"consumer": consumer, "stop": stop, "in_queue": self.to_thread, "out_queue": self.from_thread, "stop": stop, "topic_partition": topic_partition}) self.thread.start()
class Consumer(KafkaPython): def __init__(self, bootstrap_servers=None, **kwargs): super().__init__(servers=bootstrap_servers) consumer_config.update(kwargs) self.engine = KafkaConsumer(bootstrap_servers=self._bootstrap_servers, client_id=self._client_id, **consumer_config) self.DbClient = Mongo() self.group_id = consumer_config.get('group_id', None) self.tps = [] self._partition_mode = None self.offset_store_mode = store_config.get( 'offset') # 3种offset存储机制. 1db 2kafka 3both def get_user_topics(self): return self.engine.topics() # 消费模式1 : 手动分配分区对象给当前消费者 ''' topics 参考值: {'name': 'test_topic', 'num_partitions': 3, 'replication_factor': 3, 'replica_assignments': {},'topic_configs': {}} ''' def assign_partition(self, topics: list): if Consumer().get_user_topics().intersection( {item['topic'] for i, item in enumerate(topics)}): for v in topics: tp = kafka.TopicPartition(topic=str(v['topic']), partition=int(v['partition'])) self.tps.append(tp) self.engine.assign(self.tps) else: raise Exception('topics包含了未知的topic' % topics) self._partition_mode = '1' return self # 消费模式2 : 消费者主动订阅topic def sub_partition(self, topic: list): self.tps = topic self._partition_mode = '2' return self # 开始消费 def topic_consumer(self, **kwargs): if self._partition_mode == '1': for tp in self.tps: data = self.find_or_create( topic=tp.topic, partition=tp.partition, group_id=self.group_id, ) if data: self.engine.seek(tp, int(data.get('current_offset', 0)) + 1) else: self.engine.seek(tp, self.engine.beginning_offsets([tp])[tp]) return self.engine elif self._partition_mode == '2': self.engine.subscribe(self.tps, pattern=kwargs.get('pattern', None), listener=kwargs.get('listener', None)) else: raise Exception('you have to chose the partition mode') # 将current offset回滚至 指定offset Or committed # 当消费者完成消费过程并提交数据至业务并且业务日常GG了 def rollback_offset(self, topic, partition, group_id, offset=None): if offset is None: committed_offset = self.engine.committed( kafka.TopicPartition(topic=topic, partition=partition)) if committed_offset is None: raise Exception( 'topic:%s,partition:%s,group_id:%s has not commit record yet,you should enter some offset' % (topic, partition, group_id)) self.commit_offset(group_id=group_id, topic=topic, partition=partition, offset=offset) # 当前消费者的offset信息提交函数 def commit_offset(self, group_id, topic, partition, offset): if self.group_id is None: raise Exception('you must enter an group_id') tp = kafka.TopicPartition(topic=str(topic), partition=int(partition)) # 分别提交offset信息至kafka and database if self.offset_store_mode == 'both': self.engine.commit( offsets={tp: (kafka.OffsetAndMetadata(offset, None))}) self.DbClient.commit_offset(topic=topic, group_id=group_id, partition=partition, offset=offset) # 提交至kafka服务器 elif self.offset_store_mode == 'kafka': self.engine.commit( offsets={tp: (kafka.OffsetAndMetadata(offset, None))}) # 提交至database else: self.DbClient.commit_offset(topic=topic, group_id=group_id, partition=partition, offset=offset) def find_or_create(self, **kwargs): client = self.DbClient data = client.get_offset(**kwargs) if data is None: client.create_offset(**kwargs) return False else: return data
# 创建一个consumer consumer = KafkaConsumer( # consumer_timeout_ms=10000, #未收到新消息时迭代等待的时间,默认无限大 enable_auto_commit=False, #默认自动提交 #auto_commit_interval_ms=5000, group_id="g_2", #指定groupid后默认提取消息偏移量为上一次提交的偏移量 bootstrap_servers="172.31.32.39:9092", client_id='11' #同一groupid下同一时刻只会有一个client消费同一partition消息 ) consumer.assign([TopicPartition("test_yang", 0) ]) # 指定消费者topic与partition(与consumer定义参数topic互斥) #consumer.seek(TopicPartition(topic='test33333',partition=0),170)#手动指定TopicPartition的提取偏移量(要与assign方法一起用),此方法不会改变提交的偏移量 print(consumer.assignment()) #获取分配给当前consumer的topicpartitions print( consumer.beginning_offsets( {TopicPartition(topic='test33333', partition=0)})) # 获取指定分区的第一个偏移量 print(consumer.end_offsets({TopicPartition(topic='test33333', partition=0)})) # 获取指定分区的最后一个偏移量 print(consumer.topics()) # 获取主题列表 print(consumer.partitions_for_topic('test33333')) #获取指定主题的分区信息 print(consumer.committed(TopicPartition(topic='test33333', partition=0))) #获取当前Group已提交的偏移量 # 手动提交偏移量 offsets格式:{TopicPartition:OffsetAndMetadata(offset_num,None)} # consumer.commit() # 同步提交当前偏移量(默认值为已消费offset+1) consumer.commit( offsets={TopicPartition('topic_yang', 0): OffsetAndMetadata(0, None)}) #同步提交 # re=consumer.commit_async(offsets=None, callback=None) # print(re.succeeded())
import logging from kafka import KafkaConsumer logging.basicConfig(level=logging.INFO, format='%(asctime)16s - %(levelname)8s - %(message)s') consumer = KafkaConsumer( 'topic_lijihua', group_id='group_lijihua', bootstrap_servers=['111.229.87.152:9092'], # consumer_timeout_ms=1000, consumer_timeout_ms=1000 # 1000ms后,若没有消息进来,则终止进程 ) try: for msg in consumer: logging.info('[当前接收到的数据为]:{}'.format(msg)) logging.info('[消息内容]:{}'.format(msg.value)) logging.info('[当前订阅的topic]: {}'.format(consumer.subscription())) logging.info('[当前topic和分区信息]:{}'.format(consumer.assignment())) logging.info('[可消费的偏移量]:{}'.format(consumer.beginning_offsets(consumer.assignment()))) except Exception as err: logging.info(err)
def poll(self): #consumer = KafkaConsumer(self.messageQueue, auto_offset_reset='earliest',bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000) #consumer = KafkaConsumer(self.messageQueue,auto_offset_reset='earliest',bootstrap_servers=['bay15:9092'], consumer_timeout_ms=self.consumerTimeoutMS) consumer = KafkaConsumer(auto_offset_reset='earliest',bootstrap_servers=['server8:9092'], consumer_timeout_ms=self.consumerTimeoutMS) A = [TopicPartition(self.messageQueue, self.parititonToMonitor)] consumer.assign(A) partitions = consumer.assignment() if(self.verbose): idx = 0; for msg in consumer: idx+=1 print ("\t idx: %d len(msg): %d msg: --%s-- "%(idx,len(msg.value),msg.value)) break #consumer.config['group_id'] = self.actionName; partitions = consumer.assignment() print ("partitions: %s "%(str(partitions))) print (" Before while loop -- Offsets: begin: %s end: %s "%(consumer.beginning_offsets(partitions),consumer.end_offsets(partitions))) lastShippedOffset = list(consumer.beginning_offsets(partitions).values())[0] curEndOffset = list(consumer.end_offsets(partitions).values())[0] if(self.launchType == "realtime"): if(curEndOffset>3*self.batchSize): lastShippedOffset = curEndOffset-3*self.batchSize # Can get the cold start out of our way. else: lastShippedOffset = curEndOffset-1 # Assuming that wrapper is run before the producerRTA script elif(self.launchType == "fixedInterval"): lastShippedOffset = 0 lastShippedOffset = list(consumer.beginning_offsets(partitions).values())[0] print ("\t Last shipped offset: %d "%(lastShippedOffset)); #sys.exit() consumer.seek(TopicPartition(self.messageQueue,self.parititonToMonitor),lastShippedOffset+1) #lastShippedOffset = 0 idx = 0 toSendJson = [] while True: idx+=1; curEndOffset = list(consumer.end_offsets(partitions).values())[0] if((curEndOffset-lastShippedOffset) >= self.batchSize): if(self.issuesNumInvocations%5==0): print ("\t Yaay! found something to ship yo, cos curEndOffset: %d lastShippedOffset: %d "%(curEndOffset,lastShippedOffset)) getRecords = consumer.poll(timeout_ms=self.consumerTimeoutMS,max_records=self.batchSize) #for key,value in getRecords.items(): # should change this to if condition. if(len(getRecords)>0): #print ("\t getRecords: %s len(getRecords): %d "%(getRecords,len(getRecords))) key = list(getRecords.keys())[0] value = list(getRecords.values())[0] #print ("\t key: --%s-- len(value): %d "%(key,len(value))) #consumer.commit(offsets=self.getOffsetList(value)) print ("\t len(toSendJson): %d value[0].offset: %d "%(len(toSendJson),value[0].offset)) for curMsg in value: curDict = {} curDict["topic"] = str(curMsg.topic) curDict["partition"] = str(curMsg.partition) curDict["offset"] = str(curMsg.offset) curDict["key"] = str(curMsg.key) curDict["value"] = str(curMsg.value) toSendJson.append(curDict) print ("\t len(toSendJson): %d curMsg.offset: %d topic: %s partition: %d "%(len(toSendJson),curMsg.offset,curMsg.topic,curMsg.partition)) # time wsk action invoke --result pathHello --param name World -i # response = requests.post(self.triggerURL, json=payload, auth=self.authHandler, timeout=10.0, verify=check_ssl) startingOffset = lastShippedOffset if(len(toSendJson)>=self.batchSize): #print ("\t url --> %s "%(self.actionUrl)) allResponses = [] curRespStart = 0; remainingLen = len(toSendJson) while (remainingLen>0): payload = {} if(remainingLen<(1.5*self.batchSize)): #if(remainingLen<self.batchSize): curDispatchSize = remainingLen #else: curDispatchSize = self.batchSize curDispatchSize = remainingLen else: curDispatchSize = self.batchSize curDispatchSize = remainingLen payload["params"] = toSendJson[curRespStart:curDispatchSize-1] for curReq in range(self.numReqsPerIter): if(self.issuesNumInvocations%self.printOffset==0): print ("\t #reqs-issued: %d curReq: %s "%(self.issuesNumInvocations,curReq)) response = requests.post(self.actionUrl, auth=self.authHandler, json=payload, timeout=10.0, verify=False) allResponses.append([response.status_code,curDispatchSize,response,datetime.datetime.now()]) time.sleep(self.btwLaunchSleep) if(self.issuesNumInvocations<2): time.sleep(7.5) else: time.sleep(self.btwLaunchSleep) curRespStart+=curDispatchSize remainingLen-=curDispatchSize #print ("\t curDispatchSize: %d remainingLen: %d "%(curDispatchSize,remainingLen)) curRespStart = 0 for batchIdx,curRespSet in enumerate(allResponses): #print ("\t curRespSet: %s "%(curRespSet)) curRespCode = curRespSet[0] curDispatchSize = curRespSet[1] curResp = curRespSet[2] issuedTS = curRespSet[3] #print ("\t response status_code: %s curDispatchSize: %s "%(curRespCode,curDispatchSize)) if curRespCode in range(200, 300): response_json = curResp.json() if 'activationId' in response_json and response_json['activationId'] is not None: if(self.issuesNumInvocations%self.printOffset==0): print("[{}] Fired trigger with activationID {}".format(self.actionName, response_json['activationId'])) self.allActivationsInfo.append([response_json['activationId'],issuedTS]) """if(self.issuesNumInvocations<5): time.sleep(7.5) else: time.sleep(self.btwLaunchSleep)""" else: print("[{}] Successfully fired trigger".format(self.actionName)) #print ("\t Response json: --%s-- "%(str(response_json))) if(batchIdx%self.numReqsPerIter==0): self.issuesNumInvocations+=1 idxOffset = curRespStart+curDispatchSize-1 #if(self.verbose): print ("\t idxOffset: %d toSendJson[idxOffset]\t[topic]: %s\t [offset]: %s "%(idxOffset,str(toSendJson[idxOffset]["topic"]),str(toSendJson[idxOffset]["offset"]))) # Assuming it's safe to seek until the point to which we have successfully processed. # This would mean, if not all actions are successful, we might end up rereading from the queue. This is fault tolerant, but not performant design. lastShippedOffset = int(toSendJson[idxOffset]["offset"]) curRespStart+=curDispatchSize if(self.issuesNumInvocations%self.printOffset==0): print ("\t idxOffset: %d curRespStart: %d curDispatchSize: %d "%(idxOffset,curRespStart,curDispatchSize,)) # while seeking, I have to keep it +1, since I want the next record to the one I have already processed. consumer.seek(TopicPartition(curMsg.topic,curMsg.partition),lastShippedOffset+1) self.sendNumMessages+=(lastShippedOffset-startingOffset) # lastShippedOffset is not adjusted, so don't need +1 for counting. #lastShippedOffset = curMsg.offset if(self.verbose): print ("\t Done with committing.. lastShippedOffset: %d self.issuesNumInvocations: %d self.sendNumMessages: %d "%(lastShippedOffset,self.issuesNumInvocations,self.sendNumMessages)) toSendJson = [] # Now that I have processed all the records that I have read, I will remove from the buffer. else: print ("\t idx: %d curEndOffset: %d lastShippedOffset: %d "%(idx,curEndOffset,lastShippedOffset)) time.sleep(self.pollingPeriod) else: print ("\t len(getRecords): %d "%(len(getRecords))) else: print ("\t idx: %d curEndOffset: %d lastShippedOffset: %d self.sendNumMessages: %d "%(idx,curEndOffset,lastShippedOffset,self.sendNumMessages)) time.sleep(self.pollingPeriod) if( (self.issuesNumInvocations >= self.maxNumInvocations) or (self.sendNumMessages >= self.maxMessages)): break print ("\t End, self.issuesNumInvocations: %d idx: %d self.sendNumMessages: %d"%(self.issuesNumInvocations,idx,self.sendNumMessages)) self.publishActivationInfo() if consumer is not None: consumer.close()
class KafkaCache(NicosCacheReader): _consumer = None _topic = "" def __init__(self, **kwargs): brokers = kwargs['brokers'] topic = kwargs['topics'] if not isinstance(brokers, list): brokers = [brokers] if not isinstance(topic, str): raise TypeError('topic must be a string') self._connect(brokers) self._assign(topic) self._initial_db() def _connect(self, brokers): self._consumer = KafkaConsumer(bootstrap_servers=brokers, auto_offset_reset='earliest') def _assign(self, topic): consumer = self._consumer alltopics = consumer.topics() if not topic in alltopics: raise ValueError('topic: %s is not present' % topic) partitions = consumer.partitions_for_topic(topic) consumer.assign( [TopicPartition(topic, partition) for partition in partitions]) self._topic = topic def _initial_db(self): consumer = self._consumer assignment = consumer.assignment() end = consumer.end_offsets(list(assignment)) for partition in assignment: while consumer.position(partition) < end[partition]: message = next(consumer) key = message.key.decode().split('/') if self._message_is_interesting(key): self._update_db(key) def _log(self): consumer = self._consumer assignment = consumer.assignment() beginning = self._consumer.beginning_offsets(list(assignment)) end = self._consumer.end_offsets(list(assignment)) print('beginning: %r\tend: %r' % (beginning, end)) for partition in assignment: print('> partition %r: offset: %d' % (partition, consumer.position(partition))) def disconnect(self): self._consumer.unsubscribe() self._consumer.close() def run(self): consumer = self._consumer while not self._stop: message = next(consumer) key = message.key.decode().split('/') if self._message_is_interesting(key): self._update_db(key)
class PythonKafkaReader(KafkaReader): def __init__(self, kafka_hosts): self.config = { "bootstrap_servers": kafka_hosts, "client_id": "KsnapClient", "max_poll_interval_ms": 10000, "auto_offset_reset": "earliest", "enable_auto_commit": False, } self.consumer = KafkaConsumer(**self.config) self.topics: List[str] = [] @staticmethod def _check_reach_offsets(msg: ConsumerRecord, offset_dict): if (msg.topic, msg.partition) not in offset_dict: return True return offset_dict[(msg.topic, msg.partition)] <= msg.offset def list_topics(self) -> Set[str]: return self.consumer.topics() def subscribe(self, topics: List[str]): # TODO: consider having add_topics as methods self.topics = topics self.consumer.subscribe(topics) def _get_latest_offsets(self) -> Dict[Tuple[str, int], int]: tps: List[TopicPartition] = [] for t in self.consumer.topics(): if t not in self.topics: continue partitions = self.consumer.partitions_for_topic(t) for p in partitions: tps.append(TopicPartition(t, p)) d = {} low_offset_dict = self.consumer.beginning_offsets(tps) high_offset_dict = self.consumer.end_offsets(tps) for tp in tps: low = low_offset_dict.get(tp) high = high_offset_dict.get(tp) if high is None: logger.debug(tp) continue if low == high: logger.info(f'No messages in topic: {tp.topic} ' f'partition: {tp.partition}') continue # high watermark is latest offset + 1 d[(tp.topic, tp.partition)] = high - 1 logger.debug(f'Latest offset for topic: {tp.topic} ' f'partition: {tp.partition}: {high - 1}') return d def read(self, timeout: int = 0) -> Dict[Tuple[str, int], List[Message]]: msg_count = 0 offset_dict = self._get_latest_offsets() done_partitions: Set[Tuple[str, int]] = set() msg_dict: Dict[Tuple[str, int], List[Any]] = defaultdict(list) try: start_time = datetime.now() while True: # break if timeout is reached if PythonKafkaReader._check_timeout(timeout, start_time): logger.info( f'Reached timeout: {timeout}s for reading messages.') break # break if all partitions are marked as done if len(done_partitions) == len(offset_dict): logger.info('Done consuming from ' f'{len(done_partitions)} partitions.') break msg: ConsumerRecord = next(self.consumer) if msg is None: continue # skip if partitions are marked as done if (msg.topic, msg.partition) in done_partitions: continue # skip messages over required offsets if PythonKafkaReader._check_reach_offsets(msg, offset_dict): logger.info(f'Done consuming from topic: ' f'{msg.topic} partition: ' f'{msg.partition}') self.consumer.pause( TopicPartition(msg.topic, msg.partition)) done_partitions.add((msg.topic, msg.partition)) message = Message(msg.offset, msg.key, msg.value, msg.timestamp, msg.headers) msg_dict[(msg.topic, msg.partition)].append(message) msg_count += 1 if not msg_count % 100000: logger.debug( f"So far read {msg_count} messages from kafka") except KeyboardInterrupt: logger.info("%% Aborted by user\n") finally: self.close() logger.info("Done with reading") PythonKafkaReader.generate_consumer_report(offset_dict, msg_dict, done_partitions) return msg_dict def close(self): self.consumer.close(autocommit=False)
#!/usr/bin/env python # -*- coding: utf-8 -*- from kafka import KafkaConsumer servers = ['192.168.5.110:9092'] consumer = KafkaConsumer('test', bootstrap_servers=servers) print(consumer.partitions_for_topic('test')) print(consumer.topics()) print(consumer.subscription()) print(consumer.assignment()) print(consumer.beginning_offsets(consumer.assignment())) for msg in consumer: print(msg.value) consumer.close()
class Consumer: """ 继承kafka-python KafkaConsumer, 封装自己的方法 """ def __init__(self, group_id: str = None): self.group_id = group_id def __enter__(self): self.cfg = Config().cfg self.consumer = KafkaConsumer( bootstrap_servers=self.cfg["serList"], # api_version=self.cfg["apiVersion"], api_version_auto_timeout_ms=self.cfg["autoVersionTimeout"], security_protocol=self.cfg["protocol"], sasl_mechanism=self.cfg["mechanism"], sasl_kerberos_service_name=self.cfg["kerverosSerName"], group_id=self.group_id, ) return self def __exit__(self, exc_type, exc_val, exc_tb): self.consumer.close() def assign(self, partitions: list): """ 手动为当前consumer分配topic分区列表 :param partitions: 手动分配分区[(topic, partition)] :return: """ _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions] try: result = self.consumer.assign(_partitions) except IllegalStateError: log.tag_error( KafkaInfo.KafkaConsumer, "Manually consumer TopicPartitions error, " "Topic Consumer is being in used") raise ActionError(KafkaErr.ConsumerInUsed) return result def assignment(self): """ 获取当前consumer分配的topic 分区: 如果使用assign()手动分配, 则直接返回assign manage 配置 如果使用subscribe()订阅的话则返回None(无订阅的情况) or set of topic partitions :return: """ return self.consumer.assignment() def beginning_offsets(self, partitions: list): """ 获取指定partition最开始的offset, 该操作不影响当前partition offset :param partitions: 指定topic分区[(topic: partition)] :return: """ _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions] try: result = self.consumer.beginning_offsets(_partitions) except UnsupportedVersionError or KafkaTimeoutError as e: if e.__class__ == UnsupportedVersionError: log.tag_error(KafkaInfo.KafkaConsumer, "API VERSION ERROR, DO NOT SUPPORT") raise ActionError(KafkaErr.NotSupport) else: log.tag_error(KafkaInfo.KafkaConsumer, "Get Beginning offset failed, Time out") raise ActionError(KafkaErr.GetOffsetFailed) return result def end_offsets(self, partitions: list): """ 获取指定partition结束的offset :param partitions: 指定topic分区[(topic: partition)] :return: """ _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions] try: result = self.consumer.end_offsets(_partitions) except UnsupportedVersionError or KafkaTimeoutError as e: if e.__class__ == UnsupportedVersionError: log.tag_error(KafkaInfo.KafkaConsumer, "API VERSION ERROR, DO NOT SUPPORT") raise ActionError(KafkaErr.NotSupport) else: log.tag_error(KafkaInfo.KafkaConsumer, "Get end offset failed, Time out") raise ActionError(KafkaErr.GetOffsetFailed) return result def offsets_for_time(self, partitions_time: list, timestamp: int = -1): """ 寻找指定时间后的partition最早offset :param partitions_time: list of (topic, partition) if timestamp > 0, (topic, partition, timestamp) if timestamp = -1 :param timestamp: 指定的开始查询时间, 如果是-1则表示每个partitions都有自己的时间配置 :return: """ if timestamp > 0: _partitions = { TopicPartition(_tuple[0], _tuple[1]): timestamp for _tuple in partitions_time } else: _partitions = { TopicPartition(_tuple[0], _tuple[1]): _tuple[2] for _tuple in partitions_time } try: result = self.consumer.offsets_for_times(_partitions) except UnsupportedVersionError or ValueError or KafkaTimeoutError as e: if e.__class__ == UnsupportedVersionError: log.tag_error(KafkaInfo.KafkaConsumer, "API VERSION ERROR, DO NOT SUPPORT") raise ActionError(KafkaErr.NotSupport) if e.__class__ == ValueError: log.tag_error(KafkaInfo.KafkaConsumer, "Value Error: Target Timestamp is negative") else: log.tag_error(KafkaInfo.KafkaConsumer, "Get offset by timestamp failed, Time out") raise ActionError(KafkaErr.GetOffsetFailed) return result def highwarter(self, topic: str, partition: int): """ highwater offset 是分区中分配给下一个message produced的offset (一般用来与reported position做比较来计算滞后) :param topic: :param partition: :return: """ result = self.consumer.highwater(TopicPartition(topic, partition)) return result def commit(self, partition_offset: tuple, async_commit: bool = False): """ 提交偏移量至kafka, 阻塞直到成功或报错 need group id not None :param partition_offset: (topic, partition, offset) :param async_commit: choose async commit :return: """ topic = partition_offset[0] partition = partition_offset[1] _offset = partition_offset[2] offset = { TopicPartition(topic, partition): OffsetAndMetadata(_offset, None) } if not async_commit: self.consumer.commit(offset) else: self.consumer.commit_async(offset).add_errback(self.commit_err, topic=topic, partition=partition, offset=_offset) def committed(self, topic: str, partition: int): """ 获取指定的topic partition最后一个committed offset, 配合commit使用 :param topic: :param partition: :return: """ _partition = TopicPartition(topic, partition) result = self.consumer.committed(_partition) return result def metrics(self): """ 获取consumer的性能记录(包含各个kafka broker) :return: """ performance = self.consumer.metrics() return performance def partition_for_topic(self, topic_name: str): """ 查询指定topic的分区metadata :param topic_name: :return: """ result = self.consumer.partitions_for_topic(topic_name) return result def available_partitions_for_topic(self, topic_name: str): """ 查询指定topic的可用分区 :param topic_name: :return: """ result = self.consumer.available_partitions_for_topic def pause(self, partitions: list): """ 挂起分区的请求(注意请求失败, 但可能有部分topic partition已经挂起) :param partitions: list of TopicPartition need pause, for example: [(topic1, partition1), (topic2, partition2)] :return: """ _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions] try: self.consumer.pause(*_partitions) except: log.tag_error( KafkaInfo.KafkaConsumer, "Pause TopicPartition error, TopicPartition not exist") raise ActionError(KafkaErr.TopicPartitionNotExist) def get_paused(self): """ 获取当前被挂起的分区(用pause) :return: """ return self.consumer.paused() def resume(self, partitions: list): """ 恢复pause挂起的分区 :param partitions: :return: """ _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions] self.consumer.resume(*_partitions) def seek(self, partition: tuple, offset: int): """ 修改TopicPartition的偏移量, 一般用于 poll :param partition: 指定的TopicPartition, (topic, partition) :param offset: 修改后的offset, >=0 :return: """ _partition = TopicPartition(partition[0], partition[1]) self.consumer.seek(_partition, offset) def seek_many(self, partitions: list = None, is_begin: bool = True): """ 批量seek :param partitions: TopicPartition集合, [(topic, partition), ...], 当list为空时, 默认为已分配分区 :param is_begin: True: 置位分区最初的可用offset, False: 置位end可用的offset :return: """ if partitions is not None: _partitions = [ TopicPartition(_par[0], _par[1]) for _par in partitions ] else: _partitions = [] if is_begin: self.consumer.seek_to_beginning(*_partitions) else: self.consumer.seek_to_end(*_partitions) def poll(self, timeout_ms=0, max_records=1000): """ 获取指定分区的Records, 会自动使用上一次的offset作为本次的开始, 也可用seek()手动指定 当分区被pause挂起时, poll获取不到任何records :param timeout_ms: :param max_records: :return: """ result = self.consumer.poll(timeout_ms, max_records) return result def position(self, partition: tuple): """ 获取指定分区next record的offset :param partition: :return: """ _partition = TopicPartition(partition[0], partition[1]) result = self.consumer.position(_partition) return result def subscribe(self, topic: list, pattern: str = None): """ 订阅一组topics :param topic: topic 列表 :param pattern: :return: """ try: self.consumer.subscribe(topic, pattern) except IllegalStateError or AssertionError or TypeError as e: if e.__class__ == IllegalStateError: log.tag_error(KafkaInfo.KafkaConsumer, "Subscribe topic error, %s" % e.__str__) log.tag_error(KafkaInfo.KafkaConsumer, "Subscribe topic error, Parameter Error") raise ActionError(KafkaErr.ParameterError) def unsubscribe(self): """ 取消订阅所有topic并清除分区配置 :return: """ self.consumer.unsubscribe() def subscription(self): """ 获取订阅状态 :return: """ result = self.consumer.subscription() return result def get_topics(self): """ 获取用户可见的topic :return: """ result = self.consumer.topics() return result @staticmethod def commit_err(topic: str, partition: int, offset: int): """ consumer commit offset failed callback function :param topic: :param partition: :param offset: :return: """ log.tag_error( KafkaInfo.KafkaConsumer, "Kafka Consumer commit offset failed, " "{TopicPartition(%s, %s): %s}" % (topic, partition, offset)) raise ActionError(KafkaErr.CommitOffsetFailed) @staticmethod def get_topic_partition(topic: str, partition: int): """ 获取TopicPartition, 这个方法是为了让外部不处理TopicPartition相关的 :param topic: :param partition: :return: """ return TopicPartition(topic, partition)
class KafkaClient(object): def __init__(self, bootstrap_servers, topic, group_id=None): if group_id is not None: self.group_id = group_id self.allow_hotreload = True else: self.group_id = 'kafka_topic_dumper_{}'.format(uuid4()) self.allow_hotreload = False self.bootstrap_servers = bootstrap_servers.split(",") self.topic = topic self.consumer = None self.producer = None self.timeout_in_sec = 60 self.dump_state_topic = 'kafka-topic-dumper' self.s3_path = 'kafka-topic-dumper-data/' self.s3_client = None def _get_consumer(self): if self.consumer is not None: return try: logger.info('Starting consumer') self.consumer = KafkaConsumer( bootstrap_servers=self.bootstrap_servers, group_id=self.group_id, enable_auto_commit=True) except Exception as err: msg = 'Can not create KafkaConsumer instance. Reason=<{}>' logger.exception(msg.format(err)) raise err def _get_s3_client(self): if self.s3_client is None: self.s3_client = boto3.client('s3') return self.s3_client def _get_producer(self): if self.producer is not None: return try: logger.info('Starting producer') self.producer = KafkaProducer( bootstrap_servers=self.bootstrap_servers, key_serializer=bytes_serializer, value_serializer=bytes_serializer) except Exception as err: msg = 'Can not create KafkaProducer instance. Reason=<{}>' logger.exception(msg.format(err)) raise err def open(self): self._get_consumer() self._get_producer() def _close_consumer(self): logger.info("Closing consumer") self.consumer.close() self.consumer = None def _close_producer(self): logger.info("Closing producer") self.producer.flush() logger.debug('Statistics {}'.format(self.producer.metrics())) self.producer.close() self.producer = None def close(self): self._close_consumer() self._close_producer() def _get_partitions(self, topic): partitions = self.consumer.partitions_for_topic(topic) or [] count = 0 while not partitions and count < 500000: self.consumer.subscribe(topic) partitions = self.consumer.partitions_for_topic(topic) or [] sleep(0.1) msg = "Got the following partitions=<{}> for topic=<{}>" logger.info(msg.format(partitions, topic)) topic_partitions = list( map(lambda p: TopicPartition(topic, p), partitions)) msg = "Got the following topic partitions=<{}>" logger.info(msg.format(topic_partitions)) return topic_partitions def _get_offsets(self, topic=None): if topic is None: topic = self.topic topic_partitions = self._get_partitions(topic=topic) beginning_offsets = (self.consumer.beginning_offsets(topic_partitions) or {}) msg = "Got the following beginning offsets=<{}>" logger.info(msg.format(beginning_offsets)) commited_offsets = {} msg = "Partition=<{}> has the current offset=<{}> for <{}>" for tp in topic_partitions: offset = self.consumer.committed(tp) commited_offsets[tp] = offset logger.debug(msg.format(tp, offset, self.group_id)) end_offsets = self.consumer.end_offsets(topic_partitions) or {} msg = "Got the following end offsets=<{}>" logger.info(msg.format(end_offsets)) return beginning_offsets, commited_offsets, end_offsets def _calculate_offsets(self, beginning_offsets, end_offsets, num_messages_to_consume): perfect_displacement = ceil(num_messages_to_consume / max(len(beginning_offsets), 1)) offsets = {} num_messages_available = 0 for tp, offset in beginning_offsets.items(): offsets[tp] = max(beginning_offsets[tp], end_offsets[tp] - perfect_displacement) num_messages_available += end_offsets[tp] - offsets[tp] return offsets, num_messages_available def _set_offsets(self, offsets): offset_and_metadata = { tp: OffsetAndMetadata(offset, b'') for tp, offset in offsets.items() } msg = "Generated the following offsets=<{}>" logger.debug(msg.format(offset_and_metadata)) self.consumer.commit(offset_and_metadata) def _get_messages(self, num_messages_to_consume): messages = [] while len(messages) < num_messages_to_consume: record = next(self.consumer) line = (record.key, record.value) messages.append(line) self.consumer.commit() return messages def _write_messages_to_file(self, messages, local_path): df = pd.DataFrame(messages) table = pa.Table.from_pandas(df) pq.write_table(table, local_path, compression='gzip') def _send_dump_file(self, local_path, bucket_name, dump_id): file_name = path.basename(local_path) s3_path = path.join(self.s3_path, dump_id, file_name) logger.info('Sending file <{}> to s3'.format(file_name)) s3_client = self._get_s3_client() s3_client.upload_file(local_path, bucket_name, s3_path, ExtraArgs={'ACL': 'private'}, Callback=ProgressPercentage(local_path)) logger.debug('Deleting file <{}>'.format(file_name)) remove(local_path) def _get_transformer_class(self, transformer_id): [module_name, class_name] = transformer_id.split(":") module = __import__(module_name, globals(), locals(), [class_name], 0) cl = getattr(module, class_name) return cl() def get_messages(self, num_messages_to_consume, max_package_size_in_msgs, local_dir, bucket_name, dry_run, dump_id): # set offsets msg = ('Will ask kafka for <{}> messages ' + 'and save it in files with <{}> messages') logger.debug( msg.format(num_messages_to_consume, max_package_size_in_msgs)) beginning_offsets, commited_offsets, end_offsets = self._get_offsets() offsets, num_messages_available = self._calculate_offsets( beginning_offsets=beginning_offsets, end_offsets=end_offsets, num_messages_to_consume=num_messages_to_consume) self._set_offsets(offsets) # get messages self.consumer.subscribe(topics=[self.topic]) msg = 'Trying to dump <{}> messages' logger.info(msg.format(num_messages_available)) remaining_messages = num_messages_available num_dumped_messages = 0 dump_dir = path.join(local_dir, dump_id) makedirs(dump_dir, exist_ok=True) logger.debug('Dump directory <{}> created'.format(dump_dir)) while remaining_messages > 0: batch_size = min(remaining_messages, max_package_size_in_msgs) logger.debug('Fetching batch with size=<{}>'.format(batch_size)) file_name = '{}-{:015d}.parquet'.format(dump_id, num_dumped_messages) local_path = path.join(local_dir, dump_id, file_name) messages = self._get_messages(num_messages_to_consume=batch_size) self._write_messages_to_file(messages=messages, local_path=local_path) if not dry_run: self._send_dump_file(local_path=local_path, bucket_name=bucket_name, dump_id=dump_id) remaining_messages -= batch_size num_dumped_messages += batch_size logger.info('Dump done!') def find_latest_dump_id(self, bucket_name): paginator = self._get_s3_client().get_paginator('list_objects_v2') prefix = self.s3_path.rstrip('/') + '/' response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix, Delimiter='/') def strip(r): return r['Prefix'][len(prefix):].rstrip('/') prefixes = [] for response in response_iterator: prefixes.extend(map(strip, response['CommonPrefixes'])) dump_id = max(prefixes) logger.debug('Prefix chosen was <{}>'.format(dump_id)) return dump_id def _get_file_names(self, bucket_name, dump_id): paginator = self._get_s3_client().get_paginator('list_objects_v2') dump_path = path.join(self.s3_path, dump_id) + '/' response_iterator = paginator.paginate(Bucket=bucket_name, Prefix=dump_path) file_names = [] for response in response_iterator: if response['KeyCount'] > 0: file_names.extend( (f['Key'], f['Size']) for f in response['Contents']) file_names.sort() if not file_names: msg = 'Can not found files for this dump id <{}>' logger.error(msg.format(dump_id)) raise Exception('EmptyS3Response') return file_names def _gen_state(self, dump_id, transformer_id): _, _, end_offsets = self._get_offsets() if not end_offsets: msg = 'Can not find offsets for topic <{}>' raise Exception(msg.format(self.topic)) state_offsets = {} for partition, offset in end_offsets.items(): state_offsets[partition.partition] = offset state = { 'dump_id': dump_id, 'topic_name': self.topic, 'offsets': state_offsets, 'dump_date': int(time.time()), 'transformer_id': transformer_id } return state def _save_state(self, state): future = self.producer.send(topic=self.dump_state_topic, key=self.topic, value=json.dumps(state)) future.get(timeout=self.timeout_in_sec) logger.info('State saved') def _get_last_state_message(self): beginning_offsets, _, end_offsets = (self._get_offsets( topic=self.dump_state_topic)) if beginning_offsets: offsets, num_messages_available = self._calculate_offsets( beginning_offsets=beginning_offsets, end_offsets=end_offsets, num_messages_to_consume=1) self._set_offsets(offsets) self.consumer.subscribe(self.dump_state_topic) messages = [ json.loads(m.decode()) for k, m in self._get_messages(num_messages_available) ] if messages: last_state_message = max(messages, key=lambda m: m['dump_date']) return last_state_message return None def _get_state(self, dump_id, transformer_id): if self.allow_hotreload: state_message = self._get_last_state_message() if state_message and \ state_message['topic_name'] == self.topic and \ state_message['dump_id'] == dump_id and \ 'transformer_id' in state_message and \ state_message['transformer_id'] == transformer_id: return state_message['offsets'] return None def _reset_offsets(self, dump_offsets): logger.info('Messages already uploaded. Just resetting offsets') partitions = self._get_partitions(self.topic) offsets = {} for partition in partitions: offsets[partition] = dump_offsets[str(partition.partition)] logger.debug('Will reset offsets to <{}>'.format(offsets)) self._set_offsets(offsets) def _load_dump(self, bucket_name, dump_id, download_dir, files, transformer_instance): s3_client = self._get_s3_client() transformer_id = transformer_instance.get_id() state = self._gen_state(dump_id, transformer_id) current_file_number = 0 msg = "Loading messages from file {}/{} to kafka" for file_name, file_size in files: current_file_number += 1 tmp_name = '{}.tmp'.format(path.basename(file_name)) file_path = path.join(download_dir, tmp_name) s3_client.download_file(Bucket=bucket_name, Filename=file_path, Key=file_name, Callback=ProgressPercentage( tmp_name, file_size)) logger.info(msg.format(current_file_number, len(files))) try: table = pq.read_table(file_path) df = table.to_pandas() for raw_row in df.itertuples(): for row in transformer_instance.transform(raw_row): self.producer.send(self.topic, key=row[1], value=row[2]) logger.debug('File <{}> reloaded to kafka'.format(file_path)) self.producer.flush(self.timeout_in_sec) finally: remove(file_path) self._save_state(state) def reload_kafka_server(self, bucket_name, local_dir, dump_id, transformer_class): transformer_instance = self._get_transformer_class(transformer_class) msg = 'Using class=<{}> to transform events before production' logger.info(msg.format(type(transformer_instance))) transformer_id = transformer_instance.get_id() dump_offsets = self._get_state(dump_id, transformer_id) if dump_offsets: self._reset_offsets(dump_offsets=dump_offsets) else: files = self._get_file_names(bucket_name=bucket_name, dump_id=dump_id) self._load_dump(bucket_name=bucket_name, dump_id=dump_id, download_dir=local_dir, files=files, transformer_instance=transformer_instance) logger.info('Reload done!') def __enter__(self): self.open() return self def __exit__(self, exc_type, exc_value, traceback): self.close()
class Kafka(GenericTool.Tool): def __init__(self, controllerIp, controllerPort, toolName, toolDesc, defaultTool, supportProxy=0, proxyIp=None, proxyPort=None, sslSupport=True): """ Kafka agent @param controllerIp: controller ip/host @type controllerIp: string @param controllerPort: controller port @type controllerPort: integer @param toolName: agent name @type toolName: string @param toolDesc: agent description @type toolDesc: string @param defaultTool: True if the agent is started by the server, False otherwise @type defaultTool: boolean """ GenericTool.Tool.__init__(self, controllerIp, controllerPort, toolName, toolDesc, defaultTool, supportProxy=supportProxy, proxyIp=proxyIp, proxyPort=proxyPort, sslSupport=sslSupport) self.__type__ = __TYPE__ self.__mutex__ = threading.RLock() def getType(self): """ Returns agent type @return: agent type @rtype: string """ return self.__type__ def onCleanup(self): """ Cleanup all In this function, you can stop your program """ pass def initAfterRegistration(self): """ Called on successful registration In this function, you can start your program automatically. """ self.onToolLogWarningCalled("Starting dummy agent") self.onToolLogWarningCalled("Dummy agent started") self.onPluginStarted() def pluginStarting(self): """ Function to reimplement """ pass def onPluginStarted(self): """ Function to reimplement """ pass def pluginStopped(self): """ Function to reimplement """ pass def onResetAgentCalled(self): """ Function to reimplement """ pass def onToolLogWarningCalled(self, msg): """ Logs warning on main application @param msg: warning message @type msg: string """ pass def onToolLogErrorCalled(self, msg): """ Logs error on main application @param msg: error message @type msg: string """ pass def onToolLogSuccessCalled(self, msg): """ Logs success on main application @param msg: error message @type msg: string """ pass def onAgentAlive(self, client, tid, request): """ Called on keepalive received from test server {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'} @param client: server address ip/port @type client: tuple @param tid: transaction id @type tid: integer @param request: request received from the server @type request: dict """ pass def onAgentInit(self, client, tid, request): """ Called on init received from test server {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'} @param client: server address ip/port @type client: tuple @param tid: transaction id @type tid: integer @param request: request received from the server @type request: dict """ self.onToolLogWarningCalled(msg="init called: %s" % request['data']) self.sendNotify(request=request, data="notify sent") def onAgentReset(self, client, tid, request): """ Called on reset received from test server {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'} or {'event': 'agent-reset', 'source-adapter': '1', 'script_id': '7_3_0'} @param client: server address ip/port @type client: tuple @param tid: transaction id @type tid: integer @param request: request received from the server @type request: dict """ if 'data' in request: self.onToolLogWarningCalled(msg="reset called: %s" % request['data']) else: self.onToolLogWarningCalled(msg="reset called") def onAgentNotify(self, client, tid, request): """ Called on notify received from test server and dispatch it {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'} @param client: server address ip/port @type client: tuple @param tid: transaction id @type tid: integer @param request: request received from the server @type request: dict """ self.__mutex__.acquire() self.onToolLogWarningCalled(msg="notify received: %s" % request['data']) if request['uuid'] in self.context(): if request['source-adapter'] in self.context()[request['uuid']]: ctx_test = self.context()[request['uuid']][request['source-adapter']] self.execAction(request) else: self.error("Adapter context does not exists TestUuid=%s AdapterId=%s" % (request['uuid'], request['source-adapter'] ) ) else: self.error("Test context does not exits TestUuid=%s" % request['uuid']) self.__mutex__.release() def execAction(self, request): """ Execute action """ currentTest = self.context()[request['uuid']][request['source-adapter']] self.onToolLogWarningCalled( "<< Starting Command=%s TestId=%s AdapterId=%s" % (request['data']['cmd'], request['script_id'], request['source-adapter']) ) try: cmd = request['data']['cmd'] data = request['data'] # connect if cmd == 'producer_connect': # init kargs=data['kargs'] try: self.producer = KafkaProducer(bootstrap_servers=data['bootstrap_servers'], **kargs ) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'connected' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'producer_send': kargs=data['kargs'] try: future = self.producer.send(data['topic'], **kargs) record_metadata=future.get(timeout=data['timeout']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': record_metadata } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'producer_flush': try: self.producer.flush(data['timeout']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'flushed' }) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'producer_partitions_for': try: partitions = self.producer.partitions_for(data['topic']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': partitions }) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'producer_close': try: self.producer.close(int(data['timeout'])) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'closed' }) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_connect': kargs=data['kargs'] try: if not data['topics']: self.consumer = KafkaConsumer(bootstrap_servers=data['bootstrap_servers'], **kargs) else: self.consumer = KafkaConsumer(data['topics'][0], bootstrap_servers=data['bootstrap_servers'], **kargs) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'connected' }) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_consume': try: for msg in self.consumer : self.sendNotify(request=request, data={ "cmd": cmd , 'result': msg } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_close': try: self.consumer.close(data['autocommit']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'closed' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_assign': try: self.consumer.assign(data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'assigned' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_assignment': try: topicpartitions = self.consumer.assignment() self.sendNotify(request=request, data={ "cmd": cmd , 'topicpartitions': topicpartitions } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_beginning_offsets': try: offsets = self.consumer.beginning_offsets(data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'offsets': offsets } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_commit': try: self.consumer.commit(data['offsets']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'committed' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_commit_async': try: future = self.consumer.commit_async(offsets=data['offsets'],callback=data['callback']) self.sendNotify(request=request, data={ "cmd": cmd , 'future': future } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_committed': try: offsets = self.consumer.committed(data['topicpartition']) self.sendNotify(request=request, data={ "cmd": cmd , 'offsets': offsets } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_end_offsets': try: partitions = self.consumer.end_offsets(data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'partitions': partitions } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_highwater': try: offset = self.consumer.highwater(data['partition']) self.sendNotify(request=request, data={ "cmd": cmd , 'offset': offset } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_offsets_for_times': try: offsets = self.consumer.offsets_for_times(data['timestamps']) self.sendNotify(request=request, data={ "cmd": cmd , 'offsets': offsets } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_partitions_for_topic': try: partitions = self.consumer.partitions_for_topic(data['topic']) self.sendNotify(request=request, data={ "cmd": cmd , 'partitions': partitions } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_pause': try: self.consumer.pause(data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_paused': try: partitions=self.consumer.paused() self.sendNotify(request=request, data={ "cmd": cmd , 'partitions': partitions } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_poll': try: records = self.consumer.poll(timeout_ms=data['timeout_ms'], max_records=data['max_records']) self.sendNotify(request=request, data={ "cmd": cmd , 'records': records } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_position': try: offset = self.consumer.position(data['topicpartition']) self.sendNotify(request=request, data={ "cmd": cmd , 'offset': offset } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_resume': try: self.consumer.resume(data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_seek': try: self.consumer.seek(data['partition'],data['offset']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_seek_to_beginning': try: self.consumer.seek_to_beginning(*data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_seek_to_end': try: self.consumer.seek_to_end(*data['partitions']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_subscribe': try: self.consumer.subscribe(topics=data['topics'], pattern=data['pattern'], listener=data['listener']) self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_subscription': try: topics=self.consumer.subscription() self.sendNotify(request=request, data={ "cmd": cmd , 'topics': topics } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_topics': try: topics = self.consumer.topics() self.sendNotify(request=request, data={ "cmd": cmd , 'topics': topics } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) elif cmd == 'consumer_unsubscribe': try: self.consumer.unsubscribe() self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } ) except KafkaError as e: self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} ) # unknown command else: raise Exception('cmd not supported: %s' % request['data']['cmd'] ) except Exception as e: self.error( 'unable to run command: %s' % str(e) ) self.sendError( request , data="unable to run command") self.onToolLogWarningCalled( "<< Terminated Command=%s TestId=%s AdapterId=%s" % (request['data']['cmd'], request['script-id'], request['source-adapter']) )
class KafkaC: """ 消费模块: 通过不同groupid消费topic里面的消息 """ def __init__(self, bootstrap_servers, topic, group, action=None, offset=None, enable_auto_commit=True): self.bootstrap_servers = bootstrap_servers self.topic = topic self.group_id = group self.action = action self.offset = offset # self.enable_auto_commit = True self.enable_auto_commit = enable_auto_commit self.auto_commit_interval_ms = 1000 self.consumer = KafkaConsumer( # self.kafka_topic, # auto_offset_reset默认latest,如果是第一次使用一个新的消费者组,不管前面数据是否消费 # 都不会消费,使用earliest参数就可以消费前面没有消费的数据 auto_offset_reset='earliest', group_id=self.group_id, bootstrap_servers=self.bootstrap_servers, enable_auto_commit=self.enable_auto_commit, auto_commit_interval_ms=self.auto_commit_interval_ms) # 获取所有topics def get_all_topics(self): return self.consumer.topics() # 获取最开始offset def get_beginning_offsets(self): _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] return _ps, self.consumer.beginning_offsets(_ps) # 获取最新的offset,比最新数据的offset大1,因为kafkaoffset从0开始 def get_end_offsets(self): _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] return _ps, self.consumer.end_offsets(_ps) # 获取当前消费者开始消费的offset def get_last_position(self, partition=None): _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] self.consumer.assign(_ps) return self.consumer.position(partition=_ps[0]) # 以时间戳(微秒)方式获取offset,消息必须带有timestamp字段,即kafka版本大于0.10.0才支持 def get_offset_by_timestamp(self, timestamp): _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] return _ps, self.consumer.offsets_for_times({_ps[0]: timestamp}) def consume_data(self, offset=None): """ :param action: none 从 kafka 正常的 `CURRENT-OFFSET` 开始消费 custom 从指定offset开始 begin 从 kafka 从这个topic最开始消费 end 从 kafka 从这个topic从最新生成的数据库开始,会跳过未消费数据,慎用 :param offset: 数字 int>=0 type为custom时有效从当前数字 offset 开始,包括当前数字, 如果数字大于当前topic总offset,从最新生成的数据库开始 :return: """ # 获取topic所有分区并分配给当前消费者, 需要使用 assign 的话, 在 KafkaConsumer 初始化时就不能指定 topic _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] if offset is None: offset = self.get_last_position() self.consumer.assign(_ps) for p in self.consumer.partitions_for_topic(self.topic): # 也可以只指定一个分区的 offset self.consumer.seek(TopicPartition(self.topic, p), offset) # try: # for message in self.consumer: # yield message # except KeyboardInterrupt as e: # print(e) for message in self.consumer: yield message def consume_daesta_stop(self): """ 获取到最新的offset自动停止 :param action: none 从 kafka 正常的 `CURRENT-OFFSET` 开始消费 custom 从指定offset开始 begin 从 kafka 从这个topic最开始消费 end 从 kafka 从这个topic从最新生成的数据库开始,会跳过未消费数据,慎用 :param offset: 数字 int>=0 type为custom时有效从当前数字 offset 开始,包括当前数字, 如果数字大于当前topic总offset,从最新生成的数据库开始 :return: """ # 获取topic所有分区并分配给当前消费者, 需要使用 assign 的话, 在 KafkaConsumer 初始化时就不能指定 topic _ps = [ TopicPartition(self.topic, p) for p in self.consumer.partitions_for_topic(self.topic) ] self.consumer.assign(_ps) if self.action is None: pass elif self.action == 'begin': self.consumer.seek_to_beginning() elif self.action == 'end': self.consumer.seek_to_end() elif self.action == 'custom': for p in self.consumer.partitions_for_topic(self.topic): # 也可以只指定一个分区的 offset self.consumer.seek(TopicPartition(self.topic, p), self.offset) else: print( 'action values is not support! Plesase input "begin|end|custom"' ) sys.exit(1) for message in self.consumer: yield message def commit_consumer(self): # self.consumer.commit() self.consumer.commit_async() # 异步提交 def close_consumer(self): self.consumer.close(autocommit=self.enable_auto_commit)
class kafka_consumer(): def __init__(self, kafka_server=KAFKA_SERVER_IP): self.kafka_servers = kafka_server # kafka服务器的消费者接口 self.consumer = None self.topic = None # 设置消费者.使用group,对于同一个group的成员只有一个消费者实例可以读取数据。 def set_consumer(self, topic='device', group_id=None, auto_offset_reset='latest'): self.topic = topic if (group_id): self.consumer = KafkaConsumer(topic, group_id=group_id, auto_offset_reset=auto_offset_reset, bootstrap_servers=self.kafka_servers) else: self.consumer = KafkaConsumer(topic, auto_offset_reset=auto_offset_reset, bootstrap_servers=self.kafka_servers) # callback为回调函数,这是一个堵塞进行 def read_data(self, callback): if (self.consumer): for message in self.consumer: callback(message) # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) # 获取当前消费者信息 def get_consumer_info(self): cousumer_info = {} cousumer_info[ 'partitions_for_topic'] = self.consumer.partitions_for_topic( self.topic) #获取test主题的分区信息 cousumer_info['topic'] = self.consumer.topics() #获取主题列表 cousumer_info['subscription'] = self.consumer.subscription( ) #获取当前消费者订阅的主题 cousumer_info['assignment'] = self.consumer.assignment( ) #获取当前消费者topic、分区信息 cousumer_info['beginning_offsets'] = self.consumer.beginning_offsets( self.consumer.assignment()) #获取当前消费者可消费的偏移量 return cousumer_info # 设置偏移 def set_offset(self, partition=0, offset=0): self.consumer.seek( TopicPartition(topic=self.topic, partition=partition), offset) # 重置偏移量,从第offset个偏移量消费 return self.consumer.position( TopicPartition(topic=self.topic, partition=partition)) #获取当前主题的最新偏移量 # 手动拉取消息 def pull_data(self, callback): msg = self.consumer.poll(timeout_ms=5) # 从kafka获取消息 callback(msg) # ======读取当前数据========== # 使用group,对于同一个group的成员只有一个消费者实例可以读取数据。callback为回调函数,这是一个堵塞进行 def read_data_now(self, callback, topic='device', group_id=None, auto_offset_reset='latest'): if (group_id): consumer = KafkaConsumer(topic, group_id=group_id, auto_offset_reset=auto_offset_reset, bootstrap_servers=self.kafka_servers) for message in consumer: callback(message) # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) else: consumer = KafkaConsumer(topic, auto_offset_reset=auto_offset_reset, bootstrap_servers=self.kafka_servers) for message in consumer: callback(message) # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value)) # ==============消息恢复和挂起=========== # from kafka import KafkaConsumer # from kafka.structs import TopicPartition # import time # # consumer = KafkaConsumer(bootstrap_servers=['127.0.0.1:9092']) # consumer.subscribe(topics=('test')) # consumer.topics() # consumer.pause(TopicPartition(topic=u'test', partition=0)) # pause执行后,consumer不能读取,直到调用resume后恢复。 # num = 0 # while True: # print(num) # print(consumer.paused()) #获取当前挂起的消费者 # msg = consumer.poll(timeout_ms=5) # print(msg) # time.sleep(2) # num = num + 1 # if num == 10: # print("resume...") # consumer.resume(TopicPartition(topic='test', partition=0)) # print("resume......") # ======消费者分组========== # from kafka import KafkaConsumer # # 使用group,对于同一个group的成员只有一个消费者实例可以读取数据 # consumer = KafkaConsumer('test',group_id='my-group',bootstrap_servers=['127.0.0.1:9092']) # for message in consumer: # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) # =====读取消息队列最早或最新的消息======== # from kafka import KafkaConsumer # consumer = KafkaConsumer('test',auto_offset_reset='earliest',bootstrap_servers=['127.0.0.1:9092']) # for message in consumer: # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) # # # ==========读取指定位置消息=============== # from kafka import KafkaConsumer # from kafka.structs import TopicPartition # # consumer = KafkaConsumer('test',bootstrap_servers=['127.0.0.1:9092']) # # print(consumer.partitions_for_topic("test")) #获取test主题的分区信息 # print(consumer.topics()) #获取主题列表 # print(consumer.subscription()) #获取当前消费者订阅的主题 # print(consumer.assignment()) #获取当前消费者topic、分区信息 # print(consumer.beginning_offsets(consumer.assignment())) #获取当前消费者可消费的偏移量 # consumer.seek(TopicPartition(topic='test', partition=0), 5) #重置偏移量,从第5个偏移量消费 # for message in consumer: # print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) # =======订阅多个消费者========== # from kafka import KafkaConsumer # from kafka.structs import TopicPartition # # consumer = KafkaConsumer(bootstrap_servers=['127.0.0.1:9092']) # consumer.subscribe(topics=('test','test0')) #订阅要消费的主题 # print(consumer.topics()) # print(consumer.position(TopicPartition(topic='test', partition=0))) #获取当前主题的最新偏移量 # for message in consumer: # print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value)) # ==========消费者(手动拉取消息)============ # # from kafka import KafkaConsumer # import time # # consumer = KafkaConsumer(bootstrap_servers=['127.0.0.1:9092']) # consumer.subscribe(topics=('test','test0')) # while True: # msg = consumer.poll(timeout_ms=5) #从kafka获取消息 # print(msg) # time.sleep(2)