def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.poll() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.consume() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unassign() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assignment() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.commit() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert 'Consumer closed' == str(ex.value)
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.poll() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.consume() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unassign() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assignment() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.commit() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert ex.match('Consumer closed')
class KafkaQueryConsumer: """ Wraps Kafka library consumer methods which query the broker for metadata and poll for single messages. It is a thin wrapper but allows a fake to be used in unit tests. """ def __init__(self, broker: str): # Set "enable.auto.commit" to False, as we do not need to report to the # kafka broker where we got to (it usually does this in case of a # crash, but we simply restart the process and go and find the last # run_start message. # # Set "queued.min.messages" to 1 as we will consume backwards through # the partition one message at a time; we do not want to retrieve # multiple messages in the forward direction each time we step # backwards by 1 offset conf = { "bootstrap.servers": broker, "group.id": "consumer_group_name", "auto.offset.reset": "latest", "enable.auto.commit": False, "queued.min.messages": 1 } self._consumer = Consumer(**conf) def get_topic_partitions(self, topic: str, offset: int = -1): metadata = self._consumer.list_topics(topic) return [ TopicPartition(topic, partition[1].id, offset=offset) for partition in metadata.topics[topic].partitions.items() ] def seek(self, partition: TopicPartition): """ Set offset in partition, the consumer will seek to that offset """ self._consumer.seek(partition) def poll(self, timeout=2.): """ Poll for a message from Kafka """ return self._consumer.poll(timeout=timeout) def get_watermark_offsets(self, partition: TopicPartition) -> Tuple[int, int]: """ Get the offset of the first and last available message in the given partition """ return self._consumer.get_watermark_offsets(partition, cached=False) def assign(self, partitions: List[TopicPartition]): self._consumer.assign(partitions) def offsets_for_times(self, partitions: List[TopicPartition]): return self._consumer.offsets_for_times(partitions)
def morning_notice(): # 每只股票都创建 1 个 topic,包含 5 个 partition,partition 0 存放 futu 获取的 snapshot,partition 1 存放 futu 的 实时报价,partition 2 存放 futu 的实时 K线,partition 3 存放 futu 的实时 分时, # partition 4 存放 futu 的实时 逐比,partition 5 存放 futu 的实时摆盘,partition 6 存放 futu 的实时经纪队列,partition 7-9 暂时空闲 consumer = Consumer({ 'bootstrap.servers': 'kafka01', 'group.id': 'test', 'enable.auto.commit': False, 'default.topic.config': { 'auto.offset.reset': 'largest' } }) (rise_ratio_list_smallest, rise_ratio_list_largest) = consumer.get_watermark_offsets( TopicPartition('test', 0)) (volume_list_smallest, volume_list_largest) = consumer.get_watermark_offsets( TopicPartition('test', 1)) try: consumer.assign( [TopicPartition('test', 0, rise_ratio_list_largest - 1)]) consumer.seek(TopicPartition('test', 0, rise_ratio_list_largest - 1)) print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) latest_rise_ratio = consumer.poll(3.0) print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) print(latest_rise_ratio) consumer.assign([TopicPartition('test', 1, volume_list_largest - 1)]) consumer.seek(TopicPartition('test', 1, volume_list_largest - 1)) print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) latest_volume = consumer.poll(3.0).value() print(consumer.position([TopicPartition('test', 0)])) print(consumer.position([TopicPartition('test', 1)])) print(latest_volume) finally: consumer.close()
class KafkaConsumer: def __init__(self, logger, cfg, influxdb_client, email_notification): """Конструктор класса Args: logger (TimedRotatingLogger): логер cfg (dict): словарь параметров influxdb_client (InfluxBDProducer): объект для логирования в базу InfluxDB email_notification (EmailNotification): объект для отправки email уведомлений """ self.logger = logger self.cfg = cfg self.influxdb_client = influxdb_client self.email_notification = email_notification self.consumer = Consumer(self.cfg['kafka_broker']['consumer_config']) self.consumer.subscribe(self.cfg['kafka_broker']['consumer_topic']) def receive_message(self): """Читаем и отдаём одно сообщение из очереди со всеми атрибутами Returns: (message): сообщение из очереди со всеми атрибутами """ try: msg = self.consumer.poll(self.cfg['kafka_broker']['poll_timeout']) if msg and not msg.error(): return msg elif msg and msg.error().code() == KafkaError._PARTITION_EOF: self.logger.warning( "{0} {1} reached end at offset {2}\n".format( msg.topic(), msg.partition(), msg.offset())) elif msg and msg.error(): raise KafkaException(msg.error()) elif not msg: self.logger.warning('No more messages, end of the queue') else: self.logger.warning('Something new (unexpected turn)') except KafkaError as kf: exc_type, exc_value, exc_traceback = sys.exc_info() self.logger.error("KafkaError\n{0}\n{1}".format( kf, traceback.extract_tb(exc_traceback))) self.influxdb_client.write_error(module="KAFKA_CONSUMER") self.email_notification.send_error_notification() self.consumer.close() sys.exit(1) except KafkaException as ke: exc_type, exc_value, exc_traceback = sys.exc_info() self.logger.error("KafkaException\n{0}\n{1}".format( ke, traceback.extract_tb(exc_traceback))) self.influxdb_client.write_error(module="KAFKA_CONSUMER") self.email_notification.send_error_notification() self.consumer.close() sys.exit(1) def seek_message(self, partition, offset): """Сбрасываем offset в конкретной partition при ошибке обработке сообщения для возможности повторной обработки Args: partition (int): partition, в которой необходимо сбросить offset offset (int): offset, на который необходимо сбросить метку """ try: topic_partition = TopicPartition( self.cfg['kafka_broker']['consumer_topic'][0], partition, offset) self.consumer.seek(topic_partition) except KafkaError as kf: exc_type, exc_value, exc_traceback = sys.exc_info() self.logger.error("KafkaError\n{0}\n{1}".format( kf, traceback.extract_tb(exc_traceback))) self.influxdb_client.write_error(module="KAFKA_CONSUMER") self.email_notification.send_error_notification() self.consumer.close() sys.exit(1) except KafkaException as ke: exc_type, exc_value, exc_traceback = sys.exc_info() self.logger.error("KafkaException\n{0}\n{1}".format( ke, traceback.extract_tb(exc_traceback))) self.influxdb_client.write_error(module="KAFKA_CONSUMER") self.email_notification.send_error_notification() self.consumer.close() sys.exit(1)
class CRecommender(BatchProc): def __init__(self, batch_size, proc_id, proc_num): super(CRecommender, self).__init__("recommender", 4) self.firelinker = { # 当前函数为下一个函数提供的数据 "result": "Usurvive" # 初始化为“成功” } self.abyss = { # 异常、缺陷信息统计器 "Usurvive": 0, "URdead": 0, "UPoison": 0, "UPlay": 0 } self.batch_size = batch_size self.proc_id = proc_id self.proc_num = proc_num self.latest_articles = {} self.earliest_user_log = "" self.kafka_consumer = None self.user_history = {} # 用户ID作为key,用户历史列表作为value self.user_recommend = {} self.topic_vectors = np.load(topic_vector_path) def init_kafka_consumer(self, items): for item in items: self.user_history[item["Fuser_id"]] = [] self.kafka_consumer = Consumer({ 'bootstrap.servers': 'localhost', 'group.id': 'mygroup', 'default.topic.config': { 'auto.offset.reset': 'largest' } }) self.kafka_consumer.subscribe(list(self.user_history.keys())) self.kafka_consumer.poll(timeout=1.0) def __reset_monitor(self): self.firelinker = { # 当前函数为下一个函数提供的数据 "result": "Usurvive" # 初始化为“成功” } self.abyss = { # 异常、缺陷信息统计器 "Usurvive": 0, "URdead": 0, "UPoison": 0, "UPlay": 0 } self.kafka_consumer.close() self.kafka_consumer = None self.user_history = {} self.user_recommend = {} def __bonfire(self): result = self.firelinker["result"] self.abyss[result] += 1 self.abyss["UPlay"] += 1 self.firelinker = {"result": "Usurvive"} def __failsafe(self): self._db.rollback() self.firelinker["result"] = "URdead" self.logger.log_info(traceback.format_exc()) # 将每次处理结果讯息打印日志 def gen_single_report(self, item): content_id = item['Fauto_id'] origin_url = item['Fh5_url'] result = self.firelinker["result"] self.logger.log_info("item with id %s,origin url %s, result:%s" % (content_id, origin_url, result)) # 将每批处理结果讯息打印日志 def gen_batch_report(self): UPlay = self.abyss["UPlay"] UPoison = self.abyss["UPoison"] URdead = self.abyss["URdead"] Usurvive = self.abyss["Usurvive"] if UPlay != 0: self.logger.log_info("You played %s times, survive %s times, \ poisoned %s times, died %s times.\n \ survival rate: %s, poison rate: %s, death rate: %s."\ %(UPlay, Usurvive, UPoison, URdead, \ Usurvive/(UPlay), UPoison/UPlay, URdead/UPlay)) else: self.logger.log_info( "You processed zero content, please check your Sql") def load_users_log(self, items): for item in items: try: user_number = item["Fuser_id"] self.kafka_consumer.seek(TopicPartition( str(user_number), 0, 0)) # 分区为0,OFFSET为0 while True: article_id_set = set() msg = self.kafka_consumer.poll(timeout=1.0) if msg is None: break elif msg.error(): self.logger.log_info(msg.error()) break else: # 加入过滤的逻辑 history_dict = json.loads(msg.value().decode()) action_date = datetime.strptime( history_dict["action_date"], "%Y-%m-%d %H:%M:%S") if action_date > self.earliest_user_log: article_id = history_dict["article_id"] if article_id not in article_id_set: # 后面两个None分别是article_cluster_id, article_vector history_tuple = (action_date, article_id) self.user_history[user_number].append( history_tuple) article_id_set.add(article_id) else: break except Exception as e: self.logger.log_error(str(e)) def load_articles_hidden(self): # 注意,这里可能会抛出文章隐藏信息找不到的异常 article_id_set = set() for user_log in self.user_history.values(): article_id_set.update([log_item[1] for log_item in user_log ]) # log_item[1]指向是文章ID if len(article_id_set) > 1: id_range_str = format(tuple(article_id_set)) where = "Fauto_id in %s" % (id_range_str) elif len(article_id_set) == 1: id_str = list(article_id_set)[0] where = "Fauto_id = %s" % (id_str) else: return field_list = ["Fauto_id", "Fcluster_id", "Farticle_vector"] self._db.set_db_table("db_hiddens", "t_job_documents_hidden") DB_res = self._db.query(field_list, where) article_info_tmp = {} for res in DB_res: article_id = res["Fauto_id"] article_cluster_id = res["Fcluster_id"] article_vector = res["Farticle_vector"] article_info_tmp[article_id] = (article_cluster_id, article_vector) for user_id, history_tuple_list in self.user_history.items(): for idx in range(len(history_tuple_list)): article_time = history_tuple_list[idx][0] article_id = history_tuple_list[idx][1] article_id = int(article_id) article_cluster_id = article_info_tmp[article_id][0] article_vector = article_info_tmp[article_id][1] new_history_tuple = (article_time, article_id, article_cluster_id, article_vector) self.user_history[user_id][idx] = new_history_tuple def classify_user(self, history): cluster_dict = {} self.firelinker["favored_clusters"] = [] self.firelinker["favored_articles"] = [ ] # 形如[(1, 20180516),(2,20190618)] for action in history: action_date = action[0] article_id = action[1] article_cluster_id = action[2] article_vector = np.array( [float(elem) for elem in json.loads(action[3])]) self.firelinker["favored_articles"].append( (article_id, article_cluster_id, article_vector, action_date)) map_reduce(cluster_dict, article_cluster_id) for key, value in cluster_dict.items(): if value >= FAVORED_THRESH: self.firelinker["favored_clusters"].append(key) def gen_simm_article(self): raw_result = {} for article_info in self.firelinker["favored_articles"]: article_id = article_info[0] article_cluster_id = article_info[1] article_vector = article_info[2] action_date = article_info[3] article_comp_list = self.latest_articles.get( article_cluster_id, []) article_candidates = [] for article_comp in article_comp_list: article_comp_id = article_comp[0] article_comp_time = article_comp[1] article_comp_vector = article_comp[2] if article_id != article_comp_id: dot_product = np.dot(article_vector, article_comp_vector) norm_product = LA.norm(article_vector) * LA.norm( article_comp_vector) simm = (dot_product / norm_product) if simm > SIMM_THRESH: article_candidates.append( (article_comp_id, article_comp_time)) raw_result[article_id] = article_candidates self.firelinker["simm_articles"] = raw_result def gen_cluster_article(self): raw_result = {} for cluster_id in self.firelinker["favored_clusters"]: raw_result[cluster_id] = [ (article_info[0], article_info[1]) for article_info in self.latest_articles[cluster_id] ] self.firelinker["cluster_articles"] = raw_result def gen_other_article(self): raw_result = {} for cluster_id in self.latest_articles.keys(): article_info_list = self.latest_articles[cluster_id] raw_result[cluster_id] = [(article_info[0], article_info[1]) for article_info in article_info_list ] # 每个类取10篇文章 self.firelinker["other_articles"] = raw_result def gen_random_article(self): simm_articles = self.firelinker["simm_articles"] cluster_articles = self.firelinker["cluster_articles"] other_articles = self.firelinker["other_articles"] other_articles_new = {} article_id_set = set([]) for article_id, article_list in simm_articles.items(): simm_articles[article_id], article_id_set = intercept_article_list( article_list, 3, article_id_set) for cluster_id, article_list in cluster_articles.items(): cluster_articles[ cluster_id], article_id_set = intercept_article_list( article_list, 4, article_id_set) if len(other_articles.keys()) > 10: cluster_id_list = random.sample(other_articles.keys(), 10) else: cluster_id_list = list(other_articles.keys()) for cluster_id in cluster_id_list: other_articles_new[ cluster_id], article_id_set = intercept_article_list( other_articles[cluster_id], 2, article_id_set) self.firelinker["other_articles"] = other_articles_new def gen_result(self, user_id): temp_dict = {} simm_articles = self.firelinker["simm_articles"] cluster_articles = self.firelinker["cluster_articles"] other_articles = self.firelinker["other_articles"] article_id_list = [] for key, value in simm_articles.items(): article_id_list.append(key) for article in value: article_id_list.append(article[0]) for value in cluster_articles.values(): for article in value: article_id_list.append(article[0]) for value in other_articles.values(): for article in value: article_id_list.append(article[0]) article_id_list = list(set(article_id_list)) self.user_recommend[user_id] = ','.join( [str(article_id) for article_id in article_id_list]) # 用于演示,发布时记删 in_string = format(tuple(article_id_list)) self._db.set_db_table("db_documents", "t_job_documents") where = "Fauto_id in %s" % (in_string) field_list = ["Fauto_id", "Ftitle", "Fh5_url", "Fsummary"] DB_res = self._db.query(field_list, where) self._db.commit() for res in DB_res: temp_dict[res["Fauto_id"]] = res["Ftitle"] show_dict = {} for docid, value in simm_articles.items(): new_key = temp_dict[docid] print("--------------------------------") print("用户投递了职位:%s" % (new_key)) for article in value: new_value = temp_dict[article[0]] print("用户被推荐了职位:%s" % (new_value)) for cluster_id, value in cluster_articles.items(): print("--------------------------------") print("根据用户对类%s的喜爱,我们推荐了下列职位" % (cluster_id)) for article in value: new_value = temp_dict[article[0]] print("%s" % (new_value)) ''' print("--------------------------------") print("给用户推荐的其他领域的文章") for value in other_articles.values(): for article in value: new_value = temp_dict[article[0]] print("%s"%(new_value)) ''' def process_user(self, user_id, history): try: self.classify_user(history) self.gen_simm_article() self.gen_cluster_article() self.gen_other_article() self.gen_random_article() self.gen_result(user_id) except Exception as e: self.__failsafe() finally: # self.gen_single_report(item) self.__bonfire() def update_user_recmmend(self): self._db.set_db_table('db_users', 't_user_recommends') field_list = ['Fuser_id', 'Frec_articles', 'Fmodify_time'] data_list = [] for user_id, recommends in self.user_recommend.items(): modify_time = time_now() element = str((user_id, recommends, modify_time)) data_list.append(element) self._db.update_batch(field_list, data_list) self._db.commit() def run(self, items): try: self.init_kafka_consumer(items) self.load_users_log(items) self.load_articles_hidden() for user_id, history in self.user_history.items(): self.process_user(user_id, history) self.update_user_recmmend() except Exception as e: self.logger.log_error(traceback.format_exc()) finally: self.__reset_monitor() # 读取最近的文章 def prepare_articles(self): now = datetime.now() latest = now + timedelta(days=32 - now.day) time_cover = timedelta(days=ARTICLE_COVER) earliest = latest - time_cover self._db.set_db_table("db_hiddens", "t_job_documents_hidden") where = "Fcreate_time > '%s' and Fcreate_time < '%s' and Frec_state=1"\ %(earliest, latest) field_list = [ "Farticle_vector", "Fauto_id", "Fcluster_id", "Fcreate_time" ] DB_res = self._db.query(field_list, where) for item in DB_res: cluster_id = item["Fcluster_id"] article_id = item["Fauto_id"] article_time = item["Fcreate_time"] article_vector = np.array( [float(elem) for elem in json.loads(item["Farticle_vector"])]) if cluster_id in self.latest_articles.keys(): self.latest_articles[cluster_id].append( (article_id, article_time, article_vector)) else: self.latest_articles[cluster_id] = [(article_id, article_time, article_vector)] def prepare_user_cover(self): now = datetime.now() time_cover = timedelta(days=USER_HISTORY_COVER) self.earliest_user_log = now - time_cover def main(self): # 初始化各种 self.init_db() self.init_log() self.prepare_articles() self.prepare_user_cover() step = self.batch_size * self.proc_num offset = self.proc_id * self.batch_size while (True): where = "Fauto_id between %s and %s" % (offset + 1, offset + self.batch_size) self.logger.log_info('process_id:%s, sql condition:%s' % (self.proc_id, where)) field_list = ['Fuser_id'] self._db.set_db_table("db_users", "t_user_recommends") items = self._db.query(field_list, where) self._db.commit() if not items: break self.run(items) offset += step #break time.sleep(5) self.close()
# API keys held in a non-commited file import credentials # Subscribe to ATM_POSSIBLE_FRAUD_ENRICHED topic settings = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'python_pushbullet', 'default.topic.config': { 'auto.offset.reset': 'largest' } } c = Consumer(settings) tp = TopicPartition('ATM_POSSIBLE_FRAUD_ENRICHED', 1, -1) print('seek: Seeking to %s' % tp) c.seek(tp) c.subscribe(['ATM_POSSIBLE_FRAUD_ENRICHED']) # Connect to pushbullet service pb = Pushbullet(credentials.login['pushbullet_api_token']) # Poll for messages; and extract JSON and call pushbullet for any messages while True: msg = c.poll() if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue else: print(msg.error()) break
class KafkaConsumer(Consumer[TPayload]): """ The behavior of this consumer differs slightly from the Confluent consumer during rebalancing operations. Whenever a partition is assigned to this consumer, offsets are *always* automatically reset to the committed offset for that partition (or if no offsets have been committed for that partition, the offset is reset in accordance with the ``auto.offset.reset`` configuration value.) This causes partitions that are maintained across a rebalance to have the same offset management behavior as a partition that is moved from one consumer to another. To prevent uncommitted messages from being consumed multiple times, ``commit`` should be called in the partition revocation callback. The behavior of ``auto.offset.reset`` also differs slightly from the Confluent consumer as well: offsets are only reset during initial assignment or subsequent rebalancing operations. Any other circumstances that would otherwise lead to preemptive offset reset (e.g. the consumer tries to read a message that is before the earliest offset, or the consumer attempts to read a message that is after the latest offset) will cause an exception to be thrown, rather than resetting the offset, as this could lead to chunks messages being replayed or skipped, depending on the circumstances. This also means that if the committed offset is no longer available (such as when reading older messages from the log and those messages expire, or reading newer messages from the log and the leader crashes and partition ownership fails over to an out-of-date replica), the consumer will fail-stop rather than reset to the value of ``auto.offset.reset``. """ # Set of logical offsets that do not correspond to actual log positions. # These offsets should be considered an implementation detail of the Kafka # consumer and not used publically. # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25 LOGICAL_OFFSETS = frozenset( [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID]) def __init__( self, configuration: Mapping[str, Any], codec: Codec[KafkaPayload, TPayload], *, commit_retry_policy: Optional[RetryPolicy] = None, ) -> None: if commit_retry_policy is None: commit_retry_policy = NoRetryPolicy() auto_offset_reset = configuration.get("auto.offset.reset", "largest") if auto_offset_reset in {"smallest", "earliest", "beginning"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_earliest) elif auto_offset_reset in {"largest", "latest", "end"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_latest) elif auto_offset_reset == "error": self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_error) else: raise ValueError( "invalid value for 'auto.offset.reset' configuration") if (as_kafka_configuration_bool( configuration.get("enable.auto.commit", "true")) is not False): raise ValueError( "invalid value for 'enable.auto.commit' configuration") if (as_kafka_configuration_bool( configuration.get("enable.auto.offset.store", "true")) is not False): raise ValueError( "invalid value for 'enable.auto.offset.store' configuration") # NOTE: Offsets are explicitly managed as part of the assignment # callback, so preemptively resetting offsets is not enabled. self.__consumer = ConfluentConsumer({ **configuration, "auto.offset.reset": "error" }) self.__codec = codec self.__offsets: MutableMapping[Partition, int] = {} self.__staged_offsets: MutableMapping[Partition, int] = {} self.__paused: Set[Partition] = set() self.__commit_retry_policy = commit_retry_policy self.__state = KafkaConsumerState.CONSUMING def __resolve_partition_offset_earliest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, low) def __resolve_partition_offset_latest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, high) def __resolve_partition_offset_error( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: raise ConsumerError("unable to resolve partition offsets") def subscribe( self, topics: Sequence[Topic], on_assign: Optional[Callable[[Mapping[Partition, int]], None]] = None, on_revoke: Optional[Callable[[Sequence[Partition]], None]] = None, ) -> None: """ Subscribe to topics. This replaces a previous subscription. This method does not block. The subscription may not be fulfilled immediately: instead, the ``on_assign`` and ``on_revoke`` callbacks are called when the subscription state changes with the updated assignment for this consumer. If provided, the ``on_assign`` callback is called with a mapping of partitions to their offsets (at this point, the working offset and the committed offset are the same for each partition) on each subscription change. Similarly, the ``on_revoke`` callback (if provided) is called with a sequence of partitions that are being removed from this consumer's assignment. (This callback does not include the offsets, as the working offset and committed offset may differ, in some cases by substantial margin.) Raises an ``InvalidState`` exception if called on a closed consumer. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) def assignment_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.ASSIGNING try: assignment: MutableSequence[ConfluentTopicPartition] = [] for partition in self.__consumer.committed(partitions): if partition.offset >= 0: assignment.append(partition) elif partition.offset == OFFSET_INVALID: assignment.append( self.__resolve_partition_starting_offset( partition)) else: raise ValueError("received unexpected offset") offsets: MutableMapping[Partition, int] = { Partition(Topic(i.topic), i.partition): i.offset for i in assignment } self.__seek(offsets) # Ensure that all partitions are resumed on assignment to avoid # carrying over state from a previous assignment. self.__consumer.resume([ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in offsets.items() ]) for partition in offsets: self.__paused.discard(partition) except Exception: self.__state = KafkaConsumerState.ERROR raise try: if on_assign is not None: on_assign(offsets) finally: self.__state = KafkaConsumerState.CONSUMING def revocation_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.REVOKING partitions = [ Partition(Topic(i.topic), i.partition) for i in partitions ] try: if on_revoke is not None: on_revoke(partitions) finally: for partition in partitions: # Staged offsets are deleted during partition revocation to # prevent later committing offsets for partitions that are # no longer owned by this consumer. if partition in self.__staged_offsets: logger.warning( "Dropping staged offset for revoked partition (%r)!", partition, ) del self.__staged_offsets[partition] try: self.__offsets.pop(partition) except KeyError: # If there was an error during assignment, this # partition may have never been added to the offsets # mapping. logger.warning( "failed to delete offset for unknown partition: %r", partition, ) self.__paused.discard(partition) self.__state = KafkaConsumerState.CONSUMING self.__consumer.subscribe( [topic.name for topic in topics], on_assign=assignment_callback, on_revoke=revocation_callback, ) def unsubscribe(self) -> None: """ Unsubscribe from topics. Raises an ``InvalidState`` exception if called on a closed consumer. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) self.__consumer.unsubscribe() def poll(self, timeout: Optional[float] = None) -> Optional[Message[TPayload]]: """ Return the next message available to be consumed, if one is available. If no message is available, this method will block up to the ``timeout`` value before returning ``None``. A timeout of ``0.0`` represents "do not block", while a timeout of ``None`` represents "block until a message is available (or forever)". Calling this method may also invoke subscription state change callbacks. This method may also raise an ``EndOfPartition`` error (a subtype of ``ConsumerError``) when the consumer has reached the end of a partition that it is subscribed to and no additional messages are available. The ``partition`` attribute of the raised exception specifies the end which partition has been reached. (Since this consumer is multiplexing a set of partitions, this exception does not mean that *all* of the partitions that the consumer is subscribed to do not have any messages, just that it has reached the end of one of them. This also does not mean that additional messages won't be available in future poll calls.) Not every backend implementation supports this feature or is configured to raise in this scenario. Raises an ``InvalidState`` exception if called on a closed consumer. Raises a ``TransportError`` for various other consumption-related errors. """ if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) message: Optional[ConfluentMessage] = self.__consumer.poll( *[timeout] if timeout is not None else []) if message is None: return None error: Optional[KafkaError] = message.error() if error is not None: code = error.code() if code == KafkaError._PARTITION_EOF: raise EndOfPartition( Partition(Topic(message.topic()), message.partition()), message.offset(), ) elif code == KafkaError._TRANSPORT: raise TransportError(str(error)) else: raise ConsumerError(str(error)) headers: Optional[Headers] = message.headers() result = Message( Partition(Topic(message.topic()), message.partition()), message.offset(), self.__codec.decode( KafkaPayload( message.key(), message.value(), headers if headers is not None else [], )), datetime.utcfromtimestamp(message.timestamp()[1] / 1000.0), ) self.__offsets[result.partition] = result.get_next_offset() return result def tell(self) -> Mapping[Partition, int]: """ Return the read offsets for all assigned partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return self.__offsets def __validate_offsets(self, offsets: Mapping[Partition, int]) -> None: invalid_offsets: Mapping[Partition, int] = { partition: offset for partition, offset in offsets.items() if offset < 0 } if invalid_offsets: raise ConsumerError(f"invalid offsets: {invalid_offsets!r}") def __seek(self, offsets: Mapping[Partition, int]) -> None: self.__validate_offsets(offsets) if self.__state is KafkaConsumerState.ASSIGNING: # Calling ``seek`` on the Confluent consumer from an assignment # callback will throw an "Erroneous state" error. Instead, # partition offsets have to be initialized by calling ``assign``. self.__consumer.assign([ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in offsets.items() ]) else: for partition, offset in offsets.items(): self.__consumer.seek( ConfluentTopicPartition(partition.topic.name, partition.index, offset)) self.__offsets.update(offsets) def seek(self, offsets: Mapping[Partition, int]) -> None: """ Change the read offsets for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError("cannot seek on unassigned partitions") self.__seek(offsets) def pause(self, partitions: Sequence[Partition]) -> None: """ Pause the consumption of messages for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if set(partitions) - self.__offsets.keys(): raise ConsumerError("cannot pause unassigned partitions") self.__consumer.pause([ ConfluentTopicPartition(partition.topic.name, partition.index) for partition in partitions ]) self.__paused.update(partitions) # XXX: Seeking to a specific partition offset and immediately pausing # that partition causes the seek to be ignored for some reason. self.seek({ partition: offset for partition, offset in self.__offsets.items() if partition in partitions }) def resume(self, partitions: Sequence[Partition]) -> None: """ Resume the consumption of messages for the provided partitions. Raises an ``InvalidState`` if called on a closed consumer. """ if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if set(partitions) - self.__offsets.keys(): raise ConsumerError("cannot resume unassigned partitions") self.__consumer.resume([ ConfluentTopicPartition(partition.topic.name, partition.index) for partition in partitions ]) for partition in partitions: self.__paused.discard(partition) def paused(self) -> Sequence[Partition]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return [*self.__paused] def stage_offsets(self, offsets: Mapping[Partition, int]) -> None: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError( "cannot stage offsets for unassigned partitions") self.__validate_offsets(offsets) # TODO: Maybe log a warning if these offsets exceed the current # offsets, since that's probably a side effect of an incorrect usage # pattern? self.__staged_offsets.update(offsets) def __commit(self) -> Mapping[Partition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) result: Optional[Sequence[ConfluentTopicPartition]] if self.__staged_offsets: result = self.__consumer.commit( offsets=[ ConfluentTopicPartition(partition.topic.name, partition.index, offset) for partition, offset in self.__staged_offsets.items() ], asynchronous=False, ) else: result = [] assert result is not None # synchronous commit should return result immediately self.__staged_offsets.clear() offsets: MutableMapping[Partition, int] = {} for value in result: # The Confluent Kafka Consumer will include logical offsets in the # sequence of ``Partition`` objects returned by ``commit``. These # are an implementation detail of the Kafka Consumer, so we don't # expose them here. # NOTE: These should no longer be seen now that we are forcing # offsets to be set as part of the assignment callback. if value.offset in self.LOGICAL_OFFSETS: continue assert value.offset >= 0, "expected non-negative offset" offsets[Partition(Topic(value.topic), value.partition)] = value.offset return offsets def commit_offsets(self) -> Mapping[Partition, int]: """ Commit staged offsets for all partitions that this consumer is assigned to. The return value of this method is a mapping of partitions with their committed offsets as values. Raises an ``InvalidState`` if called on a closed consumer. """ return self.__commit_retry_policy.call(self.__commit) def close(self, timeout: Optional[float] = None) -> None: """ Close the consumer. This stops consuming messages, *may* commit staged offsets (depending on the configuration), and ends its subscription. Raises a ``InvalidState`` if the consumer is unable to be closed before the timeout is reached. """ try: self.__consumer.close() except RuntimeError: pass self.__state = KafkaConsumerState.CLOSED @property def closed(self) -> bool: return self.__state is KafkaConsumerState.CLOSED
consumer.assign([TopicPartition('test', 4)]) # 重置 offset consumer.assign([TopicPartition('test', 4, 2)]) # 获取一个 partition 的最小、最大 offset consumer.get_watermark_offsets(TopicPartition('test', 4)) # (0, 19) # 如果是一个新的 group.id 必须先消费一条消息,这样后面的重置 offset 才有效, 如果不消费,重置 offset 前后获取到的 offset 值都是-1001 # 获取当前 offset 位置 consumer.position([TopicPartition('test', 3)]) # 重置 offset 到任意位置,committed 决定了下一次连接后的 offset 位置(以 group 为维度),本次连接无效。本次连接的 offset 位置由 position 决定。 # 重置 offset 后,要 close 重新连才有效。position 决定本次连接的 offset 位置,用 seek() 修改。 consumer.seek(TopicPartition('test', 3, 1)) consumer.commit(offsets=[TopicPartition('test', 3, 7)]) # 检查重置的位置 msg = consumer.committed([TopicPartition('test', 3)]) print(msg) # offset:Either an absolute offset (>=0) or a logical offset: OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID while True: msg = consumer.poll(3.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue else:
def main(options): if options.hostname: hostname = options.hostname else: hostname = gethostname() group_id = f'python_search@{hostname}' print(f'group_id = {group_id}') c = Consumer({ 'bootstrap.servers': options.bootstrap_servers, 'group.id': group_id, }) tp = TopicPartition(options.topic, 0, 0) #OFFSET_BEGINNING) c.assign([tp]) c.seek(tp) printed = 0 print('begin') while True: msg = c.poll() if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: break else: print(msg.error()) return offset = msg.offset() message_string = _bytes2string(msg.value()) try: message = _json2dict_deserialize(message_string) except json.decoder.JSONDecodeError: if options.show_warnings: print('-> offset {0} : deserialize error'.format(offset)) else: ts_type, ts_ms_value = msg.timestamp() if ts_type != TIMESTAMP_NOT_AVAILABLE and ts_ms_value: ts_value = int(ts_ms_value / 1000) recieved = datetime.fromtimestamp(ts_value) else: recieved = None try: criteria = eval(options.filter) except (KeyError, TypeError) as e: criteria = False if options.show_warnings: print(f'-> offset {offset} : {e}') if criteria == True: if recieved: print( f'-> offset {offset}, recieved {recieved:%d-%m-%Y %H:%M:%S}' ) else: print(f'-> offset {offset}') _json_pprint(message) # pprint(message) printed += 1 if options.number and printed >= options.number: break print('{0}'.format(offset), end='\r') c.close() print('end ')
def proc_replicate(topic, src_partition, end_offset, part_map, rerun=False, commit=False): """ part_map list[list[]] """ src = Consumer({ 'bootstrap.servers': SRC_BOOTSTRAP_SERVERS, 'group.id': SRC_GROUP_ID, 'enable.auto.commit': commit }) logger.info( f"Starting process consumer topic: {topic} src_partition:{src_partition}" ) if rerun: logger.info( f"Resetting source partition {src_partition} to beginning...") tp = TopicPartition(topic, src_partition, confluent_kafka.OFFSET_BEGINNING) src.assign([tp]) src.seek(tp) logger.info( f"Reset of source partition {src_partition} offset to {src.position([tp])} complete." ) else: tp = TopicPartition(topic, src_partition) src.assign([tp]) trg = Producer({ 'bootstrap.servers': TRG_BOOTSTRAP_SERVERS, 'group.id': TRG_GROUP_ID }) trg_part_ndx = 0 trg_part_ndx_max = len( part_map[src_partition]) - 1 # ex: a length of 2 has 1 as the max msg_count = 0 t0 = _t1 = time.time() ending_offset = end_offset cd = 30.0 while True: st = time.time() msg = src.poll(1.0) if msg is None: if time.time() - st >= cd: logger.info( f"timeout after {cd} secs for topic: {topic} src_partition:{src_partition}, ending" ) break continue if msg.error(): logger.error( f"Consumer error topic: {topic} src_partition:{src_partition}: {msg.error()} exiting" ) sys.exit(1) msg_count += 1 trg.produce(topic, value=msg.value(), partition=part_map[src_partition][trg_part_ndx]) if commit: src.commit() # 300 secs, we must do this if we want to ensure not to lose messages # learned of during testing; w/o it, messages do get produced but many, many # will not show up in the target cluster trg.flush(300) trg_part_ndx += 1 if trg_part_ndx > trg_part_ndx_max: trg_part_ndx = 0 # Print status/stats if msg_count % LOG_INTERVAL == 0: _t1 = outputstat(t0, _t1, LOG_INTERVAL, src_partition, msg.offset, msg_count, ending_offset) logger.info( f"process consumer, source partition {src_partition} replication complete ========================" ) _t1 = outputstat(t0, _t1, LOG_INTERVAL, src_partition, -1, msg_count, ending_offset)
"HK.00326", "HK.00883", "HK.06098", "HK.02869", "HK.01060", "HK.00728", "HK.00721", "HK.00700", "HK.01468", "HK.03993", "HK.02238", "HK.01066", "HK.00139", "HK.02007", "HK.00554", "HK.06878" ] # stock_list = [ "HK.00788" ] for stock in stock_list: # 存储 ticker 和 order_book 数据到 csv 文件 # 存储 ticker 和 order_book 数据到 influxdb 数据库 # ticker 在 partition 4 # ticker_sample_1 = '{"code":"HK.00386","time":"2018-09-03 15:59:50","price":7.67,"volume":4000,"turnover":30680.0,"ticker_direction":"BUY","sequence":6596904796962160642,"type":"AUTO_MATCH"}' # ticker_sample_2 = '{"code":"HK.00386","time":"2018-09-03 15:59:50","price":7.67,"volume":2000,"turnover":15340.0,"ticker_direction":"BUY","sequence":6596904796962160644,"type":"AUTO_MATCH"}' # ticker_sample_3 = '{"code":"HK.00386","time":"2018-09-03 15:59:51","price":7.67,"volume":2000,"turnover":15340.0,"ticker_direction":"BUY","sequence":6596904801257127938,"type":"AUTO_MATCH"}' consumer.assign([TopicPartition(stock, 4, 0)]) consumer.seek(TopicPartition(stock, 4, 0)) ticker_pd = pd.DataFrame(columns=[ 'code', 'time', 'price', 'volume', 'turnover', 'ticker_direction', 'sequence', 'type' ]) while True: msg = consumer.poll(3.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: break else: print(msg.error()) break # print('Received message: {}'.format(msg.value().decode('utf-8')))
from confluent_kafka import Consumer, TopicPartition, OFFSET_BEGINNING conf = { 'bootstrap.servers': "localhost:9092", 'group.id': 'my-new-group', 'auto.offset.reset': 'earliest', } # consumer1 = Consumer(conf) consumer = Consumer(conf) topic = 'first_topic' # creating a topic partition with topic - 'first_topic' and partition - 0 topicPartition = TopicPartition(topic=topic, partition=2) print(topicPartition) consumer.assign([topicPartition]) topicPartition.offset = OFFSET_BEGINNING consumer.seek(topicPartition) while True: message = consumer.poll(timeout=1.0) print(message.code()) print(message.value())
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class Kafka_Confluent(object): Type = "Confluent-Kafka Wrapper Class" def __init__(self, kafka_client_config): print("=" * 50) print("Printing Kafka_Confluent kwargs...") import pprint pp = pprint.PrettyPrinter(indent=4) pp.pprint(kafka_client_config) print("=" * 50) self.broker = kafka_client_config["broker"] self.producer_params = kafka_client_config["producer_params"] self.consumer_1_params = kafka_client_config["consumer_1_params"] self.consumer_2_params = kafka_client_config["consumer_2_params"] self.producer_topic = kafka_client_config.get('producer_topic') self.consumer_1_topic = kafka_client_config.get('consumer_1_topic') self.consumer_2_topic = kafka_client_config.get('consumer_2_topic') self.producer = None self.consumer_1 = None self.consumer_2 = None # Create Producer if (self.producer_topic): self.producer_params['bootstrap.servers'] = kafka_client_config[ "broker"] self.producer = KafkaProducer(self.producer_params) print("Producer created successfully...") # Create Consumer 1 if (self.consumer_1_topic): self.consumer_1_params['bootstrap.servers'] = kafka_client_config[ "broker"] self.consumer_1 = KafkaConsumer(self.consumer_1_params) self.consumer_1.subscribe([self.consumer_1_topic]) self.consumer_1.poll(timeout=0.01) print("Consumer 1 created successfully...") # Create Consumer 2 if (self.consumer_2_topic): self.consumer_2_params['bootstrap.servers'] = kafka_client_config[ "broker"] self.consumer_2 = KafkaConsumer(self.consumer_2_params) self.consumer_2.subscribe([self.consumer_2_topic]) self.consumer_2.poll(timeout=0.01) print("Consumer 1 created successfully...") # TODO : Print Complete config def produce(self, output, source_data): value = dict_to_kafka(output, source_data) print("=" * 50) print("Producing Message") print("self.producer_topic", self.producer_topic) print("message size, ", str(len(value))) print("=" * 50) self.producer.produce(self.producer_topic, value) self.producer.poll(0) return (True) def consume1(self): print("=" * 50) print("Consuming Message") print("self.consumer_1_topic", self.consumer_1_topic) print("=" * 50) message_kafka = self.consumer_1.consume(num_messages=1)[0] message_dict = kafka_to_dict(message_kafka) return (message_dict) def consume2(self, block=True): print("=" * 50) print("Consuming Message") print("self.consumer_2_topic", self.consumer_2_topic) print("=" * 50) if (block): message_kafka = self.consumer_2.consume(num_messages=1)[0] else: message_kafka = self.consumer_2.poll(timeout=0.01) if (message_kafka): message_dict = kafka_to_dict(message_kafka) else: message_dict = None return (message_dict) def sync_consumers(self): m1 = self.consumer_1.consume(num_messages=1)[0] m2 = self.consumer_2.consume(num_messages=1)[0] m1_dict, m2_dict = kafka_to_dict(m1), kafka_to_dict(m2) try: assert (m2_dict["_id"] == m1_dict["source_id"]) except AssertionError: logger.info("Consumers not synced. Syncing now...") kafka_source_id = m1_dict[ "_kafka_source_id"] #"{id}:{topic}:{partition}:{offset}" consumer_2_topic_name = kafka_source_id.split(":")[-3] # 3rd last consumer_2_partition = int( kafka_source_id.split(":")[-2]) # 3rd last consumer_2_offset = int(kafka_source_id.split(":")[-1]) consumer_2_topic_partition = TopicPartition( topic=consumer_2_topic_name, partition=consumer_2_partition, offset=consumer_2_offset) # Sync Consumer 2 self.consumer_2.seek(consumer_2_topic_partition) m2 = self.consumer_2.consume(num_messages=1)[0] m2_dict = kafka_to_dict(m2) try: assert (m2_dict["_id"] == m1_dict["source_id"]) return (m1_dict, m2_dict) except AssertionError: logger.info("Consumers not synced. Unknown error.") sys.exit(0)
class KafkaConsumer(Consumer[TopicPartition, int, bytes]): """ The behavior of this consumer differs slightly from the Confluent consumer during rebalancing operations. Whenever a partition is assigned to this consumer, offsets are *always* automatically reset to the committed offset for that partition (or if no offsets have been committed for that partition, the offset is reset in accordance with the ``auto.offset.reset`` configuration value.) This causes partitions that are maintained across a rebalance to have the same offset management behavior as a partition that is moved from one consumer to another. To prevent uncommitted messages from being consumed multiple times, ``commit`` should be called in the partition revocation callback. The behavior of ``auto.offset.reset`` also differs slightly from the Confluent consumer as well: offsets are only reset during initial assignment or subsequent rebalancing operations. Any other circumstances that would otherwise lead to preemptive offset reset (e.g. the consumer tries to read a message that is before the earliest offset, or the consumer attempts to read a message that is after the latest offset) will cause an exception to be thrown, rather than resetting the offset, as this could lead to chunks messages being replayed or skipped, depending on the circumstances. This also means that if the committed offset is no longer available (such as when reading older messages from the log and those messages expire, or reading newer messages from the log and the leader crashes and partition ownership fails over to an out-of-date replica), the consumer will fail-stop rather than reset to the value of ``auto.offset.reset``. """ # Set of logical offsets that do not correspond to actual log positions. # These offsets should be considered an implementation detail of the Kafka # consumer and not used publically. # https://github.com/confluentinc/confluent-kafka-python/blob/443177e1c83d9b66ce30f5eb8775e062453a738b/tests/test_enums.py#L22-L25 LOGICAL_OFFSETS = frozenset( [OFFSET_BEGINNING, OFFSET_END, OFFSET_STORED, OFFSET_INVALID]) def __init__(self, configuration: Mapping[str, Any]) -> None: auto_offset_reset = configuration.get("auto.offset.reset", "largest") if auto_offset_reset in {"smallest", "earliest", "beginning"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_earliest) elif auto_offset_reset in {"largest", "latest", "end"}: self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_latest) elif auto_offset_reset == "error": self.__resolve_partition_starting_offset = ( self.__resolve_partition_offset_error) else: raise ValueError( "invalid value for 'auto.offset.reset' configuration") # NOTE: Offsets are explicitly managed as part of the assignment # callback, so preemptively resetting offsets is not enabled. self.__consumer = ConfluentConsumer({ **configuration, "auto.offset.reset": "error" }) self.__offsets: MutableMapping[TopicPartition, int] = {} self.__state = KafkaConsumerState.CONSUMING def __resolve_partition_offset_earliest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, low) def __resolve_partition_offset_latest( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: low, high = self.__consumer.get_watermark_offsets(partition) return ConfluentTopicPartition(partition.topic, partition.partition, high) def __resolve_partition_offset_error( self, partition: ConfluentTopicPartition) -> ConfluentTopicPartition: raise ConsumerError("unable to resolve partition offsets") def subscribe( self, topics: Sequence[str], on_assign: Optional[Callable[[Sequence[TopicPartition]], None]] = None, on_revoke: Optional[Callable[[Sequence[TopicPartition]], None]] = None, ) -> None: if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) def assignment_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.ASSIGNING try: assignment: MutableSequence[ConfluentTopicPartition] = [] for partition in self.__consumer.committed(partitions): if partition.offset >= 0: assignment.append(partition) elif partition.offset == OFFSET_INVALID: assignment.append( self.__resolve_partition_starting_offset( partition)) else: raise ValueError("received unexpected offset") offsets: MutableMapping[TopicPartition, int] = { TopicPartition(i.topic, i.partition): i.offset for i in assignment } self.__seek(offsets) except Exception: self.__state = KafkaConsumerState.ERROR raise try: if on_assign is not None: on_assign(list(offsets.keys())) finally: self.__state = KafkaConsumerState.CONSUMING def revocation_callback( consumer: ConfluentConsumer, partitions: Sequence[ConfluentTopicPartition]) -> None: self.__state = KafkaConsumerState.REVOKING streams = [ TopicPartition(i.topic, i.partition) for i in partitions ] try: if on_revoke is not None: on_revoke(streams) finally: for stream in streams: try: self.__offsets.pop(stream) except KeyError: # If there was an error during assignment, this stream # may have never been added to the offsets mapping. logger.warning( "failed to delete offset for unknown stream: %r", stream) self.__state = KafkaConsumerState.CONSUMING self.__consumer.subscribe(topics, on_assign=assignment_callback, on_revoke=revocation_callback) def unsubscribe(self) -> None: if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) self.__consumer.unsubscribe() def poll(self, timeout: Optional[float] = None) -> Optional[KafkaMessage]: if self.__state is not KafkaConsumerState.CONSUMING: raise InvalidState(self.__state) message: Optional[ConfluentMessage] = self.__consumer.poll( *[timeout] if timeout is not None else []) if message is None: return None error: Optional[KafkaError] = message.error() if error is not None: code = error.code() if code == KafkaError._PARTITION_EOF: raise EndOfStream( TopicPartition(message.topic(), message.partition()), message.offset(), ) elif code == KafkaError._TRANSPORT: raise TransportError(str(error)) else: raise ConsumerError(str(error)) result = KafkaMessage( TopicPartition(message.topic(), message.partition()), message.offset(), message.value(), ) self.__offsets[result.stream] = result.get_next_offset() return result def tell(self) -> Mapping[TopicPartition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) return self.__offsets def __seek(self, offsets: Mapping[TopicPartition, int]) -> None: if self.__state is KafkaConsumerState.ASSIGNING: # Calling ``seek`` on the Confluent consumer from an assignment # callback will throw an "Erroneous state" error. Instead, # partition offsets have to be initialized by calling ``assign``. self.__consumer.assign([ ConfluentTopicPartition(stream.topic, stream.partition, offset) for stream, offset in offsets.items() ]) else: for stream, offset in offsets.items(): self.__consumer.seek( ConfluentTopicPartition(stream.topic, stream.partition, offset)) self.__offsets.update(offsets) def seek(self, offsets: Mapping[TopicPartition, int]) -> None: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) if offsets.keys() - self.__offsets.keys(): raise ConsumerError("cannot seek on unassigned streams") self.__seek(offsets) def commit(self) -> Mapping[TopicPartition, int]: if self.__state in { KafkaConsumerState.CLOSED, KafkaConsumerState.ERROR }: raise InvalidState(self.__state) result: Optional[Sequence[ConfluentTopicPartition]] = None retries_remaining = 3 while result is None: try: result = self.__consumer.commit(asynchronous=False) assert result is not None except KafkaException as e: if not e.args[0].code() in ( KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR_FOR_GROUP, KafkaError._WAIT_COORD, ): raise if not retries_remaining: raise logger.warning( "Commit failed: %s (%d retries remaining)", str(e), retries_remaining, ) retries_remaining -= 1 time.sleep(1) offsets: MutableMapping[TopicPartition, int] = {} for value in result: # The Confluent Kafka Consumer will include logical offsets in the # sequence of ``TopicPartition`` objects returned by ``commit``. # These are an implementation detail of the Kafka Consumer, so we # don't expose them here. # NOTE: These should no longer be seen now that we are forcing # offsets to be set as part of the assignment callback. if value.offset in self.LOGICAL_OFFSETS: continue assert value.offset >= 0, "expected non-negative offset" offsets[TopicPartition(value.topic, value.partition)] = value.offset return offsets def close(self, timeout: Optional[float] = None) -> None: try: self.__consumer.close() except RuntimeError: pass self.__state = KafkaConsumerState.CLOSED
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({ 'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb }) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list( map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class TestConsumer(TestBaseEP): def __init__(self, tc_drv, cfg_sink): logger = logging.getLogger() if not cfg_sink: raise ValueError("'cfg_sink' is a required parameter") if "type" not in cfg_sink: raise RuntimeError("'type' NOT found in 'sink' dict") self.tc_drv = tc_drv self.tc_id = tc_drv.get_id() self.type = cfg_sink["type"] self.poll_count = 10 self.cons = None if self.type == "None": return if self.type == "Kafka": if "kafka" not in cfg_sink: raise RuntimeError("'kafka' NOT found in 'sink' dict") super(TestConsumer, self).__init__(tc_drv, cfg_sink) if "group" not in cfg_sink["kafka"]: raise RuntimeError("'group' NOT found in 'kafka' dict") if not isinstance(cfg_sink["kafka"]["group"], str): raise TypeError("'group' must be of type 'str'") self.group = cfg_sink["kafka"]["group"] if "timeout" in cfg_sink["kafka"]: if not isinstance(cfg_sink["kafka"]["timeout"], int): raise TypeError("'timeout' must be of type 'int'") self.poll_count = cfg_sink["kafka"]["timeout"] self.tc_drv.set_exp_type(self.type) elif self.type == "CFKafka": if "cfkafka" not in cfg_sink: raise RuntimeError("'cfkafka' NOT found in 'sink' dict") super(TestConsumer, self).__init__(tc_drv, cfg_sink) if "group" not in cfg_sink["cfkafka"]: raise RuntimeError("'group' NOT found in 'cfkafka' dict") if not isinstance(cfg_sink["cfkafka"]["group"], str): raise TypeError("'group' must be of type 'str'") self.group = cfg_sink["cfkafka"]["group"] if "timeout" in cfg_sink["cfkafka"]: if not isinstance(cfg_sink["cfkafka"]["timeout"], int): raise TypeError("'timeout' must be of type 'int'") self.poll_count = cfg_sink["cfkafka"]["timeout"] self.tc_drv.set_exp_type(self.type) else: raise RuntimeError("Unsupported 'type'='%s' in 'sink' dict" % (cfg_sink["type"])) if not self.cons: self.connect() def __del__(self): logger = logging.getLogger() if self.cons: #self.cons.unsubscribe() self.cons.close() def connect(self): logger = logging.getLogger() if self.type == "None": return if self.type == "Kafka": logger.debug("brokers: {}, group: {}, topic: {}".format( self.brokers, self.group, self.topic)) self.cons = Consumer({ 'bootstrap.servers': self.brokers, 'group.id': self.group, 'default.topic.config': { 'auto.offset.reset': 'smallest', } }) self.cons.subscribe([self.topic]) elif self.type == "CFKafka": logger.debug( "brokers: {}, schema_reg: {}, group: {}, topic: {}".format( self.brokers, self.schema_reg, self.group, self.topic)) self.cons = avro.AvroConsumer({ 'bootstrap.servers': self.brokers, 'schema.registry.url': self.schema_reg, 'group.id': self.group, 'default.topic.config': { 'auto.offset.reset': 'smallest', } }) self.cons.subscribe([self.topic]) def __reset_pos(self): logger = logging.getLogger() if self.type == "None": return parts = [TopicPartition(self.topic, 0)] (start, end) = self.cons.get_watermark_offsets(parts[0]) logger.debug("Currently at {}/{} offset <{}, {}>".format( parts[0].topic, parts[0].partition, start, end)) if end > 0: parts[0].offset = end - 1 self.cons.seek(parts[0]) def drain(self): logger = logging.getLogger() if self.type == "None": return poll_count = 60 logger.warning("topic: {}, will timeout in {} secs".format( self.topic, poll_count)) poll_num = 0 while True: try: poll_num += 1 msg = self.cons.poll(timeout=1.0) except SerializerError as exc: continue if msg is None: if poll_num >= poll_count: break elif msg.error(): break def rx_one(self): logger = logging.getLogger() if self.type == "None": return None logger.warning("will timeout in {} secs".format(self.poll_count)) #logger.debug("going to consume/poll") #msgs = self.cons.consume(num_messages=1, timeout=5.0) #if not msgs: # raise RuntimeError("No msg received, timed-out!") poll_num = 0 while True: try: poll_num += 1 msg = self.cons.poll(timeout=1.0) except SerializerError as exc: raise RuntimeError( "Message deserialization failed: {}".format(exc)) if msg is None: #parts = self.cons.position(parts) #logger.debug("Currently at {}/{} offset {}".format(parts[0].topic, # parts[0].partition, parts[0].offset)) if poll_num < self.poll_count: continue else: raise RuntimeError( "No msg received via {}, timed-out!".format( self.topic)) elif not msg.error(): break elif msg.error().code() == KafkaError._PARTITION_EOF: #raise RuntimeError("End of partition reached {}/{}".format( # msg.topic(), msg.partition())) #self.__reset_pos() #break continue else: raise RuntimeError(msg.error().str()) test_out = msg.value() logger.debug("RX'ed '{}' : '{}'".format(type(test_out), test_out)) if self.type == "CFKafka": self.tc_drv.store_rx_one(test_out) self.cons.commit() return test_out