def commit_offsets(self, consumer_id: str, offsets: List[TopicPartition]): config = Config.get_instance() consumer = Consumer({ "group.id": consumer_id, **config.create_confluent_config() }) consumer.commit(offsets=offsets, asynchronous=False) consumer.close()
def kafka_consume_expected(topic, group='0', timeout=1.0, mfilter=lambda x: True, validator=lambda x: None, after_subscribe=lambda: None): consumer = Consumer({ 'bootstrap.servers': KAFK, 'group.id': group, 'auto.offset.reset': 'earliest' # earliest _committed_ offset }) msgs = [] topics = consumer.list_topics(topic) # promises to create topic logging.debug("Topic state: %s", topics.topics) if topics.topics[topic].error is not None: logging.warning("Error subscribing to topic: %s", topics.topics) return msgs consumer.subscribe([topic]) time.sleep(5) # for kafka to rebalance consumer groups after_subscribe() logging.debug("Waiting for messages...") while True: msg = consumer.poll(timeout) if msg is None: break logging.info("Seen message: %r %r", msg.key(), msg.value()) if msg.error(): logging.warning("Consumer error: {}".format(msg.error())) continue if mfilter(msg): validator(msg) msgs.append(msg) consumer.commit() consumer.close() return msgs
def reset_offsets_from_partitions(client: AdminClient, brokers: str, app_name: str, input_topic: str): topic_description = get_topic(client, input_topic) partition_ids = [ partition_metada.id for partition_metada in topic_description.partitions.values() ] partitions = [ TopicPartition(input_topic, id_partition, 0) for id_partition in partition_ids ] consumer = Consumer({ 'bootstrap.servers': brokers, 'group.id': app_name, 'session.timeout.ms': 6000 }) response = consumer.commit(offsets=partitions, asynchronous=False) if not isinstance(response, list): raise FaustAppCleanException("Error while cleaning the Faust app!")
class TimeOrderedGeneratorWithTimeout(GeneratorInterface): """ A general generator which can read multiple topics and merge their messages in time order. A message must be emitted at (arrival_system_time + latency_ms). In batch mode (until reaching the first EOP on each stream) the generator will not discard any messages. """ def __init__(self, broker, groupid, topics_infos: List[TopicInfo], latency_ms, commit_interval_sec=None, group_by_time=False, begin_timestamp=None, begin_flag=None, end_timestamp=None, end_flag=None, heartbeat_interval_ms=-1): """ :param broker: Broker to connect to. :param groupid: Group id of the consumer. :param topics_infos: [TopicInfo()] - list of TopicInfo objects. :param latency_ms: (integer >=0) Latency to wait before serving a message. After this messages with lower or equal timestamps will be discarded. :param commit_interval_sec: How many seconds to wait between commits.-1 does not commit with the given group id. :param group_by_time: Group messages with the same timestamp. This will yield a list of messages. :param begin_timestamp: Timestamp of the kafka messages where the generator will start. :param begin_flag: BEGINNING, CONTINUE, LIVE - CONTINUE will continue from the last committed offset. If there was no committed offset will start from the end of the stream. :param end_timestamp: Timestamp where to end the reading. :param end_flag: NEVER, END_OF_PARTITION :param heartbeat_interval_ms: -1 does not produce heartbeat. After every interval will produce a HeartBeat typed message with the timestamp. """ if begin_timestamp is not None and begin_flag is not None: raise Exception( 'You can not set the begin timestamp and a flag in the same time.' ) if end_timestamp is not None and end_flag is not None: raise Exception( 'You can not set the end timestamp and a flag in the same time.' ) if begin_timestamp is not None and end_timestamp is not None and begin_timestamp >= end_timestamp: raise Exception( 'The begin timestamp is larger then the end timestamp.') if begin_flag is not None and end_flag is not None and \ begin_flag == BeginFlag.LIVE and end_flag == EndFlag.END_OF_PARTITION: raise Exception( 'You can not start in live and process until the end of the streams.' ) if end_flag is not None and not (end_flag == EndFlag.END_OF_PARTITION or end_flag == EndFlag.NEVER): raise Exception( 'Unknow end flag: {} . Please use the given enum to use proper end flag.' .format(end_flag)) self.end_ts = end_timestamp self.end_flag = end_flag self.commit_interval_sec = commit_interval_sec self.latency_ms = latency_ms self.group_by_time = group_by_time self.max_poll_interval_ms = 5 * 60 * 1000 self.consumer = Consumer({ 'bootstrap.servers': broker, 'group.id': groupid, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest' if begin_flag == BeginFlag.CONTINUE_OR_BEGINNING else 'latest', 'fetch.wait.max.ms': 20, 'max.poll.interval.ms': self.max_poll_interval_ms, 'enable.partition.eof': True }) self.last_poll = None self.tps = [] self.queues = {} self.messages_to_be_committed = {} self.begin_timestamp = begin_timestamp for ti in topics_infos: topic_name = ti.topic self.messages_to_be_committed[topic_name] = { 'last_msg': None, 'committed': True } if begin_timestamp is not None: self.tps.extend( self.consumer.offsets_for_times([ TopicPartition(topic_name, partition=ti.partition, offset=begin_timestamp) ])) elif begin_flag is not None: if begin_flag == BeginFlag.BEGINNING: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_BEGINNING)) elif begin_flag in (BeginFlag.CONTINUE, BeginFlag.CONTINUE_OR_BEGINNING): self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_STORED)) elif begin_flag == BeginFlag.LIVE: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) else: raise Exception( 'Unknown begin flag. Please use the enum to provide proper begin flag.' ) else: self.tps.append( TopicPartition(topic_name, partition=ti.partition, offset=OFFSET_END)) end_offset = None if end_flag is not None and end_flag == EndFlag.END_OF_PARTITION: end_offset = self.consumer.get_watermark_offsets( TopicPartition(topic_name, 0))[1] - 1 if end_offset is None or end_offset >= 0: self.queues[topic_name] = Topic(topic_name, self.consumer, end_offset=end_offset, partition=ti.partition, drop=ti.drop) self.consumer.assign(self.tps) self.last_commit = time.time() self.running = True self.heartbeat_interval_ms = heartbeat_interval_ms self.next_hb = None def stopGenerator(self): self.running = False def _serve_messages(self, message_to_serve): if self.commit_interval_sec is not None and self.group_by_time: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False # serve messages if self.group_by_time: yield message_to_serve else: for msg in message_to_serve: self.messages_to_be_committed[msg.topic()]['last_msg'] = msg self.messages_to_be_committed[msg.topic()]['committed'] = False yield msg if not self.running: break # commit messages when they were delivered current_time = time.time() if self.commit_interval_sec is not None and ( current_time - self.last_commit) > self.commit_interval_sec: for k in self.messages_to_be_committed.keys(): if not self.messages_to_be_committed[k]['committed']: self.consumer.commit( self.messages_to_be_committed[k]['last_msg']) self.messages_to_be_committed[k]['committed'] = True self.last_commit = current_time def _serve_heartbeat(self, current_timestamp_ms): if self.next_hb is None: if self.begin_timestamp is not None: self.next_hb = self.begin_timestamp else: self.next_hb = current_timestamp_ms while self.next_hb <= current_timestamp_ms: yield HeartBeat(self.next_hb) self.next_hb += self.heartbeat_interval_ms def _can_serve(self): min_ets = min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=-1) if min_ets == -1: return None deadline = getSystemTimestamp() - self.latency_ms if all([q.can_be_emitted(min_ets) for q in self.queues.values()]) and \ any([q.queue[0].ts < deadline for q in self.queues.values() if len(q.queue) > 0 and q.queue[0].message.timestamp()[1] == min_ets]): return min_ets else: return None def getMessages(self): while self.running: if all([v.stopped for v in self.queues.values()]): message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.queue) message_to_serve = [m.message for m in message_to_serve] message_to_serve.sort(key=lambda x: x.timestamp()[1]) while len(message_to_serve) > 0: ts = message_to_serve[0].timestamp()[1] serve_it = [] while len(message_to_serve) > 0 and message_to_serve[ 0].timestamp()[1] == ts: serve_it.append(message_to_serve.pop(0)) if not self.heartbeat_interval_ms == -1: yield from self._serve_heartbeat(ts) yield from self._serve_messages(serve_it) logging.info('Exiting from generator.') break self.last_poll = getSystemTimestamp() msg = self.consumer.poll(0.001) if msg is not None: if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: if msg.topic() in self.queues: self.queues[msg.topic()].first_eop_reached = True self.queues[msg.topic()].end_of_partition = True else: logging.error('Unhandle error: {}'.format(msg.error())) break else: self.queues[msg.topic()].end_of_partition = False if self.end_ts is not None and msg.timestamp( )[1] > self.end_ts: self.queues[msg.topic()].stop_topic() else: self.queues[msg.topic()].add_message(msg) while self.running: event_ts_to_serve = self._can_serve() if event_ts_to_serve is None or \ self.max_poll_interval_ms - (getSystemTimestamp() - self.last_poll) < 30000: if self.end_flag == EndFlag.NEVER and self.heartbeat_interval_ms != -1 \ and any([q.end_of_partition for q in self.queues.values()]): if self.next_hb is None: self.next_hb = min( getSystemTimestamp() - self.latency_ms, min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=sys.maxsize)) if self.next_hb < min( getSystemTimestamp() - self.latency_ms, min([ q.queue[0].message.timestamp()[1] for q in self.queues.values() if len(q.queue) > 0 ], default=sys.maxsize)): yield from self._serve_heartbeat(self.next_hb) break if self.heartbeat_interval_ms != -1: yield from self._serve_heartbeat(event_ts_to_serve) message_to_serve = [] for q in self.queues.values(): message_to_serve.extend(q.get_messages(event_ts_to_serve)) yield from self._serve_messages(message_to_serve) if self.end_ts is not None and self.end_ts <= event_ts_to_serve: self.running = False self.consumer.close()
class AsyncWorker(object): """ Fetches from Kafka topics and processes them. :param consumer_topic: Name of the Kafka topic for consume. :type consumer_topic: str :param service: Service function which is executed every time when job is processed. Service must get as argument str or dict type object. :type service: callable :param consumer_conf: config for Kafka consumer. :type consumer_conf: dict :param failed_topic: Kafka topic for produce unprocessed messages from consumer_topic. :type failed_topic: str :param producer_conf: config for Kafka producer for producing unprocessed messages. :type producer_conf: dict """ def __init__(self, consumer_topic: str, service: Callable, consumer_conf: dict, failed_topic: str, producer_conf: dict): self._consumer_topic = consumer_topic self._consumer = Consumer(consumer_conf) self._service = service self._failed_topic = failed_topic # use naming like <project name>_<version>_<consumer_topic><retry/failed> self._producer = AsyncProducer(producer_conf) def __repr__(self): """Return the string representation of the worker. :return: String representation of the worker. :rtype: str """ return 'Worker(Consumer={}, consume_topic={})'.format( self._consumer, self._consumer_topic) def __del__(self): # pragma: no cover # noinspection PyBroadException try: self._consumer.close() except Exception: pass async def _exec_service(self, message_value): if iscoroutinefunction(self._service): res = await self._service(message_value) else: res = self._service(message_value) return res async def _process_message(self, msg: Message): """ De-serialize message and execute service. :param msg: Kafka message. :type msg: confluent_kafka.Message` """ LOGGER.info( 'Processing Message(topic={}, partition={}, offset={}) ...'.format( msg.topic, msg.partition, msg.offset)) service_repr = get_call_repr(self._service) LOGGER.info('Executing job {}'.format(service_repr)) try: message_value = _decode_msg_value(msg.value()) res = await self._exec_service(message_value) except KeyboardInterrupt: LOGGER.error('Job was interrupted: {}'.format(msg.offset())) except Exception as err: LOGGER.exception('Job {} raised an exception: {}'.format( msg.offset(), err)) await self._producer.produce(topic=self._failed_topic, value=msg.value(), error=str(err)) else: LOGGER.info('Job {} returned: {}'.format(msg.offset(), res)) @property def consumer_topic(self): """Return the name of the Kafka topic. :return: Name of the Kafka topic. :rtype: str """ return self._consumer_topic @property def consumer(self): """Return the Kafka consumer instance. :return: Kafka consumer instance. :rtype: kafka.KafkaConsumer """ return self._consumer @property def service(self): """Return the service function. :return: Callback function, or None if not set. :rtype: callable | None """ return self._service async def start(self, max_messages: int = math.inf, commit_offsets: bool = True) -> int: """Start processing Kafka messages and executing jobs. :param max_messages: Maximum number of Kafka messages to process before stopping. If not set, worker runs until interrupted. :type max_messages: int :param commit_offsets: If set to True, consumer offsets are committed every time a message is processed (default: True). :type commit_offsets: bool :return: Total number of messages processed. :rtype: int """ LOGGER.info('Starting {} ...'.format(self)) self._consumer.unsubscribe() self._consumer.subscribe([self.consumer_topic]) LOGGER.info(" Try get messages from position: {}".format( self._consumer.position(self._consumer.assignment()))) messages_processed = 0 while messages_processed < max_messages: loop = asyncio.get_event_loop() # awaiting place for processing messages in other coroutines messages = await loop.run_in_executor( None, partial(self._consumer.consume, 10, 2.0)) LOGGER.debug(" Try get messages from position: {}".format( self._consumer.position(self._consumer.assignment()))) if not messages: LOGGER.debug("Messages not found") continue for msg in messages: if msg.error(): LOGGER.error("Consumer error: {}".format(msg.error())) LOGGER.info("Get message with offset {}".format(msg.offset())) asyncio.create_task(self._process_message(msg)) if commit_offsets: self._consumer.commit() messages_processed += 1 self._consumer.close() return messages_processed
class AioConsumer: def __init__(self, config, topics: list, group_id: str, handler, max_retry=-1, consumer_no=0, timeout=1, loop=None, exe=None): """ consumer = new AioConsumer(...) :param config: kafka consumer config :param topics: :param group_id: :param handler: :param max_retry: 消费失败重试次数。-1:不重试 :param consumer_no: 消费者编号 :param timeout: poll超时时间 :param loop: :param exe: """ self.loop = loop or asyncio.get_event_loop() assert config is not None, 'init kafka consumer error, config is None' _config = copy.deepcopy(config) _config['group.id'] = group_id _config['on_commit'] = self.commit_completed self.handler = handler self.consumer = Consumer(_config) self.consumer.subscribe(topics) self.redis_retry_key = f'{"_".join(topics)}_{self.handler.__name__}' self.name = f'{self.redis_retry_key}_{consumer_no}' self.max_retry = max_retry self.exe = exe self.timeout = timeout # 'INIT' -> 'RUNNING' -> 'STOP' self.status = 'INIT' @staticmethod def commit_completed(err, partitions): if err: logger.info(str(err)) else: logger.info("Committed partition offsets: " + str(partitions)) async def poll(self): return await self.loop.run_in_executor(self.exe, self.consumer.poll, self.timeout) async def _get_message_from_kafka(self): poll_message = await self.poll() if not poll_message: return None elif poll_message.error(): raise KafkaException(poll_message.error()) else: return poll_message.value() async def run(self): while self.status == 'RUNNING': str_message = await self._get_message_from_kafka() message = json.loads(str_message or '{}') if not message: await asyncio.sleep(1) continue try: if asyncio.iscoroutinefunction(self.handler): await self.handler(message) else: self.handler(message) await self.commit() except Exception as e: logger.warning(f'{str(self)} handler error: {e.args}. msg: {str_message}') await self.close() async def commit(self): def _commit(): self.consumer.commit(asynchronous=False) await self.loop.run_in_executor(self.exe, _commit) async def close(self): await self.commit() await self.loop.run_in_executor(self.exe, self.consumer.close) logger.info(f'{self.name} closed') def stop(self): self.status = 'STOP'