Example #1
0
 def commit_offsets(self, consumer_id: str, offsets: List[TopicPartition]):
     config = Config.get_instance()
     consumer = Consumer({
         "group.id": consumer_id,
         **config.create_confluent_config()
     })
     consumer.commit(offsets=offsets, asynchronous=False)
     consumer.close()
Example #2
0
def kafka_consume_expected(topic,
                           group='0',
                           timeout=1.0,
                           mfilter=lambda x: True,
                           validator=lambda x: None,
                           after_subscribe=lambda: None):
    consumer = Consumer({
        'bootstrap.servers': KAFK,
        'group.id': group,
        'auto.offset.reset': 'earliest'  # earliest _committed_ offset
    })
    msgs = []
    topics = consumer.list_topics(topic)  # promises to create topic
    logging.debug("Topic state: %s", topics.topics)
    if topics.topics[topic].error is not None:
        logging.warning("Error subscribing to topic: %s", topics.topics)
        return msgs
    consumer.subscribe([topic])
    time.sleep(5)  # for kafka to rebalance consumer groups

    after_subscribe()

    logging.debug("Waiting for messages...")
    while True:
        msg = consumer.poll(timeout)

        if msg is None:
            break

        logging.info("Seen message: %r %r", msg.key(), msg.value())

        if msg.error():
            logging.warning("Consumer error: {}".format(msg.error()))
            continue

        if mfilter(msg):
            validator(msg)
            msgs.append(msg)

    consumer.commit()
    consumer.close()

    return msgs
Example #3
0
def reset_offsets_from_partitions(client: AdminClient, brokers: str,
                                  app_name: str, input_topic: str):
    topic_description = get_topic(client, input_topic)
    partition_ids = [
        partition_metada.id
        for partition_metada in topic_description.partitions.values()
    ]
    partitions = [
        TopicPartition(input_topic, id_partition, 0)
        for id_partition in partition_ids
    ]
    consumer = Consumer({
        'bootstrap.servers': brokers,
        'group.id': app_name,
        'session.timeout.ms': 6000
    })
    response = consumer.commit(offsets=partitions, asynchronous=False)
    if not isinstance(response, list):
        raise FaustAppCleanException("Error while cleaning the Faust app!")
Example #4
0
class TimeOrderedGeneratorWithTimeout(GeneratorInterface):
    """
    A general generator which can read multiple topics and merge their messages in time order.
    A message must be emitted at (arrival_system_time + latency_ms).
    In batch mode (until reaching the first EOP on each stream) the generator will not discard any messages.
    """
    def __init__(self,
                 broker,
                 groupid,
                 topics_infos: List[TopicInfo],
                 latency_ms,
                 commit_interval_sec=None,
                 group_by_time=False,
                 begin_timestamp=None,
                 begin_flag=None,
                 end_timestamp=None,
                 end_flag=None,
                 heartbeat_interval_ms=-1):
        """
        :param broker: Broker to connect to.
        :param groupid: Group id of the consumer.
        :param topics_infos: [TopicInfo()] - list of TopicInfo objects.
        :param latency_ms: (integer >=0) Latency to wait before serving a message.
                            After this messages with lower or equal timestamps will be discarded.
        :param commit_interval_sec: How many seconds to wait between commits.-1 does not commit with the given group id.
        :param group_by_time: Group messages with the same timestamp. This will yield a list of messages.
        :param begin_timestamp: Timestamp of the kafka messages where the generator will start.
        :param begin_flag: BEGINNING, CONTINUE, LIVE - CONTINUE will continue from the last committed offset.
                            If there was no committed offset will start from the end of the stream.
        :param end_timestamp: Timestamp where to end the reading.
        :param end_flag: NEVER, END_OF_PARTITION
        :param heartbeat_interval_ms: -1 does not produce heartbeat. After every interval will produce a HeartBeat typed
                                        message with the timestamp.
        """
        if begin_timestamp is not None and begin_flag is not None:
            raise Exception(
                'You can not set the begin timestamp and a flag in the same time.'
            )
        if end_timestamp is not None and end_flag is not None:
            raise Exception(
                'You can not set the end timestamp and a flag in the same time.'
            )
        if begin_timestamp is not None and end_timestamp is not None and begin_timestamp >= end_timestamp:
            raise Exception(
                'The begin timestamp is larger then the end timestamp.')
        if begin_flag is not None and end_flag is not None and \
                begin_flag == BeginFlag.LIVE and end_flag == EndFlag.END_OF_PARTITION:
            raise Exception(
                'You can not start in live and process until the end of the streams.'
            )
        if end_flag is not None and not (end_flag == EndFlag.END_OF_PARTITION
                                         or end_flag == EndFlag.NEVER):
            raise Exception(
                'Unknow end flag: {} . Please use the given enum to use proper end flag.'
                .format(end_flag))
        self.end_ts = end_timestamp
        self.end_flag = end_flag
        self.commit_interval_sec = commit_interval_sec
        self.latency_ms = latency_ms
        self.group_by_time = group_by_time
        self.max_poll_interval_ms = 5 * 60 * 1000
        self.consumer = Consumer({
            'bootstrap.servers':
            broker,
            'group.id':
            groupid,
            'enable.auto.commit':
            False,
            'auto.offset.reset':
            'earliest'
            if begin_flag == BeginFlag.CONTINUE_OR_BEGINNING else 'latest',
            'fetch.wait.max.ms':
            20,
            'max.poll.interval.ms':
            self.max_poll_interval_ms,
            'enable.partition.eof':
            True
        })
        self.last_poll = None

        self.tps = []
        self.queues = {}
        self.messages_to_be_committed = {}
        self.begin_timestamp = begin_timestamp
        for ti in topics_infos:
            topic_name = ti.topic
            self.messages_to_be_committed[topic_name] = {
                'last_msg': None,
                'committed': True
            }
            if begin_timestamp is not None:
                self.tps.extend(
                    self.consumer.offsets_for_times([
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=begin_timestamp)
                    ]))
            elif begin_flag is not None:
                if begin_flag == BeginFlag.BEGINNING:
                    self.tps.append(
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=OFFSET_BEGINNING))
                elif begin_flag in (BeginFlag.CONTINUE,
                                    BeginFlag.CONTINUE_OR_BEGINNING):
                    self.tps.append(
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=OFFSET_STORED))
                elif begin_flag == BeginFlag.LIVE:
                    self.tps.append(
                        TopicPartition(topic_name,
                                       partition=ti.partition,
                                       offset=OFFSET_END))
                else:
                    raise Exception(
                        'Unknown begin flag. Please use the enum to provide proper begin flag.'
                    )
            else:
                self.tps.append(
                    TopicPartition(topic_name,
                                   partition=ti.partition,
                                   offset=OFFSET_END))
            end_offset = None
            if end_flag is not None and end_flag == EndFlag.END_OF_PARTITION:
                end_offset = self.consumer.get_watermark_offsets(
                    TopicPartition(topic_name, 0))[1] - 1
            if end_offset is None or end_offset >= 0:
                self.queues[topic_name] = Topic(topic_name,
                                                self.consumer,
                                                end_offset=end_offset,
                                                partition=ti.partition,
                                                drop=ti.drop)
        self.consumer.assign(self.tps)
        self.last_commit = time.time()
        self.running = True
        self.heartbeat_interval_ms = heartbeat_interval_ms
        self.next_hb = None

    def stopGenerator(self):
        self.running = False

    def _serve_messages(self, message_to_serve):
        if self.commit_interval_sec is not None and self.group_by_time:
            for msg in message_to_serve:
                self.messages_to_be_committed[msg.topic()]['last_msg'] = msg
                self.messages_to_be_committed[msg.topic()]['committed'] = False

        # serve messages
        if self.group_by_time:
            yield message_to_serve
        else:
            for msg in message_to_serve:
                self.messages_to_be_committed[msg.topic()]['last_msg'] = msg
                self.messages_to_be_committed[msg.topic()]['committed'] = False
                yield msg
                if not self.running:
                    break

        # commit messages when they were delivered
        current_time = time.time()
        if self.commit_interval_sec is not None and (
                current_time - self.last_commit) > self.commit_interval_sec:
            for k in self.messages_to_be_committed.keys():
                if not self.messages_to_be_committed[k]['committed']:
                    self.consumer.commit(
                        self.messages_to_be_committed[k]['last_msg'])
                    self.messages_to_be_committed[k]['committed'] = True
            self.last_commit = current_time

    def _serve_heartbeat(self, current_timestamp_ms):
        if self.next_hb is None:
            if self.begin_timestamp is not None:
                self.next_hb = self.begin_timestamp
            else:
                self.next_hb = current_timestamp_ms
        while self.next_hb <= current_timestamp_ms:
            yield HeartBeat(self.next_hb)
            self.next_hb += self.heartbeat_interval_ms

    def _can_serve(self):
        min_ets = min([
            q.queue[0].message.timestamp()[1]
            for q in self.queues.values() if len(q.queue) > 0
        ],
                      default=-1)
        if min_ets == -1:
            return None
        deadline = getSystemTimestamp() - self.latency_ms
        if all([q.can_be_emitted(min_ets) for q in self.queues.values()]) and \
                any([q.queue[0].ts < deadline for q in self.queues.values()
                     if len(q.queue) > 0 and q.queue[0].message.timestamp()[1] == min_ets]):
            return min_ets
        else:
            return None

    def getMessages(self):
        while self.running:
            if all([v.stopped for v in self.queues.values()]):
                message_to_serve = []
                for q in self.queues.values():
                    message_to_serve.extend(q.queue)
                message_to_serve = [m.message for m in message_to_serve]
                message_to_serve.sort(key=lambda x: x.timestamp()[1])
                while len(message_to_serve) > 0:
                    ts = message_to_serve[0].timestamp()[1]
                    serve_it = []
                    while len(message_to_serve) > 0 and message_to_serve[
                            0].timestamp()[1] == ts:
                        serve_it.append(message_to_serve.pop(0))
                    if not self.heartbeat_interval_ms == -1:
                        yield from self._serve_heartbeat(ts)
                    yield from self._serve_messages(serve_it)
                logging.info('Exiting from generator.')
                break
            self.last_poll = getSystemTimestamp()
            msg = self.consumer.poll(0.001)
            if msg is not None:
                if msg.error():
                    if msg.error().code() == KafkaError._PARTITION_EOF:
                        if msg.topic() in self.queues:
                            self.queues[msg.topic()].first_eop_reached = True
                            self.queues[msg.topic()].end_of_partition = True
                    else:
                        logging.error('Unhandle error: {}'.format(msg.error()))
                        break
                else:
                    self.queues[msg.topic()].end_of_partition = False
                    if self.end_ts is not None and msg.timestamp(
                    )[1] > self.end_ts:
                        self.queues[msg.topic()].stop_topic()
                    else:
                        self.queues[msg.topic()].add_message(msg)
            while self.running:
                event_ts_to_serve = self._can_serve()
                if event_ts_to_serve is None or \
                        self.max_poll_interval_ms - (getSystemTimestamp() - self.last_poll) < 30000:
                    if self.end_flag == EndFlag.NEVER and self.heartbeat_interval_ms != -1 \
                            and any([q.end_of_partition for q in self.queues.values()]):
                        if self.next_hb is None:
                            self.next_hb = min(
                                getSystemTimestamp() - self.latency_ms,
                                min([
                                    q.queue[0].message.timestamp()[1]
                                    for q in self.queues.values()
                                    if len(q.queue) > 0
                                ],
                                    default=sys.maxsize))
                        if self.next_hb < min(
                                getSystemTimestamp() - self.latency_ms,
                                min([
                                    q.queue[0].message.timestamp()[1]
                                    for q in self.queues.values()
                                    if len(q.queue) > 0
                                ],
                                    default=sys.maxsize)):
                            yield from self._serve_heartbeat(self.next_hb)
                    break
                if self.heartbeat_interval_ms != -1:
                    yield from self._serve_heartbeat(event_ts_to_serve)
                message_to_serve = []
                for q in self.queues.values():
                    message_to_serve.extend(q.get_messages(event_ts_to_serve))
                yield from self._serve_messages(message_to_serve)
                if self.end_ts is not None and self.end_ts <= event_ts_to_serve:
                    self.running = False
        self.consumer.close()
Example #5
0
class AsyncWorker(object):
    """
    Fetches from Kafka topics and processes them.

    :param consumer_topic: Name of the Kafka topic for consume.
    :type consumer_topic: str
    :param service: Service function which is executed every time when job is processed.
    Service must get as argument str or dict type object.
    :type service: callable
    :param consumer_conf: config for Kafka consumer.
    :type consumer_conf: dict
    :param failed_topic: Kafka topic for produce unprocessed messages from consumer_topic.
    :type failed_topic: str
    :param producer_conf: config for Kafka producer for producing unprocessed messages.
    :type producer_conf: dict
    """
    def __init__(self, consumer_topic: str, service: Callable,
                 consumer_conf: dict, failed_topic: str, producer_conf: dict):

        self._consumer_topic = consumer_topic
        self._consumer = Consumer(consumer_conf)
        self._service = service
        self._failed_topic = failed_topic  # use naming like <project name>_<version>_<consumer_topic><retry/failed>
        self._producer = AsyncProducer(producer_conf)

    def __repr__(self):
        """Return the string representation of the worker.
        :return: String representation of the worker.
        :rtype: str
        """

        return 'Worker(Consumer={}, consume_topic={})'.format(
            self._consumer, self._consumer_topic)

    def __del__(self):  # pragma: no cover
        # noinspection PyBroadException
        try:
            self._consumer.close()
        except Exception:
            pass

    async def _exec_service(self, message_value):
        if iscoroutinefunction(self._service):
            res = await self._service(message_value)
        else:
            res = self._service(message_value)
        return res

    async def _process_message(self, msg: Message):
        """
        De-serialize message and execute service.
        :param msg: Kafka message.
        :type msg: confluent_kafka.Message`
        """
        LOGGER.info(
            'Processing Message(topic={}, partition={}, offset={}) ...'.format(
                msg.topic, msg.partition, msg.offset))
        service_repr = get_call_repr(self._service)
        LOGGER.info('Executing job {}'.format(service_repr))
        try:
            message_value = _decode_msg_value(msg.value())
            res = await self._exec_service(message_value)

        except KeyboardInterrupt:
            LOGGER.error('Job was interrupted: {}'.format(msg.offset()))

        except Exception as err:
            LOGGER.exception('Job {} raised an exception: {}'.format(
                msg.offset(), err))

            await self._producer.produce(topic=self._failed_topic,
                                         value=msg.value(),
                                         error=str(err))
        else:
            LOGGER.info('Job {} returned: {}'.format(msg.offset(), res))

    @property
    def consumer_topic(self):
        """Return the name of the Kafka topic.
        :return: Name of the Kafka topic.
        :rtype: str
        """
        return self._consumer_topic

    @property
    def consumer(self):
        """Return the Kafka consumer instance.
        :return: Kafka consumer instance.
        :rtype: kafka.KafkaConsumer
        """
        return self._consumer

    @property
    def service(self):
        """Return the service function.
        :return: Callback function, or None if not set.
        :rtype: callable | None
        """
        return self._service

    async def start(self,
                    max_messages: int = math.inf,
                    commit_offsets: bool = True) -> int:
        """Start processing Kafka messages and executing jobs.
        :param max_messages: Maximum number of Kafka messages to process before stopping. If not set, worker runs until
        interrupted.

        :type max_messages: int
        :param commit_offsets: If set to True, consumer offsets are committed every time a message is processed
        (default: True).
        :type commit_offsets: bool
        :return: Total number of messages processed.
        :rtype: int
        """
        LOGGER.info('Starting {} ...'.format(self))

        self._consumer.unsubscribe()
        self._consumer.subscribe([self.consumer_topic])
        LOGGER.info(" Try get messages from position: {}".format(
            self._consumer.position(self._consumer.assignment())))
        messages_processed = 0
        while messages_processed < max_messages:
            loop = asyncio.get_event_loop()
            # awaiting place for processing messages in other coroutines
            messages = await loop.run_in_executor(
                None, partial(self._consumer.consume, 10, 2.0))
            LOGGER.debug(" Try get messages from position: {}".format(
                self._consumer.position(self._consumer.assignment())))
            if not messages:
                LOGGER.debug("Messages not found")
                continue
            for msg in messages:
                if msg.error():
                    LOGGER.error("Consumer error: {}".format(msg.error()))
                LOGGER.info("Get message with offset {}".format(msg.offset()))
                asyncio.create_task(self._process_message(msg))
            if commit_offsets:
                self._consumer.commit()

            messages_processed += 1
        self._consumer.close()
        return messages_processed
Example #6
0
class AioConsumer:

    def __init__(self, config,
                 topics: list,
                 group_id: str,
                 handler,
                 max_retry=-1,
                 consumer_no=0,
                 timeout=1,
                 loop=None, exe=None):
        """
        consumer = new AioConsumer(...)
        :param config: kafka consumer config
        :param topics:
        :param group_id:
        :param handler:
        :param max_retry: 消费失败重试次数。-1:不重试
        :param consumer_no: 消费者编号
        :param timeout: poll超时时间
        :param loop:
        :param exe:
        """
        self.loop = loop or asyncio.get_event_loop()
        assert config is not None, 'init kafka consumer error, config is None'
        _config = copy.deepcopy(config)
        _config['group.id'] = group_id
        _config['on_commit'] = self.commit_completed
        self.handler = handler
        self.consumer = Consumer(_config)
        self.consumer.subscribe(topics)
        self.redis_retry_key = f'{"_".join(topics)}_{self.handler.__name__}'
        self.name = f'{self.redis_retry_key}_{consumer_no}'
        self.max_retry = max_retry
        self.exe = exe
        self.timeout = timeout
        # 'INIT' -> 'RUNNING' -> 'STOP'
        self.status = 'INIT'

    @staticmethod
    def commit_completed(err, partitions):
        if err:
            logger.info(str(err))
        else:
            logger.info("Committed partition offsets: " + str(partitions))

    async def poll(self):
        return await self.loop.run_in_executor(self.exe, self.consumer.poll, self.timeout)

    async def _get_message_from_kafka(self):
        poll_message = await self.poll()
        if not poll_message:
            return None
        elif poll_message.error():
            raise KafkaException(poll_message.error())
        else:
            return poll_message.value()

    async def run(self):
        while self.status == 'RUNNING':
            str_message = await self._get_message_from_kafka()
            message = json.loads(str_message or '{}')
            if not message:
                await asyncio.sleep(1)
                continue
            try:
                if asyncio.iscoroutinefunction(self.handler):
                    await self.handler(message)
                else:
                    self.handler(message)
                await self.commit()
            except Exception as e:
                logger.warning(f'{str(self)} handler error: {e.args}. msg: {str_message}')

        await self.close()

    async def commit(self):
        def _commit():
            self.consumer.commit(asynchronous=False)
        await self.loop.run_in_executor(self.exe, _commit)

    async def close(self):
        await self.commit()
        await self.loop.run_in_executor(self.exe, self.consumer.close)
        logger.info(f'{self.name} closed')

    def stop(self):
        self.status = 'STOP'