Ejemplo n.º 1
0
class KafkaGroupReader:
    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self._kafka_groups = defaultdict(lambda: defaultdict(dict))
        self.active_partitions = {}
        self._finished = False

    def read_group(self, group_id):
        partition_count = get_offset_topic_partition_count(self.kafka_config)
        partition = get_group_partition(group_id, partition_count)
        return self.read_groups(partition)[group_id]

    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        # Fetch metadata as partitions_for_topic only returns locally cached metadata
        # See https://github.com/dpkp/kafka-python/issues/1742
        self.consumer.topics()

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(
                    CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(
            list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp
            for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks
            and self.watermarks[tp.partition].highmark > 0 and self.watermarks[
                tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[
                    message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups) if topics
        }

    def _remove_unsubscribed_topics(self):
        for group, topics in list(six.iteritems(self._kafka_groups)):
            for topic, partitions in list(six.iteritems(topics)):
                # If offsets for all partitions are 0, consider the topic as unsubscribed
                if not any(partitions.values()):
                    del self._kafka_groups[group][topic]
                    self.log.info(
                        "Removed group {group} topic {topic} from list of groups"
                        .format(group=group, topic=topic))

    def remove_partition_from_consumer(self, partition):
        deleted = self.active_partitions.pop(partition)
        # Terminate if there are no more partitions to consume
        if not self.active_partitions:
            self.log.info("Completed reading from all partitions")
            self._finished = True
            return
        # Reassign the remaining partitions to the consumer while saving the
        # position
        positions = [(p, self.consumer.position(p))
                     for p in self.active_partitions.values()]
        self.consumer.assign(list(self.active_partitions.values()))
        for topic_partition, position in positions:
            self.consumer.seek(topic_partition, position)
        self.log.info(
            "Completed reading from %s. Remaining partitions: %s",
            deleted,
            self.active_partitions,
        )

    def parse_consumer_offset_message(self, message):
        key = message.key
        ((key_schema, ), cur) = relative_unpack(b'>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException(
            )  # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition, ), cur) = relative_unpack(b'>l', key, cur)
        if message.value:
            value = message.value
            ((value_schema, ), cur) = relative_unpack(b'>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset, ), cur) = relative_unpack(b'>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return group.decode(), topic.decode(), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(
                message)
        except InvalidMessageException:
            return

        if offset is not None:
            self._kafka_groups[group][topic][partition] = offset
            self.log.info(
                "Updated group {group} topic {topic} and updated offset in list of groups"
                .format(
                    group=group,
                    topic=topic,
                ), )
        # TODO: check if we can ever find an offset commit message with message.value is None
        elif offset is None and group in self._kafka_groups and \
                topic in self._kafka_groups[group]:  # No offset means topic deletion
            del self._kafka_groups[group][topic]
            self.log.info(
                "Removed group {group} topic {topic} from list of groups".
                format(group=group, topic=topic))

    def get_current_watermarks(self, partitions=None):
        client = KafkaToolClient(self.kafka_config.broker_list)
        client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
        offsets = get_topics_watermarks(
            client,
            [CONSUMER_OFFSET_TOPIC],
        )
        partitions_set = set(tp.partition
                             for tp in partitions) if partitions else None
        return {
            part: offset
            for part, offset in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
            if offset.highmark > offset.lowmark and (
                partitions is None or part in partitions_set)
        }

    def finished(self):
        return self._finished
Ejemplo n.º 2
0
_GROUP_ID = 'my_group'

consumer = KafkaConsumer(
    group_id='ddd',
    auto_offset_reset='smallest',  #largest
    enable_auto_commit=
    False,  ## true时,Consumer会在消费消息后将offset同步到zookeeper,这样当Consumer失败后,新的consumer就能从zookeeper获取最新的offset
    bootstrap_servers=_BROKERS)
# consumer = KafkaConsumer(bootstrap_servers=_BROKERS)
consumer.assign([TopicPartition(_TOPIC_NAME, 0)])
tp = TopicPartition(_TOPIC_NAME, 0)
print(consumer.committed(TopicPartition(_TOPIC_NAME, 0)))
# consumer.subscribe(topics=[_TOPIC_NAME])
# # Subscribe to a regex topic pattern
# consumer.subscribe(pattern='^awesome.*')
print(consumer.topics())
# partition = TopicPartition(topic=_TOPIC_NAME, partition=consumer.partitions_for_topic(_TOPIC_NAME))
# consumer.seek_to_beginning()
# consumer.seek(TopicPartition(_TOPIC_NAME, 0), 0)
consumer.seek(tp, 50)  # 10 stands for start consumer from 10th offset
a = []
for m in consumer:
    if len(a) < 5:
        print(m.offset)
        a.append(m.offset)
        # consumer.commit()
    # else:
    #     a =[]

ProduceRequestPayload = namedtuple("ProduceRequestPayload",
                                   ["topic", "partition", "messages"])
Ejemplo n.º 3
0
class IBUSStreamingDownsamplingConsumer:
    LOG_FORMAT ="{} UTC_TS\t"\
                "{}"

    def __init__(self, kafkaHost, kafkaPort, tcpHost, tcpPort, group_id, topic,
                 logTopic, interval):
        self.kafkaHost = kafkaHost
        self.kafkaPort = kafkaPort
        self.tcpHost = tcpHost
        self.tcpPort = tcpPort
        self.group_id = group_id
        self.topic = topic
        self.logTopic = logTopic
        self.interval = int(interval)
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)],
            group_id=group_id,
            enable_auto_commit=False)
        self.producer = KafkaProducer(
            bootstrap_servers=["{}:{}".format(kafkaHost, kafkaPort)])
        self.tcpWriter = None

    def getTopicPartitions(self):
        self.consumer.topics()  #This ensures local cache is updated with
        # information about partitions, offsets etc.
        pids = self.consumer.partitions_for_topic(self.topic)
        tps = [TopicPartition(self.topic, pid) for pid in pids]
        return tps

    def getTopicPartitionsCommittedPositions(self):
        tps = self.getTopicPartitions()
        ret = [(tp, self.consumer.committed(tp)) for tp in tps]
        return ret

    async def tcp_server_handler(self, reader, writer):
        addr = str(writer.get_extra_info("socket").getpeername())
        if self.tcpWriter is not None:
            self.log("refused " + addr)
            writer.write(b"Connection limit reached; connection refused.")
            writer.close()
            return
        self.log("accepted " + addr)
        self.tcpWriter = writer
        t1 = asyncio.create_task(self.poll_from_Kafka(writer))
        try:
            while True:
                data = await reader.read(1)  # 1024*16 bytes
                if not data:
                    break
        except BrokenPipeError:
            """
      Catches connecton reset by peer when we are sending the batched data,
       which is also when we cannot check for reader. The broken connection
       on the writer side will ultimately lead to  BrokenPipeError on the
       reader side. Hence
      """
            pass
        finally:
            t1.cancel()
            self.log("closed " + addr)
            writer.close()
            self.tcpWriter = None

    async def poll_from_Kafka(self, writer):
        while True:
            prevPos = self.getTopicPartitionsCommittedPositions()
            polled = self.consumer.poll(timeout_ms=1000)
            records = [
                record.value for recordList in polled.values()
                for record in recordList
            ]
            try:
                for record in records:
                    writer.write(record)
                    await writer.drain()
            except ConnectionResetError:
                """
        The error is not thrown reliably. If a connection is broken, and
         one try to
            writer.write(record)
            await writer.drain()
         This error may not manifest. It is thrown more often when one try
         to repeatedly write to and drain a broken connection.
        """
                print("Last batch not fully sent, not commited.")
                for tp, pos in prevPos:
                    self.consumer.seek(tp, pos)
                break
            else:
                self.consumer.commit()
            await asyncio.sleep(self.interval)

    def log(self, msg):
        self.producer.send( self.logTopic,
                            self.LOG_FORMAT.format( datetime.now().timestamp(),
                                                    msg
                                                    ) \
                                .encode()
                            )

    def cleanup(self):
        self.log("shutdown")
        self.consumer.close()
        self.producer.flush()
        self.producer.close()

    def run(self):
        self.log("running")
        asyncio.run(self._async_run())

    async def _async_run(self):
        tcpServer = await asyncio.start_server(self.tcp_server_handler,
                                               self.tcpHost, self.tcpPort)
        await tcpServer.serve_forever()
Ejemplo n.º 4
0
class KafkaGroupReader:

    def __init__(self, kafka_config):
        self.log = logging.getLogger(__name__)
        self.kafka_config = kafka_config
        self._kafka_groups = defaultdict(lambda: defaultdict(dict))
        self.active_partitions = {}
        self._finished = False

    def read_group(self, group_id):
        partition_count = get_offset_topic_partition_count(self.kafka_config)
        partition = get_group_partition(group_id, partition_count)
        return self.read_groups(partition).get(group_id, [])

    def read_groups(self, partition=None):
        self.consumer = KafkaConsumer(
            group_id='offset_monitoring_consumer',
            bootstrap_servers=self.kafka_config.broker_list,
            auto_offset_reset='earliest',
            enable_auto_commit=False,
            consumer_timeout_ms=30000,
            fetch_max_wait_ms=2000,
            max_partition_fetch_bytes=10 * 1024 * 1024,  # 10MB
        )

        # Fetch metadata as partitions_for_topic only returns locally cached metadata
        # See https://github.com/dpkp/kafka-python/issues/1742
        self.consumer.topics()

        if partition is not None:
            self.active_partitions = {
                partition: TopicPartition(CONSUMER_OFFSET_TOPIC, partition),
            }
        else:
            self.active_partitions = {
                p: TopicPartition(CONSUMER_OFFSET_TOPIC, p)
                for p in self.consumer.partitions_for_topic(CONSUMER_OFFSET_TOPIC)
            }
        self.watermarks = self.get_current_watermarks(list(self.active_partitions.values()))
        # Active partitions are not empty. Remove the empty ones.
        self.active_partitions = {
            p: tp for p, tp in self.active_partitions.items()
            if tp.partition in self.watermarks and
            self.watermarks[tp.partition].highmark > 0 and
            self.watermarks[tp.partition].highmark > self.watermarks[tp.partition].lowmark
        }
        # Cannot consume if there are no active partitions
        if not self.active_partitions:
            return {}

        self.consumer.assign(list(self.active_partitions.values()))
        self.log.info("Consuming from %s", self.active_partitions)

        message_iterator = iter(self.consumer)

        while not self.finished():
            try:
                message = next(message_iterator)
            except StopIteration:
                continue
            # Stop when reaching the last message written to the
            # __consumer_offsets topic when KafkaGroupReader first started
            if message.offset >= self.watermarks[message.partition].highmark - 1:
                self.remove_partition_from_consumer(message.partition)
            self.process_consumer_offset_message(message)

        self._remove_unsubscribed_topics()

        return {
            group: topics.keys()
            for group, topics in six.iteritems(self._kafka_groups)
            if topics
        }

    def _remove_unsubscribed_topics(self):
        for group, topics in list(six.iteritems(self._kafka_groups)):
            for topic, partitions in list(six.iteritems(topics)):
                # If offsets for all partitions are 0, consider the topic as unsubscribed
                if not any(partitions.values()):
                    del self._kafka_groups[group][topic]
                    self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic))

    def remove_partition_from_consumer(self, partition):
        deleted = self.active_partitions.pop(partition)
        # Terminate if there are no more partitions to consume
        if not self.active_partitions:
            self.log.info("Completed reading from all partitions")
            self._finished = True
            return
        # Reassign the remaining partitions to the consumer while saving the
        # position
        positions = [
            (p, self.consumer.position(p))
            for p in self.active_partitions.values()
        ]
        self.consumer.assign(list(self.active_partitions.values()))
        for topic_partition, position in positions:
            self.consumer.seek(topic_partition, position)
        self.log.info(
            "Completed reading from %s. Remaining partitions: %s",
            deleted,
            self.active_partitions,
        )

    def parse_consumer_offset_message(self, message):
        key = message.key
        ((key_schema,), cur) = relative_unpack(b'>h', key, 0)
        if key_schema not in [0, 1]:
            raise InvalidMessageException()   # This is not an offset commit message
        (group, cur) = read_short_string(key, cur)
        (topic, cur) = read_short_string(key, cur)
        ((partition,), cur) = relative_unpack(b'>l', key, cur)
        if message.value:
            value = message.value
            ((value_schema,), cur) = relative_unpack(b'>h', value, 0)
            if value_schema not in [0, 1]:
                raise InvalidMessageException()  # Unrecognized message value
            ((offset,), cur) = relative_unpack(b'>q', value, cur)
        else:
            offset = None  # Offset was deleted
        return group.decode(), topic.decode(), partition, offset

    def process_consumer_offset_message(self, message):
        try:
            group, topic, partition, offset = self.parse_consumer_offset_message(message)
        except InvalidMessageException:
            return

        if offset is not None:
            self._kafka_groups[group][topic][partition] = offset
            self.log.info(
                "Updated group {group} topic {topic} and updated offset in list of groups".format(
                    group=group,
                    topic=topic,
                ),
            )
        # TODO: check if we can ever find an offset commit message with message.value is None
        elif offset is None and group in self._kafka_groups and \
                topic in self._kafka_groups[group]:  # No offset means topic deletion
            del self._kafka_groups[group][topic]
            self.log.info("Removed group {group} topic {topic} from list of groups".format(group=group, topic=topic))

    def get_current_watermarks(self, partitions=None):
        client = KafkaToolClient(self.kafka_config.broker_list)
        client.load_metadata_for_topics(CONSUMER_OFFSET_TOPIC)
        offsets = get_topics_watermarks(
            client,
            [CONSUMER_OFFSET_TOPIC],
        )
        partitions_set = set(tp.partition for tp in partitions) if partitions else None
        return {part: offset for part, offset
                in six.iteritems(offsets[CONSUMER_OFFSET_TOPIC])
                if offset.highmark > offset.lowmark and
                (partitions is None or part in partitions_set)}

    def finished(self):
        return self._finished