Esempi in Python per KafkaConsumer.beginning_offsets

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: kafka

Classe/tipologia: KafkaConsumer

Metodo/funzione: beginning_offsets

Esempi su hotexamples.com: 30

KafkaConsumer.beginning_offsets in Python: 30 esempi trovati. Questi sono i migliori esempi reali in Python per kafka.KafkaConsumer.beginning_offsets, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

assign(30)

KafkaConsumer(30)

assignment(30)

beginning_offsets(30)

close(30)

bootstrap_connected(8)

__next__(3)

zookeeper_connect(3)

__init__(2)

_update_fetch_positions(2)

StockPricePrediction(1)

__iter__(1)

_use_consumer_group(1)

append(1)

Esempio n. 1

Mostra file

def employeeportal():
    tp = TopicPartition('crashed-devices', 0)
    consumer = KafkaConsumer(
        'crashed-devices',
        bootstrap_servers=[
            'ec2-52-203-135-135.compute-1.amazonaws.com:9092',
            'ec2-52-70-111-222.compute-1.amazonaws.com:9092',
            'ec2-34-193-78-218.compute-1.amazonaws.com:9092'
        ],
        enable_auto_commit=True,
        group_id='my-group',
        auto_offset_reset='earliest',
        value_deserializer=lambda x: loads(x.decode('utf-8')))
    lastOffset = consumer.beginning_offsets([tp])[tp]
    latitudes = []
    longitudes = []
    i = 0
    for message in consumer:
        i += 1
        msg = message.value
        latitudes.append(msg['latitude'])
        longitudes.append(msg['longitude'])
        print(latitudes, longitudes)
        if i == 1:
            print("GOT HERE")
            consumer.commit()
            break

    consumer.close()

    return render_template("employeeportal.html",
                           APIkey='AIzaSyD9e3Rdo8fGQq6hzaXkdsdQzv9Hy0rTolE',
                           latitudes=latitudes,
                           longitudes=longitudes)

Esempio n. 2

Mostra file

def create_consumer(args, policy):
    """
    Refer to Python package kafka-python, a high-level message consumer of Kafka brokers.
    The consumer iterator returns consumer records, which expose basic message
    attributes: topic, partition, offset, key, and value.

    :param args: Input arguments
    :param policy: Object to store Network Policy for processing
    :return: KafkaConsumer object, messages from the message bus for processing
    """
    consumer = KafkaConsumer(args.get('topic'),
                             api_version=API_VERSION,
                             bootstrap_servers=args.get('broker'),
                             client_id=CLIENT_ID,                       # name passed to servers for identification
                             auto_offset_reset=args.get('start_at'),    # consume earliest or latest available msgs
                             enable_auto_commit=AUTOCOMMIT,             # autocommit offsets?
                             consumer_timeout_ms=args.get('timeout'),   # StopIteration if no message after 'n' seconds
                             security_protocol=SSL,
                             ssl_context=create_ssl_context(args)
                             )

    # Returned values are of type Set
    msg = ["All the topics available :{}".format(consumer.topics()),
           "Subscription:{}".format(consumer.subscription()),
           "Partitions for topic:{}".format(consumer.partitions_for_topic(args.get('topic'))),
           "TopicPartitions:{}".format(consumer.assignment())
           ]
    policy.add_fact('consumer_debug', msg)
    # Offsets are type Int
    policy.add_fact('beginning_offsets', str(consumer.beginning_offsets(consumer.assignment())))
    policy.add_fact('end_offsets', str(consumer.end_offsets(consumer.assignment())))

    policy.start_at_offset = args.get('start_at_offset')
    policy.add_fact('start_at_offset', policy.start_at_offset)
    return consumer

Esempio n. 3

Mostra file

def main():
    consumer = KafkaConsumer('topic_test_cluster',
                             bootstrap_servers=['master:9092'])

    print consumer.partitions_for_topic('topic_test_cluster')
    print consumer.topics()
    print consumer.subscription()
    print consumer.assignment()
    print consumer.beginning_offsets(consumer.assignment())

    # 读取partition为2、偏移量从5开始的数据
    consumer.seek(TopicPartition(topic=u'topic_test_cluster', partition=2), 5)

    for msg in consumer:
        print('%s:%d:%d: key=%s value=%s' %
              (msg.topic, msg.partition, msg.offset, msg.key, msg.value))

Esempio n. 4

Mostra file

 def __init__(self, broker: str, topic: str, partition: int = -1, start: Union[int, datetime, PartitionOffset] = PartitionOffset.END, stop: Union[int, datetime, PartitionOffset] = PartitionOffset.NEVER):
     consumer = KafkaConsumer(bootstrap_servers=broker, fetch_max_bytes=52428800 * 6, consumer_timeout_ms=100)
     existing_topics = consumer.topics()
     self.current_msg = None
     self.current_offset_limits = HighLowOffset(-1, -1)
     if topic not in existing_topics:
         raise RuntimeError(f"Topic \"{topic}\" does not exist.")
     existing_partitions = consumer.partitions_for_topic(topic)
     if partition == -1:
         partition = existing_partitions.pop()
     elif partition not in existing_partitions:
         raise RuntimeError(f"Partition {partition} for topic \"{topic}\" does not exist.")
     topic_partition = TopicPartition(topic, partition)
     consumer.assign([topic_partition, ])
     if start == PartitionOffset.BEGINNING:
         consumer.seek_to_beginning()
     elif start == PartitionOffset.END or start == PartitionOffset.NEVER:
         consumer.seek_to_end()
     elif type(start) is int:
         first_offset = consumer.beginning_offsets([topic_partition, ])
         if first_offset[topic_partition] > start:
             consumer.seek_to_beginning()
         else:
             consumer.seek(partition=topic_partition, offset=start)
     elif type(start) is datetime:
         found_offsets = consumer.offsets_for_times({topic_partition: int(start.timestamp() * 1000)})
         consumer.seek(partition=topic_partition, offset=found_offsets[topic_partition].offset)
     self.to_thread = Queue()
     self.from_thread = Queue(maxsize=100)
     self.thread = Thread(target=thread_function, daemon=True, kwargs={"consumer": consumer, "stop": stop, "in_queue": self.to_thread, "out_queue": self.from_thread, "stop": stop, "topic_partition": topic_partition})
     self.thread.start()

Esempio n. 5

Mostra file

def consume(self):
    consumer = KafkaConsumer(self.topic,
                             bootstrap_servers=self.bootstrap_servers)
    print(consumer.partitions_for_topic(self.topic))  #获取test主题的分区信息
    print(consumer.topics())  #获取主题列表
    print(consumer.subscription())  #获取当前消费者订阅的主题
    print(consumer.assignment())  #获取当前消费者topic、分区信息
    print(consumer.beginning_offsets(consumer.assignment()))  #获取当前消费者可消费的偏移量
    consumer.seek(TopicPartition(topic=self.topic, partition=0),
                  1)  #重置偏移量，从第1个偏移量消费
    for message in consumer:
        print("%s:%d:%d: key=%s value=%s" %
              (message.topic, message.partition, message.offset, message.key,
               message.value))

Esempio n. 6

Mostra file

File: kafkatTopic.py Progetto: wangxin000/AccessEngine_Functional_automation

    def consume(self):
        consumer = KafkaConsumer(self.topic,
                                 bootstrap_servers=self.bootstrap_servers)
        print(consumer.partitions_for_topic(self.topic))
        print(consumer.topics())
        print(consumer.subscription())
        print(consumer.assignment())
        print(consumer.beginning_offsets(consumer.assignment()))

        consumer.seek(TopicPartition(topic=self.topic, partition=0), 1)
        for message in consumer:
            print("%s:%d:%d: key=%s value=%s" %
                  (message.topic, message.partition, message.offset,
                   message.key, message.value))

Esempio n. 7

Mostra file

File: get_effective_offset.py Progetto: phny/Kafka-Utils

class GetEffectiveOffset:
    def __init__(self, broker_list, group_name, topic):
        self.topic = topic
        self.consumer = KafkaConsumer(group_id=group_name,
                                      bootstrap_servers=broker_list)

    def get_offset(self):
        partitions_structs = []

        for partition_id in self.consumer.partitions_for_topic(self.topic):
            partitions_structs.append(TopicPartition(self.topic, partition_id))

        beginning_offset = self.consumer.beginning_offsets(partitions_structs)
        end_offset = self.consumer.end_offsets(partitions_structs)

        for partition, offset in beginning_offset.items():
            print('{0} => beginning offset = {1}; end offset = {2}'.format(
                partition, offset, end_offset[partition]))

Esempio n. 8

Mostra file

def offset_manage_manually_consume():
    """
    手动设置offset
    :return:
    """
    consumer = KafkaConsumer(TOPIC, bootstrap_servers=BOOTSTRAP_SERVERS)
    print(consumer.partitions_for_topic(TOPIC))  # 获取topic的分区信息
    print(consumer.topics())  # 获取topic列表  当前kafka server有哪些topic
    print(consumer.subscription())  # 获取当前消费者订阅的topic
    print(consumer.assignment())  # 获取当前消费者topic、分区信息
    print(consumer.beginning_offsets(consumer.assignment()))  # 获取当前消费者可消费的偏移量
    print(consumer.assignment())  # 获取当前消费者可消费的偏移量
    consumer.seek(TopicPartition(topic=u'%s' % TOPIC, partition=0),
                  235000)  # 重置偏移量，从第235000个偏移量消费
    for message in consumer:
        print("%s:%d:%d: key=%s value=%s" %
              (message.topic, message.partition, message.offset, message.key,
               message.value))

Esempio n. 9

Mostra file

def desc_topic(args):
    consumer = KafkaConsumer(bootstrap_servers=[args.broker])
    topics = consumer.topics()
    if args.topic not in topics:
        consumer.close()
        print(f'Topic "{args.topic}" not in cluster.')
    else:
        partitions = consumer.partitions_for_topic(args.topic)
        tp_list = []
        for p in partitions:
            tp = TopicPartition(args.topic, p)
            tp_list.append(tp)
        beginning_offsets = consumer.beginning_offsets(tp_list)
        end_offsets = consumer.end_offsets(tp_list)

        print(f'Topic: {args.topic}')
        print(f'Partition: {partitions}')
        print(f'Beginning Offsets: {list(beginning_offsets.values())}')
        print(f'End Offsets: {list(end_offsets.values())}')

Esempio n. 10

Mostra file

def cusumer():
    start = time.time()
    n = 0
    _consumer = KafkaConsumer('4.1.1.1.python-test',
                              group_id='test1',
                              bootstrap_servers='192.168.18.134:9092',
                              consumer_timeout_ms=1000)
    print(_consumer.partitions_for_topic('4.1.1.1.python-test'))
    #TopicPartition('4.1.1.1.python-test','0')
    #a = namedtuple("_TopicPartition",["_4.1.1.1.python-test", "_0"])
    offset = _consumer.committed(
        TopicPartition(topic='4.1.1.1.python-test', partition=0))
    #_consumer.seek_to_beginning()
    #_consumer.seek_to_beginning(TopicPartition(topic = '4.1.1.1.python-test',partition = 0))
    #_consumer.assign(TopicPartition(topic = '4.1.1.1.python-test',partition = 0))
    print(_consumer.assignment())
    print(_consumer.subscription())
    print(
        _consumer.beginning_offsets(
            TopicPartition(topic='4.1.1.1.python-test', partition=0)))
    #_consumer.seek(TopicPartition(topic = '4.1.1.1.python-test',partition = 0),offset-1)
    #print(_consumer.position(TopicPartition(topic = '4.1.1.1.python-test',partition = 0)))
    #_consumer.commit()
    return
    while 1:
        try:
            for message in _consumer:
                #yield message
                print(message.value)
                n = n + 1
                stop = time.time()
                if stop - start > 1:
                    print(n / (stop - start))
                    start = time.time()
                    n = 0
            #print('time out')
        except KafkaTimeoutError as e:
            print(e)
        except KafkaError as e:
            print(e)
        finally:
            pass

Esempio n. 11

Mostra file

File: kafkatest.py Progetto: darling-kefan/kafka-python-examples

    def info(topic):
        print('brokers: {}'.format(','.join(bootstrap_servers)))
        consumerclient = KafkaConsumer(bootstrap_servers=bootstrap_servers)
        partitions = consumerclient.partitions_for_topic(topic)

        print('topic: {}'.format(topic))
        print('partitions: {}'.format(','.join(
            str(partition) for partition in partitions)))

        partitioninstances = []
        for partition in partitions:
            partitioninstance = TopicPartition(topic, int(partition))
            partitioninstances.append(partitioninstance)

        beginningoffsets = consumerclient.beginning_offsets(partitioninstances)
        endoffsets = consumerclient.end_offsets(partitioninstances)
        for pi in partitioninstances:
            msg = 'partition: {}, beginning_offset: {}, end_offset: {}'
            msg = msg.format(pi.partition, beginningoffsets[pi],
                             endoffsets[pi])
            print(msg)

Esempio n. 12

Mostra file

File: Kafka.py Progetto: jiasy/PY_Service

 def doTest(self):
     print(self.className + " - " +
           pyUtils.getCurrentRunningFunctionName() + "------------------")
     _topicName = "pro_bilog"
     local_url = ['网址:端口']
     real_url = ['ip:端口']
     _consumer = KafkaConsumer(_topicName,
                               bootstrap_servers=local_url,
                               group_id='test',
                               request_timeout_ms=3000,
                               session_timeout_ms=5000)
     # 获取test主题的分区信息
     print(_consumer.partitions_for_topic(_topicName))
     # 获取主题列表
     print(_consumer.topics())
     # 获取当前消费者订阅的主题
     print(_consumer.subscription())
     # 获取当前消费者topic、分区信息
     print(_consumer.assignment())
     # 获取当前消费者可消费的偏移量
     print(_consumer.beginning_offsets(_consumer.assignment()))

Esempio n. 13

Mostra file

def thread_function(consumer: KafkaConsumer, stop: Union[datetime, int], in_queue: Queue, out_queue: Queue, topic_partition):
    known_sources: Dict[bytes, DataSource] = {}
    start_time = datetime.now(tz=timezone.utc)
    update_timer = datetime.now(tz=timezone.utc)
    while True:
        messages_ctr = 0
        for kafka_msg in consumer:
            new_msg = Message(kafka_msg)
            if type(stop) is int and new_msg.offset > stop:
                pass
            elif type(stop) is datetime and new_msg.timestamp is not None and new_msg.timestamp > stop:
                pass
            elif type(stop) is datetime and new_msg.timestamp is None and new_msg.kafka_timestamp > stop:
                pass
            else:
                if not new_msg.source_hash in known_sources:
                    known_sources[new_msg.source_hash] = DataSource(new_msg.source_name, new_msg.message_type, start_time)
                known_sources[new_msg.source_hash].process_message(new_msg)
            messages_ctr += 1
            if messages_ctr == CHECK_FOR_MSG_INTERVAL:
                break
        if not in_queue.empty():
            new_msg = in_queue.get()
            if new_msg == "exit":
                break
        now = datetime.now(tz=timezone.utc)
        if now - update_timer > UPDATE_STATUS_INTERVAL:
            update_timer = now
            try:
                out_queue.put(copy(known_sources), block=False)
                low_offset = consumer.beginning_offsets([topic_partition, ])[topic_partition]
                high_offset = consumer.end_offsets([topic_partition, ])[topic_partition]
                out_queue.put(HighLowOffset(low_offset, high_offset))
            except Full:
                pass  # Do nothing
    consumer.close(True)

Esempio n. 14

Mostra file

File: kafka_read_util.py Progetto: avikulin/python_kafka

        consumer = KafkaConsumer(
            kafka_topic_name,
            group_id=kafka_consumer_group_id,
            client_id=kafka_client_id,
            bootstrap_servers=kafka_brokers,
            request_timeout_ms=6001,
            session_timeout_ms=6000,
            heartbeat_interval_ms=2000,
            auto_offset_reset="earliest",
            enable_auto_commit=False
        )
        app_logger.get.info("Consumer init successful")

        kafka_partitions: Set[TopicPartition] = consumer.partitions_for_topic(kafka_topic_name)
        kafka_topic_partitions: List[TopicPartition] = [TopicPartition(kafka_topic_name, p) for p in kafka_partitions]
        kafka_min_offsets = consumer.beginning_offsets(kafka_topic_partitions)
        kafka_max_offsets = consumer.end_offsets(kafka_topic_partitions)

        app_logger.get.info(f"Cluster info: brokers - {kafka_brokers} partitions - {kafka_topic_partitions} "
                                 f"min. offset - {kafka_min_offsets} "
                                 f"max. offsets - {kafka_max_offsets}")

        # Print connection statistics
        print("\nClaster statistics:")
        print(f"\tBrockers:\t{kafka_brokers}")
        print(f"\tTopic:\t{kafka_topic_name}")
        print(f"\tPartitions:\t{kafka_partitions}")
        print(f"\tConsumer group id:\t{kafka_consumer_group_id}")

        print(f"\nStart reading from topic \"{kafka_topic_name}\"...")
        counter: int = 0

Esempio n. 15

Mostra file

class Consumer:
    __flight_messages = dict()
    __consumers_count = 0

    def __init__(self, *args, **kwargs):
        self.topics = args
        self.consumer_id = kwargs.pop('consumer_id',
                                      Consumer.__consumers_count + 1)
        self.manager_id = kwargs.pop('manager_id', '')
        self.__consumer = KafkaConsumer(*args, **kwargs)
        self.__enable_polling = True
        self.__is_active = True
        self.__poll_delay = MIN_POLL_DELAY
        self.__group_id = kwargs.get('group_id', None)
        self.__name = 'Consumer_{}'.format(Consumer.get_consumer_count())
        self.processed_images = 0
        Consumer.increment_consumer_count()
        print('Topics listened by the consumers:', self.topics)
        if self.__group_id and self.__group_id not in Consumer.__flight_messages:
            Consumer.__flight_messages.update({self.__group_id: 0})

    @staticmethod
    def get_messages_in_flight():
        return Consumer.__flight_messages

    @staticmethod
    def get_consumer_count():
        return Consumer.__consumers_count

    @staticmethod
    def update_consumer_count(count):
        Consumer.__consumers_count = count

    @staticmethod
    def increment_consumer_count():
        Consumer.__consumers_count += 1

    def is_active(self):
        return self.__is_active

    def get_current_subscriptions(self):
        return self.__consumer.subscription()

    def get_initial_offset(self, partitions):
        return self.__consumer.beginning_offsets(partitions)

    def get_current_position(self, partition):
        return self.__consumer.position(partition)

    def get_end_offset(self, partitions):
        return self.__consumer.end_offsets(partitions)

    def subscribe_topics(self, *topics):
        self.__consumer.subscribe(topics=topics)
        print('subscribed to the topics', topics)

    def consume_messages(self, process_fn):
        assert process_fn and callable(process_fn), \
            'process_fn is mandatory and must be callable'
        for message in self.__consumer:
            process_fn(message)
        self.close_consumer()

    def set_alive(self, is_alive):
        redis_cli.set_multi_value(
            HEALTHCHECK_HASHKEY, '{}:{}'.format(self.manager_id,
                                                self.consumer_id),
            1 if is_alive else 0)

    def log_consumer_meta(self):
        consumer_meta = dict(messages_count=self.processed_images,
                             topics=self.topics,
                             group_id=self.__group_id,
                             name=self.__name)
        redis_cli.set_multi_value(
            META_INFO_HASHKEY, '{}:{}'.format(self.manager_id,
                                              self.consumer_id),
            json.loads(consumer_meta))

    def __poll(self, timeout_ms=0, max_records=MAX_RECORDS_PER_POLL):
        while self.__enable_polling:
            print('{} Listening to messages...'.format(self.__name))
            try:
                message = self.__consumer.poll(timeout_ms=timeout_ms,
                                               max_records=max_records)
                if message:
                    Consumer.__flight_messages[self.__group_id] = 0
                    self.__poll_delay = MIN_POLL_DELAY
                    max_offset_position = self.get_end_offset(message.keys())
                    for partitions in message:
                        try:
                            records = message.get(partitions)
                            Consumer.__flight_messages[self.__group_id] += (max_offset_position.get(partitions) \
                                - self.get_current_position(partitions))
                            yield records
                        except Exception as exc:
                            print(exc)
                else:
                    delay = self.__poll_delay * 2
                    if delay <= MAX_POLL_DELAY:
                        self.__poll_delay = delay
                    else:
                        self.__poll_delay = MAX_POLL_DELAY

                self.processed_images += len(records)
                self.log_consumer_meta()
                self.set_alive(False)
                self.log_consumer_meta()
                sleep(self.__poll_delay)
            except AssertionError as assertion_exc:
                self.stop_polling()
                print(assertion_exc)

        print('returning None')
        self.set_alive(False)
        return None

    def poll_topics(self,
                    process_fn,
                    timeout_ms=0,
                    max_records=MAX_RECORDS_PER_POLL):
        assert process_fn and callable(process_fn), \
            'process_fn is mandatory and must be callable'
        print('polling has begun')
        for records in self.__poll(timeout_ms=timeout_ms,
                                   max_records=max_records):
            if records is None:
                break
            process_fn(records)
        print('exit from consuming messages')

    def stop_polling(self):
        print('stopping polling and closing the consumer')
        self.__enable_polling = False
        self.close_consumer()

    def close_consumer(self):
        self.__consumer.close()
        self.__is_active = False

Esempio n. 16

Mostra file

File: kafka_cli.py Progetto: SiriusInWinter/kafka_cli

class KafkaCli(object):
    """
    kafka cli
    """
    CMD_HELP_LINES = {
        "list": "list <optional: match pattern, regex format>",
        "partition": "partition <required: topic>"
    }
    CMD_OPTIONS = [
        "list",
        "partition",
    ]

    def __init__(self, server_addr):
        self.server_addr = server_addr

        self.consumer = None
        self.producer = None

        self.cmd_proc_funcs = {}
        self.prompt_line = _color("kafka> ", "cyan")
        # reg
        self.reg_all_cmds()

    def connect(self):
        """
        connect to kafka
        """
        try:
            self.consumer = KafkaConsumer(bootstrap_servers=self.server_addr)
            return True, "Success"
        except Exception as e:
            return False, "connect to {} failed, {}".format(
                self.server_addr, e)

    def cmd_completer(self, text, state):
        """
        cmd completer
        """
        # on first trigger, build possible matches
        matches = []
        if state == 0:
            # cache matches (entries that start with entered text)
            if text:
                matches = [
                    s for s in self.CMD_OPTIONS if s and s.startswith(text)
                ]
            else:
                # no text entered, all matches possible
                matches = self.options[:]

        # return match indexed by state
        try:
            return matches[state]
        except IndexError:
            return None

    def prepare_auto_complete(self):
        """
        prepare auto complete
        """
        # cmd complete
        readline.set_completer(self.cmd_completer)
        readline.parse_and_bind('tab: complete')

    def reg_cmd_process(self, cmd_starts, func):
        """
        register cmd process function
        """
        self.cmd_proc_funcs[cmd_starts] = func

    def reg_all_cmds(self):
        """
        reg all cmds
        """
        # help
        self.reg_cmd_process("help", self.print_help)
        # list
        self.reg_cmd_process("list", self.list_topics)
        # partition offsets
        self.reg_cmd_process("partition", self.get_partitions)

    def dispatch_cmd(self, cmd_line):
        """
        dispatch
        """
        matches = [
            s for s in self.cmd_proc_funcs if s and cmd_line.startswith(s)
        ]
        # get the first
        if matches:
            match_cmd = matches[0]
            self.cmd_proc_funcs[match_cmd](cmd_line)
        else:
            self.print_help()

    def print_help(self, cmd=None, cmd_line=None):
        """
        help
        """
        print("Usage: ")
        if not cmd or cmd not in self.CMD_HELP_LINES:
            for cmd in self.CMD_HELP_LINES:
                print("{}".format(self.CMD_HELP_LINES[cmd]))
                print("")
        else:
            print("{}\n".format(self.CMD_HELP_LINES[cmd]))

    def print_sep_line(self):
        """
        print seprate line
        """
        print(_color("+{}+".format("-" * 50), 'magenta'))

    def list_topics(self, cmd_line):
        """
        list topics
        """
        line_info = re.split("\s+", cmd_line)
        match_pattern = None
        if len(line_info) > 1:
            match_pattern = re.compile(line_info[1])

        topics = self.consumer.topics()
        if topics:
            cnt = 0
            print(_color("+{}+".format("-" * 50), 'magenta'))
            for topic in topics:
                if match_pattern:
                    m = match_pattern.match(topic)
                    if m:
                        print(topic)
                        cnt += 1
                else:
                    print(topic)
                    cnt += 1
            print(_color("+{}+".format("-" * 50), 'magenta'))
            print(_color("\nGet {} result(s)\n".format(cnt), 'yellow'))
        else:
            print(_color("\nGet 0 result(s)\n", 'yellow'))

    def get_partitions(self, cmd_line):
        """
        get topic partitions
        """
        line_info = re.split("\s+", cmd_line)
        if len(line_info) < 2:
            self.print_help(cmd="partition")
        else:
            topics = line_info[1:]
            offsets = {}
            for topic in topics:
                partition_ids = self.consumer.partitions_for_topic(topic)
                if not partition_ids:
                    continue
                offsets[topic] = {}
                topic_partitions = \
                    [TopicPartition(topic, p_id) for p_id in partition_ids]
                # begin offsets
                begin_offsets = self.consumer.beginning_offsets(
                    topic_partitions)
                # end offsets
                end_offsets = self.consumer.end_offsets(topic_partitions)
                for tp in topic_partitions:
                    p_id = tp.partition
                    offsets[topic][p_id] = {}
                    offsets[topic][p_id]["begin"] = begin_offsets[tp]
                    offsets[topic][p_id]["end"] = end_offsets[tp]
            # print result
            for topic in topics:
                if topic in offsets:
                    self.print_sep_line()
                    print("{}".format(topic))
                    for p_id in offsets[topic]:
                        print("{} {}:{}".format(_color(p_id, 'yellow'),
                                                offsets[topic][p_id]["begin"],
                                                offsets[topic][p_id]["end"]))
                else:
                    self.print_sep_line()
                    print("Get no partitions for topic {}".format(topic))
            self.print_sep_line()
            print("")

    def run(self):
        """
        run cli
        """
        if not self.consumer \
            and not self.connect():
            return False
        # cmd complete
        self.prepare_auto_complete()
        # loop
        while True:
            line = raw_input(self.prompt_line)
            line = line.strip()
            if not line:
                continue

            try:
                self.dispatch_cmd(line)
            except Exception as e:
                print("Exception occored, {}".format(e))

Esempio n. 17

Mostra file

File: kafka_userinfo.py Progetto: FrankProL/PyCharmProjects

import time
import pandas as pd
import json
from kafka import KafkaConsumer, TopicPartition


datalist = []
i = 0

"""消费者(手动设置偏移量)"""
consumer = KafkaConsumer('phone-game-userinfo', bootstrap_servers=['172.23.11.150:9092'])
print (consumer.partitions_for_topic("phone-game-userinfo"))  # 获取phone-game-userinfo主题的分区信息
print (consumer.topics())  # 获取主题列表
print (consumer.subscription())  # 获取当前消费者订阅的主题
print (consumer.assignment())  # 获取当前消费者topic、分区信息
print (consumer.beginning_offsets(consumer.assignment()))  # 获取当前消费者可消费的偏移量
consumer.seek(TopicPartition(topic=u'phone-game-userinfo', partition=0), 202025)  # 重置偏移量，从第50个偏移量消费
print(consumer.end_offsets(consumer.assignment()))    # Get the last offset for the given partitions
print(consumer.end_offsets([TopicPartition(topic='phone-game-userinfo', partition=0)])) # 同上一句等价
t= '2018-05-10'
timeArray =time.strptime(t,'%Y-%m-%d')
timeStamp=int(time.mktime(timeArray))
print(consumer.offsets_for_times({TopicPartition(topic='phone-game-userinfo', partition=0):timeStamp}))
for message in consumer:
    print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,
                                          message.offset, message.key,
                                          message.value.decode('utf-8')))
#     # print (message.value.decode('utf-8'))
#     # print (message.offset)
#     data = message.value.split(',')
#     print data

Esempio n. 18

Mostra file

    def __init__(self, broker: str, topic: str, partition: int = -1, start: Tuple[Union[int, datetime, PartitionOffset], Optional[int]] = PartitionOffset.END, stop: Union[int, datetime, PartitionOffset] = PartitionOffset.NEVER):
        self.to_thread = Queue()
        self.from_thread = Queue(maxsize=100)

        consumer = KafkaConsumer(bootstrap_servers=broker, fetch_max_bytes=52428800 * 6, consumer_timeout_ms=100)
        existing_topics = consumer.topics()
        self.current_msg = None
        self.current_offset_limits = HighLowOffset(-1, -1)
        if topic not in existing_topics:
            raise RuntimeError(f"Topic \"{topic}\" does not exist.")
        existing_partitions = consumer.partitions_for_topic(topic)
        if partition == -1:
            partition = existing_partitions.pop()
        elif partition not in existing_partitions:
            raise RuntimeError(f"Partition {partition} for topic \"{topic}\" does not exist.")
        topic_partition = TopicPartition(topic, partition)
        consumer.assign([topic_partition, ])
        first_offset = consumer.beginning_offsets([topic_partition])[topic_partition]
        last_offset = consumer.end_offsets([topic_partition])[topic_partition]
        origin_offset = None
        offset_to_offset = start[1]
        if start[0] == PartitionOffset.BEGINNING:
            origin_offset = first_offset
            # consumer.seek_to_beginning()
            # if type(start[1]) == int and start[1] > 0 and first_offset + start[1] <= last_offset:
            #     consumer.seek(partition=topic_partition, offset=first_offset + start[1])
        elif start[0] == PartitionOffset.END or start == PartitionOffset.NEVER:
            origin_offset = last_offset
            # consumer.seek_to_end()
            # if type(start[1]) == int and start[1] < 0 and last_offset + start[1] >= first_offset:
            #     consumer.seek(partition=topic_partition, offset=first_offset + start[1])
        elif type(start[0]) is int:
            if first_offset > start[0]:
                origin_offset = first_offset
                # consumer.seek_to_beginning()
            elif last_offset < start[0]:
                origin_offset = last_offset
            else:
                origin_offset = start[0]
            #     consumer.seek_to_end()
            # else:
            #     consumer.seek(partition=topic_partition, offset=start[0])
        elif type(start[0]) is datetime:
            found_offsets = consumer.offsets_for_times({topic_partition: int(start[0].timestamp() * 1000)})
            if found_offsets[topic_partition] is None:
                origin_offset = last_offset
            else:
                origin_offset = found_offsets[topic_partition].offset

            # if type(start[1]) == int:
            #     used_offset += start[1]
            # consumer.seek(partition=topic_partition, offset=used_offset)
        else:
            raise RuntimeError("Unknown start offset configured.")

        if offset_to_offset is not None:
            origin_offset += offset_to_offset
            if origin_offset < first_offset:
                origin_offset = first_offset
            elif origin_offset > last_offset:
                origin_offset = last_offset
        consumer.seek(partition=topic_partition, offset=origin_offset)
        self.thread = Thread(target=thread_function, daemon=True, kwargs={"consumer": consumer, "stop": stop, "in_queue": self.to_thread, "out_queue": self.from_thread, "stop": stop, "topic_partition": topic_partition})
        self.thread.start()

Esempio n. 19

Mostra file

class Consumer(KafkaPython):
    def __init__(self, bootstrap_servers=None, **kwargs):
        super().__init__(servers=bootstrap_servers)
        consumer_config.update(kwargs)

        self.engine = KafkaConsumer(bootstrap_servers=self._bootstrap_servers,
                                    client_id=self._client_id,
                                    **consumer_config)
        self.DbClient = Mongo()
        self.group_id = consumer_config.get('group_id', None)
        self.tps = []
        self._partition_mode = None
        self.offset_store_mode = store_config.get(
            'offset')  # 3种offset存储机制. 1db  2kafka 3both

    def get_user_topics(self):
        return self.engine.topics()

    # 消费模式1 : 手动分配分区对象给当前消费者
    '''
    topics 参考值: {'name': 'test_topic', 'num_partitions': 3, 'replication_factor': 3, 'replica_assignments': {},'topic_configs': {}}
    '''

    def assign_partition(self, topics: list):
        if Consumer().get_user_topics().intersection(
            {item['topic']
             for i, item in enumerate(topics)}):
            for v in topics:
                tp = kafka.TopicPartition(topic=str(v['topic']),
                                          partition=int(v['partition']))
                self.tps.append(tp)
            self.engine.assign(self.tps)
        else:
            raise Exception('topics包含了未知的topic' % topics)
        self._partition_mode = '1'
        return self

    # 消费模式2 : 消费者主动订阅topic
    def sub_partition(self, topic: list):
        self.tps = topic
        self._partition_mode = '2'
        return self

    # 开始消费
    def topic_consumer(self, **kwargs):

        if self._partition_mode == '1':

            for tp in self.tps:

                data = self.find_or_create(
                    topic=tp.topic,
                    partition=tp.partition,
                    group_id=self.group_id,
                )

                if data:
                    self.engine.seek(tp,
                                     int(data.get('current_offset', 0)) + 1)
                else:
                    self.engine.seek(tp,
                                     self.engine.beginning_offsets([tp])[tp])

            return self.engine

        elif self._partition_mode == '2':
            self.engine.subscribe(self.tps,
                                  pattern=kwargs.get('pattern', None),
                                  listener=kwargs.get('listener', None))
        else:
            raise Exception('you have to chose the partition mode')

    # 将current offset回滚至 指定offset Or committed
    # 当消费者完成消费过程并提交数据至业务并且业务日常GG了
    def rollback_offset(self, topic, partition, group_id, offset=None):
        if offset is None:
            committed_offset = self.engine.committed(
                kafka.TopicPartition(topic=topic, partition=partition))
            if committed_offset is None:
                raise Exception(
                    'topic:%s,partition:%s,group_id:%s has not commit record yet,you should enter some offset'
                    % (topic, partition, group_id))

        self.commit_offset(group_id=group_id,
                           topic=topic,
                           partition=partition,
                           offset=offset)

    # 当前消费者的offset信息提交函数
    def commit_offset(self, group_id, topic, partition, offset):
        if self.group_id is None:
            raise Exception('you must enter an group_id')

        tp = kafka.TopicPartition(topic=str(topic), partition=int(partition))

        # 分别提交offset信息至kafka and database
        if self.offset_store_mode == 'both':

            self.engine.commit(
                offsets={tp: (kafka.OffsetAndMetadata(offset, None))})
            self.DbClient.commit_offset(topic=topic,
                                        group_id=group_id,
                                        partition=partition,
                                        offset=offset)
        # 提交至kafka服务器
        elif self.offset_store_mode == 'kafka':
            self.engine.commit(
                offsets={tp: (kafka.OffsetAndMetadata(offset, None))})
        # 提交至database
        else:
            self.DbClient.commit_offset(topic=topic,
                                        group_id=group_id,
                                        partition=partition,
                                        offset=offset)

    def find_or_create(self, **kwargs):
        client = self.DbClient
        data = client.get_offset(**kwargs)
        if data is None:
            client.create_offset(**kwargs)
            return False
        else:
            return data

Esempio n. 20

Mostra file

# 创建一个consumer
consumer = KafkaConsumer(
    # consumer_timeout_ms=10000, #未收到新消息时迭代等待的时间，默认无限大
    enable_auto_commit=False,  #默认自动提交
    #auto_commit_interval_ms=5000,
    group_id="g_2",  #指定groupid后默认提取消息偏移量为上一次提交的偏移量
    bootstrap_servers="172.31.32.39:9092",
    client_id='11'  #同一groupid下同一时刻只会有一个client消费同一partition消息
)
consumer.assign([TopicPartition("test_yang", 0)
                 ])  # 指定消费者topic与partition（与consumer定义参数topic互斥）
#consumer.seek(TopicPartition(topic='test33333',partition=0),170)#手动指定TopicPartition的提取偏移量（要与assign方法一起用）,此方法不会改变提交的偏移量
print(consumer.assignment())  #获取分配给当前consumer的topicpartitions
print(
    consumer.beginning_offsets(
        {TopicPartition(topic='test33333', partition=0)}))  # 获取指定分区的第一个偏移量
print(consumer.end_offsets({TopicPartition(topic='test33333',
                                           partition=0)}))  # 获取指定分区的最后一个偏移量
print(consumer.topics())  # 获取主题列表
print(consumer.partitions_for_topic('test33333'))  #获取指定主题的分区信息
print(consumer.committed(TopicPartition(topic='test33333',
                                        partition=0)))  #获取当前Group已提交的偏移量
# 手动提交偏移量 offsets格式：{TopicPartition:OffsetAndMetadata(offset_num,None)}
# consumer.commit()  # 同步提交当前偏移量（默认值为已消费offset+1）
consumer.commit(
    offsets={TopicPartition('topic_yang', 0): OffsetAndMetadata(0,
                                                                None)})  #同步提交

# re=consumer.commit_async(offsets=None, callback=None)
# print(re.succeeded())

Esempio n. 21

Mostra file

import logging
from kafka import KafkaConsumer

logging.basicConfig(level=logging.INFO, format='%(asctime)16s - %(levelname)8s - %(message)s')

consumer = KafkaConsumer(
    'topic_lijihua',
    group_id='group_lijihua',
    bootstrap_servers=['111.229.87.152:9092'],
    # consumer_timeout_ms=1000,
    consumer_timeout_ms=1000 # 1000ms后，若没有消息进来，则终止进程
)
try:
    for msg in consumer:
        logging.info('[当前接收到的数据为]：{}'.format(msg))
        logging.info('[消息内容]：{}'.format(msg.value))
        logging.info('[当前订阅的topic]: {}'.format(consumer.subscription()))
        logging.info('[当前topic和分区信息]：{}'.format(consumer.assignment()))
        logging.info('[可消费的偏移量]：{}'.format(consumer.beginning_offsets(consumer.assignment())))
except Exception as err:
    logging.info(err)

Esempio n. 22

Mostra file

File: pollMessages.py Progetto: PACELab/ensure_workloads

    def poll(self):
        #consumer = KafkaConsumer(self.messageQueue, auto_offset_reset='earliest',bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
        #consumer = KafkaConsumer(self.messageQueue,auto_offset_reset='earliest',bootstrap_servers=['bay15:9092'], consumer_timeout_ms=self.consumerTimeoutMS)
        consumer = KafkaConsumer(auto_offset_reset='earliest',bootstrap_servers=['server8:9092'], consumer_timeout_ms=self.consumerTimeoutMS)
        A = [TopicPartition(self.messageQueue, self.parititonToMonitor)]
        consumer.assign(A)
        partitions = consumer.assignment()
        if(self.verbose):
            idx = 0;
            for msg in consumer:
                idx+=1
                print ("\t idx: %d len(msg): %d msg: --%s-- "%(idx,len(msg.value),msg.value))
                break

        #consumer.config['group_id'] = self.actionName; partitions = consumer.assignment()

        
        print ("partitions: %s "%(str(partitions)))
        print (" Before while loop -- Offsets: begin: %s end: %s "%(consumer.beginning_offsets(partitions),consumer.end_offsets(partitions)))
        
        lastShippedOffset = list(consumer.beginning_offsets(partitions).values())[0]
        curEndOffset = list(consumer.end_offsets(partitions).values())[0]
        if(self.launchType == "realtime"):
            if(curEndOffset>3*self.batchSize):
                lastShippedOffset = curEndOffset-3*self.batchSize # Can get the cold start out of our way.
            else:
                lastShippedOffset = curEndOffset-1 # Assuming that wrapper is run before the producerRTA script
        elif(self.launchType == "fixedInterval"):
            lastShippedOffset = 0
            lastShippedOffset = list(consumer.beginning_offsets(partitions).values())[0]

        print ("\t Last shipped offset: %d "%(lastShippedOffset)); #sys.exit()
        consumer.seek(TopicPartition(self.messageQueue,self.parititonToMonitor),lastShippedOffset+1)
        #lastShippedOffset = 0
        
        idx = 0
        toSendJson = [] 
        while True:
            idx+=1;
            curEndOffset = list(consumer.end_offsets(partitions).values())[0]
            if((curEndOffset-lastShippedOffset) >= self.batchSize):
                if(self.issuesNumInvocations%5==0): print ("\t Yaay! found something to ship yo, cos curEndOffset: %d lastShippedOffset: %d "%(curEndOffset,lastShippedOffset))
                getRecords = consumer.poll(timeout_ms=self.consumerTimeoutMS,max_records=self.batchSize)
                #for key,value in getRecords.items(): # should change this to if condition.
                if(len(getRecords)>0):
                    #print ("\t getRecords: %s len(getRecords): %d "%(getRecords,len(getRecords)))
                    key = list(getRecords.keys())[0]
                    value = list(getRecords.values())[0]
                    #print ("\t key: --%s-- len(value): %d "%(key,len(value)))
                     
                    #consumer.commit(offsets=self.getOffsetList(value)) 
                    print ("\t len(toSendJson): %d value[0].offset: %d "%(len(toSendJson),value[0].offset))
                    for curMsg in value:
                        curDict = {}
                        curDict["topic"] = str(curMsg.topic)
                        curDict["partition"] = str(curMsg.partition)
                        curDict["offset"] = str(curMsg.offset)
                        curDict["key"] = str(curMsg.key)
                        curDict["value"] = str(curMsg.value)
                        toSendJson.append(curDict)
                    
                    print ("\t len(toSendJson): %d curMsg.offset: %d topic: %s partition: %d "%(len(toSendJson),curMsg.offset,curMsg.topic,curMsg.partition))

                    # time wsk action invoke --result pathHello --param name World -i
                    # response = requests.post(self.triggerURL, json=payload, auth=self.authHandler, timeout=10.0, verify=check_ssl)
                    startingOffset = lastShippedOffset
                    if(len(toSendJson)>=self.batchSize):
                        #print ("\t url --> %s "%(self.actionUrl))
                        
                        allResponses = []
                        curRespStart = 0; remainingLen = len(toSendJson)
                        while (remainingLen>0):
                            payload = {}
                            if(remainingLen<(1.5*self.batchSize)):
                                #if(remainingLen<self.batchSize): curDispatchSize = remainingLen
                                #else: curDispatchSize = self.batchSize
                                curDispatchSize = remainingLen
                            else:
                                curDispatchSize = self.batchSize 

                            curDispatchSize = remainingLen
                            payload["params"] = toSendJson[curRespStart:curDispatchSize-1]

                            for curReq in range(self.numReqsPerIter):
                                if(self.issuesNumInvocations%self.printOffset==0): print ("\t #reqs-issued: %d curReq: %s "%(self.issuesNumInvocations,curReq))
                                response = requests.post(self.actionUrl, auth=self.authHandler, json=payload, timeout=10.0, verify=False)
                                allResponses.append([response.status_code,curDispatchSize,response,datetime.datetime.now()])
                                time.sleep(self.btwLaunchSleep)

                            if(self.issuesNumInvocations<2):
                                time.sleep(7.5)
                            else:
                                time.sleep(self.btwLaunchSleep)

                            curRespStart+=curDispatchSize                    
                            remainingLen-=curDispatchSize
                            #print ("\t curDispatchSize: %d remainingLen: %d "%(curDispatchSize,remainingLen))

                        curRespStart = 0
                        for batchIdx,curRespSet in enumerate(allResponses):
                            #print ("\t curRespSet: %s "%(curRespSet))
                            curRespCode = curRespSet[0]
                            curDispatchSize = curRespSet[1]
                            curResp = curRespSet[2]
                            issuedTS = curRespSet[3]
                            #print ("\t response status_code: %s  curDispatchSize: %s "%(curRespCode,curDispatchSize))

                            if curRespCode in range(200, 300):
                                response_json = curResp.json()
                                if 'activationId' in response_json and response_json['activationId'] is not None:
                                    if(self.issuesNumInvocations%self.printOffset==0):
                                        print("[{}] Fired trigger with activationID {}".format(self.actionName, response_json['activationId']))
                                    self.allActivationsInfo.append([response_json['activationId'],issuedTS])
                                    """if(self.issuesNumInvocations<5):
                                        time.sleep(7.5)
                                    else:
                                        time.sleep(self.btwLaunchSleep)"""
                                else:
                                    print("[{}] Successfully fired trigger".format(self.actionName))                    
                                
                                #print ("\t Response json: --%s-- "%(str(response_json)))
                                if(batchIdx%self.numReqsPerIter==0):
                                    self.issuesNumInvocations+=1 
                                    idxOffset = curRespStart+curDispatchSize-1

                                #if(self.verbose): print ("\t idxOffset: %d toSendJson[idxOffset]\t[topic]: %s\t [offset]: %s "%(idxOffset,str(toSendJson[idxOffset]["topic"]),str(toSendJson[idxOffset]["offset"])))

                                # Assuming it's safe to seek until the point to which we have successfully processed. 
                                # This would mean, if not all actions are successful, we might end up rereading from the queue. This is fault tolerant, but not performant design.
                                    lastShippedOffset = int(toSendJson[idxOffset]["offset"])
                                    curRespStart+=curDispatchSize
                                    if(self.issuesNumInvocations%self.printOffset==0): print ("\t idxOffset: %d curRespStart: %d curDispatchSize: %d "%(idxOffset,curRespStart,curDispatchSize,))

                        # while seeking, I have to keep it +1, since I want the next record to the one I have already processed.
                        consumer.seek(TopicPartition(curMsg.topic,curMsg.partition),lastShippedOffset+1)
                        self.sendNumMessages+=(lastShippedOffset-startingOffset) # lastShippedOffset is not adjusted, so don't need +1 for counting.
                        #lastShippedOffset = curMsg.offset 

                        if(self.verbose): print ("\t Done with committing.. lastShippedOffset: %d self.issuesNumInvocations: %d self.sendNumMessages: %d "%(lastShippedOffset,self.issuesNumInvocations,self.sendNumMessages))
                        toSendJson = [] # Now that I have processed all the records that I have read, I will remove from the buffer.

                    else:
                        print ("\t idx: %d curEndOffset: %d lastShippedOffset: %d "%(idx,curEndOffset,lastShippedOffset))
                        time.sleep(self.pollingPeriod) 
                else:
                    print ("\t len(getRecords): %d "%(len(getRecords)))                       

            else:
                print ("\t idx: %d curEndOffset: %d lastShippedOffset: %d self.sendNumMessages: %d "%(idx,curEndOffset,lastShippedOffset,self.sendNumMessages))
                time.sleep(self.pollingPeriod)

            if( (self.issuesNumInvocations >= self.maxNumInvocations) or (self.sendNumMessages >= self.maxMessages)):
                break

        print ("\t End, self.issuesNumInvocations: %d idx: %d self.sendNumMessages: %d"%(self.issuesNumInvocations,idx,self.sendNumMessages))
        self.publishActivationInfo()
        if consumer is not None:
            consumer.close()

Esempio n. 23

Mostra file

File: cachereader.py Progetto: michele-brambilla/nicoshttp

class KafkaCache(NicosCacheReader):
    _consumer = None
    _topic = ""

    def __init__(self, **kwargs):
        brokers = kwargs['brokers']
        topic = kwargs['topics']
        if not isinstance(brokers, list):
            brokers = [brokers]
        if not isinstance(topic, str):
            raise TypeError('topic must be a string')
        self._connect(brokers)
        self._assign(topic)
        self._initial_db()

    def _connect(self, brokers):
        self._consumer = KafkaConsumer(bootstrap_servers=brokers,
                                       auto_offset_reset='earliest')

    def _assign(self, topic):
        consumer = self._consumer
        alltopics = consumer.topics()
        if not topic in alltopics:
            raise ValueError('topic: %s is not present' % topic)
        partitions = consumer.partitions_for_topic(topic)
        consumer.assign(
            [TopicPartition(topic, partition) for partition in partitions])
        self._topic = topic

    def _initial_db(self):
        consumer = self._consumer
        assignment = consumer.assignment()
        end = consumer.end_offsets(list(assignment))

        for partition in assignment:
            while consumer.position(partition) < end[partition]:
                message = next(consumer)
                key = message.key.decode().split('/')
                if self._message_is_interesting(key):
                    self._update_db(key)

    def _log(self):
        consumer = self._consumer
        assignment = consumer.assignment()
        beginning = self._consumer.beginning_offsets(list(assignment))
        end = self._consumer.end_offsets(list(assignment))
        print('beginning: %r\tend: %r' % (beginning, end))
        for partition in assignment:
            print('> partition %r: offset: %d' %
                  (partition, consumer.position(partition)))

    def disconnect(self):
        self._consumer.unsubscribe()
        self._consumer.close()

    def run(self):
        consumer = self._consumer
        while not self._stop:
            message = next(consumer)
            key = message.key.decode().split('/')
            if self._message_is_interesting(key):
                self._update_db(key)

Esempio n. 24

Mostra file

class PythonKafkaReader(KafkaReader):
    def __init__(self, kafka_hosts):
        self.config = {
            "bootstrap_servers": kafka_hosts,
            "client_id": "KsnapClient",
            "max_poll_interval_ms": 10000,
            "auto_offset_reset": "earliest",
            "enable_auto_commit": False,
        }
        self.consumer = KafkaConsumer(**self.config)
        self.topics: List[str] = []

    @staticmethod
    def _check_reach_offsets(msg: ConsumerRecord, offset_dict):
        if (msg.topic, msg.partition) not in offset_dict:
            return True
        return offset_dict[(msg.topic, msg.partition)] <= msg.offset

    def list_topics(self) -> Set[str]:
        return self.consumer.topics()

    def subscribe(self, topics: List[str]):
        # TODO: consider having add_topics as methods
        self.topics = topics
        self.consumer.subscribe(topics)

    def _get_latest_offsets(self) -> Dict[Tuple[str, int], int]:
        tps: List[TopicPartition] = []
        for t in self.consumer.topics():
            if t not in self.topics:
                continue
            partitions = self.consumer.partitions_for_topic(t)
            for p in partitions:
                tps.append(TopicPartition(t, p))
        d = {}
        low_offset_dict = self.consumer.beginning_offsets(tps)
        high_offset_dict = self.consumer.end_offsets(tps)

        for tp in tps:
            low = low_offset_dict.get(tp)
            high = high_offset_dict.get(tp)
            if high is None:
                logger.debug(tp)
                continue
            if low == high:
                logger.info(f'No messages in topic: {tp.topic} '
                            f'partition: {tp.partition}')
                continue
            # high watermark is latest offset + 1
            d[(tp.topic, tp.partition)] = high - 1
            logger.debug(f'Latest offset for topic: {tp.topic} '
                         f'partition: {tp.partition}: {high - 1}')
        return d

    def read(self, timeout: int = 0) -> Dict[Tuple[str, int], List[Message]]:
        msg_count = 0
        offset_dict = self._get_latest_offsets()
        done_partitions: Set[Tuple[str, int]] = set()
        msg_dict: Dict[Tuple[str, int], List[Any]] = defaultdict(list)
        try:
            start_time = datetime.now()
            while True:
                # break if timeout is reached
                if PythonKafkaReader._check_timeout(timeout, start_time):
                    logger.info(
                        f'Reached timeout: {timeout}s for reading messages.')
                    break
                # break if all partitions are marked as done
                if len(done_partitions) == len(offset_dict):
                    logger.info('Done consuming from '
                                f'{len(done_partitions)} partitions.')
                    break
                msg: ConsumerRecord = next(self.consumer)
                if msg is None:
                    continue
                # skip if partitions are marked as done
                if (msg.topic, msg.partition) in done_partitions:
                    continue
                # skip messages over required offsets
                if PythonKafkaReader._check_reach_offsets(msg, offset_dict):
                    logger.info(f'Done consuming from topic: '
                                f'{msg.topic} partition: '
                                f'{msg.partition}')
                    self.consumer.pause(
                        TopicPartition(msg.topic, msg.partition))
                    done_partitions.add((msg.topic, msg.partition))
                message = Message(msg.offset, msg.key, msg.value,
                                  msg.timestamp, msg.headers)
                msg_dict[(msg.topic, msg.partition)].append(message)
                msg_count += 1
                if not msg_count % 100000:
                    logger.debug(
                        f"So far read {msg_count} messages from kafka")
        except KeyboardInterrupt:
            logger.info("%% Aborted by user\n")
        finally:
            self.close()
        logger.info("Done with reading")
        PythonKafkaReader.generate_consumer_report(offset_dict, msg_dict,
                                                   done_partitions)
        return msg_dict

    def close(self):
        self.consumer.close(autocommit=False)

Esempio n. 25

Mostra file

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from kafka import KafkaConsumer

servers = ['192.168.5.110:9092']

consumer = KafkaConsumer('test', bootstrap_servers=servers)

print(consumer.partitions_for_topic('test'))
print(consumer.topics())
print(consumer.subscription())
print(consumer.assignment())
print(consumer.beginning_offsets(consumer.assignment()))

for msg in consumer:
    print(msg.value)

consumer.close()

Esempio n. 26

Mostra file

File: consumer_model.py Progetto: shanwenhao1/Kafka

class Consumer:
    """
    继承kafka-python KafkaConsumer, 封装自己的方法
    """
    def __init__(self, group_id: str = None):
        self.group_id = group_id

    def __enter__(self):
        self.cfg = Config().cfg
        self.consumer = KafkaConsumer(
            bootstrap_servers=self.cfg["serList"],
            # api_version=self.cfg["apiVersion"],
            api_version_auto_timeout_ms=self.cfg["autoVersionTimeout"],
            security_protocol=self.cfg["protocol"],
            sasl_mechanism=self.cfg["mechanism"],
            sasl_kerberos_service_name=self.cfg["kerverosSerName"],
            group_id=self.group_id,
        )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.consumer.close()

    def assign(self, partitions: list):
        """
        手动为当前consumer分配topic分区列表
        :param partitions: 手动分配分区[(topic, partition)]
        :return:
        """
        _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions]
        try:
            result = self.consumer.assign(_partitions)
        except IllegalStateError:
            log.tag_error(
                KafkaInfo.KafkaConsumer,
                "Manually consumer TopicPartitions error, "
                "Topic Consumer is being in used")
            raise ActionError(KafkaErr.ConsumerInUsed)
        return result

    def assignment(self):
        """
        获取当前consumer分配的topic 分区:
        如果使用assign()手动分配, 则直接返回assign manage 配置
        如果使用subscribe()订阅的话则返回None(无订阅的情况) or set of topic partitions
        :return:
        """
        return self.consumer.assignment()

    def beginning_offsets(self, partitions: list):
        """
        获取指定partition最开始的offset, 该操作不影响当前partition offset
        :param partitions: 指定topic分区[(topic: partition)]
        :return:
        """
        _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions]
        try:
            result = self.consumer.beginning_offsets(_partitions)
        except UnsupportedVersionError or KafkaTimeoutError as e:
            if e.__class__ == UnsupportedVersionError:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "API VERSION ERROR, DO NOT SUPPORT")
                raise ActionError(KafkaErr.NotSupport)
            else:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "Get Beginning offset failed, Time out")
                raise ActionError(KafkaErr.GetOffsetFailed)
        return result

    def end_offsets(self, partitions: list):
        """
        获取指定partition结束的offset
        :param partitions: 指定topic分区[(topic: partition)]
        :return:
        """
        _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions]
        try:
            result = self.consumer.end_offsets(_partitions)
        except UnsupportedVersionError or KafkaTimeoutError as e:
            if e.__class__ == UnsupportedVersionError:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "API VERSION ERROR, DO NOT SUPPORT")
                raise ActionError(KafkaErr.NotSupport)
            else:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "Get end offset failed, Time out")
                raise ActionError(KafkaErr.GetOffsetFailed)
        return result

    def offsets_for_time(self, partitions_time: list, timestamp: int = -1):
        """
        寻找指定时间后的partition最早offset
        :param partitions_time: list of (topic, partition) if timestamp > 0, (topic, partition, timestamp) if timestamp = -1
        :param timestamp: 指定的开始查询时间, 如果是-1则表示每个partitions都有自己的时间配置
        :return:
        """
        if timestamp > 0:
            _partitions = {
                TopicPartition(_tuple[0], _tuple[1]): timestamp
                for _tuple in partitions_time
            }
        else:
            _partitions = {
                TopicPartition(_tuple[0], _tuple[1]): _tuple[2]
                for _tuple in partitions_time
            }
        try:
            result = self.consumer.offsets_for_times(_partitions)
        except UnsupportedVersionError or ValueError or KafkaTimeoutError as e:
            if e.__class__ == UnsupportedVersionError:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "API VERSION ERROR, DO NOT SUPPORT")
                raise ActionError(KafkaErr.NotSupport)
            if e.__class__ == ValueError:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "Value Error: Target Timestamp is negative")
            else:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "Get offset by timestamp failed, Time out")
            raise ActionError(KafkaErr.GetOffsetFailed)
        return result

    def highwarter(self, topic: str, partition: int):
        """
        highwater offset 是分区中分配给下一个message produced的offset
        (一般用来与reported position做比较来计算滞后)
        :param topic:
        :param partition:
        :return:
        """
        result = self.consumer.highwater(TopicPartition(topic, partition))
        return result

    def commit(self, partition_offset: tuple, async_commit: bool = False):
        """
        提交偏移量至kafka, 阻塞直到成功或报错
        need group id not None
        :param partition_offset: (topic, partition, offset)
        :param async_commit: choose async commit
        :return:
        """
        topic = partition_offset[0]
        partition = partition_offset[1]
        _offset = partition_offset[2]
        offset = {
            TopicPartition(topic, partition): OffsetAndMetadata(_offset, None)
        }
        if not async_commit:
            self.consumer.commit(offset)
        else:
            self.consumer.commit_async(offset).add_errback(self.commit_err,
                                                           topic=topic,
                                                           partition=partition,
                                                           offset=_offset)

    def committed(self, topic: str, partition: int):
        """
        获取指定的topic partition最后一个committed offset, 配合commit使用
        :param topic:
        :param partition:
        :return:
        """
        _partition = TopicPartition(topic, partition)
        result = self.consumer.committed(_partition)
        return result

    def metrics(self):
        """
        获取consumer的性能记录(包含各个kafka broker)
        :return:
        """
        performance = self.consumer.metrics()
        return performance

    def partition_for_topic(self, topic_name: str):
        """
        查询指定topic的分区metadata
        :param topic_name:
        :return:
        """
        result = self.consumer.partitions_for_topic(topic_name)
        return result

    def available_partitions_for_topic(self, topic_name: str):
        """
        查询指定topic的可用分区
        :param topic_name:
        :return:
        """
        result = self.consumer.available_partitions_for_topic

    def pause(self, partitions: list):
        """
        挂起分区的请求(注意请求失败, 但可能有部分topic partition已经挂起)
        :param partitions: list of TopicPartition need pause, for example: [(topic1, partition1), (topic2, partition2)]
        :return:
        """
        _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions]
        try:
            self.consumer.pause(*_partitions)
        except:
            log.tag_error(
                KafkaInfo.KafkaConsumer,
                "Pause TopicPartition error, TopicPartition not exist")
            raise ActionError(KafkaErr.TopicPartitionNotExist)

    def get_paused(self):
        """
        获取当前被挂起的分区(用pause)
        :return:
        """
        return self.consumer.paused()

    def resume(self, partitions: list):
        """
        恢复pause挂起的分区
        :param partitions:
        :return:
        """
        _partitions = [TopicPartition(_par[0], _par[1]) for _par in partitions]
        self.consumer.resume(*_partitions)

    def seek(self, partition: tuple, offset: int):
        """
        修改TopicPartition的偏移量, 一般用于 poll
        :param partition: 指定的TopicPartition, (topic, partition)
        :param offset: 修改后的offset, >=0
        :return:
        """
        _partition = TopicPartition(partition[0], partition[1])
        self.consumer.seek(_partition, offset)

    def seek_many(self, partitions: list = None, is_begin: bool = True):
        """
        批量seek
        :param partitions: TopicPartition集合, [(topic, partition), ...], 当list为空时, 默认为已分配分区
        :param is_begin: True: 置位分区最初的可用offset, False: 置位end可用的offset
        :return:
        """
        if partitions is not None:
            _partitions = [
                TopicPartition(_par[0], _par[1]) for _par in partitions
            ]
        else:
            _partitions = []
        if is_begin:
            self.consumer.seek_to_beginning(*_partitions)
        else:
            self.consumer.seek_to_end(*_partitions)

    def poll(self, timeout_ms=0, max_records=1000):
        """
        获取指定分区的Records, 会自动使用上一次的offset作为本次的开始, 也可用seek()手动指定
        当分区被pause挂起时, poll获取不到任何records
        :param timeout_ms:
        :param max_records:
        :return:
        """
        result = self.consumer.poll(timeout_ms, max_records)
        return result

    def position(self, partition: tuple):
        """
        获取指定分区next record的offset
        :param partition:
        :return:
        """
        _partition = TopicPartition(partition[0], partition[1])
        result = self.consumer.position(_partition)
        return result

    def subscribe(self, topic: list, pattern: str = None):
        """
        订阅一组topics
        :param topic: topic 列表
        :param pattern:
        :return:
        """
        try:
            self.consumer.subscribe(topic, pattern)
        except IllegalStateError or AssertionError or TypeError as e:
            if e.__class__ == IllegalStateError:
                log.tag_error(KafkaInfo.KafkaConsumer,
                              "Subscribe topic error, %s" % e.__str__)
            log.tag_error(KafkaInfo.KafkaConsumer,
                          "Subscribe topic error, Parameter Error")
            raise ActionError(KafkaErr.ParameterError)

    def unsubscribe(self):
        """
        取消订阅所有topic并清除分区配置
        :return:
        """
        self.consumer.unsubscribe()

    def subscription(self):
        """
        获取订阅状态
        :return:
        """
        result = self.consumer.subscription()
        return result

    def get_topics(self):
        """
        获取用户可见的topic
        :return:
        """
        result = self.consumer.topics()
        return result

    @staticmethod
    def commit_err(topic: str, partition: int, offset: int):
        """
        consumer commit offset failed callback function
        :param topic:
        :param partition:
        :param offset:
        :return:
        """
        log.tag_error(
            KafkaInfo.KafkaConsumer, "Kafka Consumer commit offset failed, "
            "{TopicPartition(%s, %s): %s}" % (topic, partition, offset))
        raise ActionError(KafkaErr.CommitOffsetFailed)

    @staticmethod
    def get_topic_partition(topic: str, partition: int):
        """
        获取TopicPartition, 这个方法是为了让外部不处理TopicPartition相关的
        :param topic:
        :param partition:
        :return:
        """
        return TopicPartition(topic, partition)

Esempio n. 27

Mostra file

class KafkaClient(object):
    def __init__(self, bootstrap_servers, topic, group_id=None):
        if group_id is not None:
            self.group_id = group_id
            self.allow_hotreload = True
        else:
            self.group_id = 'kafka_topic_dumper_{}'.format(uuid4())
            self.allow_hotreload = False
        self.bootstrap_servers = bootstrap_servers.split(",")
        self.topic = topic
        self.consumer = None
        self.producer = None
        self.timeout_in_sec = 60
        self.dump_state_topic = 'kafka-topic-dumper'
        self.s3_path = 'kafka-topic-dumper-data/'
        self.s3_client = None

    def _get_consumer(self):
        if self.consumer is not None:
            return
        try:
            logger.info('Starting consumer')
            self.consumer = KafkaConsumer(
                bootstrap_servers=self.bootstrap_servers,
                group_id=self.group_id,
                enable_auto_commit=True)
        except Exception as err:
            msg = 'Can not create KafkaConsumer instance. Reason=<{}>'
            logger.exception(msg.format(err))
            raise err

    def _get_s3_client(self):
        if self.s3_client is None:
            self.s3_client = boto3.client('s3')
        return self.s3_client

    def _get_producer(self):
        if self.producer is not None:
            return
        try:
            logger.info('Starting producer')
            self.producer = KafkaProducer(
                bootstrap_servers=self.bootstrap_servers,
                key_serializer=bytes_serializer,
                value_serializer=bytes_serializer)
        except Exception as err:
            msg = 'Can not create KafkaProducer instance. Reason=<{}>'
            logger.exception(msg.format(err))
            raise err

    def open(self):
        self._get_consumer()
        self._get_producer()

    def _close_consumer(self):
        logger.info("Closing consumer")
        self.consumer.close()
        self.consumer = None

    def _close_producer(self):
        logger.info("Closing producer")
        self.producer.flush()
        logger.debug('Statistics {}'.format(self.producer.metrics()))
        self.producer.close()
        self.producer = None

    def close(self):
        self._close_consumer()
        self._close_producer()

    def _get_partitions(self, topic):
        partitions = self.consumer.partitions_for_topic(topic) or []

        count = 0
        while not partitions and count < 500000:
            self.consumer.subscribe(topic)
            partitions = self.consumer.partitions_for_topic(topic) or []
            sleep(0.1)

        msg = "Got the following partitions=<{}> for topic=<{}>"
        logger.info(msg.format(partitions, topic))

        topic_partitions = list(
            map(lambda p: TopicPartition(topic, p), partitions))
        msg = "Got the following topic partitions=<{}>"
        logger.info(msg.format(topic_partitions))
        return topic_partitions

    def _get_offsets(self, topic=None):
        if topic is None:
            topic = self.topic
        topic_partitions = self._get_partitions(topic=topic)
        beginning_offsets = (self.consumer.beginning_offsets(topic_partitions)
                             or {})
        msg = "Got the following beginning offsets=<{}>"
        logger.info(msg.format(beginning_offsets))

        commited_offsets = {}
        msg = "Partition=<{}> has the current offset=<{}> for <{}>"
        for tp in topic_partitions:
            offset = self.consumer.committed(tp)
            commited_offsets[tp] = offset
            logger.debug(msg.format(tp, offset, self.group_id))

        end_offsets = self.consumer.end_offsets(topic_partitions) or {}
        msg = "Got the following end offsets=<{}>"
        logger.info(msg.format(end_offsets))

        return beginning_offsets, commited_offsets, end_offsets

    def _calculate_offsets(self, beginning_offsets, end_offsets,
                           num_messages_to_consume):
        perfect_displacement = ceil(num_messages_to_consume /
                                    max(len(beginning_offsets), 1))
        offsets = {}
        num_messages_available = 0

        for tp, offset in beginning_offsets.items():
            offsets[tp] = max(beginning_offsets[tp],
                              end_offsets[tp] - perfect_displacement)
            num_messages_available += end_offsets[tp] - offsets[tp]

        return offsets, num_messages_available

    def _set_offsets(self, offsets):
        offset_and_metadata = {
            tp: OffsetAndMetadata(offset, b'')
            for tp, offset in offsets.items()
        }

        msg = "Generated the following offsets=<{}>"
        logger.debug(msg.format(offset_and_metadata))

        self.consumer.commit(offset_and_metadata)

    def _get_messages(self, num_messages_to_consume):
        messages = []
        while len(messages) < num_messages_to_consume:
            record = next(self.consumer)
            line = (record.key, record.value)
            messages.append(line)
        self.consumer.commit()

        return messages

    def _write_messages_to_file(self, messages, local_path):
        df = pd.DataFrame(messages)
        table = pa.Table.from_pandas(df)
        pq.write_table(table, local_path, compression='gzip')

    def _send_dump_file(self, local_path, bucket_name, dump_id):
        file_name = path.basename(local_path)
        s3_path = path.join(self.s3_path, dump_id, file_name)

        logger.info('Sending file <{}> to s3'.format(file_name))
        s3_client = self._get_s3_client()
        s3_client.upload_file(local_path,
                              bucket_name,
                              s3_path,
                              ExtraArgs={'ACL': 'private'},
                              Callback=ProgressPercentage(local_path))

        logger.debug('Deleting file <{}>'.format(file_name))
        remove(local_path)

    def _get_transformer_class(self, transformer_id):
        [module_name, class_name] = transformer_id.split(":")

        module = __import__(module_name, globals(), locals(), [class_name], 0)
        cl = getattr(module, class_name)

        return cl()

    def get_messages(self, num_messages_to_consume, max_package_size_in_msgs,
                     local_dir, bucket_name, dry_run, dump_id):

        # set offsets
        msg = ('Will ask kafka for <{}> messages ' +
               'and save it in files with <{}> messages')
        logger.debug(
            msg.format(num_messages_to_consume, max_package_size_in_msgs))

        beginning_offsets, commited_offsets, end_offsets = self._get_offsets()

        offsets, num_messages_available = self._calculate_offsets(
            beginning_offsets=beginning_offsets,
            end_offsets=end_offsets,
            num_messages_to_consume=num_messages_to_consume)

        self._set_offsets(offsets)

        # get messages
        self.consumer.subscribe(topics=[self.topic])

        msg = 'Trying to dump <{}> messages'
        logger.info(msg.format(num_messages_available))

        remaining_messages = num_messages_available
        num_dumped_messages = 0

        dump_dir = path.join(local_dir, dump_id)
        makedirs(dump_dir, exist_ok=True)
        logger.debug('Dump directory <{}> created'.format(dump_dir))

        while remaining_messages > 0:
            batch_size = min(remaining_messages, max_package_size_in_msgs)
            logger.debug('Fetching batch with size=<{}>'.format(batch_size))

            file_name = '{}-{:015d}.parquet'.format(dump_id,
                                                    num_dumped_messages)

            local_path = path.join(local_dir, dump_id, file_name)

            messages = self._get_messages(num_messages_to_consume=batch_size)
            self._write_messages_to_file(messages=messages,
                                         local_path=local_path)
            if not dry_run:
                self._send_dump_file(local_path=local_path,
                                     bucket_name=bucket_name,
                                     dump_id=dump_id)
            remaining_messages -= batch_size
            num_dumped_messages += batch_size

        logger.info('Dump done!')

    def find_latest_dump_id(self, bucket_name):
        paginator = self._get_s3_client().get_paginator('list_objects_v2')

        prefix = self.s3_path.rstrip('/') + '/'

        response_iterator = paginator.paginate(Bucket=bucket_name,
                                               Prefix=prefix,
                                               Delimiter='/')

        def strip(r):
            return r['Prefix'][len(prefix):].rstrip('/')

        prefixes = []
        for response in response_iterator:
            prefixes.extend(map(strip, response['CommonPrefixes']))

        dump_id = max(prefixes)
        logger.debug('Prefix chosen was <{}>'.format(dump_id))

        return dump_id

    def _get_file_names(self, bucket_name, dump_id):
        paginator = self._get_s3_client().get_paginator('list_objects_v2')
        dump_path = path.join(self.s3_path, dump_id) + '/'

        response_iterator = paginator.paginate(Bucket=bucket_name,
                                               Prefix=dump_path)
        file_names = []
        for response in response_iterator:
            if response['KeyCount'] > 0:
                file_names.extend(
                    (f['Key'], f['Size']) for f in response['Contents'])
        file_names.sort()

        if not file_names:
            msg = 'Can not found files for this dump id <{}>'
            logger.error(msg.format(dump_id))
            raise Exception('EmptyS3Response')

        return file_names

    def _gen_state(self, dump_id, transformer_id):
        _, _, end_offsets = self._get_offsets()

        if not end_offsets:
            msg = 'Can not find offsets for topic <{}>'
            raise Exception(msg.format(self.topic))

        state_offsets = {}

        for partition, offset in end_offsets.items():
            state_offsets[partition.partition] = offset

        state = {
            'dump_id': dump_id,
            'topic_name': self.topic,
            'offsets': state_offsets,
            'dump_date': int(time.time()),
            'transformer_id': transformer_id
        }

        return state

    def _save_state(self, state):
        future = self.producer.send(topic=self.dump_state_topic,
                                    key=self.topic,
                                    value=json.dumps(state))
        future.get(timeout=self.timeout_in_sec)
        logger.info('State saved')

    def _get_last_state_message(self):
        beginning_offsets, _, end_offsets = (self._get_offsets(
            topic=self.dump_state_topic))

        if beginning_offsets:
            offsets, num_messages_available = self._calculate_offsets(
                beginning_offsets=beginning_offsets,
                end_offsets=end_offsets,
                num_messages_to_consume=1)
            self._set_offsets(offsets)
            self.consumer.subscribe(self.dump_state_topic)
            messages = [
                json.loads(m.decode())
                for k, m in self._get_messages(num_messages_available)
            ]
            if messages:
                last_state_message = max(messages,
                                         key=lambda m: m['dump_date'])
                return last_state_message

        return None

    def _get_state(self, dump_id, transformer_id):
        if self.allow_hotreload:
            state_message = self._get_last_state_message()
            if state_message and \
               state_message['topic_name'] == self.topic and \
               state_message['dump_id'] == dump_id and \
               'transformer_id' in state_message and \
               state_message['transformer_id'] == transformer_id:
                return state_message['offsets']
        return None

    def _reset_offsets(self, dump_offsets):
        logger.info('Messages already uploaded. Just resetting offsets')
        partitions = self._get_partitions(self.topic)
        offsets = {}

        for partition in partitions:
            offsets[partition] = dump_offsets[str(partition.partition)]

        logger.debug('Will reset offsets to <{}>'.format(offsets))

        self._set_offsets(offsets)

    def _load_dump(self, bucket_name, dump_id, download_dir, files,
                   transformer_instance):
        s3_client = self._get_s3_client()

        transformer_id = transformer_instance.get_id()

        state = self._gen_state(dump_id, transformer_id)

        current_file_number = 0
        msg = "Loading messages from file {}/{} to kafka"
        for file_name, file_size in files:
            current_file_number += 1
            tmp_name = '{}.tmp'.format(path.basename(file_name))
            file_path = path.join(download_dir, tmp_name)
            s3_client.download_file(Bucket=bucket_name,
                                    Filename=file_path,
                                    Key=file_name,
                                    Callback=ProgressPercentage(
                                        tmp_name, file_size))
            logger.info(msg.format(current_file_number, len(files)))
            try:
                table = pq.read_table(file_path)
                df = table.to_pandas()
                for raw_row in df.itertuples():
                    for row in transformer_instance.transform(raw_row):
                        self.producer.send(self.topic,
                                           key=row[1],
                                           value=row[2])
                logger.debug('File <{}> reloaded to kafka'.format(file_path))
                self.producer.flush(self.timeout_in_sec)
            finally:
                remove(file_path)

        self._save_state(state)

    def reload_kafka_server(self, bucket_name, local_dir, dump_id,
                            transformer_class):
        transformer_instance = self._get_transformer_class(transformer_class)
        msg = 'Using class=<{}> to transform events before production'
        logger.info(msg.format(type(transformer_instance)))

        transformer_id = transformer_instance.get_id()
        dump_offsets = self._get_state(dump_id, transformer_id)

        if dump_offsets:
            self._reset_offsets(dump_offsets=dump_offsets)
        else:
            files = self._get_file_names(bucket_name=bucket_name,
                                         dump_id=dump_id)
            self._load_dump(bucket_name=bucket_name,
                            dump_id=dump_id,
                            download_dir=local_dir,
                            files=files,
                            transformer_instance=transformer_instance)

        logger.info('Reload done!')

    def __enter__(self):
        self.open()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.close()

Esempio n. 28

Mostra file

class Kafka(GenericTool.Tool):
    def __init__(self, controllerIp, controllerPort, toolName, toolDesc, defaultTool, supportProxy=0,
                        proxyIp=None, proxyPort=None, sslSupport=True):
        """
        Kafka agent

        @param controllerIp: controller ip/host
        @type controllerIp: string

        @param controllerPort: controller port
        @type controllerPort: integer

        @param toolName: agent name
        @type toolName: string

        @param toolDesc: agent description
        @type toolDesc: string

        @param defaultTool: True if the agent is started by the server, False otherwise
        @type defaultTool: boolean
        """
        GenericTool.Tool.__init__(self, controllerIp, controllerPort, toolName, toolDesc, defaultTool, 
                                    supportProxy=supportProxy, proxyIp=proxyIp, 
                                    proxyPort=proxyPort, sslSupport=sslSupport)
        self.__type__ = __TYPE__
        self.__mutex__ = threading.RLock()


    def getType(self):
        """
        Returns agent type

        @return: agent type
        @rtype: string
        """
        return self.__type__

    def onCleanup(self):
        """
        Cleanup all
        In this function, you can stop your program
        """
        pass
        
    def initAfterRegistration(self):
        """
        Called on successful registration
        In this function, you can start your program automatically.
        """
        self.onToolLogWarningCalled("Starting dummy agent")
        self.onToolLogWarningCalled("Dummy agent started")
        self.onPluginStarted()
    
    def pluginStarting(self):
        """
        Function to reimplement
        """
        pass
        
    def onPluginStarted(self):
        """
        Function to reimplement
        """
        pass

    def pluginStopped(self):
        """
        Function to reimplement
        """
        pass

    def onResetAgentCalled(self):
        """
        Function to reimplement
        """
        pass
        
    def onToolLogWarningCalled(self, msg):
        """
        Logs warning on main application

        @param msg: warning message
        @type msg: string
        """
        pass

    def onToolLogErrorCalled(self, msg):
        """
        Logs error on main application

        @param msg: error message
        @type msg: string
        """
        pass

    def onToolLogSuccessCalled(self, msg):
        """
        Logs success on main application

        @param msg: error message
        @type msg: string
        """
        pass
    
    def onAgentAlive(self, client, tid, request):
        """
        Called on keepalive received from test server
        {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 
        'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'}

        @param client: server address ip/port
        @type client: tuple

        @param tid: transaction id
        @type tid: integer

        @param request: request received from the server
        @type request: dict
        """
        pass
        
    def onAgentInit(self, client, tid, request):
        """
        Called on init received from test server
        {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 
        'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'}

        @param client: server address ip/port
        @type client: tuple

        @param tid: transaction id
        @type tid: integer

        @param request: request received from the server
        @type request: dict
        """
        self.onToolLogWarningCalled(msg="init called: %s" % request['data'])
        self.sendNotify(request=request, data="notify sent")

    def onAgentReset(self, client, tid, request):
        """
        Called on reset received from test server
        {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 
        'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'}
        or 
        {'event': 'agent-reset', 'source-adapter': '1', 'script_id': '7_3_0'}
        
        @param client: server address ip/port
        @type client: tuple

        @param tid: transaction id
        @type tid: integer

        @param request: request received from the server
        @type request: dict
        """
        if 'data' in request:
            self.onToolLogWarningCalled(msg="reset called: %s" % request['data'])
        else:
            self.onToolLogWarningCalled(msg="reset called")
            
    def onAgentNotify(self, client, tid, request):
        """
        Called on notify received from test server and dispatch it
        {'task-id': 'xx', 'from': 'tester', 'destination-agent': 'xxxxx', 'source-adapter': 'xx', 
        'script-name': 'xxxx', 'script_id': 'xxx', 'data': 'xxx', 'event': 'agent-init', 'test-id': 'xxx'}

        @param client: server address ip/port
        @type client: tuple

        @param tid: transaction id
        @type tid: integer

        @param request: request received from the server
        @type request: dict
        """
        self.__mutex__.acquire()
        self.onToolLogWarningCalled(msg="notify received: %s" % request['data'])

        if request['uuid'] in self.context():
            if request['source-adapter'] in self.context()[request['uuid']]:
                ctx_test = self.context()[request['uuid']][request['source-adapter']]
                self.execAction(request)
            else:
                self.error("Adapter context does not exists TestUuid=%s AdapterId=%s" % (request['uuid'], 
                                                                                         request['source-adapter'] ) )
        else:
            self.error("Test context does not exits TestUuid=%s" % request['uuid'])
        self.__mutex__.release()

    def execAction(self, request):
        """
        Execute action
        """
        currentTest = self.context()[request['uuid']][request['source-adapter']]

        self.onToolLogWarningCalled( "<< Starting Command=%s TestId=%s AdapterId=%s" % (request['data']['cmd'],
                                                                                        request['script_id'], 
                                                                                        request['source-adapter']) )
        try:
            cmd = request['data']['cmd']
            data = request['data']
            # connect
            if cmd == 'producer_connect':
                # init 
                kargs=data['kargs']
                try:
                    self.producer = KafkaProducer(bootstrap_servers=data['bootstrap_servers'], **kargs )
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'connected' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )	
                    
            elif cmd == 'producer_send':
                kargs=data['kargs']
                try:
                    future = self.producer.send(data['topic'], **kargs)
                    record_metadata=future.get(timeout=data['timeout'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': record_metadata } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'producer_flush':
                try:
                    self.producer.flush(data['timeout'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'flushed' })
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'producer_partitions_for':
                try:
                    partitions = self.producer.partitions_for(data['topic'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': partitions })
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'producer_close':
                try:
                    self.producer.close(int(data['timeout']))
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'closed' })
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_connect':
                kargs=data['kargs']
                try:
                    if not data['topics']:
                        self.consumer = KafkaConsumer(bootstrap_servers=data['bootstrap_servers'], **kargs)
                    else:
                        self.consumer = KafkaConsumer(data['topics'][0], bootstrap_servers=data['bootstrap_servers'], **kargs)
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'connected' })
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_consume':
                try:
                    for msg in self.consumer :
                        self.sendNotify(request=request, data={ "cmd": cmd , 'result': msg } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_close':
                try:
                    self.consumer.close(data['autocommit'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'closed' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_assign':
                try:
                    self.consumer.assign(data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'assigned' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_assignment':
                try:
                    topicpartitions = self.consumer.assignment()
                    self.sendNotify(request=request, data={ "cmd": cmd , 'topicpartitions': topicpartitions } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_beginning_offsets':
                try:
                    offsets = self.consumer.beginning_offsets(data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'offsets': offsets } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_commit':
                try:
                    self.consumer.commit(data['offsets'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'committed' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_commit_async':
                try:
                    future = self.consumer.commit_async(offsets=data['offsets'],callback=data['callback'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'future': future } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_committed':
                try:
                    offsets = self.consumer.committed(data['topicpartition'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'offsets': offsets } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_end_offsets':
                try:
                    partitions = self.consumer.end_offsets(data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'partitions': partitions } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_highwater':
                try:
                    offset = self.consumer.highwater(data['partition'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'offset': offset } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_offsets_for_times':
                try:
                    offsets = self.consumer.offsets_for_times(data['timestamps'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'offsets': offsets } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_partitions_for_topic':
                try:
                    partitions = self.consumer.partitions_for_topic(data['topic'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'partitions': partitions } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_pause':
                try:
                    self.consumer.pause(data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_paused':
                try:
                    partitions=self.consumer.paused()
                    self.sendNotify(request=request, data={ "cmd": cmd , 'partitions': partitions } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_poll':
                try:
                    records = self.consumer.poll(timeout_ms=data['timeout_ms'], max_records=data['max_records'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'records': records } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_position':
                try:
                    offset = self.consumer.position(data['topicpartition'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'offset': offset } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_resume':
                try:
                    self.consumer.resume(data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_seek':
                try:
                    self.consumer.seek(data['partition'],data['offset'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_seek_to_beginning':
                try:
                    self.consumer.seek_to_beginning(*data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_seek_to_end':
                try:
                    self.consumer.seek_to_end(*data['partitions'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_subscribe':
                try:
                    self.consumer.subscribe(topics=data['topics'], pattern=data['pattern'], listener=data['listener'])
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_subscription':
                try:
                    topics=self.consumer.subscription()
                    self.sendNotify(request=request, data={ "cmd": cmd , 'topics': topics } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_topics':
                try:
                    topics = self.consumer.topics()
                    self.sendNotify(request=request, data={ "cmd": cmd , 'topics': topics } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            elif cmd == 'consumer_unsubscribe':
                try:
                    self.consumer.unsubscribe()
                    self.sendNotify(request=request, data={ "cmd": cmd , 'result': 'success' } )
                except KafkaError  as e:
                    self.sendError( request , data={"cmd": cmd , "err-msg": str(e)} )
                    
            # unknown command
            else:
                raise Exception('cmd not supported: %s' % request['data']['cmd'] )
        except Exception as e:
            self.error( 'unable to run command: %s' % str(e) )
            self.sendError( request , data="unable to run command")

        self.onToolLogWarningCalled( "<< Terminated Command=%s TestId=%s AdapterId=%s" % (request['data']['cmd'],
                                                                                          request['script-id'], 
                                                                                          request['source-adapter']) )

Esempio n. 29

Mostra file

File: kafka_consumer.py Progetto: huruizhi/python_learning_demo

class KafkaC:
    """
    消费模块: 通过不同groupid消费topic里面的消息
    """
    def __init__(self,
                 bootstrap_servers,
                 topic,
                 group,
                 action=None,
                 offset=None,
                 enable_auto_commit=True):
        self.bootstrap_servers = bootstrap_servers
        self.topic = topic
        self.group_id = group
        self.action = action
        self.offset = offset
        # self.enable_auto_commit = True
        self.enable_auto_commit = enable_auto_commit
        self.auto_commit_interval_ms = 1000
        self.consumer = KafkaConsumer(
            # self.kafka_topic,
            # auto_offset_reset默认latest，如果是第一次使用一个新的消费者组，不管前面数据是否消费
            # 都不会消费，使用earliest参数就可以消费前面没有消费的数据
            auto_offset_reset='earliest',
            group_id=self.group_id,
            bootstrap_servers=self.bootstrap_servers,
            enable_auto_commit=self.enable_auto_commit,
            auto_commit_interval_ms=self.auto_commit_interval_ms)

    # 获取所有topics
    def get_all_topics(self):
        return self.consumer.topics()

    # 获取最开始offset
    def get_beginning_offsets(self):
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        return _ps, self.consumer.beginning_offsets(_ps)

    # 获取最新的offset,比最新数据的offset大1,因为kafkaoffset从0开始
    def get_end_offsets(self):
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        return _ps, self.consumer.end_offsets(_ps)

    # 获取当前消费者开始消费的offset
    def get_last_position(self, partition=None):
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        self.consumer.assign(_ps)
        return self.consumer.position(partition=_ps[0])

    # 以时间戳(微秒)方式获取offset,消息必须带有timestamp字段，即kafka版本大于0.10.0才支持
    def get_offset_by_timestamp(self, timestamp):
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        return _ps, self.consumer.offsets_for_times({_ps[0]: timestamp})

    def consume_data(self, offset=None):
        """
        :param action: none 从 kafka 正常的 `CURRENT-OFFSET` 开始消费
                    custom 从指定offset开始
                    begin 从 kafka 从这个topic最开始消费
                    end 从 kafka 从这个topic从最新生成的数据库开始，会跳过未消费数据，慎用
        :param offset: 数字 int>=0 type为custom时有效从当前数字 offset 开始，包括当前数字，
                       如果数字大于当前topic总offset，从最新生成的数据库开始
        :return:
        """

        # 获取topic所有分区并分配给当前消费者, 需要使用 assign 的话， 在 KafkaConsumer 初始化时就不能指定 topic
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        if offset is None:
            offset = self.get_last_position()
        self.consumer.assign(_ps)
        for p in self.consumer.partitions_for_topic(self.topic):
            # 也可以只指定一个分区的 offset
            self.consumer.seek(TopicPartition(self.topic, p), offset)
        # try:
        #     for message in self.consumer:
        #         yield message
        # except KeyboardInterrupt as e:
        #     print(e)
        for message in self.consumer:
            yield message

    def consume_daesta_stop(self):
        """
        获取到最新的offset自动停止
        :param action: 
                    none 从 kafka 正常的 `CURRENT-OFFSET` 开始消费
                    custom 从指定offset开始
                    begin 从 kafka 从这个topic最开始消费
                    end 从 kafka 从这个topic从最新生成的数据库开始，会跳过未消费数据，慎用
        :param offset: 
                    数字 int>=0 type为custom时有效从当前数字 offset 开始，包括当前数字，
                    如果数字大于当前topic总offset，从最新生成的数据库开始
        :return:
        """

        # 获取topic所有分区并分配给当前消费者, 需要使用 assign 的话， 在 KafkaConsumer 初始化时就不能指定 topic
        _ps = [
            TopicPartition(self.topic, p)
            for p in self.consumer.partitions_for_topic(self.topic)
        ]
        self.consumer.assign(_ps)
        if self.action is None:
            pass
        elif self.action == 'begin':
            self.consumer.seek_to_beginning()
        elif self.action == 'end':
            self.consumer.seek_to_end()
        elif self.action == 'custom':
            for p in self.consumer.partitions_for_topic(self.topic):
                # 也可以只指定一个分区的 offset
                self.consumer.seek(TopicPartition(self.topic, p), self.offset)
        else:
            print(
                'action values is not support! Plesase input "begin|end|custom"'
            )
            sys.exit(1)

        for message in self.consumer:
            yield message

    def commit_consumer(self):
        # self.consumer.commit()
        self.consumer.commit_async()  # 异步提交

    def close_consumer(self):
        self.consumer.close(autocommit=self.enable_auto_commit)

Esempio n. 30

Mostra file

class kafka_consumer():
    def __init__(self, kafka_server=KAFKA_SERVER_IP):
        self.kafka_servers = kafka_server  # kafka服务器的消费者接口
        self.consumer = None
        self.topic = None

    # 设置消费者.使用group,对于同一个group的成员只有一个消费者实例可以读取数据。
    def set_consumer(self,
                     topic='device',
                     group_id=None,
                     auto_offset_reset='latest'):
        self.topic = topic
        if (group_id):
            self.consumer = KafkaConsumer(topic,
                                          group_id=group_id,
                                          auto_offset_reset=auto_offset_reset,
                                          bootstrap_servers=self.kafka_servers)
        else:
            self.consumer = KafkaConsumer(topic,
                                          auto_offset_reset=auto_offset_reset,
                                          bootstrap_servers=self.kafka_servers)

    # callback为回调函数，这是一个堵塞进行
    def read_data(self, callback):
        if (self.consumer):
            for message in self.consumer:
                callback(message)
                # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))

    # 获取当前消费者信息
    def get_consumer_info(self):
        cousumer_info = {}
        cousumer_info[
            'partitions_for_topic'] = self.consumer.partitions_for_topic(
                self.topic)  #获取test主题的分区信息
        cousumer_info['topic'] = self.consumer.topics()  #获取主题列表
        cousumer_info['subscription'] = self.consumer.subscription(
        )  #获取当前消费者订阅的主题
        cousumer_info['assignment'] = self.consumer.assignment(
        )  #获取当前消费者topic、分区信息
        cousumer_info['beginning_offsets'] = self.consumer.beginning_offsets(
            self.consumer.assignment())  #获取当前消费者可消费的偏移量
        return cousumer_info

    # 设置偏移
    def set_offset(self, partition=0, offset=0):
        self.consumer.seek(
            TopicPartition(topic=self.topic, partition=partition),
            offset)  # 重置偏移量，从第offset个偏移量消费
        return self.consumer.position(
            TopicPartition(topic=self.topic,
                           partition=partition))  #获取当前主题的最新偏移量

    # 手动拉取消息
    def pull_data(self, callback):
        msg = self.consumer.poll(timeout_ms=5)  # 从kafka获取消息
        callback(msg)

    # ======读取当前数据==========
    # 使用group,对于同一个group的成员只有一个消费者实例可以读取数据。callback为回调函数，这是一个堵塞进行
    def read_data_now(self,
                      callback,
                      topic='device',
                      group_id=None,
                      auto_offset_reset='latest'):
        if (group_id):
            consumer = KafkaConsumer(topic,
                                     group_id=group_id,
                                     auto_offset_reset=auto_offset_reset,
                                     bootstrap_servers=self.kafka_servers)
            for message in consumer:
                callback(message)
                # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))
        else:
            consumer = KafkaConsumer(topic,
                                     auto_offset_reset=auto_offset_reset,
                                     bootstrap_servers=self.kafka_servers)
            for message in consumer:
                callback(message)
                # print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, message.offset, message.key, message.value))


# ==============消息恢复和挂起===========

# from kafka import KafkaConsumer
# from kafka.structs import TopicPartition
# import time
#
# consumer = KafkaConsumer(bootstrap_servers=['127.0.0.1:9092'])
# consumer.subscribe(topics=('test'))
# consumer.topics()
# consumer.pause(TopicPartition(topic=u'test', partition=0))  # pause执行后，consumer不能读取，直到调用resume后恢复。
# num = 0
# while True:
#     print(num)
#     print(consumer.paused())   #获取当前挂起的消费者
#     msg = consumer.poll(timeout_ms=5)
#     print(msg)
#     time.sleep(2)
#     num = num + 1
#     if num == 10:
#         print("resume...")
#         consumer.resume(TopicPartition(topic='test', partition=0))
#         print("resume......")

# ======消费者分组==========
# from kafka import KafkaConsumer
# # 使用group,对于同一个group的成员只有一个消费者实例可以读取数据
# consumer = KafkaConsumer('test',group_id='my-group',bootstrap_servers=['127.0.0.1:9092'])
# for message in consumer:
#     print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))

# =====读取消息队列最早或最新的消息========
# from kafka import KafkaConsumer
# consumer = KafkaConsumer('test',auto_offset_reset='earliest',bootstrap_servers=['127.0.0.1:9092'])
# for message in consumer:
#     print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))
#

# # ==========读取指定位置消息===============
# from kafka import KafkaConsumer
# from kafka.structs import TopicPartition
#
# consumer = KafkaConsumer('test',bootstrap_servers=['127.0.0.1:9092'])
#
# print(consumer.partitions_for_topic("test"))  #获取test主题的分区信息
# print(consumer.topics())  #获取主题列表
# print(consumer.subscription())  #获取当前消费者订阅的主题
# print(consumer.assignment())  #获取当前消费者topic、分区信息
# print(consumer.beginning_offsets(consumer.assignment())) #获取当前消费者可消费的偏移量
# consumer.seek(TopicPartition(topic='test', partition=0), 5)  #重置偏移量，从第5个偏移量消费
# for message in consumer:
#     print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))

# =======订阅多个消费者==========

# from kafka import KafkaConsumer
# from kafka.structs import TopicPartition
#
# consumer = KafkaConsumer(bootstrap_servers=['127.0.0.1:9092'])
# consumer.subscribe(topics=('test','test0'))  #订阅要消费的主题
# print(consumer.topics())
# print(consumer.position(TopicPartition(topic='test', partition=0))) #获取当前主题的最新偏移量
# for message in consumer:
#     print ("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,message.offset, message.key,message.value))

# ==========消费者(手动拉取消息)============
#
# from kafka import KafkaConsumer
# import time
#
# consumer = KafkaConsumer(bootstrap_servers=['127.0.0.1:9092'])
# consumer.subscribe(topics=('test','test0'))
# while True:
#     msg = consumer.poll(timeout_ms=5)   #从kafka获取消息
#     print(msg)
#     time.sleep(2)