Esempio n. 1
2
def commit_offsets_in_kafka(broker, group_name, group_dict):
    cons = KafkaConsumer(bootstrap_servers=broker, group_id=group_name)
    for topic_name, topic_dict in group_dict.iteritems():
        for partition, offset in topic_dict.iteritems():
            logging.info(
                "Commiting {} {} to topic {} and partition number {}".format(
                    group_name, offset, topic_name, partition))
            tp = TopicPartition(topic_name, int(partition))
            cons.assign([tp])
            cons.seek(tp, int(offset))
            # commit it
            cons.commit()
            time.sleep(8)
    cons.close()
    time.sleep(1)
Esempio n. 2
0
 def __init__(self, broker: str, topic: str, partition: int = -1, start: Union[int, datetime, PartitionOffset] = PartitionOffset.END, stop: Union[int, datetime, PartitionOffset] = PartitionOffset.NEVER):
     consumer = KafkaConsumer(bootstrap_servers=broker, fetch_max_bytes=52428800 * 6, consumer_timeout_ms=100)
     existing_topics = consumer.topics()
     self.current_msg = None
     self.current_offset_limits = HighLowOffset(-1, -1)
     if topic not in existing_topics:
         raise RuntimeError(f"Topic \"{topic}\" does not exist.")
     existing_partitions = consumer.partitions_for_topic(topic)
     if partition == -1:
         partition = existing_partitions.pop()
     elif partition not in existing_partitions:
         raise RuntimeError(f"Partition {partition} for topic \"{topic}\" does not exist.")
     topic_partition = TopicPartition(topic, partition)
     consumer.assign([topic_partition, ])
     if start == PartitionOffset.BEGINNING:
         consumer.seek_to_beginning()
     elif start == PartitionOffset.END or start == PartitionOffset.NEVER:
         consumer.seek_to_end()
     elif type(start) is int:
         first_offset = consumer.beginning_offsets([topic_partition, ])
         if first_offset[topic_partition] > start:
             consumer.seek_to_beginning()
         else:
             consumer.seek(partition=topic_partition, offset=start)
     elif type(start) is datetime:
         found_offsets = consumer.offsets_for_times({topic_partition: int(start.timestamp() * 1000)})
         consumer.seek(partition=topic_partition, offset=found_offsets[topic_partition].offset)
     self.to_thread = Queue()
     self.from_thread = Queue(maxsize=100)
     self.thread = Thread(target=thread_function, daemon=True, kwargs={"consumer": consumer, "stop": stop, "in_queue": self.to_thread, "out_queue": self.from_thread, "stop": stop, "topic_partition": topic_partition})
     self.thread.start()
Esempio n. 3
0
def worker(kafka_settings, mongo_settings, partition):
    #Join a consumer group for dynamic partition assignment and offset commits
    consumer = KafkaConsumer(
        auto_offset_reset='earliest'
    )  #kafka_settings["topic"], auto_offset_reset='earliest')#, group_id="consumer_group1")#kafka_settings["topic"], auto_offset_reset='earliest')#group_id='tweet_reader'
    consumer.assign([TopicPartition(kafka_settings["topic"], partition)])
    print("Connected to Kafka")

    #Create connection to Mongo
    client = MongoClient(
        mongo_settings["ip"], mongo_settings["port"]
    )  #client = MongoClient('mongodb://localhost:27017/') also works
    #Connect to a database
    db = client[mongo_settings[
        "db"]]  #Can use db = client["test-db"] to select dbs that don't use attribute style access
    #Connect to the collection you want
    collection = db[mongo_settings[
        "collection"]]  #Dictionary style access works here too: collection = db['test-collection']
    first = True
    #print("Connected to Mongo")
    #Iterate through messages
    for msg in consumer:
        #print("%s:%d:%d: key=%s value=%s") % (msg.topic, msg.partition, msg.offset, msg.key, str(bytes_to_dict(msg.value)))
        #print("Value:", bytes_to_dict(msg.value))
        #Convert the kafka message back into a dict
        tweet_in = bytes_to_dict(msg.value)

        #Check if there is a place field
        if ("place" in tweet_in) or ("location" in tweet_in):
            #Dict that will be pushed into Mongo
            tweet_out = {}
            #Start processing the tweet
            #Keep pertinent informatino that doesn't need processing
            tweet_out["track_id"] = tweet_in["track_id"]
            tweet_out["tweet_id"] = tweet_in["tweet_id"]
            tweet_out["created_at"] = tweet_in["created_at"]
            #tweet_out["created_at_int"] = int(tweet_to_utc(tweet_in["created_at"])) #tweet_in["created_at"]
            #Take in the coordinate box and average it to a single set of coordinates

            tweet_out["location"] = location_strip(tweet_in)
            #Analyze sentiment
            #tweet_out["sentiment"] = sentiment_analyze(tweet_in)

            #Insert/Update the tweet into Mongo
            post_id = collection.update({"_id": tweet_out["tweet_id"]},
                                        tweet_out,
                                        upsert=True)
            if first:
                print(tweet_in)
                print("----------Processed----------------")
                print("Wrote:", tweet_out)
                print("Type:", type(tweet_out))
                first = False
        '''
		for j in tweet:
			f_o.write("--" + str(j) + ":" + str(tweet[j]) + "\n")
		f_o.write("\n--------------------------------------------------------\n")
		'''
    client.close()
Esempio n. 4
0
 def init_consumer():
     c = KafkaConsumer(bootstrap_servers=brokers,
                       group_id=group,
                       value_deserializer=msgpack.unpackb,
                       enable_auto_commit=False)
     t_p = TopicPartition(topic, partition)
     c.assign([t_p])
     return c, t_p
def func(topic, partition):
  i=0
  consumer = KafkaConsumer(bootstrap_servers='104.154.53.184:6667', group_id='grp-5327', auto_offset_reset='earliest', 
  consumer_timeout_ms = 10000)
  consumer.assign([TopicPartition(topic, partition)])
  for msg in consumer:
    i=i+1
  print(i)
 def start(self):
     
     kafka_brokers = '{0}:{1}'.format(self._server,self._port)
     consumer =  KC(bootstrap_servers=[kafka_brokers],group_id=self._topic)
     partition = [TopicPartition(self._topic,int(self._id))]
     consumer.assign(partitions=partition)
     consumer.poll()
     return consumer
Esempio n. 7
0
    def start(self):

        kafka_brokers = '{0}:{1}'.format(self._server, self._port)
        consumer = KC(bootstrap_servers=[kafka_brokers], group_id=self._topic)
        partition = [TopicPartition(self._topic, int(self._id))]
        consumer.assign(partitions=partition)
        consumer.poll()
        return consumer
def create_kafka_consumer(topics, server, consumer_timeout=12000):
    consumer = KafkaConsumer(bootstrap_servers=server,
                             consumer_timeout_ms=consumer_timeout)
    tp = []
    for topic in topics:
        tp.append(TopicPartition(topic, 0))
    consumer.assign(tp)
    return consumer
class KafkaHandler:
    def __init__(self, kafka_endpoint: str, socketio: SocketIO):
        self.consumer = KafkaConsumer(bootstrap_servers=kafka_endpoint)
        self.socketio = socketio
        self.dumps = {}
        end_offset = {}

        for topic in topics:
            self.dumps[topic] = collections.deque(maxlen=100)
            current_partition = TopicPartition(topic, 0)
            self.consumer.assign([current_partition])
            self.consumer.seek_to_end()
            end_offset[topic] = self.consumer.position(current_partition)

        topic_partitions = [TopicPartition(topic, 0) for topic in topics]
        self.consumer.assign(topic_partitions)
        for topic in topics:
            self.consumer.seek(TopicPartition(topic, 0), max(0, end_offset[topic] - 100))

        self.thread = threading.Thread(target=self.run)
        self.thread.daemon = True  # Demonize thread
        self.thread.start()  # Start the execution

    def run(self):
        for msg in self.consumer:
            try:
                msg_json = json.loads(msg.value.decode('utf-8'))
                if 'http_code' in msg_json and msg_json['http_code'] != 200:
                    continue

                output = {
                    "topic": msg.topic,
                    "timestamp": msg.timestamp,
                    "value": msg_json
                }
                output_json = json.dumps(output)
                self.dumps[str(msg.topic)].append(output)

                self.socketio.emit(str(msg.topic), output_json, namespace='/')
            except Exception as e:
                print('error emit msg', e)

        self.consumer.close()

    def on_connect(self):
        if self.dumps:
            for msg_topic in self.dumps:
                messages = list(self.dumps[msg_topic])
                emit(msg_topic, messages, namespace='/')

    def status(self):
        status_dict = {}
        for topic in self.dumps:
            status_dict[topic] = {
                'messages': len(self.dumps[topic]),
                'last_message': self.dumps[topic][-1] if self.dumps[topic] else ''
            }
        return json.dumps(status_dict)
    def export_csv_for_topic(self, topic):
        auth_header = request.headers.get('Authorization')
        merchant_token = auth_header.split(' ')[-1] if auth_header else None
        merchant_id = calculate_id(merchant_token) if merchant_token else None

        if topic not in topics:
            return json.dumps({'error': 'unknown topic'})

        try:
            consumer = KafkaConsumer(consumer_timeout_ms=1000, bootstrap_servers=self.kafka_endpoint)
            topic_partition = TopicPartition(topic, 0)
            consumer.assign([topic_partition])

            consumer.seek_to_beginning()
            start_offset = consumer.position(topic_partition)

            consumer.seek_to_end()
            end_offset = consumer.position(topic_partition)

            msgs = []
            '''
            Assumption: message offsets are continuous.
            Start and end can be anywhere, end - start needs to match the amount of messages.
            TODO: when deletion of some individual messages is possible and used, refactor!
            '''
            max_messages = 10 ** 5
            offset = max(start_offset, end_offset - max_messages)
            consumer.seek(topic_partition, offset)
            for msg in consumer:
                '''
                Don't handle steadily incoming new messages
                only iterate to last messages when requested
                '''
                if offset >= end_offset:
                    break
                offset += 1
                try:
                    msg_json = json.loads(msg.value.decode('utf-8'))
                    # filtering on messages that can be filtered on merchant_id
                    if 'merchant_id' not in msg_json or msg_json['merchant_id'] == merchant_id:
                        msgs.append(msg_json)
                except ValueError as e:
                    print('ValueError', e, 'in message:\n', msg.value)
            consumer.close()

            if topic == 'marketSituation':
                df = market_situation_shaper(msgs)
            else:
                df = pd.DataFrame(msgs)

            filename = topic + '_' + str(int(time.time()))
            filepath = 'data/' + filename + '.csv'
            df.to_csv(filepath, index=False)
            response = {'url': filepath}
        except Exception as e:
            response = {'error': 'failed with: ' + str(e)}

        return json.dumps(response)
Esempio n. 11
0
def main():
    """
    Main predictor function
    """

    args = init_parser()
    config = init_config(args)
    logger = get_logger(f'hawkes-{config["partition"]}',
                        broker_list=config["bootstrap_servers"],
                        debug=True)
    consumer = KafkaConsumer(bootstrap_servers=config["bootstrap_servers"])
    consumer.assign(
        [TopicPartition(config["consumer_topic"], config["partition"])])
    producer = KafkaProducer(
        bootstrap_servers=config["bootstrap_servers"],
        value_serializer=lambda v: json.dumps(v).encode("utf-8"),
        key_serializer=lambda v: json.dumps(v).encode("utf-8"))

    alpha = config["alpha"]
    mu = config["mu"]

    for message in consumer:
        mess = message.value.decode().replace("'", '"').replace('(',
                                                                '[').replace(
                                                                    ')', ']')

        mess = eval(mess)

        cascade = np.array(mess["tweets"])
        tweet_id = mess["cid"]
        text = mess["msg"]
        T_obs = mess["T_obs"]
        p, beta = 0.02, 1 / 3600
        t = cascade[-1, 0]
        LL = loglikelihood((p, beta), cascade, t)
        LL_MLE, MLE = compute_MLE(cascade, t, alpha, mu)
        p_est, beta_est = MLE
        N, G1, n_star = prediction([p_est, beta_est], cascade, alpha, mu, t)

        messfinal = {
            "type": "parameters",
            "cid": tweet_id,
            "msg": text,
            "n_obs": len(cascade),
            "n_supp": N,
            "params": list(MLE),
            "G1": G1,
            "n_star": n_star
        }

        producer.send(config["producer_topic"],
                      key=T_obs,
                      value=messfinal,
                      partition=config["partition"])

        logger.info(
            "Predicted params p = {: .3f} and beta = {: .3f} for tweet {} at time {} on partition: {}"
            .format(p_est, beta_est, tweet_id, T_obs, config["partition"]))
Esempio n. 12
0
def get_kafka_consumer(group_id, host, port, topic):
    """Return consumer for Kafka topic"""
    consumer = KafkaConsumer(group_id=group_id,
                             auto_offset_reset="earliest",
                             bootstrap_servers=[f"{host}:{port}"],
                             value_deserializer=lambda x: x.decode('utf-8'))
    partition = TopicPartition(topic, 0)
    consumer.assign([partition])
    return consumer, partition
Esempio n. 13
0
class PubSubInterface:
    EVTYPE = "PubSubEvent"

    def __init__(self, topic_list):
        self.logger = logging.getLogger('Mercury.PubSubInterface')
        self.producer = None
        self.consumer_thread = None
        self.topiclist = []
        self.topiclock = threading.Lock()
        for topic in topic_list:
            self._add_topic(topic)
        self.msglist = []
        self.msglock = threading.Lock()
        self.evhandler = eventhandler.EventHandler()

    def configure(self, config):
        self.psconfig = config['PubSub']

    def _add_topic(self, new_topic):
        self.topiclock.acquire()
        self.topiclist.append(TopicPartition(new_topic, 0))
        self.topiclock.release()

    def connect(self):
        self.logger.info("Connecting to Kafka pubsub")
        self.consumer_thread = threading.Thread(target=self.run_consumer)
        self.consumer_thread.daemon = True
        self.consumer_thread.start()
        self.producer = KafkaProducer(
            bootstrap_servers=[self.psconfig['bootstrap_server']])

    def _add_msg(self, msg):
        self.msglock.acquire()
        self.msglist.append(msg)
        self.msglock.release()

    def run_consumer(self):
        self.consumer = KafkaConsumer(
            bootstrap_servers=[self.psconfig['bootstrap_server']])
        self.consumer.assign(self.topiclist)
        for msg in self.consumer:
            self.logger.debug("Received message from pubsub!")
            self._add_msg(msg)
            ev = eventhandler.MercuryEvent(PubSubInterface.EVTYPE)
            self.evhandler.fire(ev)

    def get_msg(self):
        msg = None
        self.msglock.acquire()
        if len(self.msglist):
            msg = self.msglist.pop(0)
        self.msglock.release()
        return msg

    def send_msg(self, topic, msg):
        self.logger.debug("psubi send_msg")
        self.producer.send(topic, msg.encode())
Esempio n. 14
0
def consumer_message3():
    consumer = KafkaConsumer(bootstrap_servers=servers,
                             consumer_timeout_ms=1000,
                             group_id="kafka-group-id",
                             enable_auto_commit=False)
    consumer.assign([TopicPartition('kafka-topic', 0)])
    for msg in consumer:
        print(msg)
        consumer.commit()
Esempio n. 15
0
class KConsumer:

    def __init__(self, settings):
        self.kafka_host = settings.BROKER['HOST']
        self.kafka_port = settings.BROKER['PORT']

        self.kafka_topics = settings.BROKER['TOPICS']
        self.kafka_groups = settings.BROKER['GROUPS']
        # self.kafka_topics = settings.kafka_topics
        # self.kafka_groups = settings.kafka_groups
        self.kprocessor = KProcessor(settings)

        self.consumer = KafkaConsumer(bootstrap_servers=[self.kafka_host + ':' + self.kafka_port])

    def startThread(self):
        executor = ThreadPoolExecutor(max_workers=1)
        future = executor.submit(self.start())
        print(future.result())

    def start_old(self):
        # TODO here only the first topic is selected
        partition = TopicPartition(self.kafka_topics[0], 0)

        self.consumer.assign([partition])
        self.consumer.seek_to_beginning(partition)

        for msg in self.consumer:
            self.kprocessor.process(msg[6])

    def start(self):
        partitions = [TopicPartition(x, 0) for x in self.kafka_topics]
        self.consumer.assign(partitions)
        # TODO do that instead of the below loop
        # self.consumer.seek_to_beginning(partitions)

        for partition in partitions:
            self.consumer.seek_to_beginning(partition)

        while True:
            msg = next(self.consumer)
            try:
                print('good format :', msg.value)
                json_string = msg.value.decode("utf-8")
                self.kprocessor.process(json_string)
            except:
                print('event string not in the good format :', msg.value)
            # use KProducer here

    def close(self):
        self.consumer.close()

    def add_topics(self, topics):
        self.kafka_topics.append(topics)

    def remove_topics(self, topics):
        self.kafka_topics.remove(topics)
Esempio n. 16
0
class AKConsumer(Thread):
    def __init__(self):
        self.stopConsuming = False
        self.brokerconf = None
        self.userconf = None
        self.callback = None
        self.topic_partitions = None

    def get_name(self):
        return self.userconf['topics'][0]

    def get_partition(self):
        return self.userconf['partition'][0]

    def configure(self, brokerconf, userconf):
        self.brokerconf = brokerconf
        self.userconf = userconf
        self.consumer = KafkaConsumer(**self.brokerconf)
        Thread.__init__(self)
        self.topic_partitions = [TopicPartition(self.userconf['topics'][0], self.userconf['partition'][0])]
        self.consumer.assign(self.topic_partitions)
        partitions = self.consumer.partitions_for_topic(self.userconf['topics'][0])
        if partitions and self.userconf['partition'][0] in partitions:
            self._user_wants_old_messages(self.topic_partitions[0])

    def _user_wants_old_messages(self, tp):
        current_offset = self.consumer.position(tp)
        user_shift_offset = self.userconf.get('resendnumber', 0)
        if user_shift_offset > 0: user_shift_offset -= 1
        final_offset=current_offset - user_shift_offset
        if final_offset > 0:
            self.consumer.seek(tp, current_offset - user_shift_offset)
            print(' User selected to go from ', current_offset, ' to offset ', current_offset - user_shift_offset)

    def subscribe(self,callback):
        self.callback = callback

    def run(self):
        self.stopConsuming = False
        print('Consuming thread from ', self.userconf['topics'], ' in partition ', self.userconf['partition'], ' in.')
        while not self.stopConsuming:
            partitions = self.consumer.poll(300,1)
            for p in partitions:
                for response in partitions[p]:
                    self.consumer.commit()
                    self._receive(response)
        print('Consuming thread from ', self.userconf['topics'], ' in partition ', self.userconf['partition'], ' out.')

    def _receive(self, msg):
        self.callback(msg)

    def stop(self):
        self.stopConsuming = True
        self.join()
        self.consumer.close()
        print('Consumer to ', self.userconf['topics'], ' in partition ', self.userconf['partition'], ' stopped.')
Esempio n. 17
0
def get_current_offsets():
    cons = KafkaConsumer(bootstrap_servers=bootstrap_servers)
    tps = [
        TopicPartition(topic_src, x)
        for x in sorted(cons.partitions_for_topic(topic_src))
    ]
    cons.assign(tps)
    ret = [cons.position(tp) for tp in tps]
    cons.close(autocommit=False)
    return ret
Esempio n. 18
0
 def _init_consumer(self):
     consumer = KafkaConsumer(client_id=self.task_name(),
                              bootstrap_servers=self._brokers,
                              request_timeout_ms=1000,
                              enable_auto_commit=False,
                              auto_offset_reset="latest")
     consumer.assign(self._topic_partitions)
     for tps in self._topic_partitions:
         consumer.seek_to_beginning(tps)
     return consumer
Esempio n. 19
0
class Consumer(BaseStreamConsumer):
    """
    Used in DB and SW worker. SW consumes per partition.
    """
    def __init__(self, location, enable_ssl, cert_path, topic, group,
                 partition_id):
        self._location = location
        self._group = group
        self._topic = topic
        kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}
        self._consumer = KafkaConsumer(
            bootstrap_servers=self._location,
            group_id=self._group,
            max_partition_fetch_bytes=10485760,
            consumer_timeout_ms=100,
            client_id="%s-%s" %
            (self._topic,
             str(partition_id) if partition_id is not None else "all"),
            request_timeout_ms=120 * 1000,
            heartbeat_interval_ms=10000,
            **kwargs)

        # explicitly causing consumer to bootstrap the cluster metadata
        self._consumer.topics()

        if partition_id is not None:
            self._partitions = [TopicPartition(self._topic, partition_id)]
            self._consumer.assign(self._partitions)
        else:
            self._partitions = [
                TopicPartition(self._topic, pid)
                for pid in self._consumer.partitions_for_topic(self._topic)
            ]
            self._consumer.subscribe(topics=[self._topic])

    def get_messages(self, timeout=0.1, count=1):
        result = []
        while count > 0:
            try:
                m = next(self._consumer)
                result.append(m.value)
                count -= 1
            except StopIteration:
                break
        return result

    def get_offset(self, partition_id):
        for tp in self._partitions:
            if tp.partition == partition_id:
                return self._consumer.position(tp)
        raise KeyError("Can't find partition %d", partition_id)

    def close(self):
        self._consumer.commit()
        self._consumer.close()
Esempio n. 20
0
    def run(self):
        """ 抽取指定kafka集群中的Topic Logsize数据
            将抽出的数据输出给transfer进行处理
        """

        brokers = base.config["collector"]["clusters"][self.cluster]["brokers"]
        consumer = KafkaConsumer(bootstrap_servers=brokers,
                                 enable_auto_commit=False,
                                 group_id="kafka_extract")
        consumer.assign([
            TopicPartition(self.topic, int(partition))
            for partition in self.stopPosition
        ])
        finish = {}

        for partition, stopLogsize in self.stopPosition.items():
            tp = TopicPartition(self.topic, int(partition))
            finish[partition] = False

            try:
                startLogsize = self.startPosition[partition]
                consumer.seek(tp, startLogsize)
                self.progress[partition] = [startLogsize, stopLogsize]

            except KeyError:
                consumer.seek_to_beginning(tp)
                self.progress[partition] = [0, stopLogsize]

        if self.startPosition == self.stopPosition:
            return

        with DataTransfer(output=self.output,
                          cluster=self.cluster,
                          topic=self.topic,
                          diskPath=self.diskPath,
                          avroSchema=self.avroSchema,
                          targetBrokers=self.targetBrokers,
                          targetTopic=self.targetTopic,
                          compressType=self.compressType) as dt:
            for message in consumer:
                partition = str(message.partition)
                offset = message.offset + 1

                if offset <= self.stopPosition[partition]:
                    dt.transfer(message)
                    self.progress[partition][0] = offset
                    self.get_progress()

                if offset >= self.stopPosition[partition]:
                    finish[partition] = True

                    if False not in finish.values():
                        sys.stdout.write("\n" * len(self.stopPosition))
                        sys.stdout.flush()
                        break
 def __init__(self, branch_id):
     consumer = KafkaConsumer(
         bootstrap_servers=['localhost:9092'],
         value_deserializer=lambda m: loads(m.decode('ascii'), ))
     topic_customer = TopicPartition('bank-customer-test', branch_id)
     #topic_transaction = TopicPartition('bank-customer-new', branch_id)
     partitions = list()
     partitions.append(topic_customer)
     consumer.assign(partitions)
     self.consumer = consumer
     self.branch_id = branch_id
Esempio n. 22
0
def get_consumer(topic, offset=-1):
    # Check for offset, otherwise return consumer with group_id
    if offset == -1:
        consumer = KafkaConsumer(topic,
                                 group_id='MovieLog1',
                                 consumer_timeout_ms=KAFKA_TIMEOUT)
    else:
        consumer = KafkaConsumer(consumer_timeout_ms=KAFKA_TIMEOUT)
        consumer.assign([TopicPartition(topic, offset)])
        consumer.seek_to_beginning(TopicPartition(topic, offset))
    return consumer
Esempio n. 23
0
    def read_from_offset(self, offset=0, lang='json', schema=None):

        '''

        Kafka read message

        Read json and avro messages from consumer

        '''
        log.debug("[KafkaDriver][read_from_offset] lang: " + str(lang))
        log.debug("[KafkaDriver][read_from_offset] offset: " + str(offset))

        def outputJSON(obj):

            '''

            Default JSON serializer.

            '''

            if isinstance(obj, datetime.datetime):
                return int(obj.strftime("%s%f")[:-3])
            return obj


        ret = None
        log.debug("[KafkaDriver][read_from_offset] read start: " + str(self.server))
        consumer = KafkaConsumer(bootstrap_servers=self.server + ':9092',
                                 auto_offset_reset='earliest',
                                 consumer_timeout_ms=1000)

        partition = TopicPartition(self.topic, 0)
        consumer.assign([partition])
        consumer.seek_to_end(partition)
        start = int(offset)
        consumer.seek(partition, offset)

        for msg in consumer:
            if (lang == 'avro'):
                #message = AvroDecoder.decode(schema, msg.value)
                schema_registry = CachedSchemaRegistryClient(url='http://' + self.schema_registry + ':8081')
                self._serializer = MessageSerializer(schema_registry)
                message = self._serializer.decode_message(msg.value)
                message = json.dumps(message, indent=4, sort_keys=True, default=outputJSON)
                #log.debug("[KafkaDriver][read_from_offset] avro message: " + str(message))
                ret = message
            else:
                message = msg.value
                #log.debug("[KafkaDriver][read_from_offset] other message: " + str(message))
                ret = msg.value
            log.debug("[KafkaDriver][read_from_offset] msg: " + str(message) + " msg.offset: " + str(msg.offset))
        consumer.close()
        log.debug("[KafkaDriver][read_from_offset] read end")
        return ret
Esempio n. 24
0
def cam(cam_num):
    """
    This is the heart of our video display. Notice we set the mimetype to 
    multipart/x-mixed-replace. This tells Flask to replace any old images with 
    new values streaming through the pipeline.
    """
    consumer = KafkaConsumer(bootstrap_servers=['localhost:9092'])

    consumer.assign([TopicPartition(topic=topic, partition=int(cam_num))])

    return Response(get_video_stream(consumer),
                    mimetype='multipart/x-mixed-replace; boundary=frame')
Esempio n. 25
0
    def consume(self):
        """setup consumer"""
        consumer = KafkaConsumer(
            bootstrap_servers=[self.kafka_host],
            enable_auto_commit=False,
            value_deserializer=lambda x: json.loads(x.decode('utf-8')))
        partition = TopicPartition(self.kafka_topic, 0)
        consumer.assign([partition])
        consumer.seek_to_end()
        last_offset = consumer.position(partition)
        print(last_offset)
        consumer.seek(partition=partition,
                      offset=last_offset - self.offset_decrement)

        for message in consumer:
            # message value and key are raw bytes -- decode if necessary!
            # e.g., for unicode: `message.value.decode('utf-8')`
            print("%s:%d:%d: key=%s value=" %
                  (message.topic, message.partition, message.offset,
                   message.key))
            val = message.value
            if "cmd" in val:
                cmd = val["cmd"]
                print(cmd)
                if cmd == "FileWriter_new":
                    self.previous_command = cmd
                    if "file_attributes" in val:
                        if "file_name" in val["file_attributes"]:
                            self.attrib = val["file_attributes"]
                            print(self.attrib["file_name"])
                elif cmd == "FileWriter_stop":
                    if self.previous_command == "FileWriter_new":
                        self.previous_command = cmd
                        time.sleep(5)
                        bot = ScicatBot()
                        bot.login()
                        room_alias = "#" + self.proposal_id + ":ess"
                        room_id = bot.get_room_id(room_alias)
                        filename = self.attrib["file_name"]
                        bot.post(room_id, filename)
                        image_name = "im.png"
                        try:
                            with h5py.File(filename,
                                           "r",
                                           libver="latest",
                                           swmr=True) as file:
                                pass
                                # print(file["/entry/title"])
                            bot.upload_image(image_name)
                            bot.post_image(room_id)
                        except OSError as err:
                            print("OS error: {0}".format(err))
                            print("Error reading hdf5 file")
def _test_produce_and_consume_kafka_message(bootstrap_server: str):
    topic = 'test-topics'
    producer = KafkaProducer(bootstrap_servers=[bootstrap_server])
    producer.send(topic, b'producer message')
    producer.flush()
    producer.close()

    consumer = KafkaConsumer(bootstrap_servers=[bootstrap_server])
    tp = TopicPartition(topic, 0)
    consumer.assign([tp])
    consumer.seek_to_beginning()
    assert next(consumer).value.decode("utf-8") == 'producer message'
Esempio n. 27
0
def get_consumer_kafkaConsumer():
    consumer = KafkaConsumer(
        group_id='my-group1',
        bootstrap_servers=[
            'ip-172-31-15-110.us-west-2.compute.internal:6667',
            'ip-172-31-15-237.us-west-2.compute.internal:6667',
            'ip-172-31-5-184.us-west-2.compute.internal:6667'
        ])
    consumer.assign([topic])
    position = consumer.position(topic)
    consumer.seek_to_end(topic)
    return consumer
Esempio n. 28
0
def Kafka_Consumer():
    try:
        consumer = KafkaConsumer(group_id="black",
                                 bootstrap_servers=config.BOOTSTRAP_SERVERS,
                                 consumer_timeout_ms=1000)
        consumer.assign([TopicPartition(topic=KAFKA_TOPIC, partition=0)])
        # consumer.subscribe(topics=['my_topic', 'topic_1'])#订阅多个topic
        for msg in consumer:
            print("%s:%d:%d: key=%s value=%s" %
                  (msg.topic, msg.partition, msg.offset, msg.key, msg.value))
    except KafkaError:
        print KafkaError
Esempio n. 29
0
def create_consumer_with_partition(partition_number):
    partition = TopicPartition("final", partition_number)

    consumer = KafkaConsumer(
        bootstrap_servers=KAFKA_BROKER,
        auto_offset_reset="earliest",
        enable_auto_commit=True,
        group_id="my-group"
        # value_deserializer=lambda m: json.loads(m.decode("utf-8"))
    )
    consumer.assign([partition])
    return consumer
Esempio n. 30
0
def create_consumer(topic):
    consumer = KafkaConsumer(bootstrap_servers="localhost:9092", value_deserializer=lambda x: json.loads(x.decode("utf-8")))
    # Manually assign partitions
    # https://github.com/dpkp/kafka-python/issues/601#issuecomment-331419097
    assignments = []
    partitions = consumer.partitions_for_topic(topic)
    for p in partitions:
        print(f"topic {topic} - partition {p}")
        assignments.append(TopicPartition(topic, p))
    consumer.assign(assignments)

    return consumer
Esempio n. 31
0
def consumer():
    # 获取数据的kafka topic
    kafka_topic = "test"
    # kafka 的节点
    bootstrap_servers = ["localhost:9092"]
    # 为kafka动态分区所用到的group name
    # group_id = "test_group"
    # 用于反序列化数据的方法
    # value_deserializer = lambda v: json.dumps(v)
    # kafka读取数据时最小的返回数据量
    # fetch_min_bytes = 1
    # 一般用法,一开始指定topic
    # consumer = KafkaConsumer(kafka_topic, bootstrap_servers=bootstrap_servers)
    # 缓存的数据量大小
    cache_data = 10

    # 在后面设置topic
    consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers)

    # todo:从redis/mysql中读取offset
    kafka_offset_key = "kafka:offset"
    kafka_offset = redis.get(kafka_offset_key)
    tp = TopicPartition(kafka_topic, 0)
    consumer.assign([tp])
    consumer.seek_to_end(tp)
    lastOffset = consumer.position(tp)

    # 若要从最新的消息消费kafka 则调用 assignment 方法
    # consumer.assignment()

    # 从最旧的数据开始消费
    # consumer.seek_to_beginning(tp)

    consumer.seek(tp, int(kafka_offset))
    if int(kafka_offset) < int(lastOffset):
        data_list = []
        for msg in consumer:
            logger.info("message topic: %s" % msg.topic)
            logger.info("message partition: %s" % msg.partition)
            logger.info("message offset: %s" % msg.offset)
            data_list.append(parseData(msg.value))

            if len(data_list) > cache_data:
                toHbase(data_list)
                data_list = []
            if msg.offset == lastOffset - 1:
                if len(data_list) > 0:
                    toHbase(data_list)
                redis.set(kafka_offset_key, lastOffset)
                break
    else:
        logger.info("no new data")
Esempio n. 32
0
    def run(self):

        consumer = KafkaConsumer(
            bootstrap_servers=self.server, 
            auto_offset_reset='earliest',
            group_id=self.groupid)

        if consumer.partitions_for_topic(self.topic) is None:
            print("El tópico %s no existe!" % self.topic)
            sys.exit(2)

        if self.partition is None:
            partitions = [TopicPartition(self.topic, partition) 
                           for partition in consumer.partitions_for_topic(self.topic)]
        else:
            partitions = [TopicPartition(self.topic, int(self.partition))]

        consumer.assign(partitions)

        if self.offset is None:
            if self.inicio:
                for partition in partitions:
                    consumer.seek_to_beginning(partition)
        else:
            for partition in partitions:
                    consumer.seek(partition, int(self.offset))

        while not self.stop_event.is_set():
            try:
                for message in consumer:
                    logging.info(message)

                    try:
                        valor = json.loads(message.value)
                        if self.words:
                            valor = valor['words']
                            
                    except (ValueError):
                        valor = message.value.decode('utf-8')

                    print ("Recibiendo Mensaje (%s/%d/%d) %s" % (message.topic, 
                                              message.partition,
                                              message.offset, 
                                              #message.key,
                                              valor))

                    if self.stop_event.is_set():
                        break
            except IndexError:
                pass

        consumer.close()      
Esempio n. 33
0
def consume_from_beginning(host='192.168.11.137:9092', topic='first_topic'):
    consumer = KafkaConsumer(group_id='1',
                             bootstrap_servers=host,
                             auto_offset_reset='earliest',
                             enable_auto_commit=True,
                             auto_commit_interval_ms=3000)

    tp = TopicPartition(topic, 0)
    consumer.assign([tp])
    consumer.poll()
    consumer.seek_to_beginning(tp)
    for msg in consumer:
        print(msg)
Esempio n. 34
0
def poll(topic, offset=0, hostname=None, port_num=None, max_timeout=100):
    hostname, port_num = insure_host_port(hostname, port_num)
    server = hostname+':'+str(port_num)
    topic_partition = TopicPartition(topic, partition)

    consumer = KafkaConsumer(bootstrap_servers=server, group_id=None)
    consumer.assign([topic_partition])
    consumer.seek(topic_partition, offset)
    msgs = consumer.poll(max_timeout).values()
    consumer.close()
    if len(msgs) > 0:
        return msgs[0]
    else:
        return {}
Esempio n. 35
0
class Consumer(BaseStreamConsumer):
    """
    Used in DB and SW worker. SW consumes per partition.
    """
    def __init__(self, location, enable_ssl, cert_path, topic, group, partition_id):
        self._location = location
        self._group = group
        self._topic = topic
        kwargs = _prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {}
        self._consumer = KafkaConsumer(
            bootstrap_servers=self._location,
            group_id=self._group,
            max_partition_fetch_bytes=10485760,
            consumer_timeout_ms=100,
            client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"),
            request_timeout_ms=120 * 1000,
            heartbeat_interval_ms=10000,
            **kwargs
        )

        # explicitly causing consumer to bootstrap the cluster metadata
        self._consumer.topics()

        if partition_id is not None:
            self._partitions = [TopicPartition(self._topic, partition_id)]
            self._consumer.assign(self._partitions)
        else:
            self._partitions = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)]
            self._consumer.subscribe(topics=[self._topic])

    def get_messages(self, timeout=0.1, count=1):
        result = []
        while count > 0:
            try:
                m = next(self._consumer)
                result.append(m.value)
                count -= 1
            except StopIteration:
                break
        return result

    def get_offset(self, partition_id):
        for tp in self._partitions:
            if tp.partition == partition_id:
                return self._consumer.position(tp)
        raise KeyError("Can't find partition %d", partition_id)

    def close(self):
        self._consumer.commit()
        self._consumer.close()
  def run(self):
    global useavro, useextra, schema_id, sslEnable
    print("start Consumer")

    if useavro: 
     topic="avro.log.localtest"
    else:
     topic="raw.log.localtest"

    print("on topic %s" % topic)

    if sslEnable:
      print("setting up SSL to PROTOCOL_TLSv1")
      ctx = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
      ctx.load_cert_chain(certfile="../ca-cert", keyfile="../ca-key", password="******")
      consumer = KafkaConsumer(bootstrap_servers=["ip6-localhost:9093"],security_protocol="SASL_SSL",ssl_context=ctx,\
    sasl_mechanism="PLAIN",sasl_plain_username="******",sasl_plain_password="******", group_id="test")
    else:
      consumer = KafkaConsumer(bootstrap_servers=["ip6-localhost:9092"])

    consumer.assign([TopicPartition(topic, 0)])

    ## Skip the consumer to the head of the log - this is a personal choice
    ## It mean we are loosing messages when the py consumer was off
    ## Not a problem for testing purposes
    #consumer.seek(0,2)

    for message in consumer:
      print('-'*60)
      try:
        consume_message(message)           
      except:
          print('error')
          print('-'*60)
          traceback.print_exc(file=sys.stdout)
          print('-'*60)
Esempio n. 37
0
def kafka_consumer_test():
    topic_name = 'topic_test'
    bootstrap_servers = ['localhost:9092']

    # consumer = KafkaConsumer(topic_name, bootstrap_servers=bootstrap_servers, group_id='test_group', auto_offset_reset='earliest')
    consumer = KafkaConsumer(bootstrap_servers=bootstrap_servers, group_id='test_group', auto_offset_reset='earliest')
        # enable_auto_commit=True(默认)才能断点续消,此时服务端会保存该group_id的offset
        # auto_offset_reset='earliest',默认值是latest,只在offset发生异常是起作用,
    
    partition_set = consumer.partitions_for_topic(topic_name)
    partitions = [ TopicPartition(topic_name, partition_idx) for partition_idx in partition_set ]
    consumer.assign(partitions)
    topic_partition_set = consumer.assignment()

    #consumer.seek_to_beginning()    # 设置offset到集群中保存的第一个值,不一定是0, 没有参数则,对consumer的每一个partition设置
    #consumer.seek_to_end()    # 设置offset到当前没有消费的第一个值, 没有参数则,对consumer的每一个partition设置
    for topic_partition in topic_partition_set:
        offset = consumer.position(topic_partition)
        print "partition: %d, offset: %d" % (topic_partition.partition, offset)
        #consumer.seek(topicTopicPartition, offset)     # 尽量不要手动设置这个值


    for msg in consumer:
        print ("topic:%s, partition:%d, offset:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value.decode("utf-8")))
from kafka import KafkaConsumer, TopicPartition

topic_name = "test"
consumer = KafkaConsumer(bootstrap_servers=['u1401.ambari.apache.org:6667', 'u1402.ambari.apache.org:6667', 'u1403.ambari.apache.org:6667'])
partitions = [TopicPartition(topic_name, partition) for partition in consumer.partitions_for_topic(topic_name) if partition < 5]
consumer.assign(partitions)
consumer.seek_to_beginning()
for message in consumer:
    print message
import sys, os, re
import json

from kafka import KafkaConsumer, TopicPartition
consumer = KafkaConsumer()
consumer.assign([TopicPartition('flight_delay_classification_request', 0)])
consumer.seek_to_beginning()

for message in consumer:
  message_bytes = message.value
  message_string = message_bytes.decode()
  message_object = json.loads(message_string)
  print(message_object)
Esempio n. 40
0
class KafkaChangeFeed(ChangeFeed):
    """
    Kafka-based implementation of a ChangeFeed
    """
    sequence_format = 'json'

    def __init__(self, topics, client_id, strict=False, num_processes=1, process_num=0):
        """
        Create a change feed listener for a list of kafka topics, a client ID, and partition.

        See http://kafka.apache.org/documentation.html#introduction for a description of what these are.
        """
        self._topics = topics
        self._client_id = client_id
        self._processed_topic_offsets = {}
        self.strict = strict
        self.num_processes = num_processes
        self.process_num = process_num
        self._consumer = None

    def __str__(self):
        return 'KafkaChangeFeed: topics: {}, client: {}'.format(self._topics, self._client_id)

    @property
    def topics(self):
        return self._topics

    def _get_single_topic_or_fail(self):
        if len(self._topics) != 1:
            raise ValueError("This function requires a single topic but found {}!".format(self._topics))
        return self._topics[0]

    def iter_changes(self, since, forever):
        """
        Since must be a dictionary of topic partition offsets.
        """
        timeout = float('inf') if forever else MIN_TIMEOUT
        start_from_latest = since is None
        reset = 'largest' if start_from_latest else 'smallest'
        self._init_consumer(timeout, auto_offset_reset=reset)

        since = self._filter_offsets(since)
        # a special value of since=None will start from the end of the change stream
        if since is not None and (not isinstance(since, dict) or not since):
            raise ValueError("'since' must be None or a topic offset dictionary")

        if not start_from_latest:
            if self.strict:
                validate_offsets(since)

            checkpoint_topics = {tp[0] for tp in since}
            extra_topics = checkpoint_topics - set(self._topics)
            if extra_topics:
                raise ValueError("'since' contains extra topics: {}".format(list(extra_topics)))

            self._processed_topic_offsets = copy(since)

            # Tell the consumer to start from offsets that were passed in
            for topic_partition, offset in since.items():
                self.consumer.seek(TopicPartition(topic_partition[0], topic_partition[1]), int(offset))

        try:
            for message in self.consumer:
                self._processed_topic_offsets[(message.topic, message.partition)] = message.offset
                yield change_from_kafka_message(message)
        except StopIteration:
            assert not forever, 'Kafka pillow should not timeout when waiting forever!'
            # no need to do anything since this is just telling us we've reached the end of the feed

    def get_current_checkpoint_offsets(self):
        # the way kafka works, the checkpoint should increment by 1 because
        # querying the feed is inclusive of the value passed in.
        latest_offsets = self.get_latest_offsets()
        ret = {}
        for topic_partition, sequence in self.get_processed_offsets().items():
            if sequence == latest_offsets[topic_partition]:
                # this topic and partition is totally up to date and if we add 1
                # then kafka will give us an offset out of range error.
                # not adding 1 to the partition means that we may process this
                # change again later, but that should be OK
                sequence = latest_offsets[topic_partition]
            else:
                sequence += 1
            ret[topic_partition] = sequence
        return self._filter_offsets(ret)

    def get_processed_offsets(self):
        return copy(self._processed_topic_offsets)

    def get_latest_offsets(self):
        return self.consumer.end_offsets(self.consumer.assignment())

    def get_latest_offsets_json(self):
        return json.loads(kafka_seq_to_str(self.get_latest_offsets()))

    def get_latest_offsets_as_checkpoint_value(self):
        return self.get_latest_offsets()

    @property
    def consumer(self):
        if self._consumer is None:
            return self._init_consumer()
        return self._consumer

    def _init_consumer(self, timeout=MIN_TIMEOUT, auto_offset_reset='smallest'):
        """Allow re-initing the consumer if necessary
        """
        config = {
            'client_id': self._client_id,
            'bootstrap_servers': settings.KAFKA_BROKERS,
            'consumer_timeout_ms': timeout,
            'auto_offset_reset': auto_offset_reset,
            'enable_auto_commit': False,
            'api_version': settings.KAFKA_API_VERSION,
        }
        self._consumer = KafkaConsumer(**config)

        topic_partitions = []
        for topic in self.topics:
            for partition in self._consumer.partitions_for_topic(topic):
                topic_partitions.append(TopicPartition(topic, partition))

        self._consumer.assign(self._filter_partitions(topic_partitions))
        return self._consumer

    def _filter_offsets(self, offsets):
        if offsets is None:
            return offsets

        return {
            tp: offsets[tp]
            for tp in self.consumer.assignment()
            if tp in offsets
        }

    def _filter_partitions(self, topic_partitions):
        topic_partitions.sort()

        return [
            topic_partitions[num::self.num_processes]
            for num in range(self.num_processes)
        ][self.process_num]
Esempio n. 41
0
import sys, os, re
import json

from kafka import KafkaConsumer, TopicPartition
consumer = KafkaConsumer()
consumer.assign([TopicPartition('test', 0)])
consumer.seek_to_beginning()

for message in consumer:
  message_bytes = message.value
  message_string = message_bytes.decode()
  message_object = json.loads(message_string)
  print(message_object)

Esempio n. 42
0
class Consumer(BaseStreamConsumer):
    """
    Used in DB and SW worker. SW consumes per partition.
    """
    def __init__(self, location, topic, group, partition_id):
        self._location = location
        self._group = group
        self._topic = topic
        self._consumer = KafkaConsumer(
            bootstrap_servers=self._location,
            group_id=self._group,
            max_partition_fetch_bytes=10485760,
            consumer_timeout_ms=100,
            client_id="%s-%s" % (self._topic, str(partition_id) if partition_id is not None else "all"),
            request_timeout_ms=120 * 1000,
        )

        if partition_id is not None:
            self._partition_ids = [TopicPartition(self._topic, partition_id)]
            self._consumer.assign(self._partition_ids)
        else:
            self._partition_ids = [TopicPartition(self._topic, pid) for pid in self._consumer.partitions_for_topic(self._topic)]
            self._consumer.subscribe(topics=[self._topic])
            if self._consumer._use_consumer_group():
                self._consumer._coordinator.ensure_coordinator_known()
                self._consumer._coordinator.ensure_active_group()

        self._consumer._update_fetch_positions(self._partition_ids)
        self._start_looping_call()

    def _start_looping_call(self, interval=60):
        def errback(failure):
            logger.exception(failure.value)
            if failure.frames:
                logger.critical(str("").join(format_tb(failure.getTracebackObject())))
            self._poll_task.start(interval).addErrback(errback)

        self._poll_task = LoopingCall(self._poll_client)
        self._poll_task.start(interval).addErrback(errback)

    def _poll_client(self):
        self._consumer._client.poll()

    def get_messages(self, timeout=0.1, count=1):
        result = []
        while count > 0:
            try:
                m = next(self._consumer)
                result.append(m.value)
                count -= 1
            except StopIteration:
                break
        return result

    def get_offset(self, partition_id):
        for tp in self._partition_ids:
            if tp.partition == partition_id:
                return self._consumer.position(tp)
        raise KeyError("Can't find partition %d", partition_id)

    def close(self):
        self._poll_task.stop()
        self._consumer.commit()
        # getting kafka client event loop running some more and execute commit
        tries = 3
        while tries:
            self.get_messages()
            sleep(2.0)
            tries -= 1
        self._consumer.close()
Esempio n. 43
0
class CheckKafka(PubSubNagiosPlugin):

    def __init__(self):
        # Python 2.x
        super(CheckKafka, self).__init__()
        # Python 3.x
        # super().__init__()
        self.name = 'Kafka'
        self.default_port = 9092
        self.producer = None
        self.consumer = None
        self.topic = None
        self.client_id = 'Hari Sekhon ' + os.path.basename(get_topfile()) + ' ' + __version__
        self.group_id = self.client_id + ' ' + str(os.getpid()) + ' ' + random_alnum(10)
        self.acks = '1'
        self.retries = 0
        self.partition = None
        self.topic_partition = None
        self.brokers = None
        self.timeout_ms = None
        self.start_offset = None

    def add_options(self):
        # super(CheckKafka, self).add_options()
        # TODO: (host_envs, default_host) = getenvs2('HOST', default_host, name)
        # TODO: env support for Kafka brokers
        self.add_opt('-H', '--host', \
                     '-B', '--brokers', \
                     dest='brokers', metavar='broker_list', default='localhost:9092',
                     help='Kafka Broker seed list in form host[:port],host2[:port2]... (default: localhost:9092)')
        self.add_opt('-T', '--topic', help='Kafka Topic')
        self.add_opt('-p', '--partition', type=int, help='Kafka Partition (default: 0)', default=0)
        self.add_opt('-a', '--acks', default=1, choices=['1', 'all'],
                     help='Acks to require from Kafka. Valid options are \'1\' for Kafka ' +
                     'partition leader, or \'all\' for all In-Sync Replicas (may block causing ' +
                     'timeout if replicas aren\'t available, default: 1)')
        self.add_opt('-s', '--sleep', metavar='secs',
                     help='Sleep in seconds between producing and consuming from given topic (default: 0.5)')
        self.add_opt('--list-topics', action='store_true', help='List Kafka topics from broker(s) and exit')
        self.add_opt('--list-partitions', action='store_true',
                     help='List Kafka topic paritions from broker(s) and exit')
        self.add_thresholds(default_warning=1, default_critical=2)

    def run(self):
        try:
            super(CheckKafka, self).run()
        #except KafkaError as _:
            #raise CriticalError(_)
        except KafkaError:
            err = self.exception_msg()
            if 'NoBrokersAvailable' in err:
                err += ' ({0})'.format(self.brokers)
            raise CriticalError(err)

    @staticmethod
    def exception_msg():
        return traceback.format_exc().split('\n')[-2]

    def get_topics(self):
        self.consumer = KafkaConsumer(
            bootstrap_servers=self.brokers,
            client_id=self.client_id,
            request_timeout_ms=self.timeout_ms
            )
        return self.consumer.topics()

    def print_topics(self):
        print('Kafka Topics:\n')
        for topic in self.get_topics():
            print(topic)

    def get_topic_partitions(self, topic):
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=self.brokers,
            client_id=self.client_id,
            request_timeout_ms=self.timeout_ms
            )
        if topic not in self.get_topics():
            raise CriticalError("topic '{0}' does not exist on Kafka broker".format(topic))
        partitions = self.consumer.partitions_for_topic(topic)
        assert isSet(partitions)
        return partitions

    def print_topic_partitions(self, topic):
        print('Kafka topic \'{0}\' partitions:\n'.format(topic))
        #for partition in self.get_topic_partitions(topic):
        #    print(partition)
        print(list(self.get_topic_partitions(topic)))
        print()

    def process_args(self):
        self.brokers = self.get_opt('brokers')
        # TODO: add broker list validation back in
        # validate_hostport(self.brokers)
        log_option('brokers', self.brokers)
        self.timeout_ms = max((self.timeout * 1000 - 1000) / 2, 1000)

        try:
            list_topics = self.get_opt('list_topics')
            list_partitions = self.get_opt('list_partitions')
            if list_topics:
                self.print_topics()
                sys.exit(ERRORS['UNKNOWN'])
            self.topic = self.get_opt('topic')
        except KafkaError:
            raise CriticalError(self.exception_msg())

        if self.topic:
            validate_chars(self.topic, 'topic', 'A-Za-z-')
        elif list_topics or list_partitions:
            pass
        else:
            self.usage('--topic not specified')

        try:
            if list_partitions:
                if self.topic:
                    self.print_topic_partitions(self.topic)
                else:
                    for topic in self.get_topics():
                        self.print_topic_partitions(topic)
                sys.exit(ERRORS['UNKNOWN'])
        except KafkaError:
            raise CriticalError(self.exception_msg())

        self.partition = self.get_opt('partition')
        # technically optional, will hash to a random partition, but need to know which partition to get offset
        # if self.partition is not None:
        validate_int(self.partition, "partition", 0, 10000)
        self.topic_partition = TopicPartition(self.topic, self.partition)
        self.acks = self.get_opt('acks')
        try:
            self.acks = int(self.acks)
        except ValueError:
            pass
        log_option('acks', self.acks)
        self.validate_thresholds()

    def subscribe(self):
        self.consumer = KafkaConsumer(
            #self.topic,
            bootstrap_servers=self.brokers,
            # client_id=self.client_id,
            # group_id=self.group_id,
            request_timeout_ms=self.timeout_ms
            )
            #key_serializer
            #value_serializer
        # this is only a guess as Kafka doesn't expose it's API version
        #log.debug('kafka api version: %s', self.consumer.config['api_version'])
        log.debug('partition assignments: {0}'.format(self.consumer.assignment()))

        # log.debug('subscribing to topic \'{0}\' parition \'{1}\''.format(self.topic, self.partition))
        # self.consumer.subscribe(TopicPartition(self.topic, self.partition))
        # log.debug('partition assignments: {0}'.format(self.consumer.assignment()))

        log.debug('assigning partition {0} to consumer'.format(self.partition))
        # self.consumer.assign([self.partition])
        self.consumer.assign([self.topic_partition])
        log.debug('partition assignments: {0}'.format(self.consumer.assignment()))

        log.debug('getting current offset')
        # see also highwater, committed, seek_to_end
        self.start_offset = self.consumer.position(self.topic_partition)
        if self.start_offset is None:
            # don't do this, I've seen scenario where None is returned and all messages are read again, better to fail
            # log.warn('consumer position returned None, resetting to zero')
            # self.start_offset = 0
            raise UnknownError('Kafka Consumer reported current starting offset = {0}'.format(self.start_offset))
        log.debug('recorded starting offset \'{0}\''.format(self.start_offset))
        # self.consumer.pause()

    def publish(self):
        log.debug('creating producer')
        self.producer = KafkaProducer(
            bootstrap_servers=self.brokers,
            client_id=self.client_id,
            acks=self.acks,
            batch_size=0,
            max_block_ms=self.timeout_ms,
            request_timeout_ms=self.timeout_ms
            )
            #key_serializer
            #value_serializer
        log.debug('producer.send()')
        self.producer.send(
            self.topic,
            key=self.key,
            partition=self.partition,
            value=self.publish_message
            )
        log.debug('producer.flush()')
        self.producer.flush()

    def consume(self):
        self.consumer.assign([self.topic_partition])
        log.debug('consumer.seek({0})'.format(self.start_offset))
        self.consumer.seek(self.topic_partition, self.start_offset)
        # self.consumer.resume()
        log.debug('consumer.poll(timeout_ms={0})'.format(self.timeout_ms))
        obj = self.consumer.poll(timeout_ms=self.timeout_ms)
        log.debug('msg object returned: %s', obj)
        msg = None
        try:
            for consumer_record in obj[self.topic_partition]:
                if consumer_record.key == self.key:
                    msg = consumer_record.value
                    break
        except KeyError:
            raise UnknownError('TopicPartition key was not found in response')
        if msg is None:
            raise UnknownError("failed to find matching consumer record with key '{0}'".format(self.key))
        return msg
Esempio n. 44
0
class ClusterZookeeper(object):
    def __init__(self, zookeeper_hosts, kafka_hosts):
        self.groups_dict = {}
        self.topics_dict = {}
        self.brokers_list = []
        self.consumer = KafkaConsumer(bootstrap_servers=kafka_hosts.split(','))
        self.zk = KazooClient(hosts=zookeeper_hosts)
        self.zk.add_listener(self.keep_start)
        self.zk.start()
        if self.zk.exists('/consumers') is None or self.zk.exists('/brokers') is None:
            raise ValueError(zookeeper_hosts + 'is not zookeeper of kafka')
        ChildrenWatch(self.zk, '/consumers', self.groups_watch)
        ChildrenWatch(self.zk, '/brokers/topics', self.topics_watch)
        ChildrenWatch(self.zk, '/brokers/ids/', self.brokers_watch)
        t = threading.Thread(target=self.latest, name=kafka_hosts)
        t.setDaemon(True)
        t.start()

    # 保证链接是可用的
    def keep_start(self, client_status):
        if client_status != 'CONNECTED':
            try:
                self.zk.start()
            except():
                pass

    # 监听consumers节点
    def groups_watch(self, children):
        for group in [group for group in self.groups_dict.keys() if group not in children]:
            self.groups_dict.pop(group)
        for group in [group for group in children if group not in self.groups_dict.keys()]:
            owners_p = '/consumers/' + group + '/owners'
            if self.zk.exists(owners_p) is None:
                continue
            g_o_t = GroupOwnersTopic()
            self.groups_dict[group] = g_o_t
            ChildrenWatch(self.zk, owners_p, g_o_t.g_topic_watch)

    # 监听topic节点
    def topics_watch(self, children):
        for topic in [topic for topic in self.topics_dict.keys() if topic not in children]:
            self.topics_dict.pop(topic)
        for topic in [topic for topic in children if topic not in self.topics_dict.keys()]:
            t_v = TopicValue()
            self.topics_dict[topic] = t_v
            DataWatch(self.zk, '/brokers/topics/' + topic, t_v.topic_watch)
            t_v.topic_partition = [TopicPartition(topic, p) for p in self.consumer.partitions_for_topic(topic)]

    # 监听broker节点
    def brokers_watch(self, children):
        self.brokers_list = children

    def close_zk(self):
        try:
            self.zk.remove_listener(self.keep_start)
            self.zk.stop()
            self.zk.close()
        except():
            pass

    def latest(self):
        while True:
            # time.sleep(0.1)
            time.sleep(0.001)
            for k, v in self.topics_dict.items():
                try:
                    partitions = v.topic_partition
                    self.consumer.assign(partitions)
                    self.consumer.seek_to_end(*partitions)
                    log_offset = reduce(lambda x, y: x + y, [self.consumer.position(p) for p in partitions])
                    now_timestamp = int(time.mktime(time.localtime()))
                    if 'timestamp' in v.__dict__ and v.timestamp is not None:
                        v.speed = (log_offset - v.off_set) / (now_timestamp - v.timestamp)
                    v.timestamp = now_timestamp
                    v.off_set = log_offset
                except Exception as e:
                    pass