Ejemplo n.º 1
0
def output_kafka(graph_db, registry,
                 kafka_url=None):
    ldict = {"step": MODULEFILE + "/" + inspect.stack()[0][3],
             "hostname": platform.node().split(".")[0]}
    l = logging.LoggerAdapter(common.fetch_lg(), ldict)
    kafka_topic = "cs"
    if kafka_url is None:
        kafka_url = registry.get_config("kafka_url",
                                        "localhost:9092")
    else:
        l.info("Updating registry with kafka_url: {}".format(kafka_url))
        registry.put_config("kafka_url",
                            kafka_url)
    (nodes, rels) = out.output_json(graph_db, None, None, as_list=True)
    l.info("Connecting to kafka_url {}".format(kafka_url))
    kafka = KafkaClient(kafka_url)
    # To send messages asynchronously
    producer = SimpleProducer(kafka)
    l.info("Sending nodes to kafka {}/{}".format(kafka_url, kafka_topic))
    for n in nodes:
        producer.send_messages(kafka_topic, n)
    l.info("Sending rels to kafka {}/{}".format(kafka_url, kafka_topic))
    for n in rels:
        producer.send_messages(kafka_topic, n)
    kafka.close()
class CrawlScheduler(object):
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(
            self.kafka, KAFKA_CONSUMER_GROUP, KAFKA_TOPIC,
            auto_commit=True,
            max_buffer_size=1024*1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
class CrawlScheduler(object):
    def __init__(self):
        self.kafka = KafkaClient(hosts=KAFKA_SERVER)
        self.consumer = SimpleConsumer(self.kafka,
                                       KAFKA_CONSUMER_GROUP,
                                       KAFKA_TOPIC,
                                       auto_commit=True,
                                       max_buffer_size=1024 * 1024)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)

    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s', ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
Ejemplo n.º 4
0
class KafkaBusClient(BasicBusClient):
    def __init__(self, client_type, config):
        super(KafkaBusClient, self).__init__(client_type, config)
        self.address = str(self.address)
        self.topic = str(self.topic)
        self.client = KafkaClient("%s:%d" % (self.address, self.port))
        if config.has_key('async'):
            self. async = config['async']
        else:
            self. async = True

        if self.client_type == 'producer':
            self.producer = SimpleProducer(self.client, async=self. async)
        else:
            self.consumer_group = str(self.consumer_group)
            if not config.has_key('consumer_procs'):
                self.consumer_procs = multiprocessing.cpu_count()
                #print "Using %d processes" %(self.consumer_procs)

            self.consumer = SimpleConsumer(self.client, self.consumer_group,
                                           self.topic)
            #num_procs=self.consumer_procs)

    def close(self):
        self.client.close()
Ejemplo n.º 5
0
class KafkaBusClient(BasicBusClient):
	def __init__(self, client_type, config):
		super(KafkaBusClient, self).__init__(client_type, config)
		self.address = str(self.address)
		self.topic = str(self.topic)
		self.client = KafkaClient("%s:%d" %(self.address, self.port))
		if config.has_key('async'):
			self.async = config['async']
		else:
			self.async = True

		if self.client_type == 'producer':
			self.producer = SimpleProducer(self.client, async=self.async)
		else:
			self.consumer_group = str(self.consumer_group)
			if not config.has_key('consumer_procs'):
				self.consumer_procs = multiprocessing.cpu_count()
				#print "Using %d processes" %(self.consumer_procs)
				
			self.consumer = SimpleConsumer(self.client, 
							self.consumer_group, self.topic)
							#num_procs=self.consumer_procs)

	def close(self):
		self.client.close()
class CrawlScheduler(object):
    def __init__(self):
        if False:
            self.kafka = KafkaClient(*KAFKA_SERVER)
            self.consumer = SimpleConsumer(self.kafka, "crawl", "wiki-links",
                                           driver_type=KAFKA_THREAD_DRIVER,
                                           auto_commit=False)
        else:
            self.kafka = None
            self.consumer = ZSimpleConsumer(ZKHOSTS, "crawl", "wiki-links",
                                            driver_type=KAFKA_THREAD_DRIVER,
                                            manage_offsets=True,
                                            auto_commit=False)

        self.submitter = HeadquarterSubmitter(HQ_BASE_URL, HQ_JOB)

        self.stats = dict(fetched=0, scheduled=0, discarded=0)
    def shutdown(self):
        if self.kafka:
            self.kafka.close()

    def submit(self, curls):
        logging.info('submitting %d curls to HQ', len(curls))
        for n in itertools.count():
            try:
                self.submitter.put(curls)
                if n > 0:
                    logging.info('submission retry succeeded')
                break
            except Exception, ex:
                logging.warn('submission failed (%s), retrying after 30s',
                             ex)
                time.sleep(30.0)
        self.consumer.commit()
        self.stats['scheduled'] += len(curls)
Ejemplo n.º 7
0
def train(numIters):
    global users
    try:
        mcl = pm.MongoClient('10.137.168.196:27017')
        userColl = mcl.DataSet['PMLUsers']
        users = {
            user['userId']: user['partitionId']
            for user in userColl.find()
        }
        mcl.close()
        kafka = KafkaClient('mozo.cloudapp.net:9092', timeout=None)
        producer = UserProducer(kafka,
                                kafkaTopic,
                                users,
                                async=False,
                                req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                                ack_timeout=200)
        for userId, partitionId in users.iteritems():
            if userId in UoI.keys():
                for i in range(numIters):
                    #print "iteration " + str(i)
                    encodedMessage = simplejson.dumps({
                        'turtleId': turtleId,
                        'userId': userId,
                        'operation': 'train_one'
                    })
                    print i, producer.send(userId, encodedMessage)
    finally:
        producer.stop()
        kafka.close()
Ejemplo n.º 8
0
def test_kafka_queue():
    kafka = KafkaClient("kafka01", 9092)
    q = KafkaQueue(kafka, "queue", [0])
    q.put("first")
    q.put("second")
    assert q.get() == "first"
    assert q.get() == "second"
    q.close()
    kafka.close()
Ejemplo n.º 9
0
def main():
    kafka = KafkaClient("localhost:9092")

    producer = SimpleProducer(kafka)
    consumer = SimpleConsumer(kafka, "my-group", "activity.stream", max_buffer_size=None)

    producer.send_messages("activity.stream", "some message test")
    for message in consumer:
        print(message)

    kafka.close()
Ejemplo n.º 10
0
def checkTopicExists(topic_name):
    try:
        kafkaClient = KafkaClient(bootstrap_servers=HOSTPORT)
        metadata = kafkaClient.poll()
        server_topics = list(x[1] for x in metadata[0].topics)
        kafkaClient.close()
        return topic_name in server_topics
    except IndexError:
        return False
    except KafkaUnavailableError:
        logging.error("Kafka não está disponivel")
Ejemplo n.º 11
0
def add_data():
    global users
    try:
        mcl = pm.MongoClient('10.137.168.196:27017')
        kafka = KafkaClient('mozo.cloudapp.net:9092', timeout=None)
        producer = UserProducer(kafka,
                                kafkaTopic,
                                users,
                                parts,
                                async=False,
                                req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                                ack_timeout=200)
        coll = mcl.DataSet['PMLExpression']
        ii = 0  # max is 151413 (number of doc in PMLExpression)
        for ent in coll.find({'userId': {
                '$in': UoI.keys()
        }}, {
                '_id': True,
                'userId': True
        },
                             timeout=False):

            ii += 1
            entity = str(ent['_id'])
            userId = ent['userId']
            if (stop_add_data(userId)):
                continue
            UoI[userId] += 1
            encodedMessage = simplejson.dumps({
                'turtleId': turtleId,
                'userId': userId,
                'entity': entity,
                'operation': 'add_data'
            })
            print producer.send(userId, encodedMessage)

        for userId, partitionId in users.iteritems():
            encodedMessage = simplejson.dumps({
                'turtleId': turtleId,
                'userId': userId,
                'operation': 'save_one'
            })
            print producer.send(userId, encodedMessage)
        userColl = mcl.DataSet['PMLUsers']
        if users:
            userColl.insert([{
                'userId': userId,
                'partitionId': partitionId
            } for userId, partitionId in users.iteritems()])
    finally:
        producer.stop()
        mcl.close()
        kafka.close()
Ejemplo n.º 12
0
        def run(self):
            client = None
            consumer = None
            try:
                prev = None
                # print("Starting Kafka Client")
                # print("Kafka topic: {}").format(self.topic)
                print get_kafka_hosts()
                client = KafkaClient(hosts=get_kafka_hosts())
                consumer = SimpleConsumer(client=client,
                                          group=self.groupName.encode(
                                              'ascii', 'ignore'),
                                          topic=self.topic,
                                          iter_timeout=5)
                consumer.seek(0, 1)
                print '[Kafka Consumer] START'
                print 'Topic: {}'.format(self.topic)
                print 'Listening incoming message...'
                print '========================================================='
                # print("Listening kafka message...")

                while self.stopCpu is False:
                    for message in consumer.get_messages(count=5, block=False):
                        if self.stopCpu is True:
                            # print("Kafka Consumer Listening Stopped")
                            break

                        if message:
                            offset = message.offset
                            value = message.message.value
                            print 'msg: {0}, offset: {1}'.format(value, offset)

                            if len(value) > 0:
                                # chartdata = []
                                # j_val = json.loads(value)
                                # j_val['offset'] = offset
                                # chartdata.append(j_val)
                                # print("destination => ws"+str(self.pid))
                                # self.parentOj.emit("ws"+str(self.type), chartdata)
                                # self.parentOj.emit(self.topic, value)
                                self.parentOj.emit("ws" + str(self.pid), value)

                print '[Kafka Consumer] STOP'
                print 'Topic: {}'.format(self.topic)
                print 'Stop listening...'
                print '========================================================'
                # print("Listening kafka Stopped")
                consumer.stop()
                client.close()
            except Exception as e:
                consumer.stop()
                client.close()
Ejemplo n.º 13
0
def send_msg(msgs):
    cli = KafkaClient("localhost:9092")
    producer = SimpleProducer(cli)
    if isinstance(msgs, list):
        content = [(json.dumps(msg) if isinstance(msg, dict) else msg) for msg in msgs]
    else:
        content = [msgs]
    try:
        resp = producer.send_messages("tp_test1", *content)
        print resp
    except Exception:
        print traceback.format_exc()
    finally:
        cli.close()
Ejemplo n.º 14
0
def offsetCommit():
    global users
    checkUserPartitionMapping()
    kafkaClient = KafkaClient(kafkaHost, timeout=None)
    producer = KeyedProducer(kafkaClient, async=False,
                      req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=200)
    for partition in partitions:
        encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                           'user':'',
                                           'operation':'offsetCommit'})
        print producer.send(kafkaTopic, partition, encodedMessage)
    producer.stop(1)
    kafkaClient.close()
Ejemplo n.º 15
0
def mockTradingdesk(sleeptime = 0.1):

    ISOTIMEFORMAT='%Y-%m-%d %X'
    global count
    count = 1
    while True:
        print count
        Client = KafkaClient("172.20.0.51:9092")
        data = json.dumps({"time_stamp":time.strftime(ISOTIMEFORMAT, time.localtime()),"click_id":"yf_td_test_topic","campaign_id":"3","offer_id":"4","ref_site":"5","site":"6","click_time":"7","cost_per_click":"8","payout":"9","real_ip":"10","proxy_ip":"11","device_id":"12","os_id":"13","carrier_id":"14","mobile_brand_id":"15","screen_h":"16","screen_w":"17","screen_id":"18","city_id":"19","brand_id":"20","model_id":"21","country_id":"22","state_id":"23","conversion_time":"24","event":"25","sub1":"26","sub2":"27","sub3":"28","sub4":"29","sub5":"30","sub6":"31","sub7":"32","sub8":"33","click":"34","lp_click":"35","conversion":"36","sub_campaign_id":"37"})
        producer = SimpleProducer(Client,async=False,req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE)
        producer.send_messages("td_test_topic1", data)
        count += 1
        Client.close()
        time.sleep(0.1)
    thread.exit_thread()
Ejemplo n.º 16
0
class KafkaClientPlugin(cherrypy.process.plugins.SimplePlugin):
    def start(self):
        self.client = KafkaClient(config.KAFKA_HOST)
        self.producer = SimpleProducer(self.client)
        self.writer = WriterProcess(config.KAFKA_HOST)
        self.writer.start()
        self.bus.subscribe("dbwrite", self.dbwrite)

    def stop(self):
        self.writer.terminate()
        self.client.close()

    def dbwrite(self, key, value):
        message = IWAMessage(key, value)
        self.producer.send_messages(config.KAFKA_TOPIC, message.dumps())
        cherrypy.log("Queued: %s => %s" % (message.key, message.value))
Ejemplo n.º 17
0
def main():
    kafka = KafkaClient('172.16.252.61', 9092)
    print kafka
    topic = 'test_kafka_4'
    number = 1100
    count = 0
    while (True):
        number += 1
        raw_input('next--->>')
        produce_example(kafka, topic, number)
        count += 1
        if count > 100: break
        #break
    #produce_example(kafka, topic, number)
    #produce_gz_example(kafka)
    #consume_example(kafka, topic)
    kafka.close()
def main():
    kafka = KafkaClient("localhost:9092")
    producer = SimpleProducer(kafka)
    try:
        time.sleep(5)
        topic = 'test'
        for i in range(5):
            time.sleep(1)
            msg = 'This is a message sent from the kafka producer: ' \
                  + str(datetime.now().time()) + ' -- '\
                  + str(datetime.now().strftime("%A, %d %B %Y %I:%M%p"))
            print_response(producer.send_messages(topic, msg))
    except LeaderNotAvailableError:
        # https://github.com/mumrah/kafka-python/issues/249
        time.sleep(1)
        print_response(producer.send_messages(topic, msg))

    kafka.close()
Ejemplo n.º 19
0
class KafkaProducer:
    def __init__(self, conn_pool, topic):
        self.conn_pool = conn_pool
        self.topic = topic
        self.kafka = KafkaClient(self.conn_pool)
        self.producer = SimpleProducer(self.kafka, async=True)

    def send(self, message):
        self.producer.send_messages(self.topic, message)

    def sendBulk(self, messages):
        self.producer.send_messages(self.topic, *messages)

    def close(self):
        self.producer.stop()
        self.kafka.close()
        self.kafka = None
        self.producer = None
Ejemplo n.º 20
0
def main():
    kafka = KafkaClient("localhost:9092")
    producer = SimpleProducer(kafka)
    try:
        time.sleep(5)
        topic = 'test'
        for i in range(5):
            time.sleep(1)
            msg = 'This is a message sent from the kafka producer: ' \
                  + str(datetime.now().time()) + ' -- '\
                  + str(datetime.now().strftime("%A, %d %B %Y %I:%M%p"))
            print_response(producer.send_messages(topic, msg))
    except LeaderNotAvailableError:
        # https://github.com/mumrah/kafka-python/issues/249
        time.sleep(1)
        print_response(producer.send_messages(topic, msg))
 
    kafka.close()
Ejemplo n.º 21
0
def train(numIters):
    global users
    checkUserPartitionMapping()
    kafka = KafkaClient(kafkaHost, timeout=None)
    producer = UserProducer(kafka, kafkaTopic, users, partitions, async=False,
                      req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=200)
    for i in range(numIters):
        for user, partitionId in users.iteritems():
            if user == ''  or user == 'monk':
                continue
            encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                               'user':user,
                                               'operation':'train'})
            print i, producer.send(user, encodedMessage)
    
    producer.stop(1)
    kafka.close()
def main():
    kafka = KafkaClient("172.16.252.61", 9092)
    print kafka
    topic = "test_kafka_4"
    number = 1100
    count = 0
    while True:
        number += 1
        raw_input("next--->>")
        produce_example(kafka, topic, number)
        count += 1
        if count > 100:
            break
        # break
    # produce_example(kafka, topic, number)
    # produce_gz_example(kafka)
    # consume_example(kafka, topic)
    kafka.close()
Ejemplo n.º 23
0
 def _determine_kafka_version(cls, init_config, instance):
     """Return the Kafka cluster version as a tuple."""
     kafka_version = instance.get('kafka_client_api_version')
     if isinstance(kafka_version, str):
         kafka_version = tuple(map(int, kafka_version.split(".")))
     if kafka_version is None:  # if unspecified by the user, we have to probe the cluster
         kafka_connect_str = instance.get(
             'kafka_connect_str')  # TODO call validation method
         kafka_client = KafkaClient(
             bootstrap_servers=kafka_connect_str,
             client_id='dd-agent',
             request_timeout_ms=init_config.get(
                 'kafka_timeout', DEFAULT_KAFKA_TIMEOUT) * 1000,
             # if `kafka_client_api_version` is not set, then kafka-python automatically probes the cluster for
             # broker version during the bootstrapping process. Note that this returns the first version found, so in
             # a mixed-version cluster this will be a non-deterministic result.
             api_version=kafka_version,
             # While we check for SASL/SSL params, if not present they will default to the kafka-python values for
             # plaintext connections
             security_protocol=instance.get('security_protocol',
                                            'PLAINTEXT'),
             sasl_mechanism=instance.get('sasl_mechanism'),
             sasl_plain_username=instance.get('sasl_plain_username'),
             sasl_plain_password=instance.get('sasl_plain_password'),
             sasl_kerberos_service_name=instance.get(
                 'sasl_kerberos_service_name', 'kafka'),
             sasl_kerberos_domain_name=instance.get(
                 'sasl_kerberos_domain_name'),
             ssl_cafile=instance.get('ssl_cafile'),
             ssl_check_hostname=instance.get('ssl_check_hostname', True),
             ssl_certfile=instance.get('ssl_certfile'),
             ssl_keyfile=instance.get('ssl_keyfile'),
             ssl_crlfile=instance.get('ssl_crlfile'),
             ssl_password=instance.get('ssl_password'),
         )
         # version probing happens automatically as part of KafkaClient's __init__()
         kafka_version = kafka_client.config['api_version']
         # Currently, this client is only used for probing, so we need to close it to avoid stale connections on
         # older Kafka brokers. We can't re-use in new code path because KafkaAdminClient doesn't currently support
         # passing in an existing client.
         # TODO this could be re-used by the legacy version of the check to make maintenance easier... ie, we don't
         # have multiple sections of code instantiating clients
         kafka_client.close()
     return kafka_version
Ejemplo n.º 24
0
class KafkaProducer:

    def __init__(self,conn_pool,topic):
        self.conn_pool = conn_pool
        self.topic = topic
        self.kafka = KafkaClient(self.conn_pool)
        self.producer = SimpleProducer(self.kafka, async=True)

    def send(self,message):
        self.producer.send_messages(self.topic, message)

    def sendBulk(self,messages):
        self.producer.send_messages(self.topic, *messages)

    def close(self):
        self.producer.stop()
        self.kafka.close()
        self.kafka = None
        self.producer = None
Ejemplo n.º 25
0
def set_mantis_parameter(para, value):
    global users
    checkUserPartitionMapping()
    kafka = KafkaClient(kafkaHost, timeout=None)
    producer = UserProducer(kafka, kafkaTopic, users, partitions, async=False,
                      req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=200)
    for user, partitionId in users.iteritems():
#        if not partitionId == 4:
#            continue
        encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                           'user':user,
                                           'operation':'set_mantis_parameter',
                                           'para':para,
                                           'value':value})
        print producer.send(user, encodedMessage)
    
    producer.stop(1)
    kafka.close()
Ejemplo n.º 26
0
class Producer:
	'Simple kafka producer'

	def __init__(self, servers, topic):
		self.client = KafkaClient(servers)
		self.producer = SimpleProducer(self.client)
		self.topic = topic

	def current_time(self):
		return int(round(time.time() * 1000))

	def produce(self, mesnum):
		msg = json.dumps({'time': self.current_time(), 'data' : 'Hello - %s' % mesnum})
		self.producer.send_messages(self.topic, msg)

	def stop(self):
		if self.producer != None:
			self.producer.stop()
		if self.client != None:
			self.client.close()
Ejemplo n.º 27
0
def reset_all_data():
    global users
    checkUserPartitionMapping()
    kafka = KafkaClient(kafkaHost, timeout=None)
    producer = UserProducer(kafka, kafkaTopic, users, partitions, async=False,
                      req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                      ack_timeout=200)

    for user, partitionId in users.iteritems():            
        encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                           'user':user,
                                           'operation':'reset_all_data'})
        print producer.send(user, encodedMessage)
    
    users['monk'] = 8
    encodedMessage = simplejson.dumps({'turtleName':turtleName,
                                       'user':'******',
                                       'operation':'reset_all_data'})
    print producer.send('monk', encodedMessage)
    producer.stop(1)
    kafka.close()    
Ejemplo n.º 28
0
def producerMain(args):
    kafka = KafkaClient(args[0])

    # Send messages synchronously in batches. This will collect
    # messages in batch and send them to Kafka after 20 messages are
    # collected or every 60 seconds
    # Notes:
    # * If the producer dies before the messages are sent, there will be losses
    # * Call producer.stop() to send the messages and cleanup
    producer = SimpleProducer(
        kafka
    )  #, batch_send=True, batch_send_every_n=20, batch_send_every_t=60)

    # To wait for acknowledgements
    # ACK_AFTER_LOCAL_WRITE : server will wait till the data is written to
    #                         a local log before sending response
    # ACK_AFTER_CLUSTER_COMMIT : server will block until the message is committed
    #                            by all in sync replicas before sending a response
    #producer = SimpleProducer(kafka, async=False,
    #                      req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
    #                      ack_timeout=2000)

    seenFiles = False
    for path in args[1:]:
        files = getFilesToWorkOn(path)
        for filename in files:
            print "File: " + filename
            f = open(filename, "r")
            if filename.endswith(".csv"):
                sendFromCsv(producer, f)
                seenFiles = True
            elif filename.endswith(".xml"):
                sendFromXml(producer, f)
                seenFiles = True

    #if seenFiles:
    #    producer.stop() # send the remaining batched messages and cleanup
    kafka.close()

    return
Ejemplo n.º 29
0
def train(numIters):
    global users
    try:
        mcl = pm.MongoClient('10.137.168.196:27017')
        userColl = mcl.DataSet['PMLUsers']
        users = {user['userId']:user['partitionId'] for user in userColl.find()}
        mcl.close()
        kafka = KafkaClient('mozo.cloudapp.net:9092', timeout=None)
        producer = UserProducer(kafka, kafkaTopic, users, async=False,
                          req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                          ack_timeout=200)
        for userId, partitionId in users.iteritems():   
            if userId in UoI.keys():    
                for i in range(numIters):                         
                    #print "iteration " + str(i)
                    encodedMessage = simplejson.dumps({'turtleId':turtleId,
                                                   'userId':userId,
                                                   'operation':'train_one'})
                    print i, producer.send(userId, encodedMessage)
    finally:
        producer.stop()
        kafka.close()
Ejemplo n.º 30
0
def producerMain(args):
    kafka = KafkaClient(args[0])

    # Send messages synchronously in batches. This will collect
    # messages in batch and send them to Kafka after 20 messages are
    # collected or every 60 seconds
    # Notes:
    # * If the producer dies before the messages are sent, there will be losses
    # * Call producer.stop() to send the messages and cleanup
    producer = SimpleProducer(kafka) #, batch_send=True, batch_send_every_n=20, batch_send_every_t=60)

    # To wait for acknowledgements
    # ACK_AFTER_LOCAL_WRITE : server will wait till the data is written to
    #                         a local log before sending response
    # ACK_AFTER_CLUSTER_COMMIT : server will block until the message is committed
    #                            by all in sync replicas before sending a response
    #producer = SimpleProducer(kafka, async=False,
    #                      req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE,
    #                      ack_timeout=2000)

    seenFiles = False
    for path in args[1:]:
        files = getFilesToWorkOn(path)
        for filename in files:
            print "File: " + filename
            f = open(filename, "r")
            if filename.endswith(".csv"):
                sendFromCsv(producer, f)
                seenFiles = True
            elif filename.endswith(".xml"):
                sendFromXml(producer, f)
                seenFiles = True

    #if seenFiles:
    #    producer.stop() # send the remaining batched messages and cleanup
    kafka.close()

    return
Ejemplo n.º 31
0
def add_data():
    global users
    try:
        mcl = pm.MongoClient('10.137.168.196:27017')
        kafka = KafkaClient('mozo.cloudapp.net:9092', timeout=None)
        producer = UserProducer(kafka, kafkaTopic, users, parts, async=False,
                          req_acks=UserProducer.ACK_AFTER_LOCAL_WRITE,
                          ack_timeout=200)
        coll = mcl.DataSet['PMLExpression']
        ii = 0      # max is 151413 (number of doc in PMLExpression)
        for ent in coll.find({'userId': {'$in': UoI.keys()}}, {'_id':True, 'userId':True}, timeout=False):
            
            ii += 1
            entity = str(ent['_id'])
            userId = ent['userId']
            if (stop_add_data(userId)):
                continue
            UoI[userId] += 1
            encodedMessage = simplejson.dumps({'turtleId':turtleId,
                                               'userId':userId,
                                               'entity':entity,
                                               'operation':'add_data'})
            print producer.send(userId, encodedMessage)
            
        for userId, partitionId in users.iteritems():
            encodedMessage = simplejson.dumps({'turtleId':turtleId,
                                               'userId':userId,
                                               'operation':'save_one'})
            print producer.send(userId, encodedMessage)
        userColl = mcl.DataSet['PMLUsers']
        if users:
            userColl.insert([{'userId':userId, 'partitionId':partitionId} for userId, partitionId in users.iteritems()])
    finally:
        producer.stop()
        mcl.close()
        kafka.close()
Ejemplo n.º 32
0
    def check(self, instance):
        consumer_groups = self.read_config(instance, 'consumer_groups',
                                           cast=self._validate_consumer_groups)
        kafka_host_ports = self.read_config(instance, 'kafka_connect_str')
        full_output = self.read_config(instance, 'full_output', cast=bool)
        dimensions = self.read_config(instance, 'dimensions', cast=dict, optional=True)
        new_dimensions = {'component': 'kafka', 'service': 'kafka'}
        if dimensions is not None:
            new_dimensions.update(dimensions.copy())

        try:
            # Connect to Kafka
            kafka_conn = KafkaClient(kafka_host_ports)

            # Query Kafka for consumer offsets
            consumer_offsets = {}
            topics = defaultdict(set)
            for consumer_group, topic_partitions in consumer_groups.iteritems():
                for topic, partitions in topic_partitions.iteritems():
                    consumer = SimpleConsumer(kafka_conn, consumer_group, topic)
                    # Remember the topic partitions that we've see so that we can
                    # look up their broker offsets later
                    topics[topic].update(set(partitions))
                    for partition in partitions:
                        consumer_offsets[(consumer_group, topic, partition)] = consumer.offsets[partition]
                    consumer.stop()

            # Query Kafka for the broker offsets, done in a separate loop so only one query is done
            # per topic even if multiple consumer groups watch the same topic
            broker_offsets = {}
            for topic, partitions in topics.items():
                offset_responses = kafka_conn.send_offset_request([
                    OffsetRequest(topic, p, -1, 1) for p in partitions])

                for resp in offset_responses:
                    broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0]
        finally:
            try:
                kafka_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Kafka connection')

        # Report the broker data
        if full_output:
            for (topic, partition), broker_offset in broker_offsets.items():
                broker_dimensions = new_dimensions.copy()
                broker_offset = broker_offsets.get((topic, partition))
                self.gauge('kafka.broker_offset',
                           broker_offset,
                           dimensions={'topic': topic,
                                       'partition': partition}.update(broker_dimensions))

        # Report the consumer data
        for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items():
            # Get the broker offset
            broker_offset = broker_offsets.get((topic, partition))
            # Report the consumer offset and lag
            consumer_dimensions = new_dimensions.copy()
            consumer_dimensions['topic'] = topic
            consumer_dimensions['partition'] = partition
            consumer_dimensions['consumer_group'] = consumer_group
            if full_output:
                self.gauge('kafka.consumer_offset',
                           consumer_offset,
                           dimensions={'topic': topic,
                                       'partition': partition,
                                       'consumer_group': consumer_group}.update(consumer_dimensions))
            self.gauge('kafka.consumer_lag',
                       broker_offset - consumer_offset,
                       dimensions={'topic': topic,
                                   'partition': partition,
                                   'consumer_group': consumer_group}.update(consumer_dimensions))
Ejemplo n.º 33
0
def main():
    """kafkadump: Kafka topic dump utility for debugging.

    Usage:
        kafkadump list --host=<host>
        kafkadump dump <topic> --host=<host> [--consumer=<consumer>]

    Examples:

        List all the topics on your local Kafka instance:

            python kafkadump.py list --host=<kafkahost>:9092

        Dump the contents of a single topic starting from offset 0:

            python kafkadump.py dump test.crawled_firehose --host=<kafkahost>:9092

        Use CTRL+C (SIGINT, KeyboardInterrupt) to stop it from polling Kafka.
        It will end by printing the total records serviced and the raw output
        of the most recent record.

    Options:
        -h --host <host>            Kafka host name where Kafka cluster will be resolved
        -c --consumer <consumer>    Consumer group ID to use for reading messages
    """
    args = docopt(main.__doc__)
    host = args["--host"]

    logging.basicConfig()

    print "=> Connecting to {0}...".format(host)
    kafka = KafkaClient(host)
    print "=> Connected."

    if args["list"]:
        for topic in kafka.topic_partitions.keys():
            print topic
        return 0
    elif args["dump"]:
        topic = args["<topic>"]
        consumer_id = args["--consumer"] or "default"
        consumer = SimpleConsumer(kafka, consumer_id, topic,
                            buffer_size=1024*100,      # 100kb
                            fetch_size_bytes=1024*100, # 100kb
                            max_buffer_size=None       # eliminate big message errors
                            )
        consumer.seek(0, 0)
        num_records = 0
        total_bytes = 0
        item = None
        while True:
            try:
                message = consumer.get_message()
                if message is None:
                    time.sleep(1)
                    continue
                val = message.message.value
                item = json.loads(val)
                body_bytes = len(item)
                print item
                num_records = num_records + 1
                total_bytes = total_bytes + body_bytes
            except:
                traceback.print_exc()
                break
        total_mbs = float(total_bytes) / (1024*1024)
        print
        if item is not None:
            print json.dumps(item, indent=4)
        if num_records == 0:
            num_records = 1
        print num_records, "records", total_mbs, "megabytes", (float(total_bytes) / num_records / 1024), "kb per msg"
        kafka.close()
        return 0
Ejemplo n.º 34
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(self,
                 zk_hosts,
                 group,
                 topic,
                 nodes,
                 zk_handler=None,
                 logger=None,
                 identifier=None,
                 **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError(
                "Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']"
            )
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:

                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning(
                            "Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info(
                            'Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(
                            lambda greenlet: self.zkp.join_group)

                if not self.nodes:
                    self.logger.info(
                        'Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk,
                    self.group,
                    self.topic,
                    self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger,
                    identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk,
                    self.group,
                    self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger,
                    identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn(
                    'Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info(
                    'Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client,
                                       self.group,
                                       self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s",
                         self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self,
                     block=True,
                     timeout=0.1,
                     get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
Ejemplo n.º 35
0
class KafkaManager:
    """
    A class used to interact with Kafka and Zookeeper
    and easily retrive useful information
    """

    MAX_RETRY = 10
    MAX_POLL_RETRIES = 3
    MAX_ZK_RETRIES = 5
    TOPIC_RESOURCE_ID = 2
    DEFAULT_TIMEOUT = 15000
    SUCCESS_CODE = 0
    ZK_REASSIGN_NODE = '/admin/reassign_partitions'
    ZK_TOPIC_PARTITION_NODE = '/brokers/topics/'
    ZK_TOPIC_CONFIGURATION_NODE = '/config/topics/'

    # Not used yet.
    ZK_TOPIC_DELETION_NODE = '/admin/delete_topics/'

    def __init__(self, module, **configs):
        self.module = module
        self.zk_client = None
        self.client = KafkaClient(**configs)

    def init_zk_client(self, **configs):
        """
        Zookeeper client initialization
        """
        self.zk_client = KazooClient(**configs)
        self.zk_client.start()

    def close_zk_client(self):
        """
        Closes Zookeeper client
        """
        self.zk_client.stop()

    def close(self):
        """
        Closes Kafka client
        """
        self.client.close()

    def refresh(self):
        """
        Refresh topics state
        """
        fut = self.client.cluster.request_update()
        self.client.poll(future=fut)
        if not fut.succeeded():
            self.close()
            self.module.fail_json(
                msg='Error while updating topic state from Kafka server: %s.'
                % fut.exception
            )

    def create_topic(self, name, partitions, replica_factor,
                     replica_assignment=[], config_entries=[],
                     timeout=None):
        """
        Creates a topic
        Usable for Kafka version >= 0.10.1
        """
        if timeout is None:
            timeout = self.DEFAULT_TIMEOUT
        request = CreateTopicsRequest_v0(
            create_topic_requests=[(
                name, partitions, replica_factor, replica_assignment,
                config_entries
            )],
            timeout=timeout
        )
        response = self.send_request_and_get_response(request)

        for topic, error_code in response.topic_error_codes:
            if error_code != self.SUCCESS_CODE:
                self.close()
                self.module.fail_json(
                    msg='Error while creating topic %s. '
                    'Error key is %s, %s.' % (
                        topic, kafka.errors.for_code(error_code).message,
                        kafka.errors.for_code(error_code).description
                    )
                )

    def delete_topic(self, name, timeout=None):
        """
        Deletes a topic
        Usable for Kafka version >= 0.10.1
        Need to know which broker is controller for topic
        """
        if timeout is None:
            timeout = self.DEFAULT_TIMEOUT
        request = DeleteTopicsRequest_v0(topics=[name], timeout=timeout)
        response = self.send_request_and_get_response(request)

        for topic, error_code in response.topic_error_codes:
            if error_code != self.SUCCESS_CODE:
                self.close()
                self.module.fail_json(
                    msg='Error while deleting topic %s. '
                    'Error key is: %s, %s. '
                    'Is option \'delete.topic.enable\' set to true on '
                    ' your Kafka server?' % (
                        topic, kafka.errors.for_code(error_code).message,
                        kafka.errors.for_code(error_code).description
                    )
                )

    @staticmethod
    def _convert_create_acls_resource_request_v0(acl_resource):
        if acl_resource.operation == ACLOperation.ANY:
            raise IllegalArgumentError("operation must not be ANY")
        if acl_resource.permission_type == ACLPermissionType.ANY:
            raise IllegalArgumentError("permission_type must not be ANY")

        return (
            acl_resource.resource_type,
            acl_resource.name,
            acl_resource.principal,
            acl_resource.host,
            acl_resource.operation,
            acl_resource.permission_type
        )

    @staticmethod
    def _convert_create_acls_resource_request_v1(acl_resource):
        if acl_resource.operation == ACLOperation.ANY:
            raise IllegalArgumentError("operation must not be ANY")
        if acl_resource.permission_type == ACLPermissionType.ANY:
            raise IllegalArgumentError("permission_type must not be ANY")

        return (
            acl_resource.resource_type,
            acl_resource.name,
            acl_resource.pattern_type,
            acl_resource.principal,
            acl_resource.host,
            acl_resource.operation,
            acl_resource.permission_type
        )

    @staticmethod
    def _convert_delete_acls_resource_request_v0(acl_resource):
        return (
            acl_resource.resource_type,
            acl_resource.name,
            acl_resource.principal,
            acl_resource.host,
            acl_resource.operation,
            acl_resource.permission_type
        )

    def describe_acls(self, acl_resource, api_version):
        """Describe a set of ACLs
        """

        if api_version < parse_version('2.0.0'):
            request = DescribeAclsRequest_v0(
                resource_type=acl_resource.resource_type,
                resource_name=acl_resource.name,
                principal=acl_resource.principal,
                host=acl_resource.host,
                operation=acl_resource.operation,
                permission_type=acl_resource.permission_type
            )
        else:
            request = DescribeAclsRequest_v1(
                resource_type=acl_resource.resource_type,
                resource_name=acl_resource.name,
                resource_pattern_type_filter=acl_resource.pattern_type,
                principal=acl_resource.principal,
                host=acl_resource.host,
                operation=acl_resource.operation,
                permission_type=acl_resource.permission_type
            )

        response = self.send_request_and_get_response(request)

        if response.error_code != self.SUCCESS_CODE:
            self.close()
            self.module.fail_json(
                msg='Error while describing ACL %s. '
                    'Error %s: %s.' % (
                        acl_resource, response.error_code,
                        response.error_message
                    )
            )

        return response.resources

    def create_acls(self, acl_resources, api_version):
        """Create a set of ACLs"""

        if api_version < parse_version('2.0.0'):
            request = CreateAclsRequest_v0(
                creations=[self._convert_create_acls_resource_request_v0(
                    acl_resource) for acl_resource in acl_resources]
            )
        else:
            request = CreateAclsRequest_v1(
                creations=[self._convert_create_acls_resource_request_v1(
                    acl_resource) for acl_resource in acl_resources]
            )
        response = self.send_request_and_get_response(request)

        for error_code, error_message in response.creation_responses:
            if error_code != self.SUCCESS_CODE:
                self.close()
                self.module.fail_json(
                    msg='Error while creating ACL %s. '
                    'Error %s: %s.' % (
                        acl_resources, error_code, error_message
                    )
                )

    def delete_acls(self, acl_resources):
        """Delete a set of ACLSs"""

        request = DeleteAclsRequest_v0(
            filters=[self._convert_delete_acls_resource_request_v0(
                acl_resource) for acl_resource in acl_resources]
        )

        response = self.send_request_and_get_response(request)

        for error_code, error_message, _ in response.filter_responses:
            if error_code != self.SUCCESS_CODE:
                self.close()
                self.module.fail_json(
                    msg='Error while deleting ACL %s. '
                    'Error %s: %s.' % (
                        acl_resources, error_code, error_message
                    )
                )

    def send_request_and_get_response(self, request):
        """
        Sends a Kafka protocol request and returns
        the associated response
        """
        try:
            node_id = self.get_controller()

        except UndefinedController:
            self.module.fail_json(
                msg='Cannot determine a controller for your current Kafka '
                'server. Is your Kafka server running and available on '
                '\'%s\' with security protocol \'%s\'?' % (
                    self.client.config['bootstrap_servers'],
                    self.client.config['security_protocol']
                )
            )

        except Exception as e:
            self.module.fail_json(
                msg='Cannot determine a controller for your current Kafka '
                'server. Is your Kafka server running and available on '
                '\'%s\' with security protocol \'%s\'? Are you using the '
                'library versions from given \'requirements.txt\'? '
                'Exception was: %s' % (
                    self.client.config['bootstrap_servers'],
                    self.client.config['security_protocol'],
                    e
                )
            )

        if self.connection_check(node_id):
            future = self.client.send(node_id, request)
            self.client.poll(future=future)
            if future.succeeded():
                return future.value
            else:
                self.close()
                self.module.fail_json(
                    msg='Error while sending request %s to Kafka server: %s.'
                    % (request, future.exception)
                )
        else:
            self.close()
            self.module.fail_json(
                msg='Connection is not ready, please check your client '
                'and server configurations.'
            )

    def get_controller(self):
        """
        Returns the current controller
        """
        if self.client.cluster.controller is not None:
            node_id, _host, _port, _rack = self.client.cluster.controller
            return node_id
        else:
            raise UndefinedController(
                'Cant get a controller for this cluster.'
            )

    def get_controller_id_for_topic(self, topic_name):
        """
        Returns current controller for topic
        """
        request = MetadataRequest_v1(topics=[topic_name])
        response = self.send_request_and_get_response(request)
        return response.controller_id

    def get_config_for_topic(self, topic_name, config_names):
        """
        Returns responses with configuration
        Usable with Kafka version >= 0.11.0
        """
        request = DescribeConfigsRequest_v0(
            resources=[(self.TOPIC_RESOURCE_ID, topic_name, config_names)]
        )
        return self.send_request_and_get_response(request)

    def get_responses_from_client(self, connection_sleep=1):
        """
        Obtains response from server using poll()
        It may need some times to get the response, so we had some retries
        """
        retries = 0
        if self.get_awaiting_request() > 0:
            while retries < self.MAX_POLL_RETRIES:
                resp = self.client.poll()
                if resp:
                    return resp
                time.sleep(connection_sleep)
                retries += 1
            self.close()
            self.module.fail_json(
                msg='Error while getting responses : no response to request '
                'was obtained, please check your client and server '
                'configurations.'
            )
        else:
            self.close()
            self.module.fail_json(
                msg='No pending request, please check your client and server '
                'configurations.'
            )

    def get_topics(self):
        """
        Returns the topics list
        """
        return self.client.cluster.topics()

    def get_total_partitions_for_topic(self, topic):
        """
        Returns the number of partitions for topic
        """
        return len(self.client.cluster.partitions_for_topic(topic))

    def get_partitions_for_topic(self, topic):
        """
        Returns all partitions for topic, with information
        TODO do not use private property anymore
        """
        return self.client.cluster._partitions[topic]

    def get_total_brokers(self):
        """
        Returns number of brokers available
        """
        return len(self.client.cluster.brokers())

    def get_brokers(self):
        """
        Returns all brokers
        """
        return self.client.cluster.brokers()

    def get_api_version(self):
        """
        Returns Kafka server version
        """
        major, minor, patch = self.client.config['api_version']
        return '%s.%s.%s' % (major, minor, patch)

    def get_awaiting_request(self):
        """
        Returns the number of requests currently in the queue
        """
        return self.client.in_flight_request_count()

    def connection_check(self, node_id, connection_sleep=0.1):
        """
        Checks that connection with broker is OK and that it is possible to
        send requests
        Since the _maybe_connect() function used in ready() is 'async', we
        need to manually call it several time to make the connection
        """
        retries = 0
        if not self.client.ready(node_id):
            while retries < self.MAX_RETRY:
                if self.client.ready(node_id):
                    return True
                time.sleep(connection_sleep)
                retries += 1
            return False
        return True

    def is_topic_configuration_need_update(self, topic_name, topic_conf):
        """
        Checks whether topic's options need to be updated or not.
        Since the DescribeConfigsRequest does not give all current
        configuration entries for a topic, we need to use Zookeeper.
        Requires zk connection.
        """
        current_config, _zk_stats = self.zk_client.get(
            self.ZK_TOPIC_CONFIGURATION_NODE + topic_name
        )
        current_config = json.loads(current_config)['config']

        if len(topic_conf) != len(current_config.keys()):
            return True
        else:
            for conf_name, conf_value in topic_conf:
                if (
                        conf_name not in current_config.keys() or
                        str(conf_value) != str(current_config[conf_name])
                ):
                    return True

        return False

    def is_topic_partitions_need_update(self, topic_name, partitions):
        """
        Checks whether topic's partitions need to be updated or not.
        """
        total_partitions = self.get_total_partitions_for_topic(topic_name)
        need_update = False

        if partitions != total_partitions:
            if partitions > total_partitions:
                # increasing partition number
                need_update = True
            else:
                # decreasing partition number, which is not possible
                self.close()
                self.module.fail_json(
                    msg='Can\'t update \'%s\' topic partition from %s to %s :'
                    'only increase is possible.' % (
                        topic_name, total_partitions, partitions
                        )
                )

        return need_update

    def is_topic_replication_need_update(self, topic_name, replica_factor):
        """
        Checks whether a topic replica needs to be updated or not.
        """
        need_update = False
        for _id, part in self.get_partitions_for_topic(topic_name).items():
            _topic, _partition, _leader, replicas, _isr, _error = part
            if len(replicas) != replica_factor:
                need_update = True

        return need_update

    def update_topic_partitions(self, topic_name, partitions):
        """
        Updates the topic partitions
        Usable for Kafka version >= 1.0.0
        Requires to be the sended to the current controller of the Kafka
        cluster.
        The request requires to precise the total number of partitions and
        broker assignment for each new partition without forgeting replica.
        See NewPartitions class for explanations
        apache/kafka/clients/admin/NewPartitions.java#L53
        """
        brokers = []
        for node_id, _, _, _ in self.get_brokers():
            brokers.append(int(node_id))
        brokers_iterator = itertools.cycle(brokers)
        topic, _, _, replicas, _, _ = (
            self.get_partitions_for_topic(topic_name)[0]
        )
        total_replica = len(replicas)
        old_partition = self.get_total_partitions_for_topic(topic_name)
        assignments = []
        for _new_partition in range(partitions - old_partition):
            assignment = []
            for _replica in range(total_replica):
                assignment.append(next(brokers_iterator))
            assignments.append(assignment)

        request = CreatePartitionsRequest_v0(
            topic_partitions=[(topic_name, (partitions, assignments))],
            timeout=self.DEFAULT_TIMEOUT,
            validate_only=False
        )
        response = self.send_request_and_get_response(request)
        for topic, error_code, _error_message in response.topic_errors:
            if error_code != self.SUCCESS_CODE:
                self.close()
                self.module.fail_json(
                    msg='Error while updating topic \'%s\' partitions. '
                    'Error key is %s, %s. Request was %s.' % (
                        topic, kafka.errors.for_code(error_code).message,
                        kafka.errors.for_code(error_code).description,
                        str(request)
                    )
                )
        self.refresh()

    def update_topic_configuration(self, topic_name, topic_conf):
        """
        Updates the topic configuration
        Usable for Kafka version >= 0.11.0
        Requires to be the sended to the current controller of the Kafka
        cluster.
        """
        request = AlterConfigsRequest_v0(
            resources=[(self.TOPIC_RESOURCE_ID, topic_name, topic_conf)],
            validate_only=False
        )
        response = self.send_request_and_get_response(request)

        for error_code, _, _, resource_name in response.resources:
            if error_code != self.SUCCESS_CODE:
                self.close()
                self.module.fail_json(
                    msg='Error while updating topic \'%s\' configuration. '
                    'Error key is %s, %s' % (
                        resource_name,
                        kafka.errors.for_code(error_code).message,
                        kafka.errors.for_code(error_code).description
                    )
                )
        self.refresh()

    def get_assignment_for_replica_factor_update(self, topic_name,
                                                 replica_factor):
        """
        Generates a json assignment based on replica_factor given to update
        replicas for a topic.
        Uses all brokers available and distributes them as replicas using
        a round robin method.
        """
        all_replicas = []
        assign = {'partitions': [], 'version': 1}

        if replica_factor > self.get_total_brokers():
            self.close()
            self.close_zk_client()
            self.module.fail_json(
                msg='Error while updating topic \'%s\' replication factor : '
                'replication factor \'%s\' is more than available brokers '
                '\'%s\'' % (
                    topic_name,
                    replica_factor,
                    self.get_total_brokers()
                )
            )
        else:
            for node_id, _, _, _ in self.get_brokers():
                all_replicas.append(node_id)
            brokers_iterator = itertools.cycle(all_replicas)
            for _, part in self.get_partitions_for_topic(topic_name).items():
                _, partition, _, _, _, _ = part
                assign_tmp = {
                    'topic': topic_name,
                    'partition': partition,
                    'replicas': []
                }
                for _i in range(replica_factor):
                    assign_tmp['replicas'].append(next(brokers_iterator))
                assign['partitions'].append(assign_tmp)

            return bytes(str(json.dumps(assign)).encode('ascii'))

    def get_assignment_for_partition_update(self, topic_name, partitions):
        """
        Generates a json assignment based on number of partitions given to
        update partitions for a topic.
        Uses all brokers available and distributes them among partitions
        using a round robin method.
        """
        all_brokers = []
        assign = {'partitions': {}, 'version': 1}

        _, _, _, replicas, _, _ = self.get_partitions_for_topic(topic_name)[0]
        total_replica = len(replicas)

        for node_id, _host, _port, _rack in self.get_brokers():
            all_brokers.append(node_id)
        brokers_iterator = itertools.cycle(all_brokers)

        for i in range(partitions):
            assign_tmp = []
            for _j in range(total_replica):
                assign_tmp.append(next(brokers_iterator))
            assign['partitions'][str(i)] = assign_tmp

        return bytes(str(json.dumps(assign)).encode('ascii'))

    def wait_for_znode_assignment(self, zk_sleep_time, zk_max_retries):
        """
        Wait for the reassignment znode to be consumed by Kafka.

        Raises `ReassignPartitionsTimeout` if `zk_max_retries` is reached.
        """
        retries = 0
        while (
                self.zk_client.exists(self.ZK_REASSIGN_NODE) and
                retries < zk_max_retries
        ):
            retries += 1
            time.sleep(zk_sleep_time)

        if retries >= zk_max_retries:
            raise ReassignPartitionsTimeout(
                'The znode %s, is still present after %s tries, giving up.'
                'Consider increasing your `zookeeper_max_retries` and/or '
                '`zookeeper_sleep_time` parameters and check your cluster.',
                self.ZK_REASSIGN_NODE,
                retries
            )

    def update_admin_assignment(self, json_assignment, zk_sleep_time,
                                zk_max_retries):
        """
Updates the topic replica factor using a json assignment
Cf core/src/main/scala/kafka/admin/ReassignPartitionsCommand.scala#L580
 1 - Send AlterReplicaLogDirsRequest to allow broker to create replica in
     the right log dir later if the replica has not been created yet.

  2 - Create reassignment znode so that controller will send
      LeaderAndIsrRequest to create replica in the broker
      def path = "/admin/reassign_partitions" ->
      zk.create("/admin/reassign_partitions", b"a value")
  case class ReplicaAssignment(
    @BeanProperty @JsonProperty("topic") topic: String,
    @BeanProperty @JsonProperty("partition") partition: Int,
    @BeanProperty @JsonProperty("replicas") replicas: java.util.List[Int])
  3 - Send AlterReplicaLogDirsRequest again to make sure broker will start
      to move replica to the specified log directory.
     It may take some time for controller to create replica in the broker
     Retry if the replica has not been created.
 It may be possible that the node '/admin/reassign_partitions' is already
 there for another topic. That's why we need to check for its existence
 and wait for its consumption if it is already present.
 Requires zk connection.
        """

        try:
            self.wait_for_znode_assignment(zk_sleep_time, zk_max_retries)
            self.zk_client.create(self.ZK_REASSIGN_NODE, json_assignment)
            self.wait_for_znode_assignment(zk_sleep_time, zk_max_retries)

        except ReassignPartitionsTimeout as e:
            self.close()
            self.close_zk_client()
            self.module.fail_json(
                msg=str(e)
            )

        self.refresh()

    def update_topic_assignment(self, json_assignment, zknode):
        """
 Updates the topic partition assignment using a json assignment
 Used when Kafka version < 1.0.0
 Requires zk connection.
        """
        if not self.zk_client.exists(zknode):
            self.close()
            self.close_zk_client()
            self.module.fail_json(
                msg='Error while updating assignment: zk node %s missing. '
                'Is the topic name correct?' % (zknode)
            )
        self.zk_client.set(zknode, json_assignment)
        self.refresh()
Ejemplo n.º 36
0
class DocManager(object):
    """ Connects to a kafka instance, generates producers for a given
    database and collection
    """
    def __init__(self, url, auto_commit=True, unique_key='_id'):
        """Connect to kafka instance
        """
        url_info = url.split(":")
        if len(url_info) < 2:
            raise SystemError

        self.server = KafkaClient(url_info[0], int(url_info[1]))
        self.producer_dict = {}
        self.auto_commit = auto_commit

    def generate_producer(self, doc):
        """ Generates a producer for a given database and collection
        """
        database, coll = doc['ns'].split('.', 1)
        topic = (('%s-%s') % (database, coll))
        if topic not in self.producer_dict:
            try:
                self.producer_dict[topic] = SimpleProducer(
                    self.server,
                    str(topic),
                    async=True,
                    req_acks=SimpleProducer.ACK_AFTER_LOCAL_WRITE)
            except:
                self.producer_dict[topic] = None
        return self.producer_dict[topic]

    def stop(self):
        """ Stops the instance
        """
        self.auto_commit = False
        self.server.close()

    def upsert(self, doc):
        """ Sends the document to kafka
        """
        producer = self.generate_producer(doc)
        if producer:
            producer.send_messages(str(doc))
        else:
            raise SystemError

    def remove(self, doc):
        """ Not revelant in this context
        """
        pass

    def search(self, start_ts, end_ts):
        """ Not revelant in this context
        """
        pass

    def commit(self):
        """ Not revelant in this context
        """
        pass

    def run_auto_commit(self):
        """ Not revelant in this context
        """
        pass

    def get_last_doc(self):
        """ This is probably possible but unsure of implementation.
            Hestitant to implement this since it might be system
            specific.
        """
        pass
Ejemplo n.º 37
0
def main():
    kafka = KafkaClient("localhost", 9092)
    produce_example(kafka)
    produce_gz_example(kafka)
    consume_example(kafka)
    kafka.close()
Ejemplo n.º 38
0
kafka = KafkaClient("localhost", 9092)

consumer = SimpleConsumer(kafka, "test", "test")
for message in consumer:
    print(message)

# Invoking consumer.commit() does not work with the message below.
# Each run of this script just dumps all the messages.
# consumer.commit()
error_message = '''
WARNING:kafka:Could not send request [F
                                        kafka-pythontest
test��

��] to server <KafkaConnection host=dima-i8 port=9092>: Kafka @ dima-i8:9092 went away
Traceback (most recent call last):
  File "log_saver.py", line 14, in <module>
    consumer.commit()
  File "/home/dima/.local/lib/python2.7/site-packages/kafka/consumer.py", line 144, in commit
    resps = self.client.send_offset_commit_request(self.group, reqs)
  File "/home/dima/.local/lib/python2.7/site-packages/kafka/client.py", line 318, in send_offset_commit_request
    resps = self._send_broker_aware_request(payloads, encoder, decoder)
  File "/home/dima/.local/lib/python2.7/site-packages/kafka/client.py", line 184, in _send_broker_aware_request
    raise FailedPayloadsException(failed_payloads)
kafka.common.FailedPayloadsException: [OffsetCommitRequest(topic='test', partition=0, offset=7, metadata=None), OffsetCommitRequest(topic='test', partition=1, offset=1, metadata=None)]
dima@dima-i8:~/github/dkorolev/sandbox/python/log_saver$ 
'''

kafka.close()
Ejemplo n.º 39
0
class KafkaManager(object):
    """
    Easier access to Kafka information
    """

    TOPIC_RESOURCE_ID = 2
    MAX_POLL_RETRIES = 3
    MAX_RETRY = 10
    SUCCESS_CODE = 0

    def __init__(self, **configs):
        self.client = KafkaClient(**configs)

    def close(self):
        """
        Closes the client. Must be called once
        the client is not used anymore.
        """
        self.client.close()

    def get_controller(self):
        """
        Return the current controller for cluster.
        """
        node_id, _host, _port, _rack = self.client.cluster.controller
        return node_id

    def get_topics(self):
        """
        Returns the topics list
        """
        return self.client.cluster.topics()

    def get_total_partitions_for_topic(self, topic):
        """
        Returns the number of partitions for topic
        """
        return len(self.client.cluster.partitions_for_topic(topic))

    def get_partitions_metadata_for_topic(self, topic):
        """
        Returns set of partition for topic
        """
        return self.client.cluster._partitions[topic]

    def get_config_for_topic(self, topic_name, config_name):
        """
        Returns value for config_name topic option
        """
        request = DescribeConfigsRequestV0(resources=[(self.TOPIC_RESOURCE_ID,
                                                       topic_name,
                                                       [config_name])])
        responses = self.send_request_and_get_response(request)
        for resp in responses:
            for err_code, err_message, _, _, config_entries in resp.resources:
                if err_code != self.SUCCESS_CODE:
                    raise Exception(err_message)
                for _, value, _, _, _ in config_entries:
                    return value

    def describe_acls(self, acl_resource):
        """Describe a set of ACLs
        """

        request = DescribeAclsRequest_v0(
            resource_type=acl_resource['resource_type'],
            resource_name=acl_resource['name'],
            principal=acl_resource['principal'],
            host=acl_resource['host'],
            operation=acl_resource['operation'],
            permission_type=acl_resource['permission_type'])

        responses = self.send_request_and_get_response(request)

        for resp in responses:
            if resp.error_code != self.SUCCESS_CODE:
                raise Exception(resp.err_message)
            else:
                return resp.resources

        return None

    def get_awaiting_request(self):
        """
        Returns the number of requests currently in the queue
        """
        return self.client.in_flight_request_count()

    def get_responses_from_client(self, connection_sleep=1):
        """
        Obtains response from server using poll()
        It may need some times to get the response, so we had some retries
        """
        retries = 0
        if self.get_awaiting_request() > 0:
            while retries < self.MAX_POLL_RETRIES:
                resp = self.client.poll()
                if resp:
                    return resp
                time.sleep(connection_sleep)
                retries += 1
        return None

    def connection_check(self, node_id, connection_sleep=1):
        """
        Checks that connection with broker is OK and that it is possible
        to send requests
        Since the _maybe_connect() function used in ready() is 'async',
        we need to manually call it several time to make the connection
        """
        retries = 0
        if not self.client.ready(node_id):
            while retries < self.MAX_RETRY:
                if self.client.ready(node_id):
                    return True
                time.sleep(connection_sleep)
                retries += 1
            return False
        return True

    def send_request_and_get_response(self, request):
        """
        Send requet and get associated response
        """
        try:
            node_id = self.get_controller()
        except Exception:
            raise
        if self.connection_check(node_id):
            update = self.client.send(node_id, request)
            if update.is_done and update.failed():
                self.close()
            return self.get_responses_from_client()
        else:
            return None
Ejemplo n.º 40
0
import threading, sys, json, time
from kafka.client import KafkaClient
from kafka.producer import SimpleProducer
from datetime import datetime

if __name__ == "__main__":
	servers = sys.argv[1]
	topic = sys.argv[2]
	mesnum = int(sys.argv[3])
	interval = float(sys.argv[4])

	client = KafkaClient(servers)
	producer = SimpleProducer(client)

	current_time = lambda: int(round(time.time() * 1000))

	try:
		while mesnum > 0:
			msg = json.dumps({'time': current_time(), 'data' : 'Hello - %s' % mesnum})
			producer.send_messages(topic, msg)
			time.sleep(interval)
			mesnum -= 1
	except KeyboardInterrupt:
		print "Interrupted!"
	finally:
		producer.stop()
		client.close()
Ejemplo n.º 41
0
class KafkaManager:

    MAX_RETRY = 10
    MAX_POLL_RETRIES = 3
    MAX_ZK_RETRIES = 5
    TOPIC_RESOURCE_ID = 2
    DEFAULT_TIMEOUT = 15000
    SUCCESS_CODE = 0
    ZK_REASSIGN_NODE = '/admin/reassign_partitions'
    ZK_TOPIC_PARTITION_NODE = '/brokers/topics/'
    ZK_TOPIC_CONFIGURATION_NODE = '/config/topics/'

    # Not used yet.
    ZK_TOPIC_DELETION_NODE = '/admin/delete_topics/'

    def __init__(self, module, **configs):
        self.module = module
        self.client = KafkaClient(**configs)

    def init_zk_client(self, zookeeper, zookeeper_auth=[]):
        self.zk_client = KazooClient(hosts=zookeeper,
                                     auth_data=zookeeper_auth,
                                     read_only=True)
        self.zk_client.start()

    def close_zk_client(self):
        self.zk_client.stop()

    def close(self):
        self.client.close()

    # Creates a topic
    # Usable for Kafka version >= 0.10.1
    def create_topic(self,
                     name,
                     partitions,
                     replica_factor,
                     replica_assignment=[],
                     config_entries=[],
                     timeout=None):
        if timeout is None:
            timeout = self.DEFAULT_TIMEOUT
        request = CreateTopicsRequest_v0(create_topic_requests=[
            (name, partitions, replica_factor, replica_assignment,
             config_entries)
        ],
                                         timeout=timeout)
        resp = self.send_request_and_get_response(request)
        for request in resp:
            for topic, error_code in request.topic_error_codes:
                if (error_code != self.SUCCESS_CODE):
                    self.close()
                    self.module.fail_json(
                        msg=
                        'Error while creating topic %s. Error key is %s, %s.' %
                        (topic, kafka.errors.for_code(error_code).message,
                         kafka.errors.for_code(error_code).description))

    # Deletes a topic
    # Usable for Kafka version >= 0.10.1
    # Need to know which broker is controller for topic
    def delete_topic(self, name, timeout=None):
        if timeout is None:
            timeout = self.DEFAULT_TIMEOUT
        request = DeleteTopicsRequest_v0(topics=[name], timeout=timeout)
        resp = self.send_request_and_get_response(request)
        for request in resp:
            for topic, error_code in request.topic_error_codes:
                if (error_code != self.SUCCESS_CODE):
                    self.close()
                    self.module.fail_json(
                        msg=
                        'Error while deleting topic %s. Error key is: %s, %s. Is option \'delete.topic.enable\' set to true on your Kafka server?'
                        % (topic, kafka.errors.for_code(error_code).message,
                           kafka.errors.for_code(error_code).description))

    def send_request_and_get_response(self, request):
        try:
            node_id = self.get_controller()
        except Exception:
            self.module.fail_json(
                msg=
                'Cannot determine a controller for your current Kafka server. Is your Kafka server running and available on \'%s\' with security protocol \'%s\'?'
                % (self.client.config['bootstrap_servers'],
                   self.client.config['security_protocol']))
        if self.connection_check(node_id):
            update = self.client.send(node_id, request)
            if update.is_done and update.failed():
                self.close()
                self.module.fail_json(
                    msg='Error while sending request %s to Kafka server: %s.' %
                    (request, update.exception))
            return self.get_responses_from_client()
        else:
            self.close()
            self.module.fail_json(
                msg=
                'Connection is not ready, please check your client and server configurations.'
            )

    def get_controller(self):
        nodeId, host, port, rack = self.client.cluster.controller
        return nodeId

    def get_controller_id_for_topic(self, topic_name):
        request = MetadataRequest_v1(topics=[topic_name])
        resp = self.send_request_and_get_response(request)
        for request in resp:
            controller_id = request.controller_id
        return controller_id

    # Returns responses with configuration
    # Usable with Kafka version >= 0.11.0
    def get_config_for_topic(self, topic_name, config_names):
        request = DescribeConfigsRequest_v0(resources=[(self.TOPIC_RESOURCE_ID,
                                                        topic_name,
                                                        config_names)])
        return self.send_request_and_get_response(request)

    # Obtains response from server using poll()
    # It may need some times to get the response, so we had some retries
    def get_responses_from_client(self, connection_sleep=1):
        retries = 0
        if self.get_awaiting_request() > 0:
            while retries < self.MAX_POLL_RETRIES:
                resp = self.client.poll()
                if len(resp) > 0:
                    return resp
                time.sleep(connection_sleep)
                retries += 1
            self.close()
            self.module.fail_json(
                msg=
                'Error while getting responses : no response to request was obtained, please check your client and server configurations.'
            )
        else:
            self.close()
            self.module.fail_json(
                msg=
                'No pending request, please check your client and server configurations.'
            )

    # Returns the topics list
    def get_topics(self):
        return self.client.cluster.topics()

    # Returns the number of partitions for topic
    def get_total_partitions_for_topic(self, topic):
        return len(self.client.cluster.partitions_for_topic(topic))

    # Returns all partitions for topic, with information
    # TODO do not use private property anymore
    def get_partitions_for_topic(self, topic):
        return self.client.cluster._partitions[topic]

    # Returns number of brokers available
    def get_total_brokers(self):
        return len(self.client.cluster.brokers())

    # Returns all brokers
    def get_brokers(self):
        return self.client.cluster.brokers()

    # Returns Kafka server version
    def get_api_version(self):
        major, minor, patch = self.client.config['api_version']
        return '%s.%s.%s' % (major, minor, patch)

    # Returns the number of requests currently in the queue
    def get_awaiting_request(self):
        return self.client.in_flight_request_count()

    # Checks that connection with broker is OK and that it is possible to send requests
    # Since the _maybe_connect() function used in ready() is 'async', we need to manually call it several time to make the connection
    def connection_check(self, node_id, connection_sleep=1):
        retries = 0
        if not self.client.ready(node_id):
            while retries < self.MAX_RETRY:
                if self.client.ready(node_id):
                    return True
                time.sleep(connection_sleep)
                retries += 1
            return False
        else:
            return True

    # Checks whether topic's options need to be updated or not.
    # Since the DescribeConfigsRequest does not give all current configuration entries for a topic, we need to use Zookeeper.
    # Requires zk connection.
    def is_topic_configuration_need_update(self, topic_name, topic_conf):
        current_config, zk_stats = self.zk_client.get(
            self.ZK_TOPIC_CONFIGURATION_NODE + topic_name)
        current_config = json.loads(current_config)['config']

        if len(topic_conf) != len(current_config.keys()):
            return True
        else:
            for conf_name, conf_value in topic_conf:
                if conf_name not in current_config.keys(
                ) or str(conf_value) != str(current_config[conf_name]):
                    return True

        return False

    # Checks whether topic's partitions need to be updated or not.
    def is_topic_partitions_need_update(self, topic_name, partitions):
        total_partitions = self.get_total_partitions_for_topic(topic_name)
        need_update = False

        if partitions != total_partitions:
            if partitions > total_partitions:
                # increasing partition number
                need_update = True
            else:
                # decreasing partition number, which is not possible
                self.close()
                self.module.fail_json(
                    msg=
                    'Can\'t update \'%s\' topic partition from %s to %s : only increase is possible.'
                    % (topic_name, total_partitions, partitions))

        return need_update

    # Checks whether a topic replica needs to be updated or not.
    def is_topic_replication_need_update(self, topic_name, replica_factor):
        need_update = False
        for id, part in self.get_partitions_for_topic(topic_name).items():
            topic, partition, leader, replicas, isr, error = part
            if len(replicas) != replica_factor:
                need_update = True

        return need_update

    # Updates the topic partitions
    # Usable for Kafka version >= 1.0.0
    # Requires to be the sended to the current controller of the Kafka cluster.
    # The request requires to precise the total number of partitions and broker assignment for each new partition without forgeting replica.
    # See NewPartitions class for explanations https://github.com/apache/kafka/blob/a553764c8b1611cafd318022d0fc4a34c861f6ba/clients/src/main/java/org/apache/kafka/clients/admin/NewPartitions.java#L53
    def update_topic_partitions(self, topic_name, partitions):
        brokers = []
        for nodeId, host, port, rack in self.get_brokers():
            brokers.append(int(nodeId))
        brokers_iterator = itertools.cycle(brokers)
        topic, partition_id, leader, replicas, isr, error = self.get_partitions_for_topic(
            topic_name)[0]
        total_replica = len(replicas)
        old_partition = self.get_total_partitions_for_topic(topic_name)
        assignments = []
        for new_partition in range(partitions - old_partition):
            assignment = []
            for replica in range(total_replica):
                assignment.append(next(brokers_iterator))
            assignments.append(assignment)

        request = CreatePartitionsRequest_v0(topic_partitions=[
            (topic_name, (partitions, assignments))
        ],
                                             timeout=self.DEFAULT_TIMEOUT,
                                             validate_only=False)
        resp = self.send_request_and_get_response(request)
        for request in resp:
            for topic, error_code, error_message in request.topic_errors:
                if (error_code != self.SUCCESS_CODE):
                    self.close()
                    self.module.fail_json(
                        msg=
                        'Error while updating topic \'%s\' partitions. Error key is %s, %s. Request was %s.'
                        % (topic, kafka.errors.for_code(error_code).message,
                           kafka.errors.for_code(error_code).description,
                           str(request)))

    # Updates the topic configuration
    # Usable for Kafka version >= 0.11.0
    # Requires to be the sended to the current controller of the Kafka cluster.
    def update_topic_configuration(self, topic_name, topic_conf):
        request = AlterConfigsRequest_v0(resources=[(self.TOPIC_RESOURCE_ID,
                                                     topic_name, topic_conf)],
                                         validate_only=False)
        resp = self.send_request_and_get_response(request)
        for request in resp:
            for error_code, error_message, resource_type, resource_name in request.resources:
                if (error_code != self.SUCCESS_CODE):
                    self.close()
                    self.module.fail_json(
                        msg=
                        'Error while updating topic \'%s\' configuration. Error key is %s, %s'
                        % (resource_name,
                           kafka.errors.for_code(error_code).message,
                           kafka.errors.for_code(error_code).description))

    # Generates a json assignment based on replica_factor given to update replicas for a topic.
    # Uses all brokers available and distributes them as replicas using a round robin method.
    def get_assignment_for_replica_factor_update(self, topic_name,
                                                 replica_factor):
        all_replicas = []
        assign = {'partitions': [], 'version': 1}

        if replica_factor > self.get_total_brokers():
            self.close()
            self.close_zk_client()
            self.module.fail_json(
                msg=
                'Error while updating topic \'%s\' replication factor : replication factor \'%s\' is more than available brokers \'%s\''
                % (topic_name, replica_factor, self.get_total_brokers()))
        else:
            for nodeId, host, port, rack in self.get_brokers():
                all_replicas.append(nodeId)
            brokers_iterator = itertools.cycle(all_replicas)
            for id, part in self.get_partitions_for_topic(topic_name).items():
                topic, partition, leader, replicas, isr, error = part
                assign_tmp = {
                    'topic': topic_name,
                    'partition': partition,
                    'replicas': []
                }
                for i in range(replica_factor):
                    assign_tmp['replicas'].append(next(brokers_iterator))
                assign['partitions'].append(assign_tmp)

            return bytes(str(json.dumps(assign)))

    # Generates a json assignment based on number of partitions given to update partitions for a topic.
    # Uses all brokers available and distributes them among partitions using a round robin method.
    def get_assignment_for_partition_update(self, topic_name, partitions):
        all_brokers = []
        assign = {'partitions': {}, 'version': 1}

        topic, partition_id, leader, replicas, isr, error = self.get_partitions_for_topic(
            topic_name)[0]
        total_replica = len(replicas)

        for nodeId, host, port, rack in self.get_brokers():
            all_brokers.append(nodeId)
        brokers_iterator = itertools.cycle(all_brokers)

        for i in range(partitions):
            assign_tmp = []
            for j in range(total_replica):
                assign_tmp.append(next(brokers_iterator))
            assign['partitions'][str(i)] = assign_tmp

        return bytes(str(json.dumps(assign)))

    # Updates the topic replica factor using a json assignment
    # Cf https://github.com/apache/kafka/blob/98296f852f334067553e541d6ecdfa624f0eb689/core/src/main/scala/kafka/admin/ReassignPartitionsCommand.scala#L580
    # 1 - Send AlterReplicaLogDirsRequest to allow broker to create replica in the right log dir later if the replica has not been created yet.
    #
    # 2 - Create reassignment znode so that controller will send LeaderAndIsrRequest to create replica in the broker
    #     def path = "/admin/reassign_partitions" -> zk.create("/admin/reassign_partitions", b"a value")
    # case class ReplicaAssignment(@BeanProperty @JsonProperty("topic") topic: String,
    #@BeanProperty @JsonProperty("partition") partition: Int,
    #@BeanProperty @JsonProperty("replicas") replicas: java.util.List[Int])
    # 3 - Send AlterReplicaLogDirsRequest again to make sure broker will start to move replica to the specified log directory.
    #     It may take some time for controller to create replica in the broker. Retry if the replica has not been created.
    # It may be possible that the node '/admin/reassign_partitions' is already there for another topic. That's why we need to check for its existence and wait for its consumption if it is already present.
    # Requires zk connection.
    def update_admin_assignment(self, json_assignment, zk_sleep_time=5):
        retries = 0
        while self.zk_client.exists(
                self.ZK_REASSIGN_NODE) and retries < self.MAX_ZK_RETRIES:
            retries += 1
            time.sleep(zk_sleep_time)
        if retries >= self.MAX_ZK_RETRIES:
            self.close()
            self.close_zk_client()
            self.module.fail_json(
                msg=
                'Error while updating assignment: zk node %s is already there after %s retries and not yet consumed, giving up.'
                % (self.ZK_REASSIGN_NODE, self.MAX_ZK_RETRIES))
        self.zk_client.create(self.ZK_REASSIGN_NODE, json_assignment)

    # Updates the topic partition assignment using a json assignment
    # Used when Kafka version < 1.0.0
    # Requires zk connection.
    def update_topic_assignment(self, json_assignment, zknode):
        if not self.zk_client.exists(zknode):
            self.close()
            self.close_zk_client()
            self.module.fail_json(
                msg=
                'Error while updating assignment: zk node %s missing. Is the topic name correct?'
                % (zknode))
        self.zk_client.set(zknode, json_assignment)
Ejemplo n.º 42
0
consumer = SimpleConsumer(cli, 'test', 'tp_test1', auto_commit_every_n=10)

try:
    no_msg_times = 0
    while 1:
        is_over = False
        messages = consumer.get_messages(count=5, timeout=3)
        if messages:
            for m in messages:
                #print m
                msg_value = m.message.value
                print msg_value
                if msg_value == 'over':
                    is_over = True
        else:
            print "no msg!"
            no_msg_times += 1

        if is_over:
            print "The show is over! bye..."
            break
        if no_msg_times >= 5:
            print "no more msg"
            break
except Exception:
    print traceback.format_exc()

finally:
    cli.close()
    print "kafka connection closed!"
Ejemplo n.º 43
0
class KafkaConsumer(object):
    """A simpler kafka consumer"""
    DEFAULT_CONFIG = deepcopy(DEFAULT_CONSUMER_CONFIG)

    def __init__(self, *topics, **configs):
        self.configure(**configs)
        self.set_topic_partitions(*topics)

    def configure(self, **configs):
        """Configure the consumer instance

        Configuration settings can be passed to constructor,
        otherwise defaults will be used:

        Keyword Arguments:
            bootstrap_servers (list): List of initial broker nodes the consumer
                should contact to bootstrap initial cluster metadata.  This does
                not have to be the full node list.  It just needs to have at
                least one broker that will respond to a Metadata API Request.
            client_id (str): a unique name for this client.  Defaults to
                'kafka.consumer.kafka'.
            group_id (str): the name of the consumer group to join,
                Offsets are fetched / committed to this group name.
            fetch_message_max_bytes (int, optional): Maximum bytes for each
                topic/partition fetch request.  Defaults to 1024*1024.
            fetch_min_bytes (int, optional): Minimum amount of data the server
                should return for a fetch request, otherwise wait up to
                fetch_wait_max_ms for more data to accumulate.  Defaults to 1.
            fetch_wait_max_ms (int, optional): Maximum time for the server to
                block waiting for fetch_min_bytes messages to accumulate.
                Defaults to 100.
            refresh_leader_backoff_ms (int, optional): Milliseconds to backoff
                when refreshing metadata on errors (subject to random jitter).
                Defaults to 200.
            socket_timeout_ms (int, optional): TCP socket timeout in
                milliseconds.  Defaults to 30*1000.
            auto_offset_reset (str, optional): A policy for resetting offsets on
                OffsetOutOfRange errors. 'smallest' will move to the oldest
                available message, 'largest' will move to the most recent.  Any
                ofther value will raise the exception.  Defaults to 'largest'.
            deserializer_class (callable, optional):  Any callable that takes a
                raw message value and returns a deserialized value.  Defaults to
                 lambda msg: msg.
            auto_commit_enable (bool, optional): Enabling auto-commit will cause
                the KafkaConsumer to periodically commit offsets without an
                explicit call to commit().  Defaults to False.
            auto_commit_interval_ms (int, optional):  If auto_commit_enabled,
                the milliseconds between automatic offset commits.  Defaults to
                60 * 1000.
            auto_commit_interval_messages (int, optional): If
                auto_commit_enabled, a number of messages consumed between
                automatic offset commits.  Defaults to None (disabled).
            consumer_timeout_ms (int, optional): number of millisecond to throw
                a timeout exception to the consumer if no message is available
                for consumption.  Defaults to -1 (dont throw exception).

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        configs = self._deprecate_configs(**configs)
        self._config = {}
        for key in self.DEFAULT_CONFIG:
            self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit '
                    'without required consumer group (group_id)')

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if not self._config['bootstrap_servers']:
            raise KafkaConfigurationError(
                'bootstrap_servers required to configure KafkaConsumer')

        self._client = KafkaClient(self._config['bootstrap_servers'],
                                   client_id=self._config['client_id'],
                                   timeout=(self._config['socket_timeout_ms'] /
                                            1000.0))

    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = kafka_bytestring(arg)

                for partition in self._client.get_partition_ids_for_topic(
                        topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = kafka_bytestring(arg[0])
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = kafka_bytestring(key)

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)')

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = kafka_bytestring(key[0])
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' %
                                              type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[
                        topic_partition] = self._offsets.commit[
                            topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[
                        topic_partition] = self._reset_partition_offset(
                            topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()

    def close(self):
        """Close this consumer's underlying client."""
        self._client.close()

    def next(self):
        """Return the next available message

        Blocks indefinitely unless consumer_timeout_ms > 0

        Returns:
            a single KafkaMessage from the message iterator

        Raises:
            ConsumerTimeout after consumer_timeout_ms and no message

        Note:
            This is also the method called internally during iteration

        """
        self._set_consumer_timeout_start()
        while True:

            try:
                return six.next(self._get_message_iterator())

            # Handle batch completion
            except StopIteration:
                self._reset_message_iterator()

            self._check_consumer_timeout()

    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages')

        fetches = [
            FetchRequest(topic, partition,
                         self._offsets.fetch[(topic, partition)], max_bytes)
            for (topic, partition) in self._topics
        ]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False)

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning(
                    'OffsetOutOfRange: topic %s, partition %d, '
                    'offset %d (Highwatermark: %d)', topic, partition,
                    self._offsets.fetch[(topic, partition)],
                    resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition)))
                continue

            except NotLeaderForPartitionError:
                logger.warning(
                    "NotLeaderForPartitionError for %s - %d. "
                    "Metadata may be out of date", topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d", topic,
                               partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug(
                        'message offset less than fetched offset '
                        'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg

    def get_partition_offsets(self, topic, partition, request_time_ms,
                              max_num_offsets):
        """Request available fetch offsets for a single topic/partition

        Keyword Arguments:
            topic (str): topic for offset request
            partition (int): partition for offset request
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values.
                Specify -1 to receive the latest offset (i.e. the offset of the
                next coming message) and -2 to receive the earliest available
                offset. Note that because offsets are pulled in descending
                order, asking for the earliest offset will always return you a
                single element.
            max_num_offsets (int): Maximum offsets to include in the OffsetResponse

        Returns:
            a list of offsets in the OffsetResponse submitted for the provided
            topic / partition. See:
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI
        """
        reqs = [
            OffsetRequest(topic, partition, request_time_ms, max_num_offsets)
        ]

        (resp, ) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets

    def offsets(self, group=None):
        """Get internal consumer offset values

        Keyword Arguments:
            group: Either "fetch", "commit", "task_done", or "highwater".
                If no group specified, returns all groups.

        Returns:
            A copy of internal offsets struct
        """
        if not group:
            return {
                'fetch': self.offsets('fetch'),
                'commit': self.offsets('commit'),
                'task_done': self.offsets('task_done'),
                'highwater': self.offsets('highwater')
            }
        else:
            return dict(deepcopy(getattr(self._offsets, group)))

    def task_done(self, message):
        """Mark a fetched message as consumed.

        Offsets for messages marked as "task_done" will be stored back
        to the kafka cluster for this consumer group on commit()

        Arguments:
            message (KafkaMessage): the message to mark as complete

        Returns:
            True, unless the topic-partition for this message has not
            been configured for the consumer. In normal operation, this
            should not happen. But see github issue 364.
        """
        topic_partition = (message.topic, message.partition)
        if topic_partition not in self._topics:
            logger.warning(
                'Unrecognized topic/partition in task_done message: '
                '{0}:{1}'.format(*topic_partition))
            return False

        offset = message.offset

        # Warn on non-contiguous offsets
        prev_done = self._offsets.task_done[topic_partition]
        if prev_done is not None and offset != (prev_done + 1):
            logger.warning(
                'Marking task_done on a non-continuous offset: %d != %d + 1',
                offset, prev_done)

        # Warn on smaller offsets than previous commit
        # "commit" offsets are actually the offset of the next message to fetch.
        prev_commit = self._offsets.commit[topic_partition]
        if prev_commit is not None and ((offset + 1) <= prev_commit):
            logger.warning(
                'Marking task_done on a previously committed offset?: %d (+1) <= %d',
                offset, prev_commit)

        self._offsets.task_done[topic_partition] = offset

        # Check for auto-commit
        if self._does_auto_commit_messages():
            self._incr_auto_commit_message_count()

        if self._should_auto_commit():
            self.commit()

        return True

    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError(
                'Attempted to commit offsets '
                'without a configured consumer group (group_id)')

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(
                OffsetCommitRequest(topic_partition[0], topic_partition[1],
                                    commit_offset, metadata))

        if commits:
            logger.info('committing consumer offsets to group %s',
                        self._config['group_id'])
            resps = self._client.send_offset_commit_request(
                kafka_bytestring(self._config['group_id']),
                commits,
                fail_on_error=False)

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s',
                        self._config['group_id'])
            return False

    #
    # Topic/partition management private methods
    #

    def _consume_topic_partition(self, topic, partition):
        topic = kafka_bytestring(topic)
        if not isinstance(partition, int):
            raise KafkaConfigurationError('Unknown partition type (%s) '
                                          '-- expected int' % type(partition))

        if topic not in self._client.topic_partitions:
            raise UnknownTopicOrPartitionError(
                "Topic %s not found in broker metadata" % topic)
        if partition not in self._client.get_partition_ids_for_topic(topic):
            raise UnknownTopicOrPartitionError(
                "Partition %d not found in Topic %s "
                "in broker metadata" % (partition, topic))
        logger.info("Configuring consumer to fetch topic '%s', partition %d",
                    topic, partition)
        self._topics.append((topic, partition))

    def _refresh_metadata_on_error(self):
        refresh_ms = self._config['refresh_leader_backoff_ms']
        jitter_pct = 0.20
        sleep_ms = random.randint(int((1.0 - 0.5 * jitter_pct) * refresh_ms),
                                  int((1.0 + 0.5 * jitter_pct) * refresh_ms))
        while True:
            logger.info("Sleeping for refresh_leader_backoff_ms: %d", sleep_ms)
            time.sleep(sleep_ms / 1000.0)
            try:
                self._client.load_metadata_for_topics()
            except KafkaUnavailableError:
                logger.warning(
                    "Unable to refresh topic metadata... cluster unavailable")
                self._check_consumer_timeout()
            else:
                logger.info("Topic metadata refreshed")
                return

    #
    # Offset-managment private methods
    #

    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp, ) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset

    def _reset_highwater_offsets(self):
        for topic_partition in self._topics:
            self._offsets.highwater[topic_partition] = None

    def _reset_task_done_offsets(self):
        for topic_partition in self._topics:
            self._offsets.task_done[topic_partition] = None

    def _reset_partition_offset(self, topic_partition):
        (topic, partition) = topic_partition
        LATEST = -1
        EARLIEST = -2

        request_time_ms = None
        if self._config['auto_offset_reset'] == 'largest':
            request_time_ms = LATEST
        elif self._config['auto_offset_reset'] == 'smallest':
            request_time_ms = EARLIEST
        else:

            # Let's raise an reasonable exception type if user calls
            # outside of an exception context
            if sys.exc_info() == (None, None, None):
                raise OffsetOutOfRangeError(
                    'Cannot reset partition offsets without a '
                    'valid auto_offset_reset setting '
                    '(largest|smallest)')

            # Otherwise we should re-raise the upstream exception
            # b/c it typically includes additional data about
            # the request that triggered it, and we do not want to drop that
            raise  # pylint: disable-msg=E0704

        (offset, ) = self.get_partition_offsets(topic,
                                                partition,
                                                request_time_ms,
                                                max_num_offsets=1)
        return offset

    #
    # Consumer Timeout private methods
    #

    def _set_consumer_timeout_start(self):
        self._consumer_timeout = False
        if self._config['consumer_timeout_ms'] >= 0:
            self._consumer_timeout = time.time() + (
                self._config['consumer_timeout_ms'] / 1000.0)

    def _check_consumer_timeout(self):
        if self._consumer_timeout and time.time() > self._consumer_timeout:
            raise ConsumerTimeout('Consumer timed out after %d ms' %
                                  +self._config['consumer_timeout_ms'])

    #
    # Autocommit private methods
    #

    def _should_auto_commit(self):
        if self._does_auto_commit_ms():
            if time.time() >= self._next_commit_time:
                return True

        if self._does_auto_commit_messages():
            if self._uncommitted_message_count >= self._config[
                    'auto_commit_interval_messages']:
                return True

        return False

    def _reset_auto_commit(self):
        self._uncommitted_message_count = 0
        self._next_commit_time = None
        if self._does_auto_commit_ms():
            self._next_commit_time = time.time() + (
                self._config['auto_commit_interval_ms'] / 1000.0)

    def _incr_auto_commit_message_count(self, n=1):
        self._uncommitted_message_count += n

    def _does_auto_commit_ms(self):
        if not self._config['auto_commit_enable']:
            return False

        conf = self._config['auto_commit_interval_ms']
        if conf is not None and conf > 0:
            return True
        return False

    def _does_auto_commit_messages(self):
        if not self._config['auto_commit_enable']:
            return False

        conf = self._config['auto_commit_interval_messages']
        if conf is not None and conf > 0:
            return True
        return False

    #
    # Message iterator private methods
    #

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def _get_message_iterator(self):
        # Fetch a new batch if needed
        if self._msg_iter is None:
            self._msg_iter = self.fetch_messages()

        return self._msg_iter

    def _reset_message_iterator(self):
        self._msg_iter = None

    #
    # python private methods
    #

    def __repr__(self):
        return '<{0} topics=({1})>'.format(
            self.__class__.__name__, '|'.join([
                "%s-%d" % topic_partition for topic_partition in self._topics
            ]))

    #
    # other private methods
    #

    def _deprecate_configs(self, **configs):
        for old, new in six.iteritems(DEPRECATED_CONFIG_KEYS):
            if old in configs:
                logger.warning(
                    'Deprecated Kafka Consumer configuration: %s. '
                    'Please use %s instead.', old, new)
                old_value = configs.pop(old)
                if new not in configs:
                    configs[new] = old_value
        return configs
Ejemplo n.º 44
0
class BusAdapter(object):
    '''
    The BusAdapter class is intended to be imported to bus modules.
    Instances of this class provide the software bus illusion over
    Kafka. 
    
    Public methods are:
        
        * publish()
        * waitForMessage()
        * subscribeToTopic()
        * unSubscribeFromTopic()
        * addTopicListener()
        * removeTopicListener()
        * mySubscriptions()
        * returnError()
        * close()
    
    A minimal consumer module looks like this:
    
    ::

        # A callback function:
    	def printMessage(topicName, msgText, msgOffset):
    	    print('Msg[%s]: %s' % (topicName, msgText))
    	
    	bus = BusAdapter()
    	# Subscribe to a topic, passing the callback function:
    	bus.subscribeToTopic('exampleTopic', printMessage)
    	
    	while True:
    	    # do anything you like
    	    time.sleep(10)
		    
    
    A corresponding minimal producer module would be like this:
    
    ::
    
        bus = BusAdapter()
        while True:
            # Read one line from console:
            msgText = raw_input("Type a message to send: ('Q' to end.): ")
            if msgText == 'Q':
                break
            else:
                bus.publish(msgText, 'exampleTopic')    
    
    For better structured, but equivalent examples, see :py:class:`Example Producer <kafka_bus_python.example_producer.BusModuleProducer>`
    and :py:class:`Example Consumer <kafka_bus_python.example_consumer.BusModuleConsumer>`.
    
    Clients of this class may install multiple listeners
    for any given topic. The publish() method may be used asynchronously,
    just to send a message to subscribing modules on the bus, or
    synchronously like a remote procedure call.
        
    The BusAdapter wraps payloads into a JSON structure
    as follows: 
    
    ::
    
    	'id'     : <RFC 4122 UUID Version 4>   # e.g. 'b0f4259e-3d01-44bd-9eb3-25981c2dc643'
    	'type'   : {req | resp}
    	'status' : { OK | ERROR }
    	'time'   : <ISO 8601>                  # e.g. '2015-05-31T17:13:41.957350'
    	'content': <text>
    
    It is the responsibility of listener functions to 
    strip this header away, if desired. For an example
    see echo_service.EchoServer's echoRequestDelivery()
    method.
    
    '''
    
    _LEGAL_MSG_TYPES = ['req', 'resp']
    _LEGAL_STATUS    = ['OK', 'ERROR']
    
    _DEFAULT_KAFKA_LISTEN_PORT = 9092
    _KAFKA_SERVERS = [('localhost', _DEFAULT_KAFKA_LISTEN_PORT),
                     ('mono.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
                     ('datastage.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
                     ]

#     _KAFKA_SERVERS = [('mono.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
#                      ('localhost', _DEFAULT_KAFKA_LISTEN_PORT),
#                      ('datastage.stanford.edu', _DEFAULT_KAFKA_LISTEN_PORT),
#                      ]

       
    # Remember whether logging has been initialized (class var!):
    _loggingInitialized = False
    _logger = None

    def __init__(self, 
                 kafkaHost=None, 
                 kafkaPort=None,
                 loggingLevel=logging.DEBUG,
                 logFile=None,
                 kafkaGroupId='school_bus'
                 ):
        '''
        Initialize communications with Kafka.

        :param kafkaHost: hostname or ip address of host where Kafka server runs.
            If None, then BusAdapter._KAFKA_SERVERS are tried in turn.
        :type kafkaHost: {string | None}
        :param kafkaPort: port at which Kafka expects clients to come in.
            if None, then BusAdapter._DEFAULT_KAFKA_LISTEN_PORT is used.
        :type kafkaPort: {int | None}
        :param loggingLevel: detail of logging
        :type loggingLevel: {logging.DEBUG | logging.INFO | logging.ERROR}  
        :param logFile: file to which log is written; concole, if NONE
        :type logFile: {string | None}
        :param kafkaGroupId: name under which message offset management is
            stored [by Kafka in zookeeper]. Different groups of bus modules
            will have different sets of message offsets recorded. You can 
            leave this default.
        :type kafkaGroupId: string
        '''

        if kafkaPort is None:
            kafkaPort = BusAdapter._DEFAULT_KAFKA_LISTEN_PORT
        self.port = kafkaPort
        self.kafkaGroupId = kafkaGroupId
        
        self._setupLogging(loggingLevel, logFile)

        for hostPortTuple in BusAdapter._KAFKA_SERVERS:
            self.logDebug('Contacting Kafka server at %s:%s...' % hostPortTuple)
            try:
                self.kafkaClient = KafkaClient("%s:%s" % hostPortTuple)
            except KafkaUnavailableError:
                # Have we just contacted the last of the available
                # servers?
                if hostPortTuple == BusAdapter._KAFKA_SERVERS[-1]:
                    raise KafkaUnavailableError("No Kafka server found running at any of %s." % str(BusAdapter._KAFKA_SERVERS))
                else:
                    continue
            self.logDebug('Successfully contacted Kafka server at %s:%s...' % hostPortTuple)
            # If succeeded, init the 'bootstrap_servers' array
            # referenced in topic_waiter.py:
            self.bootstrapServers = ['%s:%s' % hostPortTuple]
            # Don't try any other servers:
            break
                
        self.producer    = SimpleProducer(self.kafkaClient)

        # Create a function that has the first method-arg
        # 'self' already built in. That new function is then
        # called with just the remaining positional/keyword parms.
        # In this case: see method :func:`addTopicListener`.
        
        # This way we can by default pass :func:`_deliverResult` to a
        # _TopicWaiter instance, and thereby cause it to invoke our
        # _deliverResult() *method* (which takes the hidden 'self.'
        # Yet other callers to subscribeToTopic() can specify 
        # a *function* which only takes the non-self parameters 
        # specified in method :func:`addTopicListener`. 
        
        self.resultCallback    = partial(self._deliverResult)
        
        # A function that will be called when the result to
        # a synchronous call arrives:
        self.syncResultWaiter  = partial(self._awaitSynchronousReturn)
        
        # Dict mapping topic names to thread objects that listen
        # to the respective topic. Used by subscribeToTopic() and
        # unsubscribeFromTopic():
        self.listenerThreads = {}
        
        # Dict mapping topic names to event objects that provide
        # communication between the topic's thread and the main
        # thread. Used in awaitMessage():
        self.topicEvents = {}
        
        # Dict used for synchronous calls: the dict maps
        # msg UUIDs to the results of a call. Set in 
        # _awaitSynchronousReturn(), and emptied in publish()
        self.resDict = {}

# --------------------------  Pulic Methods ---------------------
     
    def publish(self, busMessage, topicName=None, sync=False, msgId=None, msgType='req', timeout=None, auth=None):
        '''
        Publish either a string or a BusMessage object. If busMessage
        is a string, then the caller is responsible for ensuring that
        the string is UTF-8, and a topic name must be provided.
        
        If busMessage is a BusMessage object, then that object contains
        all the required information. In this case, parameter topicName
        overrides a topic name that might be stored in the BusMessage.
        
        Messages are wrapped in a JSON structure that provides
        'id', 'type', 'time', and 'content' fields. The 'content' field
        will contain the message payload.
        
        Two ways of using this method: asynchronously, and synchronously.
        In asynchronous invocation the passed-in message is published, and
        this method returns immediately. For this type of invocation just
        provide argument busMessage, and possibly topicName, if busMessage
        is a string. 
        
        Synchronous invocation is just like a remote procedure call.
        In synchronous invocation the passed-in message is published, and 
        this method will wait for a return message that carries the same
        message ID, and is of message type 'resp'. This method then
        returns the **content** of the returned message; the surrounding
        wrapper (time/msgId/msgType...) is stripped.  
        
        :param busMessage: string or BusMessage to publish
        :type busMessage: {string | BusMessage}
        :param topicName: name of topic to publish to. If None, then 
            parameter must be a BusMessage object that contains an
            associated topic name.
        :type topicName: {string | None}
        :param sync: if True, call will not return till answer received,
            or timeout (if given) has expired).
        :type sync: boolean
        :param msgId: if this publish() call is a response to a prior request,
            the request message's ID must be the id of the response. In that
            case the caller can use this parameter to provide the ID. If
            None, a new message ID is generated.
        :type msgId: string
        :param msgType: value for the message type field of the outgoing message.
            Usually this is 'req', but when calling publish() to return a result
            to a prior request, then set this argument to 'resp'. 
        :param timeout: timeout after which synchronous call should time out.
            if sync is False, the timeout parameter is ignored.
        :type timeout: float
        :param auth: reserved for later authentication mechanism.
        :type auth: not yet known
        :return: value is only defined for synchronous invocation.
        :rtype: string
        :raises ValueError: if targeted topic name is not provided in a msg object,
            or explicitly in the topicName parameter.
        :raises ValueError: if illegal message type is passed in.
        :raises BadInformation: if Kafka does not recognize the provided topic
            **and** Kafka is not configured to create topics on the fly.
        :raises SyncCallTimedOut: if no response is received to a synchronous call
            within the provided timeout period.
        :raises SyncCallRuntimeError: if a message received in response to a 
            synchronous call cannot be parsed.
        '''

        if not isinstance(busMessage, BusMessage):
            # We were passed a raw string to send. The topic name
            # to publish to better be given:
            if topicName is None:
                raise ValueError('Attempt to publish a string without specifying a topic name.')
            msg = busMessage
        else:
            # the busMessage parm is a BusMessage instance:
            # If topicName was given, it overrides any topic name
            # associated with the BusObject; else:
            if topicName is None:
                # Grab topic name from the BusMessage:
                topicName = busMessage.topicName()
                # If the BusMessage did not include a topic name: error
                if topicName is None:
                    raise ValueError('Attempt to publish a BusMessage instance that does not hold a topic name: %s' % str(busMessage))
            # Get the serialized, UTF-8 encoded message from the BusMessage:
            msg = busMessage.content()
            
        # Now msg contains the msg text.
        try:
            self.kafkaClient.ensure_topic_exists(topicName, timeout=5)
        except KafkaTimeoutError:
            raise BadInformation("Topic '%s' is not a recognized topic." % topicName)
        
        # Create a JSON struct:
        if msgId is None:
            msgUuid = str(uuid.uuid4())
        else:
            msgUuid = msgId
        # Sanity check on message type:
        if msgType not in BusAdapter._LEGAL_MSG_TYPES:
            raise ValueError('Legal message types are %s' % str(BusAdapter._LEGAL_MSG_TYPES))
        
        msgDict = dict(zip(['id', 'type', 'time', 'content'],
                           [msgUuid, msgType, datetime.now().isoformat(), msg]))

        # If synchronous operation requested, wait for response:
        if sync:
            
            # Before publishing the request, must prepare for 
            # a function that will be invoked with the result.
            
            # Use instance vars for communication with the result 
            # delivery thread.
            # Use of these instance vars means that publish
            # isn't re-entrant. Fine for now:

            # For the result delivery method to know which msg id
            # we are waiting for:            
            self.uuidToWaitFor   = msgUuid
            
            # For the result delivery method to know which topic
            # we are waiting for:
            self.topicToWaitFor  = topicName

            # For the result delivery method to put a string
            # if an error occurs while processing the result
            # bus message:

            self.syncResultError = None
            
            # Create event that will wake us when result
            # arrived and has been placed in self.resDict:

            self.resultArrivedEvent = threading.Event(timeout)

            # If not subscribed to the topic to which this synchronous
            # call is being published, then subscribe to it temporarily:

            wasSubscribed = topicName in self.mySubscriptions()
            if not wasSubscribed:
                self.subscribeToTopic(topicName, self.syncResultWaiter)
            else:
                self.addTopicListener(topicName, self.syncResultWaiter)
            
            # Finally: post the request...
            self.producer.send_messages(topicName, json.dumps(msgDict))
            
            # ... and wait for the answer message to invoke
            # self._awaitSynchronousReturn():
            resBeforeTimeout = self.resultArrivedEvent.wait(timeout)
            
            # Result arrived, and was placed into
            # self.resDict under the msgUuid. Remove the listener
            # that waited for the result:
            
            self.removeTopicListener(topicName, self.syncResultWaiter)
            
            # If we weren't subscribed to this topic, then
            # restore that condition:

            if not wasSubscribed:
                self.unsubscribeFromTopic(topicName)
            
            # If the 'call' timed out, raise exception:
            if not resBeforeTimeout:
                raise SyncCallTimedOut('Synchronous call on topic %s timed out' % topicName)
            
            # A result arrived from the call:
            res = self.resDict.get(msgUuid, None)
            
            # No longer need the result to be saved:
            try:
                del self.resDict[msgUuid]
            except KeyError:
                pass
            
            # Check whether awaitSynchronousReturn() placed an
            # error message into self.syncResultError:

            if self.syncResultError is not None:
                raise(SyncCallRuntimeError(self.syncResultError)) 
            
            return res
        
        else:
            # Not a synchronous call; just publish the request:
            self.producer.send_messages(topicName, json.dumps(msgDict))
       


    def subscribeToTopic(self, topicName, deliveryCallback=None, kafkaLiveCheckTimeout=30):
        '''
        Fork a new thread that keeps waiting for any messages
        on the topic of the given name. Stop listening for the topic
        by calling unsubscribeFromTropic(). 
        
        For convenience, a deliveryCallback function may be passed,
        saving a subsequent call to addTopicListener(). See addTopicListener()
        for details.
        
        If deliveryCallback is absent or None, then method _deliverResult()
        in this class will be used. That method is intended to be a 
        placeholder with no side effects.
        
        It is a no-op to call this method multiple times for the
        same topic.
                 
        :param topicName: official name of topic to listen for.
        :type topicName: string
        :param deliveryCallback: a function that takes two args: a topic
            name, and a topic content string.
        :type deliveryCallback: function
        :param kafkaLiveCheckTimeout: timeout in (fractional) seconds to
            wait when checking for a live Kafka server being available.
        :type kafkaLiveCheckTimeout: float
        :raises KafkaServerNotFound: when no Kafka server responds
        '''
        
        if deliveryCallback is None:
            deliveryCallback = self.resultCallback
            
        if type(deliveryCallback) != types.FunctionType and type(deliveryCallback) != functools.partial:
            raise ValueError("Parameter deliveryCallback must be a function, was of type %s" % type(deliveryCallback))

        try:
            # Does a thread for this msg already exist?
            self.listenerThreads[topicName]
            # Yep (b/c we didn't bomb out). Nothing to do:
            return
        
        except KeyError:
            # No thread exists for this topic. 
            
            # Create an event object that the thread will set()
            # whenever a msg arrives, even if no listeners exist:
            event = threading.Event()
            self.topicEvents[topicName] = event
            
            # Create the thread that will listen to Kafka;
            # raises KafkaServerNotFound if necessary:
            waitThread = _TopicWaiter(topicName, 
                                     self, 
                                     self.kafkaGroupId, 
                                     deliveryCallback=deliveryCallback, 
                                     eventObj=event,
                                     kafkaLiveCheckTimeout=kafkaLiveCheckTimeout)

            # Remember that this thread listens to the given topic:
            self.listenerThreads[topicName] = waitThread
            
            waitThread.start()

    def unsubscribeFromTopic(self, topicName):
        '''
        Unsubscribes from topic. Stops the topic's thread,
        and removes it from bookkeeping so that the Thread object
        will be garbage collected. Same for the Event object
        used by the thread to signal message arrival.
        
        Calling this method for a topic that is already
        unsubscribed is a no-op.
        
        :param topicName: name of topic to subscribe from
        :type topicName: string
        '''

        # Delete our record of the Event object used by the thread to
        # indicate message arrivals:
        try:
            del self.topicEvents[topicName]
        except KeyError:
            pass

        try:
            # Does a thread for this msg even exist?
            existingWaitThread = self.listenerThreads[topicName]

            # Yep, it exists. Stop it and remove it from
            # our bookkeeping
            existingWaitThread.stop()
            del self.listenerThreads[topicName]
            
        except KeyError:
            # No thread exists for this topic at all, so all done:
            return
    
    def addTopicListener(self, topicName, deliveryCallback):
        '''
        Add a listener function for a topic for which a
        subscription already exists. Parameter deliverCallback
        must be a function accepting parameters: topicName, rawResult, msgOffset
        It is an error to call the method without first
        having subscribed to the topic.
        
        :param topicName: name of topic to add
        :type topicName: String
        :param deliveryCallback: function to call when message to this topic arrives
        :type deliveryCallback: <function(topicName, rawResult, msgOffset)
        :raises NameError: if caller has not previously subscribed to topicName.

        '''
        
        if deliveryCallback != types.FunctionType and type(deliveryCallback) != functools.partial:
            raise ValueError("Parameter deliveryCallback must be a function, was of type %s" % type(deliveryCallback))
        try:
            # Does a thread for this msg already exist?
            existingWaitThread = self.listenerThreads[topicName]
            
            # Yep (b/c we didn't bomb out). Check whether the 
            # given deliveryCallback is already among the listeners 
            # added earlier:
            try:
                existingWaitThread.listeners().index(deliveryCallback)
                # Both, a thread and this callback already exist, do nothing:
                return
            except ValueError:
                pass
            # Thread exists for this topic, but an additional
            # callback is being registered:
            existingWaitThread.addListener(deliveryCallback)
            return
        except KeyError:
            # No thread exists for this topic, so no deliveryCallback
            # can be added:
            raise NameError("Attempt to add topic listener %s for topic '%s' without first subscribing to '%s'" %
                            (str(deliveryCallback), topicName, topicName))
        
    
    def removeTopicListener(self, topicName, deliveryCallback):
        '''
        Remove a topic listener function from a topic. It is
        a no-op to call this method with a topic that has not
        been subscribed to, or with a deliveryCallback function that
        was never added to the topic.
        
        :param topicName:
        :type topicName:
        :param deliveryCallback:
        :type deliveryCallback:
        '''
        
        try:
            # Does a thread for this msg even exist?
            existingWaitThread = self.listenerThreads[topicName]

            # Yep, exists (we didn't bomb). Now check whether the 
            # given deliveryCallback was actually added to the listeners 
            # earlier:

            existingListeners = existingWaitThread.listeners()
            try:
                existingListeners.index(deliveryCallback)
                # The listener to be removed does exist:
                existingWaitThread.removeListener(deliveryCallback)
                return 
            except NameError:
                # This listener isn't registered, so all done:
                return
            
        except KeyError:
            # No listener thread exists for this topic at all, so all done:
            return


    def waitForMessage(self, topicName, timeout=None):
        '''
        Block till a message on the given topic arrives. It is
        an error to call this method on a topic to which the
        caller has not previously subscribed.
        
        :param topicName:
        :type topicName:
        :param timeout: seconds (or fractions of second) to wait.
        :type timeout: float
        :returns: True if a message arrived in time, else returnes False
        :rtype: boolean
        :raises NameError: on attempt to wait for a topic for which no subscription exists.
        '''
        
        try:
            event = self.topicEvents[topicName]
            return(event.wait(timeout))
        except KeyError:
            raise NameError("Attempt to wait for messages on topic %s, which was never subscribed to." % topicName)
 
    def mySubscriptions(self):
        '''
        Return a list of topic names to which this bus adapter is subscribed.
        
        :return: List of topics to which caller is subscribed
        :rtype: [String]
        '''
        return self.topicEvents.keys()
        
    def returnError(self, req_key, topicName, errMsg):
        '''
        Convencience method when handling an incoming message.
        Returns a message that is marked as an error return.
        
        :param req_key: key of the incoming message; it will be used in the return message as well.
        :type req_key: String
        :param topicName: name of topic to use in the return message
        :type topicName: String
        :param errMsg: error message to include in the return message
        :type errMsg: String
        '''
        
        errMsg = {'resp_key'    : req_key,
                  'type'        : 'resp',
                  'status'      : 'ERROR',
                  'time'        : datetime.now().isoformat(),
                  'content'     : errMsg
                 }
        errMsgJSON = _JSONEncoderBusExtended.makeJSON(errMsg)
        self.bus.publish(errMsgJSON, topicName)
      
    def close(self):
        '''
        Cleanup. All threads are stopped. Kafka
        connection is closed.
        '''
        for thread in self.listenerThreads.values():
            thread.stop()
        self.listenerThreads.clear()
        self.topicEvents.clear()
        
        self.kafkaClient.close()

# --------------------------  Private Methods ---------------------


    def _deliverResult(self, topicName, rawResult, msgOffset):
        '''
        Simple default message delivery callback. Just prints 
        topic name and content. Override in subclass to get 
        more interesting behavior. Remember, though: you (I believe)
        need to do the functools.partial trick to create a function
        for your overriding method that already has 'self' curried out.
        We may be able to simplify that, because the listening threads
        do save the BusAdapter objecst that created them.    
        
        :param topicName: name of topic the msg came from
        :type topicName: string
        :param rawResult: the string from the wire; not yet de-serialized
        :type rawResult: string
        :param msgOffset: the Kafka queue offset of the message
        :type msgOffset: int 
        '''
        print('Msg at offset %d: %s' % (msgOffset,rawResult))
        

    def _awaitSynchronousReturn(self, topicName, rawResult, msgOffset):
        '''
        A callback for _TopicWaiter. Invoked from a different thread!!
        This callback is installed by publish() when a synchronous
        bus 'call' is executed. The main thread, i.e. publish() will
        have delivered the request to the bus, and initialized the 
        following instance variables for us:

          * self.uuidToWaitFor: the message id an incoming result must have
          * self.syncResultError: a place for this method to place an error message if necessary
          * self.resultArrivedEvent: a threading.Event() obj which this method will set() when it's done.
        
        :param topicName: name of topic on which a message arrived
        :type topicName: string
        :param rawResult: message payload; a JSON string
        :type rawResult: string
        :param msgOffset: offset in Kafka system
        :type msgOffset: int
        '''
        
        # If this incoming message is the wrong topic,
        # ignore; this should never happen, b/c this method
        # is only installed as a listener when we hang for
        # a synchronous call:

        if topicName != self.topicToWaitFor:
            return
        
        # Turn msg JSON into a dict:
        try:
            thisResDict = json.loads(rawResult)
        except ValueError:
            self.syncResultError = 'Bad JSON while waiting for sync response: %s' % rawResult
            # Tell main thread that answer to synchronous
            # call arrived, and was processed:
            self.resultArrivedEvent.set()
            return
        
        # Is this a response msg, and is it the one
        # we are waiting for?
        thisUuid    = thisResDict.get('id', None)
        thisMsgType = thisResDict.get('type', None)
        thisContent = thisResDict.get('content', None)
        
        if thisUuid    == self.uuidToWaitFor and \
           thisMsgType == 'resp':
            # All good; store just the msg content field
            # in a result dict that's shared with the main
            # thread:
            self.resDict[thisUuid] = thisContent
        
            # Tell main thread that answer to synchronous
            # call arrived, and was processed:
            self.resultArrivedEvent.set()
        else:
            # Not the msg we are waiting for:
            return
    
    
    def _setupLogging(self, loggingLevel, logFile):
        if BusAdapter._loggingInitialized:
            # Remove previous file or console handlers,
            # else we get logging output doubled:
            BusAdapter._logger.handlers = []
            
        # Set up logging:
        # A _logger named SchoolBusLog:
        BusAdapter._logger = logging.getLogger('SchoolBusLog')
        BusAdapter._logger.setLevel(loggingLevel)
        
        # A msg formatter that shows datetime, _logger name, 
        # the log level of the message, and the msg.
        # The datefmt=None causes ISO8601 to be used:
        
        formatter = logging.Formatter(fmt='%(asctime)s-%(name)s-%(levelname)s-%(module)s: %(message)s',datefmt=None)
        
        # Create file handler if requested:
        if logFile is not None:
            handler = logging.FileHandler(logFile)
        else:
            # Create console handler:
            handler = logging.StreamHandler()
        handler.setFormatter(formatter)
        handler.setLevel(loggingLevel)
#         # create formatter and add it to the handlers
#         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
#         fh.setFormatter(formatter)
#         ch.setFormatter(formatter)
        # Add the handler to the _logger
        BusAdapter._logger.addHandler(handler)
        #**********************
        #BusAdapter._logger.info("Info for you")
        #BusAdapter._logger.warn("Warning for you")
        #BusAdapter._logger.debug("Debug for you")
        #**********************
        
        BusAdapter._loggingInitialized = True


    def logWarn(self, msg):
        '''
        Loccally log a warning message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        BusAdapter._logger.warn(msg)

    def logInfo(self, msg):
        '''
        Locally log an info message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        BusAdapter._logger.info(msg)
     
    def logError(self, msg):
        '''
        Locally log an error message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        
        BusAdapter._logger.error(msg)

    def logDebug(self, msg):
        '''
        Locally log a debug message using the Python logging facility.
        The _logger name is 'SchoolBusLog'. Change format or _logger
        name by modifying _setupLogging().
        
        :param msg: message to log
        :type msg: String
        '''
        BusAdapter._logger.debug(msg)
Ejemplo n.º 45
0
class ZKConsumer(object):

    zk_timeout = 30
    jitter_seconds = 30
    broker_prefix = '/brokers/ids'

    def __init__(
            self,
            zk_hosts,
            group,
            topic,
            nodes,
            zk_handler=None,
            logger=None,
            identifier=None,
            **consumer_kwargs):
        """Creates a Consumer that tracks state in ZooKeeper,
        rebalancing partition ownership as registered consumers change.
        NOTE: this class is intended for version 0.8.1 of Kafka, where offsets
              are managed by Kafka but there is no rebalancing in the protocol.
        """
        if logger is None:
            logger = logging.getLogger('kafka.consumer.ZKConsumer')
        self.logger = logger
        self.identifier = identifier

        if KafkaClient is None:
            raise RuntimeError("Kafka support requires cs.eyrie to be installed with the Kafka extra: install_requires= ['cs.eyrie[Kafka]']")
        self.zk_handler = zk_handler
        self.zk_hosts = zk_hosts
        self.broker_hosts = []

        self.group = group
        self.topic = topic

        self.zk = None
        self.nodes = nodes
        self.client = None
        self.consumer = None
        self.consumer_kwargs = consumer_kwargs

        # This will kick off a cascading sequence to initialize ourselves:
        # 1. Connect to ZK and pull list of Kafka brokers
        # 2. Register ourselves as a consumer in ZK
        # 3. Rebalance partitions across all connected consumers
        self.init_zk()

    def zk_session_watch(self, state):
        self.logger.debug('ZK transitioned to: %s', state)
        if state == KazooState.SUSPENDED:
            if self.consumer is not None:
                self.logger.info('Stopping Kafka consumer')
                self.consumer.stop()
                self.consumer = None
            # Lost connection to ZK; we can't call any methods that would
            # try to contact it (i.e., we can't do self.zkp.finish() )
            self.zkp = None
        elif state == KazooState.CONNECTED:
            self.logger.info('Restarting ZK partitioner')
            self.zk.handler.spawn(self.init_zkp)

    def _zkp_wait(self):
        handler = self.zk.handler
        while 1:
            if self.zkp.failed:
                self.logger.warning("Lost or unable to acquire partition")
                self.stop()
            elif self.zkp.release:
                self.zkp.release_set()
            elif self.zkp.acquired:
                def group_change_proxy(event):
                    self.logger.warn('Connected consumers changed')
                    if self.zkp is None:
                        self.logger.info('Restarting ZK partitioner')
                        handler.spawn(self.init_zkp)
                    elif self.zkp is not None and self.zkp.failed:
                        self.logger.warning("Lost or unable to acquire partition")
                        self.stop()
                    else:
                        self.logger.info('Scheduling ZK partitioner set release')
                        rel_greenlet = handler.spawn(self.zkp.release_set)
                        self.logger.info('Scheduling group re-join')
                        rel_greenlet.link_value(lambda greenlet: self.zkp.join_group)
                if not self.nodes:
                    self.logger.info('Partitioner aquired; setting child watch')
                    result = self.zk.get_children_async(self.zkp._group_path)
                    result.rawlink(group_change_proxy)
                # Break out of while loop to begin consuming events
                break
            elif self.zkp.allocating:
                self.zkp.wait_for_acquire()

    def init_zkp(self):
        if not hasattr(self, 'zkp') or self.zkp is None:
            if self.nodes:
                self.zkp = StaticZKPartitioner(
                    self.zk, self.group, self.topic, self.nodes,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)
            else:
                self.zkp = ZKPartitioner(
                    self.zk, self.group, self.topic,
                    time_boundary=self.jitter_seconds,
                    partitions_changed_cb=self.init_consumer,
                    logger=self.logger, identifier=self.identifier)

        self._zkp_wait()

    def init_zk(self):
        # TODO: switch to async
        # 1. implement kazoo.interfaces.IHandler in terms of Tornado's IOLoop
        self.zk = KazooClient(hosts=self.zk_hosts, handler=self.zk_handler)
        self.zk.start()
        self.zk.add_listener(self.zk_session_watch)

        @self.zk.ChildrenWatch(self.broker_prefix)
        def broker_change_proxy(broker_ids):
            self.onBrokerChange(broker_ids)

        self.init_zkp()

    def onBrokerChange(self, broker_ids):
        self.broker_hosts = []
        for b_id in broker_ids:
            b_json, zstat = self.zk.get('/'.join([self.broker_prefix, b_id]))
            b_data = json.loads(b_json)
            self.broker_hosts.append('{}:{}'.format(b_data['host'],
                                                    b_data['port']))

        my_partitions = []
        if self.consumer is not None:
            self.logger.warn('Brokers changed, stopping Kafka consumer.')
            my_partitions = self.consumer.offsets.keys()
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.warn('Brokers changed, stopping Kafka client.')
            self.client.close()
            self.client = None

        if my_partitions:
            msg = 'Brokers changed, queuing restart of Kafka client / consumer.'
            self.logger.warn(msg)
            self.zk.handler.spawn(self.init_consumer, my_partitions)

    def init_consumer(self, my_partitions):
        if self.consumer is None:
            self.logger.warn('Starting Kafka client')
            self.client = KafkaClient(self.broker_hosts,
                                      client_id=self.zkp._identifier)
        else:
            if self.consumer is None or \
               sorted(my_partitions) != sorted(self.consumer.offsets.keys()):
                self.logger.warn('Partitions changed, restarting Kafka consumer.')
                self.consumer.stop()
            else:
                self.logger.info('Partitions unchanged, not restarting Kafka consumer.')
                return

        self.consumer = SimpleConsumer(self.client, self.group, self.topic,
                                       partitions=my_partitions,
                                       **self.consumer_kwargs)
        self.consumer.provide_partition_info()
        self.logger.info("Consumer connected to Kafka: %s", self.consumer.offsets)

    def stop(self):
        if self.consumer is not None:
            self.logger.info('Stopping Kafka consumer')
            self.consumer.stop()
            self.consumer = None
        if self.client is not None:
            self.logger.info('Stopping Kafka client')
            self.client.close()
            self.client = None
        if self.zk is not None:
            self.logger.info('Stopping ZooKeeper client')
            if self.zkp is not None and not self.zkp.failed:
                self.zkp.finish()
                self.zk.stop()
            self.zkp = None
            self.zk = None

    def commit(self, partitions=None):
        """
        Commit offsets for this consumer

        partitions: list of partitions to commit, default is to commit
                    all of them
        """
        if self.consumer is None:
            return
        self.logger.debug('Begin committing offsets for partitions: %s',
                          partitions if partitions else 'All')
        self.consumer.commit(partitions)
        self.logger.debug('End committing offsets for partitions: %s',
                          partitions if partitions else 'All')

    def pending(self, partitions=None):
        """
        Gets the pending message count

        partitions: list of partitions to check for, default is to check all
        """
        return self.consumer.pending(partitions)

    def provide_partition_info(self):
        """
        Indicates that partition info must be returned by the consumer
        """
        self.consumer.provide_partition_info()

    def seek(self, offset, whence):
        """
        Alter the current offset in the consumer, similar to fseek

        offset: how much to modify the offset
        whence: where to modify it from
                0 is relative to the earliest available offset (head)
                1 is relative to the current offset
                2 is relative to the latest known offset (tail)
        """
        self.consumer.seek(offset, whence)

    def get_messages(self, count=1, block=True, timeout=0.1):
        """
        Fetch the specified number of messages

        count: Indicates the maximum number of messages to be fetched
        block: If True, the API will block till some messages are fetched.
        timeout: If block is True, the function will block for the specified
                 time (in seconds) until count messages is fetched. If None,
                 it will block forever.
        """
        if self.consumer is None:
            return []
        else:
            try:
                messages = self.consumer.get_messages(count, block, timeout)
                if not messages and self.zkp.failed:
                    raise FailedPayloadsError
                return messages
            except FailedPayloadsError as err:
                msg = 'Failed to retrieve payload, restarting consumer'
                self.logger.exception(msg)
                raise err

    def get_message(self, block=True, timeout=0.1, get_partition_info=None):
        return self.consumer.get_message(block, timeout, get_partition_info)

    def _get_message(self, block=True, timeout=0.1, get_partition_info=None,
                     update_offset=True):
        return self.consumer._get_message(block, timeout, get_partition_info,
                                          update_offset)

    def __iter__(self):
        for msg in self.consumer:
            yield msg
Ejemplo n.º 46
0
    def check(self, instance):
        """
        Check offset in kafka for consumer_groups,topics and partitions.


        Alt 1;
        You can ether specify consumer_groups, topics and partitions in
        config file like

        consumer_groups:
            my_consumer:
              my_topic: [0, 1, 4, 12]

        Alt 2;
        Ask zookeeper for the current configuration and use that, it will
        do this if no consumer_groups is specifyed in configuration.

        """

        zk_connect_str = self.read_config(instance, 'zk_connect_str')
        kafka_host_ports = self.read_config(instance, 'kafka_connect_str')

        # Construct the Zookeeper path pattern
        zk_prefix = instance.get('zk_prefix', '')
        # Connect to Zookeeper
        zk_conn = KazooClient(zk_connect_str)
        zk_conn.start()

        try:
            if instance.has_key('consumer_groups'):
                #Alt1, Only check the given consumer groups, topics and partions.
                consumer_groups = self.read_config(
                    instance,
                    'consumer_groups',
                    cast=self._validate_consumer_groups)

                (consumer_offsets, topics) = \
                    self._get_offsets_based_on_config(zk_conn, zk_prefix, consumer_groups)
            else:
                #Alt2, Non given lets ask zookeeper for a full set.
                (consumer_offsets, topics) = \
                    self._get_offsets_from_zk(zk_conn, zk_prefix)

        finally:
            try:
                zk_conn.stop()
                zk_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Zookeeper connection')

        # Connect to Kafka
        kafka_conn = KafkaClient(kafka_host_ports)

        try:
            # Query Kafka for the broker offsets
            broker_offsets = {}
            for topic, partitions in topics.items():
                offset_responses = kafka_conn.send_offset_request(
                    [OffsetRequest(topic, p, -1, 1) for p in partitions])

                for resp in offset_responses:
                    broker_offsets[(resp.topic,
                                    resp.partition)] = resp.offsets[0]
        finally:
            try:
                kafka_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Kafka connection')

        # Report the broker data
        for (topic, partition), broker_offset in broker_offsets.items():
            broker_tags = ['topic:%s' % topic, 'partition:%s' % partition]
            broker_offset = broker_offsets.get((topic, partition))
            self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags)

        # Report the consumer
        for (consumer_group, topic,
             partition), consumer_offset in consumer_offsets.items():

            # Get the broker offset
            broker_offset = broker_offsets.get((topic, partition))

            # Report the consumer offset and lag
            tags = [
                'topic:%s' % topic,
                'partition:%s' % partition,
                'consumer_group:%s' % consumer_group
            ]
            self.gauge('kafka.consumer_offset', consumer_offset, tags=tags)
            self.gauge('kafka.consumer_lag',
                       broker_offset - consumer_offset,
                       tags=tags)
Ejemplo n.º 47
0
class KfkClient(object):

    def __init__(self, ip):
        self.client = KafkaClient(ip, 9092)
        self.fd = None
        self.topic = None
        self.partition = None
        self.offset = None

    def send(self, topic, partition, data):
        message = self.client.create_message(data)
        request = ProduceRequest(topic, partition, [message])
        self.client.send_message_set(request)

    def _check_offset(self, topic, partition):
        if (self.topic != topic or self.partition != partition):
            self.topic = topic
            self.partition = partition
            self._get_new_offset()

    def receive(self, topic, partition):
        self._check_offset(topic, partition)

        while True:
            request = FetchRequest(topic, partition, self.offset, 2048)
            debug(request)
            try:
                (messages, nextRequest) = self.client.get_message_set(request)
            except e:
                self._check_offset(topic, partition)
                continue
                
            if len(messages) > 0:
                self.offset = nextRequest.offset
                self._write_offset()
                return messages
            else:
                time.sleep(1)

    def get_line(self, topic, partition):
        while True:
            messages = self.receive(topic, partition)
            for message in messages:
                yield message.payload

    def close(self):
        if self.fd is not None:
            self.fd.close()
        self.client.close()


    def _get_new_offset(self):
        file_name = "%s-%s.offset" % (self.topic, self.partition)

        if self.fd is not None:
            self.fd.close()

        try:
            self.fd = open(file_name, 'r+')
            file_offset = self.fd.readline()
        except IOError:
            self.fd = open(file_name, 'w+')
            file_offset = -1

        self.fd.seek(0,0)
        self.fd.truncate()

        try:
            file_offset = int(file_offset)
        except:
            file_offset = 0

        minoffsetreq = OffsetRequest(self.topic, self.partition, -2, 1)
        results = self.client.get_offsets(minoffsetreq)
        minoffset = results[0]

        maxoffsetreq = OffsetRequest(self.topic, self.partition, -1, 1)
        results = self.client.get_offsets(maxoffsetreq)
        maxoffset = results[0]


        if file_offset == -1:
            self.offset = minoffset
        elif file_offset >= minoffset and file_offset <= maxoffset:
            self.offset = file_offset
        else:
            self.offset = maxoffset

        debug ("file%d min%d max%d using%d" % (file_offset, minoffset, maxoffset, self.offset))
        self._write_offset()


    def _write_offset(self):
        self.fd.seek(0,0)
        self.fd.write("%d" % self.offset)
Ejemplo n.º 48
0

        # 判断是否达到分页值,进行提交
        if num == slice_number:
            num = 0
            print datetime.datetime.now(), 'log_time:', log_time, 'handled to %d' % msg.offset
            # 提交笔记点击数
            url_handler.discovery_click_handler(discovery_click_data)
            discovery_click_data = {}

            # 提交类别列表点击数
            url_handler.discovery_list_handler(list_oid_click_data)
            list_oid_click_data = {}

            # # 提交专题页点击数
            url_handler.event_page_click_handler(event_page_click_data)
            event_page_click_data = {}

            # # 提交专题页微信分享数
            url_handler.event_page_weixin_share_handler(event_page_fpid_data)
            event_page_fpid_data = {}

            print datetime.datetime.now(), 'handled succeed'
            print 

kafka.close()




Ejemplo n.º 49
0
class Partitioner(object):
    """Partitioner is used to handle distributed a set of
    topics/partitions among a group of consumers.

    :param topics: kafka topics
    :type topics: list
    :param acquire: function to be called when a set of partitions
                    has been acquired. It should usually allocate the consumers.
    :param release: function to be called when the acquired
                    partitions have to be release. It should usually stops the consumers.

    """
    def __init__(self, config, topics, acquire, release):
        self.log = logging.getLogger(self.__class__.__name__)
        self.config = config
        # Clients
        self.kazoo_client = None
        self.kafka_client = None
        self.topics = topics
        self.acquired_partitions = defaultdict(list)
        self.partitions_set = set()
        # User callbacks
        self.acquire = acquire
        self.release = release
        # We guarantee that the user defined release function call follows
        # always the acquire. release function will never be called twice in a
        # row. Initialize to true because no partitions have been acquired at
        # startup.
        self.released_flag = True
        # Kafka metadata refresh
        self.force_partitions_refresh = True
        self.last_partitions_refresh = 0
        # Kazoo partitioner
        self._partitioner = None
        # Map Kazoo partitioner state to actions
        self.actions = {
            PartitionState.ALLOCATING: self._allocating,
            PartitionState.ACQUIRED: self._acquire,
            PartitionState.RELEASE: self._release,
            PartitionState.FAILURE: self._fail
        }

        self.kazoo_retry = None
        self.zk_group_path = build_zk_group_path(
            self.config.group_path,
            self.topics,
        ) if self.config.use_group_sha else self.config.group_path

    def start(self):
        """Create a new group and wait until the partitions have been
        acquired. This function should never be called twice.

        :raises: PartitionerError upon partitioner failures

        .. note: This is a blocking operation.
        """
        self.kazoo_retry = KazooRetry(**KAZOO_RETRY_DEFAULTS)
        self.kazoo_client = KazooClient(
            self.config.zookeeper,
            connection_retry=self.kazoo_retry,
        )
        self.kafka_client = KafkaClient(self.config.broker_list)

        self.log.debug("Starting a new group for topics %s", self.topics)
        self.released_flag = True
        self._refresh()

    def __enter__(self):
        self.start()

    def __exit__(self, exc_type, exc_value, traceback):
        self.stop()

    def stop(self):
        """Leave the group and release the partitions."""
        self.log.debug("Stopping group for topics %s", self.topics)
        self.release_and_finish()
        self._close_connections()

    def refresh(self):
        """Rebalance upon group changes, such as when a consumer
        joins/leaves the group, the partitions for a topics change, or the
        partitioner itself fails (connection to zookeeper lost).
        This method should be called periodically to make sure that the
        group is in sync.

        :raises: PartitionerError upon partitioner failures
        """
        self.log.debug("Refresh group for topics %s", self.topics)
        self._refresh()

    def _refresh(self):
        while True:
            partitioner = self._get_partitioner()
            self._handle_group(partitioner)
            if self.acquired_partitions:
                break

    def need_partitions_refresh(self):
        return (self.force_partitions_refresh or self.last_partitions_refresh <
                time.time() - PARTITIONS_REFRESH_TIMEOUT)

    def _get_partitioner(self):
        """Get an instance of the partitioner. When the partitions set changes
         we need to destroy the partitioner and create another one.
        If the partitioner does not exist yet, create a new partitioner.
        If the partitions set changed, destroy the partitioner and create a new
        partitioner. Different consumer will eventually use
        the same partitions set.

        :param partitions: the partitions set to use for partitioner.
        :type partitions: set
        """
        if self.need_partitions_refresh() or not self._partitioner:
            try:
                partitions = self.get_partitions_set()
            except Exception:
                self.log.exception("Failed to get partitions set from Kafka."
                                   "Releasing the group.")
                self.release_and_finish()
                raise PartitionerError(
                    "Failed to get partitions set from Kafka", )
            self.force_partitions_refresh = False
            self.last_partitions_refresh = time.time()
            if partitions != self.partitions_set:
                # If partitions changed we release the consumers, destroy the
                # partitioner and disconnect from zookeeper.
                self.log.info(
                    "Partitions set changed. New partitions: %s. "
                    "Old partitions %s. Rebalancing...",
                    [p for p in partitions if p not in self.partitions_set],
                    [p for p in self.partitions_set if p not in partitions])
                # We need to destroy the existing partitioner before creating
                # a new one.
                self.release_and_finish()
                self._partitioner = self._create_partitioner(partitions)
                self.partitions_set = partitions
        return self._partitioner

    def _create_partitioner(self, partitions):
        """Connect to zookeeper and create a partitioner"""
        if self.kazoo_client.state != KazooState.CONNECTED:
            try:
                self.kazoo_client.start()
            except Exception:
                self.log.exception("Impossible to connect to zookeeper")
                self.release_and_finish()
                raise PartitionerError("Zookeeper connection failure")

        self.log.debug(
            "Creating partitioner for group %s, topic %s,"
            " partitions set %s", self.config.group_id, self.topics,
            partitions)
        return self.kazoo_client.SetPartitioner(
            path=self.zk_group_path,
            set=partitions,
            time_boundary=self.config.partitioner_cooldown,
        )

    def release_and_finish(self):
        """Release consumers and terminate the partitioner"""
        if self._partitioner:
            self._release(self._partitioner)
            self._partitioner.finish()
        self._partitioner = None

    def _close_connections(self):
        self.kafka_client.close()
        self.partitions_set = set()
        self.last_partitions_refresh = 0
        self.kazoo_client.stop()
        self.kazoo_client.close()
        self.kazoo_retry = None

    def _handle_group(self, partitioner):
        """Handle group status changes, for example when a new
        consumer joins or leaves the group.
        """
        if partitioner:
            try:
                self.actions[partitioner.state](partitioner)
            except KeyError:
                self.log.exception("Unexpected partitioner state.")
                self.release_and_finish()
                raise PartitionerError("Invalid partitioner state %s" %
                                       partitioner.state)

    def _allocating(self, partitioner):
        """Usually we don't want to do anything but waiting in
        allocating state.
        """
        partitioner.wait_for_acquire()

    def _acquire(self, partitioner):
        """Acquire kafka topics-[partitions] and start the
        consumers for them.
        """
        acquired_partitions = self._get_acquired_partitions(partitioner)
        if acquired_partitions != self.acquired_partitions:
            # TODO: Decrease logging level
            self.log.info(
                "Total number of acquired partitions = %s"
                "It was %s before. Added partitions %s. Removed partitions %s",
                len(acquired_partitions),
                len(self.acquired_partitions),
                [
                    p for p in acquired_partitions
                    if p not in self.acquired_partitions
                ],
                [
                    p for p in self.acquired_partitions
                    if p not in acquired_partitions
                ],
            )
            self.acquired_partitions = acquired_partitions
            try:
                self.acquire(copy.deepcopy(self.acquired_partitions))
                self.released_flag = False
            except Exception:
                self.log.exception("Acquire action failed.")
                trace = traceback.format_exc()
                self.release_and_finish()
                raise PartitionerError(
                    "Acquire action failed."
                    "Acquire error: {trace}".format(trace=trace))

    def _release(self, partitioner):
        """Release the consumers and acquired partitions.
        This function is executed either at termination time or
        whenever there is a group change.
        """
        self.log.debug("Releasing partitions")
        try:
            if not self.released_flag:
                self.release(self.acquired_partitions)
                self.released_flag = True
        except Exception:
            trace = traceback.format_exc()
            self.log.exception("Release action failed.")
            raise PartitionerError(
                "Release action failed."
                "Release error: {trace}".format(trace=trace), )
        partitioner.release_set()
        self.acquired_partitions.clear()
        self.force_partitions_refresh = True

    def _fail(self, partitioner):
        """Handle zookeeper failures.
        Executed when the consumer group is not able to recover
        the connection. In this case, we cowardly stop
        the running consumers.
        """
        self.log.error("Lost or unable to acquire partitions")
        self.release_and_finish()
        raise PartitionerZookeeperError(
            "Internal partitioner error. "
            "Lost connection to zookeeper: {cluster}".format(
                cluster=self.config.zookeeper, ))

    def _get_acquired_partitions(self, partitioner):
        """Retrieve acquired partitions from a partitioner.

        :returns: acquired topic and partitions
        :rtype: dict {<topic>: <[partitions]>}
        """
        acquired_partitions = defaultdict(list)
        for partition in partitioner:
            topic, partition_id = partition.rsplit('-', 1)
            acquired_partitions[topic].append(int(partition_id))
        return acquired_partitions

    def get_partitions_set(self):
        """ Load partitions metadata from kafka and create
        a set containing "<topic>-<partition_id>"

        :returns: partitions for user topics
        :rtype: set
        :raises PartitionerError: if no partitions have been found
        """
        topic_partitions = get_kafka_topics(self.kafka_client)
        partitions = []
        missing_topics = set()
        for topic in self.topics:
            kafka_topic = kafka_bytestring(topic)
            if kafka_topic not in topic_partitions:
                missing_topics.add(topic)
            else:
                partitions += [
                    "{0}-{1}".format(topic, p)
                    for p in topic_partitions[kafka_topic]
                ]
        if missing_topics:
            self.log.info("Missing topics: %s", missing_topics)
        if not partitions:
            self.release_and_finish()
            raise PartitionerError(
                "No partitions found for topics: {topics}".format(
                    topics=self.topics))
        return set(partitions)
Ejemplo n.º 50
0
    def check(self, instance):
        consumer_groups = self.read_config(instance, 'consumer_groups',
                                           cast=self._validate_consumer_groups)
        zk_connect_str = self.read_config(instance, 'zk_connect_str')
        kafka_host_ports = self.read_config(instance, 'kafka_connect_str',
                                            cast=self._parse_connect_str)

        # Construct the Zookeeper path pattern
        zk_prefix = instance.get('zk_prefix', '')
        zk_path_tmpl = zk_prefix + '/consumers/%s/offsets/%s/%s'

        # Connect to Zookeeper
        zk_conn = KazooClient(zk_connect_str)
        zk_conn.start()

        try:
            # Query Zookeeper for consumer offsets
            consumer_offsets = {}
            topics = defaultdict(set)
            for consumer_group, topic_partitions in consumer_groups.iteritems():
                for topic, partitions in topic_partitions.iteritems():
                    # Remember the topic partitions that we've see so that we can
                    # look up their broker offsets later
                    topics[topic].update(set(partitions))
                    for partition in partitions:
                        zk_path = zk_path_tmpl % (consumer_group, topic, partition)
                        try:
                            consumer_offset = int(zk_conn.get(zk_path)[0])
                            key = (consumer_group, topic, partition)
                            consumer_offsets[key] = consumer_offset
                        except NoNodeError:
                            self.log.warn('No zookeeper node at %s' % zk_path)
                        except Exception:
                            self.log.exception('Could not read consumer offset from %s' % zk_path)
        finally:
            try:
                zk_conn.stop()
                zk_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Zookeeper connection')

        # Connect to Kafka
        kafka_host, kafka_port = random.choice(kafka_host_ports)
        kafka_conn = KafkaClient(kafka_host, kafka_port)

        try:
            # Query Kafka for the broker offsets
            broker_offsets = {}
            for topic, partitions in topics.items():
                offset_responses = kafka_conn.send_offset_request([
                    OffsetRequest(topic, p, -1, 1) for p in partitions])

                for resp in offset_responses:
                    broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0]
        finally:
            try:
                kafka_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Kafka connection')

        # Report the broker data
        for (topic, partition), broker_offset in broker_offsets.items():
            broker_tags = ['topic:%s' % topic, 'partition:%s' % partition]
            broker_offset = broker_offsets.get((topic, partition))
            self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags)

        # Report the consumer
        for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items():

            # Get the broker offset
            broker_offset = broker_offsets.get((topic, partition))

            # Report the consumer offset and lag
            tags = ['topic:%s' % topic, 'partition:%s' % partition,
                    'consumer_group:%s' % consumer_group]
            self.gauge('kafka.consumer_offset', consumer_offset, tags=tags)
            self.gauge('kafka.consumer_lag', broker_offset - consumer_offset,
                       tags=tags)
Ejemplo n.º 51
0
    def check(self, instance):
        consumer_groups = self.read_config(instance, 'consumer_groups',
                                           cast=self._validate_consumer_groups)
        zk_connect_str = self.read_config(instance, 'zk_connect_str')
        kafka_host_ports = self.read_config(instance, 'kafka_connect_str')

        # Construct the Zookeeper path pattern
        zk_prefix = instance.get('zk_prefix', '')
        zk_path_tmpl = zk_prefix + '/consumers/%s/offsets/%s/%s'

        # Connect to Zookeeper
        zk_conn = KazooClient(zk_connect_str, timeout=self.zk_timeout)
        zk_conn.start()

        try:
            # Query Zookeeper for consumer offsets
            consumer_offsets = {}
            topics = defaultdict(set)
            for consumer_group, topic_partitions in consumer_groups.iteritems():
                for topic, partitions in topic_partitions.iteritems():
                    # Remember the topic partitions that we've see so that we can
                    # look up their broker offsets later
                    topics[topic].update(set(partitions))
                    for partition in partitions:
                        zk_path = zk_path_tmpl % (consumer_group, topic, partition)
                        try:
                            consumer_offset = int(zk_conn.get(zk_path)[0])
                            key = (consumer_group, topic, partition)
                            consumer_offsets[key] = consumer_offset
                        except NoNodeError:
                            self.log.warn('No zookeeper node at %s' % zk_path)
                        except Exception:
                            self.log.exception('Could not read consumer offset from %s' % zk_path)
        finally:
            try:
                zk_conn.stop()
                zk_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Zookeeper connection')

        # Connect to Kafka
        kafka_conn = KafkaClient(kafka_host_ports, timeout=self.kafka_timeout)

        try:
            # Query Kafka for the broker offsets
            broker_offsets = {}
            for topic, partitions in topics.items():
                offset_responses = kafka_conn.send_offset_request([
                    OffsetRequest(topic, p, -1, 1) for p in partitions])

                for resp in offset_responses:
                    broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0]
        finally:
            try:
                kafka_conn.close()
            except Exception:
                self.log.exception('Error cleaning up Kafka connection')

        # Report the broker data
        for (topic, partition), broker_offset in broker_offsets.items():
            broker_tags = ['topic:%s' % topic, 'partition:%s' % partition]
            broker_offset = broker_offsets.get((topic, partition))
            self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags)

        # Report the consumer
        for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items():

            # Get the broker offset
            broker_offset = broker_offsets.get((topic, partition))

            # Report the consumer offset and lag
            tags = ['topic:%s' % topic, 'partition:%s' % partition,
                    'consumer_group:%s' % consumer_group]
            self.gauge('kafka.consumer_offset', consumer_offset, tags=tags)
            self.gauge('kafka.consumer_lag', broker_offset - consumer_offset,
                       tags=tags)
Ejemplo n.º 52
0
class KafkaConsumer(object):
    """A simpler kafka consumer"""
    DEFAULT_CONFIG = deepcopy(DEFAULT_CONSUMER_CONFIG)

    def __init__(self, *topics, **configs):
        self.configure(**configs)
        self.set_topic_partitions(*topics)

    def configure(self, **configs):
        """Configure the consumer instance

        Configuration settings can be passed to constructor,
        otherwise defaults will be used:

        Keyword Arguments:
            bootstrap_servers (list): List of initial broker nodes the consumer
                should contact to bootstrap initial cluster metadata.  This does
                not have to be the full node list.  It just needs to have at
                least one broker that will respond to a Metadata API Request.
            client_id (str): a unique name for this client.  Defaults to
                'kafka.consumer.kafka'.
            group_id (str): the name of the consumer group to join,
                Offsets are fetched / committed to this group name.
            fetch_message_max_bytes (int, optional): Maximum bytes for each
                topic/partition fetch request.  Defaults to 1024*1024.
            fetch_min_bytes (int, optional): Minimum amount of data the server
                should return for a fetch request, otherwise wait up to
                fetch_wait_max_ms for more data to accumulate.  Defaults to 1.
            fetch_wait_max_ms (int, optional): Maximum time for the server to
                block waiting for fetch_min_bytes messages to accumulate.
                Defaults to 100.
            refresh_leader_backoff_ms (int, optional): Milliseconds to backoff
                when refreshing metadata on errors (subject to random jitter).
                Defaults to 200.
            socket_timeout_ms (int, optional): TCP socket timeout in
                milliseconds.  Defaults to 30*1000.
            auto_offset_reset (str, optional): A policy for resetting offsets on
                OffsetOutOfRange errors. 'smallest' will move to the oldest
                available message, 'largest' will move to the most recent.  Any
                ofther value will raise the exception.  Defaults to 'largest'.
            deserializer_class (callable, optional):  Any callable that takes a
                raw message value and returns a deserialized value.  Defaults to
                 lambda msg: msg.
            auto_commit_enable (bool, optional): Enabling auto-commit will cause
                the KafkaConsumer to periodically commit offsets without an
                explicit call to commit().  Defaults to False.
            auto_commit_interval_ms (int, optional):  If auto_commit_enabled,
                the milliseconds between automatic offset commits.  Defaults to
                60 * 1000.
            auto_commit_interval_messages (int, optional): If
                auto_commit_enabled, a number of messages consumed between
                automatic offset commits.  Defaults to None (disabled).
            consumer_timeout_ms (int, optional): number of millisecond to throw
                a timeout exception to the consumer if no message is available
                for consumption.  Defaults to -1 (dont throw exception).

        Configuration parameters are described in more detail at
        http://kafka.apache.org/documentation.html#highlevelconsumerapi
        """
        configs = self._deprecate_configs(**configs)
        self._config = {}
        for key in self.DEFAULT_CONFIG:
            self._config[key] = configs.pop(key, self.DEFAULT_CONFIG[key])

        if configs:
            raise KafkaConfigurationError('Unknown configuration key(s): ' +
                                          str(list(configs.keys())))

        if self._config['auto_commit_enable']:
            if not self._config['group_id']:
                raise KafkaConfigurationError(
                    'KafkaConsumer configured to auto-commit '
                    'without required consumer group (group_id)'
                )

        # Check auto-commit configuration
        if self._config['auto_commit_enable']:
            logger.info("Configuring consumer to auto-commit offsets")
            self._reset_auto_commit()

        if not self._config['bootstrap_servers']:
            raise KafkaConfigurationError(
                'bootstrap_servers required to configure KafkaConsumer'
            )

        self._client = KafkaClient(
            self._config['bootstrap_servers'],
            client_id=self._config['client_id'],
            timeout=(self._config['socket_timeout_ms'] / 1000.0)
        )

    def set_topic_partitions(self, *topics):
        """
        Set the topic/partitions to consume
        Optionally specify offsets to start from

        Accepts types:

        * str (utf-8): topic name (will consume all available partitions)
        * tuple: (topic, partition)
        * dict:
            - { topic: partition }
            - { topic: [partition list] }
            - { topic: (partition tuple,) }

        Optionally, offsets can be specified directly:

        * tuple: (topic, partition, offset)
        * dict:  { (topic, partition): offset, ... }

        Example:

        .. code:: python

            kafka = KafkaConsumer()

            # Consume topic1-all; topic2-partition2; topic3-partition0
            kafka.set_topic_partitions("topic1", ("topic2", 2), {"topic3": 0})

            # Consume topic1-0 starting at offset 12, and topic2-1 at offset 45
            # using tuples --
            kafka.set_topic_partitions(("topic1", 0, 12), ("topic2", 1, 45))

            # using dict --
            kafka.set_topic_partitions({ ("topic1", 0): 12, ("topic2", 1): 45 })

        """
        self._topics = []
        self._client.load_metadata_for_topics()

        # Setup offsets
        self._offsets = OffsetsStruct(fetch=dict(),
                                      commit=dict(),
                                      highwater=dict(),
                                      task_done=dict())

        # Handle different topic types
        for arg in topics:

            # Topic name str -- all partitions
            if isinstance(arg, (six.string_types, six.binary_type)):
                topic = kafka_bytestring(arg)

                for partition in self._client.get_partition_ids_for_topic(topic):
                    self._consume_topic_partition(topic, partition)

            # (topic, partition [, offset]) tuple
            elif isinstance(arg, tuple):
                topic = kafka_bytestring(arg[0])
                partition = arg[1]
                self._consume_topic_partition(topic, partition)
                if len(arg) == 3:
                    offset = arg[2]
                    self._offsets.fetch[(topic, partition)] = offset

            # { topic: partitions, ... } dict
            elif isinstance(arg, dict):
                for key, value in six.iteritems(arg):

                    # key can be string (a topic)
                    if isinstance(key, (six.string_types, six.binary_type)):
                        topic = kafka_bytestring(key)

                        # topic: partition
                        if isinstance(value, int):
                            self._consume_topic_partition(topic, value)

                        # topic: [ partition1, partition2, ... ]
                        elif isinstance(value, (list, tuple)):
                            for partition in value:
                                self._consume_topic_partition(topic, partition)
                        else:
                            raise KafkaConfigurationError(
                                'Unknown topic type '
                                '(dict key must be int or list/tuple of ints)'
                            )

                    # (topic, partition): offset
                    elif isinstance(key, tuple):
                        topic = kafka_bytestring(key[0])
                        partition = key[1]
                        self._consume_topic_partition(topic, partition)
                        self._offsets.fetch[(topic, partition)] = value

            else:
                raise KafkaConfigurationError('Unknown topic type (%s)' % type(arg))

        # If we have a consumer group, try to fetch stored offsets
        if self._config['group_id']:
            self._get_commit_offsets()

        # Update missing fetch/commit offsets
        for topic_partition in self._topics:

            # Commit offsets default is None
            if topic_partition not in self._offsets.commit:
                self._offsets.commit[topic_partition] = None

            # Skip if we already have a fetch offset from user args
            if topic_partition not in self._offsets.fetch:

                # Fetch offsets default is (1) commit
                if self._offsets.commit[topic_partition] is not None:
                    self._offsets.fetch[topic_partition] = self._offsets.commit[topic_partition]

                # or (2) auto reset
                else:
                    self._offsets.fetch[topic_partition] = self._reset_partition_offset(topic_partition)

        # highwater marks (received from server on fetch response)
        # and task_done (set locally by user)
        # should always get initialized to None
        self._reset_highwater_offsets()
        self._reset_task_done_offsets()

        # Reset message iterator in case we were in the middle of one
        self._reset_message_iterator()

    def close(self):
        """Close this consumer's underlying client."""
        self._client.close()

    def next(self):
        """Return the next available message

        Blocks indefinitely unless consumer_timeout_ms > 0

        Returns:
            a single KafkaMessage from the message iterator

        Raises:
            ConsumerTimeout after consumer_timeout_ms and no message

        Note:
            This is also the method called internally during iteration

        """
        self._set_consumer_timeout_start()
        while True:

            try:
                return six.next(self._get_message_iterator())

            # Handle batch completion
            except StopIteration:
                self._reset_message_iterator()

            self._check_consumer_timeout()

    def fetch_messages(self):
        """Sends FetchRequests for all topic/partitions set for consumption

        Returns:
            Generator that yields KafkaMessage structs
            after deserializing with the configured `deserializer_class`

        Note:
            Refreshes metadata on errors, and resets fetch offset on
            OffsetOutOfRange, per the configured `auto_offset_reset` policy

        See Also:
            Key KafkaConsumer configuration parameters:
            * `fetch_message_max_bytes`
            * `fetch_max_wait_ms`
            * `fetch_min_bytes`
            * `deserializer_class`
            * `auto_offset_reset`

        """

        max_bytes = self._config['fetch_message_max_bytes']
        max_wait_time = self._config['fetch_wait_max_ms']
        min_bytes = self._config['fetch_min_bytes']

        if not self._topics:
            raise KafkaConfigurationError('No topics or partitions configured')

        if not self._offsets.fetch:
            raise KafkaConfigurationError(
                'No fetch offsets found when calling fetch_messages'
            )

        fetches = [FetchRequest(topic, partition,
                                self._offsets.fetch[(topic, partition)],
                                max_bytes)
                   for (topic, partition) in self._topics]

        # send_fetch_request will batch topic/partition requests by leader
        responses = self._client.send_fetch_request(
            fetches,
            max_wait_time=max_wait_time,
            min_bytes=min_bytes,
            fail_on_error=False
        )

        for resp in responses:

            if isinstance(resp, FailedPayloadsError):
                logger.warning('FailedPayloadsError attempting to fetch data')
                self._refresh_metadata_on_error()
                continue

            topic = kafka_bytestring(resp.topic)
            partition = resp.partition
            try:
                check_error(resp)
            except OffsetOutOfRangeError:
                logger.warning('OffsetOutOfRange: topic %s, partition %d, '
                               'offset %d (Highwatermark: %d)',
                               topic, partition,
                               self._offsets.fetch[(topic, partition)],
                               resp.highwaterMark)
                # Reset offset
                self._offsets.fetch[(topic, partition)] = (
                    self._reset_partition_offset((topic, partition))
                )
                continue

            except NotLeaderForPartitionError:
                logger.warning("NotLeaderForPartitionError for %s - %d. "
                               "Metadata may be out of date",
                               topic, partition)
                self._refresh_metadata_on_error()
                continue

            except RequestTimedOutError:
                logger.warning("RequestTimedOutError for %s - %d",
                               topic, partition)
                continue

            # Track server highwater mark
            self._offsets.highwater[(topic, partition)] = resp.highwaterMark

            # Yield each message
            # Kafka-python could raise an exception during iteration
            # we are not catching -- user will need to address
            for (offset, message) in resp.messages:
                # deserializer_class could raise an exception here
                val = self._config['deserializer_class'](message.value)
                msg = KafkaMessage(topic, partition, offset, message.key, val)

                # in some cases the server will return earlier messages
                # than we requested. skip them per kafka spec
                if offset < self._offsets.fetch[(topic, partition)]:
                    logger.debug('message offset less than fetched offset '
                                 'skipping: %s', msg)
                    continue
                # Only increment fetch offset
                # if we safely got the message and deserialized
                self._offsets.fetch[(topic, partition)] = offset + 1

                # Then yield to user
                yield msg

    def get_partition_offsets(self, topic, partition, request_time_ms, max_num_offsets):
        """Request available fetch offsets for a single topic/partition

        Keyword Arguments:
            topic (str): topic for offset request
            partition (int): partition for offset request
            request_time_ms (int): Used to ask for all messages before a
                certain time (ms). There are two special values.
                Specify -1 to receive the latest offset (i.e. the offset of the
                next coming message) and -2 to receive the earliest available
                offset. Note that because offsets are pulled in descending
                order, asking for the earliest offset will always return you a
                single element.
            max_num_offsets (int): Maximum offsets to include in the OffsetResponse

        Returns:
            a list of offsets in the OffsetResponse submitted for the provided
            topic / partition. See:
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetAPI
        """
        reqs = [OffsetRequest(topic, partition, request_time_ms, max_num_offsets)]

        (resp,) = self._client.send_offset_request(reqs)

        check_error(resp)

        # Just for sanity..
        # probably unnecessary
        assert resp.topic == topic
        assert resp.partition == partition

        return resp.offsets

    def offsets(self, group=None):
        """Get internal consumer offset values

        Keyword Arguments:
            group: Either "fetch", "commit", "task_done", or "highwater".
                If no group specified, returns all groups.

        Returns:
            A copy of internal offsets struct
        """
        if not group:
            return {
                'fetch': self.offsets('fetch'),
                'commit': self.offsets('commit'),
                'task_done': self.offsets('task_done'),
                'highwater': self.offsets('highwater')
            }
        else:
            return dict(deepcopy(getattr(self._offsets, group)))

    def task_done(self, message):
        """Mark a fetched message as consumed.

        Offsets for messages marked as "task_done" will be stored back
        to the kafka cluster for this consumer group on commit()

        Arguments:
            message (KafkaMessage): the message to mark as complete

        Returns:
            True, unless the topic-partition for this message has not
            been configured for the consumer. In normal operation, this
            should not happen. But see github issue 364.
        """
        topic_partition = (message.topic, message.partition)
        if topic_partition not in self._topics:
            logger.warning('Unrecognized topic/partition in task_done message: '
                           '{0}:{1}'.format(*topic_partition))
            return False

        offset = message.offset

        # Warn on non-contiguous offsets
        prev_done = self._offsets.task_done[topic_partition]
        if prev_done is not None and offset != (prev_done + 1):
            logger.warning('Marking task_done on a non-continuous offset: %d != %d + 1',
                           offset, prev_done)

        # Warn on smaller offsets than previous commit
        # "commit" offsets are actually the offset of the next message to fetch.
        prev_commit = self._offsets.commit[topic_partition]
        if prev_commit is not None and ((offset + 1) <= prev_commit):
            logger.warning('Marking task_done on a previously committed offset?: %d (+1) <= %d',
                           offset, prev_commit)

        self._offsets.task_done[topic_partition] = offset

        # Check for auto-commit
        if self._does_auto_commit_messages():
            self._incr_auto_commit_message_count()

        if self._should_auto_commit():
            self.commit()

        return True

    def commit(self):
        """Store consumed message offsets (marked via task_done())
        to kafka cluster for this consumer_group.

        Returns:
            True on success, or False if no offsets were found for commit

        Note:
            this functionality requires server version >=0.8.1.1
            https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-OffsetCommit/FetchAPI
        """
        if not self._config['group_id']:
            logger.warning('Cannot commit without a group_id!')
            raise KafkaConfigurationError(
                'Attempted to commit offsets '
                'without a configured consumer group (group_id)'
            )

        # API supports storing metadata with each commit
        # but for now it is unused
        metadata = b''

        offsets = self._offsets.task_done
        commits = []
        for topic_partition, task_done_offset in six.iteritems(offsets):

            # Skip if None
            if task_done_offset is None:
                continue

            # Commit offsets as the next offset to fetch
            # which is consistent with the Java Client
            # task_done is marked by messages consumed,
            # so add one to mark the next message for fetching
            commit_offset = (task_done_offset + 1)

            # Skip if no change from previous committed
            if commit_offset == self._offsets.commit[topic_partition]:
                continue

            commits.append(
                OffsetCommitRequest(topic_partition[0], topic_partition[1],
                                    commit_offset, metadata)
            )

        if commits:
            logger.info('committing consumer offsets to group %s', self._config['group_id'])
            resps = self._client.send_offset_commit_request(
                kafka_bytestring(self._config['group_id']), commits,
                fail_on_error=False
            )

            for r in resps:
                check_error(r)
                topic_partition = (r.topic, r.partition)
                task_done = self._offsets.task_done[topic_partition]
                self._offsets.commit[topic_partition] = (task_done + 1)

            if self._config['auto_commit_enable']:
                self._reset_auto_commit()

            return True

        else:
            logger.info('No new offsets found to commit in group %s', self._config['group_id'])
            return False

    #
    # Topic/partition management private methods
    #

    def _consume_topic_partition(self, topic, partition):
        topic = kafka_bytestring(topic)
        if not isinstance(partition, int):
            raise KafkaConfigurationError('Unknown partition type (%s) '
                                          '-- expected int' % type(partition))

        if topic not in self._client.topic_partitions:
            raise UnknownTopicOrPartitionError("Topic %s not found in broker metadata" % topic)
        if partition not in self._client.get_partition_ids_for_topic(topic):
            raise UnknownTopicOrPartitionError("Partition %d not found in Topic %s "
                                               "in broker metadata" % (partition, topic))
        logger.info("Configuring consumer to fetch topic '%s', partition %d", topic, partition)
        self._topics.append((topic, partition))

    def _refresh_metadata_on_error(self):
        refresh_ms = self._config['refresh_leader_backoff_ms']
        jitter_pct = 0.20
        sleep_ms = random.randint(
            int((1.0 - 0.5 * jitter_pct) * refresh_ms),
            int((1.0 + 0.5 * jitter_pct) * refresh_ms)
        )
        while True:
            logger.info("Sleeping for refresh_leader_backoff_ms: %d", sleep_ms)
            time.sleep(sleep_ms / 1000.0)
            try:
                self._client.load_metadata_for_topics()
            except KafkaUnavailableError:
                logger.warning("Unable to refresh topic metadata... cluster unavailable")
                self._check_consumer_timeout()
            else:
                logger.info("Topic metadata refreshed")
                return

    #
    # Offset-managment private methods
    #

    def _get_commit_offsets(self):
        logger.info("Consumer fetching stored offsets")
        for topic_partition in self._topics:
            (resp,) = self._client.send_offset_fetch_request(
                kafka_bytestring(self._config['group_id']),
                [OffsetFetchRequest(topic_partition[0], topic_partition[1])],
                fail_on_error=False)
            try:
                check_error(resp)
            # API spec says server wont set an error here
            # but 0.8.1.1 does actually...
            except UnknownTopicOrPartitionError:
                pass

            # -1 offset signals no commit is currently stored
            if resp.offset == -1:
                self._offsets.commit[topic_partition] = None

            # Otherwise we committed the stored offset
            # and need to fetch the next one
            else:
                self._offsets.commit[topic_partition] = resp.offset

    def _reset_highwater_offsets(self):
        for topic_partition in self._topics:
            self._offsets.highwater[topic_partition] = None

    def _reset_task_done_offsets(self):
        for topic_partition in self._topics:
            self._offsets.task_done[topic_partition] = None

    def _reset_partition_offset(self, topic_partition):
        (topic, partition) = topic_partition
        LATEST = -1
        EARLIEST = -2

        request_time_ms = None
        if self._config['auto_offset_reset'] == 'largest':
            request_time_ms = LATEST
        elif self._config['auto_offset_reset'] == 'smallest':
            request_time_ms = EARLIEST
        else:

            # Let's raise an reasonable exception type if user calls
            # outside of an exception context
            if sys.exc_info() == (None, None, None):
                raise OffsetOutOfRangeError('Cannot reset partition offsets without a '
                                            'valid auto_offset_reset setting '
                                            '(largest|smallest)')

            # Otherwise we should re-raise the upstream exception
            # b/c it typically includes additional data about
            # the request that triggered it, and we do not want to drop that
            raise # pylint: disable-msg=E0704

        (offset, ) = self.get_partition_offsets(topic, partition,
                                                request_time_ms, max_num_offsets=1)
        return offset

    #
    # Consumer Timeout private methods
    #

    def _set_consumer_timeout_start(self):
        self._consumer_timeout = False
        if self._config['consumer_timeout_ms'] >= 0:
            self._consumer_timeout = time.time() + (self._config['consumer_timeout_ms'] / 1000.0)

    def _check_consumer_timeout(self):
        if self._consumer_timeout and time.time() > self._consumer_timeout:
            raise ConsumerTimeout('Consumer timed out after %d ms' % + self._config['consumer_timeout_ms'])

    #
    # Autocommit private methods
    #

    def _should_auto_commit(self):
        if self._does_auto_commit_ms():
            if time.time() >= self._next_commit_time:
                return True

        if self._does_auto_commit_messages():
            if self._uncommitted_message_count >= self._config['auto_commit_interval_messages']:
                return True

        return False

    def _reset_auto_commit(self):
        self._uncommitted_message_count = 0
        self._next_commit_time = None
        if self._does_auto_commit_ms():
            self._next_commit_time = time.time() + (self._config['auto_commit_interval_ms'] / 1000.0)

    def _incr_auto_commit_message_count(self, n=1):
        self._uncommitted_message_count += n

    def _does_auto_commit_ms(self):
        if not self._config['auto_commit_enable']:
            return False

        conf = self._config['auto_commit_interval_ms']
        if conf is not None and conf > 0:
            return True
        return False

    def _does_auto_commit_messages(self):
        if not self._config['auto_commit_enable']:
            return False

        conf = self._config['auto_commit_interval_messages']
        if conf is not None and conf > 0:
            return True
        return False

    #
    # Message iterator private methods
    #

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def _get_message_iterator(self):
        # Fetch a new batch if needed
        if self._msg_iter is None:
            self._msg_iter = self.fetch_messages()

        return self._msg_iter

    def _reset_message_iterator(self):
        self._msg_iter = None

    #
    # python private methods
    #

    def __repr__(self):
        return '<{0} topics=({1})>'.format(
            self.__class__.__name__,
            '|'.join(["%s-%d" % topic_partition
                      for topic_partition in self._topics])
        )

    #
    # other private methods
    #

    def _deprecate_configs(self, **configs):
        for old, new in six.iteritems(DEPRECATED_CONFIG_KEYS):
            if old in configs:
                logger.warning('Deprecated Kafka Consumer configuration: %s. '
                               'Please use %s instead.', old, new)
                old_value = configs.pop(old)
                if new not in configs:
                    configs[new] = old_value
        return configs
Ejemplo n.º 53
0
class KafkaHelper(object):
    def __init__(self):
        self.client = None
        self.producer = None
        self.consumer = None
        self.consumer_fetch_timeout = None
        self.consumer_fetch_size = None

    def __enter__(self):
        self.get_client()
        return self

    def __exit__(self, exctype, excvalue, traceback):
        self.close_client()

    @retry(BrokerResponseError, tries=5, delay=3, backoff=2)
    def get_client(self):
        if not self.client:
            self.client = KafkaClient(settings.KAFKA['host'])
        return self.client

    def get_producer(self):
        """
        :return: SimpleProducer
        """
        if not self.producer:
            self.get_client()
            self.producer = SimpleProducer(self.client)
        return self.producer

    def get_multiprocess_consumer(
            self,
            consumer_group,
            topic,
            fetch_size=settings.KAFKA['message_fetch_batch'],
            fetch_timeout=settings.KAFKA['message_fetch_timeout'],
            auto_commit_every_n=settings.KAFKA['auto_commit_msg_count'],
            **kw):
        """
        Return MultiProcessConsumer which consumes partitions for a topic in
        parallel using multiple processes

        Arguments:
            consumer_group: a name for this consumer, used for offset storage and must be unique
            topic: the topic to consume

        Keyword Arguments:
            fetch_size: Indicates the maximum number of messages to be fetched
            fetch_timeout: The function will block for the specified
                time (in seconds) until count messages is fetched
            auto_commit_every_n: How many messages to consume
                before a commit
        """
        if not self.consumer:
            self.consumer_fetch_size = fetch_size
            self.consumer_fetch_timeout = fetch_timeout

            self.get_client()
            partitions = len(self.get_partitions(topic))
            self.consumer = MultiProcessConsumer(
                self.client,
                consumer_group,
                topic,
                num_procs=partitions,
                partitions_per_proc=1,
                auto_commit_every_n=auto_commit_every_n,
                **kw)
        return self.consumer

    def get_consumer(
            self,
            consumer_group,
            topic,
            fetch_size=settings.KAFKA['message_fetch_batch'],
            fetch_timeout=settings.KAFKA['message_fetch_timeout'],
            auto_commit_every_n=settings.KAFKA['auto_commit_msg_count'],
            **kw):
        if not self.consumer:
            self.consumer_fetch_size = fetch_size
            self.consumer_fetch_timeout = fetch_timeout

            self.get_client()
            self.consumer = SimpleConsumer(
                self.client,
                consumer_group,
                topic,
                auto_commit_every_n=auto_commit_every_n,
                auto_offset_reset='smallest',
                **kw)
        return self.consumer

    def close_client(self):
        if self.client:
            self.client.close()

    def send_message(self, topic, msgs, logger=None):
        content = [(json.dumps(msg) if type(msg) is dict else msg)
                   for msg in msgs]
        try:
            resp = self.producer.send_messages(topic, *content)
            return resp
        except Exception as e:
            if logger:
                logger.error(
                    'An error has occured in KafkaHelper.send_message(), please check errors:  %s',
                    traceback.format_exc())
            raise e

    def receive_messages(self):
        messages = self.consumer.get_messages(
            count=self.consumer_fetch_size,
            timeout=self.consumer_fetch_timeout)
        return messages

    def current_offset(self, topic, partition):
        offsets, = self.client.send_offset_request(
            [OffsetRequest(kafka_bytestring(topic), partition, -1, 1)])
        return offsets.offsets[0]

    def consumer_offset(self, consumer_name, topic, partition):
        offsets, = self.client.send_offset_fetch_request(
            consumer_name,
            [OffsetRequest(kafka_bytestring(topic), partition, -1, 1)])
        return offsets[2]

    def get_total_lags(self, consumer_name, topic):
        lags = []
        lag = 0
        partitions = self.get_partitions(topic)
        for p in partitions:
            offset1 = self.consumer_offset(consumer_name, topic, p)
            offset2 = self.current_offset(topic, p)
            lag = (offset2 - offset1)
            lags.append(lag)
            #print offset1,offset2,lag
        return sum(lags)

    def get_partitions(self, topic):
        return self.client.get_partition_ids_for_topic(topic)
class DocManager(DocManagerBase):
    """ Connects to a kafka instance, generates producers for a given
    database and collection
    """

    def __init__(self, url, auto_commit=True, unique_key='_id',
                 chunk_size=constants.DEFAULT_MAX_BULK, **kwargs):
        """Connect to kafka instance
        """
        url_info = url.split(":")
        if len(url_info) < 2:
            raise SystemError

        self.server = KafkaClient(url)
        self.producer_dict = {}
        self.auto_commit = auto_commit
        print url

    def generate_producer(self, namespace):
        """ Generates a producer for a given database and collection
        """
        database, coll = namespace.split('.', 1)
        topic = (('%s-%s') % (database, coll))
        if topic not in self.producer_dict:
            try:
                self.producer_dict[topic] = SimpleProducer(
                    self.server,
                    async=True)
            except:
                self.producer_dict[topic] = None
        return self.producer_dict[topic]

    def stop(self):
        print "stop"
        """ Stops the instance
        """
        self.auto_commit = False
        self.server.close()

    def update(self, document_id, update_spec, namespace, timestamp):
        """Apply updates given in update_spec to the document whose id
        matches that of doc.
        """
        pass

    def remove(self, document_id, namespace, timestamp):
        pass

    def upsert(self, doc, namespace, timestamp):
        print "upsert"
        """ Sends the document to kafka
        """
        if doc.has_key('isInTangoDir'):
            import json
            # remove field "_id" since ObjectId("") can't be serialized
            del doc['_id']
            doc_str = json.dumps(doc)
            print doc_str
            database, coll = namespace.split('.', 1)
            topic = (('%s-%s') % (database, coll))
            producer = self.generate_producer(namespace)
            if producer:
                producer.send_messages(topic, str(doc_str))
            else:
                raise SystemError

    def search(self, start_ts, end_ts):
        print "search"
        """ Not revelant in this context
        """
        pass

    def commit(self):
        print "commit"
        """ Not revelant in this context
        """
        pass

    def run_auto_commit(self):
        print 'run_auto_commit'
        """ Not revelant in this context
        """
        pass

    def get_last_doc(self):
        print 'get_last_doc'
        """ This is probably possible but unsure of implementation.
            Hestitant to implement this since it might be system
            specific.
        """
        pass
Ejemplo n.º 55
0
kserver = ["192.168.80.109:9092", "192.168.80.108:9092", "192.168.80.107:9092"]
client = KafkaClient(bootstrap_servers=kserver)
topic = "spark"
num_partitions = 3
timeout_ms = 10000
configs = {}
if topic not in client.cluster.topics(
        exclude_internal_topics=True):  # Topic不存在

    request = admin.CreateTopicsRequest_v0(
        create_topic_requests=[(
            topic,
            num_partitions,
            -1,  # replication unset.
            [],  # Partition assignment.
            [(key, value) for key, value in configs.items()],  # Configs
        )],
        timeout=timeout_ms)

    future = client.send(2, request)  # 2是Controller,发送给其他Node都创建失败。
    client.poll(timeout_ms=timeout_ms, future=future)  # 这里

    result = future.value
    # error_code = result.topic_error_codes[0][1]
    print("CREATE TOPIC RESPONSE: ",
          result)  # 0 success, 41 NOT_CONTROLLER, 36 ALREADY_EXISTS
    client.close()
else:  # Topic已经存在
    print("Topic already exists!")