def verify_consumer_performance(): """ Verify Consumer performance """ conf = {'bootstrap.servers': bootstrap_servers, 'group.id': uuid.uuid1(), 'session.timeout.ms': 6000, 'error_cb': error_cb, 'auto.offset.reset': 'earliest'} c = confluent_kafka.Consumer(**conf) def my_on_assign(consumer, partitions): print('on_assign:', len(partitions), 'partitions:') for p in partitions: print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset)) consumer.assign(partitions) def my_on_revoke(consumer, partitions): print('on_revoke:', len(partitions), 'partitions:') for p in partitions: print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset)) consumer.unassign() c.subscribe([topic], on_assign=my_on_assign, on_revoke=my_on_revoke) max_msgcnt = 1000000 bytecnt = 0 msgcnt = 0 print('Will now consume %d messages' % max_msgcnt) if with_progress: bar = Bar('Consuming', max=max_msgcnt, suffix='%(index)d/%(max)d [%(eta_td)s]') else: bar = None while True: # Consume until EOF or error msg = c.poll(timeout=20.0) if msg is None: raise Exception('Stalled at %d/%d message, no new messages for 20s' % (msgcnt, max_msgcnt)) if msg.error(): if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF: # Reached EOF for a partition, ignore. continue else: raise confluent_kafka.KafkaException(msg.error()) bytecnt += len(msg) msgcnt += 1 if bar is not None and (msgcnt % 10000) == 0: bar.next(n=10000) if msgcnt == 1: t_first_msg = time.time() if msgcnt >= max_msgcnt: break if bar is not None: bar.finish() if msgcnt > 0: t_spent = time.time() - t_first_msg print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' % (msgcnt, bytecnt / (1024*1024), t_spent, msgcnt / t_spent, (bytecnt / t_spent) / (1024*1024))) print('closing consumer') c.close()
def verify_batch_consumer_performance(): """ Verify batch Consumer performance """ conf = {'bootstrap.servers': bootstrap_servers, 'group.id': uuid.uuid1(), 'session.timeout.ms': 6000, 'error_cb': error_cb, 'auto.offset.reset': 'earliest'} c = confluent_kafka.Consumer(conf) def my_on_assign(consumer, partitions): print('on_assign:', len(partitions), 'partitions:') for p in partitions: print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset)) consumer.assign(partitions) def my_on_revoke(consumer, partitions): print('on_revoke:', len(partitions), 'partitions:') for p in partitions: print(' %s [%d] @ %d' % (p.topic, p.partition, p.offset)) consumer.unassign() c.subscribe([topic], on_assign=my_on_assign, on_revoke=my_on_revoke) max_msgcnt = 1000000 bytecnt = 0 msgcnt = 0 batch_size = 1000 print('Will now consume %d messages' % max_msgcnt) if with_progress: bar = Bar('Consuming', max=max_msgcnt, suffix='%(index)d/%(max)d [%(eta_td)s]') else: bar = None while msgcnt < max_msgcnt: # Consume until we hit max_msgcnt msglist = c.consume(num_messages=batch_size, timeout=20.0) for msg in msglist: if msg.error(): raise confluent_kafka.KafkaException(msg.error()) bytecnt += len(msg) msgcnt += 1 if bar is not None and (msgcnt % 10000) == 0: bar.next(n=10000) if msgcnt == 1: t_first_msg = time.time() if bar is not None: bar.finish() if msgcnt > 0: t_spent = time.time() - t_first_msg print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' % (msgcnt, bytecnt / (1024*1024), t_spent, msgcnt / t_spent, (bytecnt / t_spent) / (1024*1024))) print('closing consumer') c.close()
def verify_stats_cb(): """ Verify stats_cb """ def stats_cb(stats_json_str): global good_stats_cb_result stats_json = json.loads(stats_json_str) if topic in stats_json['topics']: app_offset = stats_json['topics'][topic]['partitions']['0'][ 'app_offset'] if app_offset > 0: print("# app_offset stats for topic %s partition 0: %d" % (topic, app_offset)) good_stats_cb_result = True conf = { 'bootstrap.servers': bootstrap_servers, 'group.id': uuid.uuid1(), 'session.timeout.ms': 6000, 'error_cb': error_cb, 'stats_cb': stats_cb, 'statistics.interval.ms': 200, 'default.topic.config': { 'auto.offset.reset': 'earliest' } } c = confluent_kafka.Consumer(**conf) c.subscribe([topic]) max_msgcnt = 1000000 bytecnt = 0 msgcnt = 0 print('Will now consume %d messages' % max_msgcnt) if with_progress: bar = Bar('Consuming', max=max_msgcnt, suffix='%(index)d/%(max)d [%(eta_td)s]') else: bar = None while not good_stats_cb_result: # Consume until EOF or error msg = c.poll(timeout=20.0) if msg is None: raise Exception( 'Stalled at %d/%d message, no new messages for 20s' % (msgcnt, max_msgcnt)) if msg.error(): if msg.error().code() == confluent_kafka.KafkaError._PARTITION_EOF: # Reached EOF for a partition, ignore. continue else: raise confluent_kafka.KafkaException(msg.error()) bytecnt += len(msg) msgcnt += 1 if bar is not None and (msgcnt % 10000) == 0: bar.next(n=10000) if msgcnt == 1: t_first_msg = time.time() if msgcnt >= max_msgcnt: break if bar is not None: bar.finish() if msgcnt > 0: t_spent = time.time() - t_first_msg print('%d messages (%.2fMb) consumed in %.3fs: %d msgs/s, %.2f Mb/s' % (msgcnt, bytecnt / (1024 * 1024), t_spent, msgcnt / t_spent, (bytecnt / t_spent) / (1024 * 1024))) print('closing consumer') c.close()
def run( self, bootstrap_servers: str = None, group_id: str = None, topics: List[str] = None, request_timeout: float = 1.0, auto_offset_reset: str = "earliest", message_consume_limit: int = None, kafka_configs: dict = None, **kwargs, ) -> List[bytes]: """ Run method for this Task. Invoked by calling this Task after initialization within a Flow context, or by using `Task.bind`. Args: - bootstrap_servers (str, required): comma separated host and port pairs that are the addresses of kafka brokers - group_id (str, required): name of the consumer group the consumer will belong to - topics (List[str], required): list of topic names to consume messages from - request_timeout (float, optional): Maximum time to block waiting for message, event or callback - auto_offset_reset (str, optional): configurable offset reset policy - message_consume_limit (int, optional): max number of messages to consume before closing the consumer - kafka_configs (dict, optional): a dict of kafka client configuration properties used to construct the consumer. - **kwargs (Any, optional): additional keyword arguments to pass to the standard Task init method Returns: - List of consumed messages """ consumer = confluent_kafka.Consumer({ "bootstrap.servers": bootstrap_servers, "group.id": group_id, "auto.offset.reset": auto_offset_reset, **kafka_configs, }) consumer.subscribe(topics) messages = [] message_consume_count = 0 running = True try: while running: message = consumer.poll(timeout=request_timeout) if message is not None: if message.error(): if (message.error().code() == confluent_kafka.KafkaError._PARTITION_EOF): # End of partition event, exit consumer self.logger.warn( f"{message.topic()} [{message.partition()}] " f"reached end at offset {message.offset()}") running = False elif message.error(): raise confluent_kafka.KafkaException( message.error()) else: messages.append(message.value()) message_consume_count += 1 if message_consume_limit: if message_consume_count >= message_consume_limit: break else: self.logger.info( f"No messages found for topic {topics}; closing consumer..." ) break finally: consumer.close() return messages
def main(bootstrap_servers, host, port, user, password): consumerConfiguration = { 'bootstrap.servers': bootstrap_servers, 'group.id': "elasticsearch", 'session.timeout.ms': 30000, 'auto.offset.reset': 'earliest' } consumer = confluent_kafka.Consumer(consumerConfiguration) consumer.subscribe(["scan", "ct", "tags"]) # Elasticsearch configuration es = Elasticsearch([{ 'host': host, 'port': port }], http_auth=(user, password), timeout=60) actions = [] try: while True: msg = consumer.poll(timeout=1.0) if msg is None: # no message received yet continue if msg.error(): raise confluent_kafka.KafkaException(msg.error()) else: topic = msg.topic() if topic == "scan": message = json.loads(msg.value()) data = message['data'] date = message['date'] sha1 = message['sha1'] raw = deep_get( data, 'data.tls.result.handshake_log.server_certificates.certificate.raw', "") tls_version = deep_get( data, 'data.tls.result.handshake_log.server_hello.version.value', "") tls_cipher_suite = deep_get( data, 'data.tls.result.handshake_log.server_hello.cipher_suite.hex', "") actions.append({ "_index": "certificates", "_id": sha1, "date": date, "sha1": sha1, "raw": raw, "scan": True, }) ip = deep_get(data, 'ip', "") md5 = deep_get( data, 'data.tls.result.handshake_log.server_certificates.certificate.parsed.fingerprint_md5', "") sha256 = deep_get( data, 'data.tls.result.handshake_log.server_certificates.certificate.parsed.fingerprint_sha256', "") actions.append({ "_index": "hosts_{date}".format(date=date), "ip": ip, "date": date, "md5": md5, "sha1": sha1, "sha256": sha256, "tls_version": tls_version, "tls_cipher_suite": tls_cipher_suite, }) elif topic == "ct": message = json.loads(msg.value()) data = message['data'] date = message['date'] sha1 = message['sha1'] try: issuer_common_name = data["chain"][0]["subject"]["CN"] except KeyError or IndexError: issuer_common_name = "" subject_common_name = deep_get(data, 'leaf_cert.subject.CN', "") raw = deep_get(data, 'leaf_cert.as_der', "") actions.append({ "_index": "certificates", "_id": sha1, "date": date, "sha1": sha1, "raw": raw, "ct": True, }) elif msg.topic() == "tags": message = json.loads(msg.value()) date = message['date'] sha1 = message['sha1'] tag = message['tag'] comment = message['comment'] actions.append({ "_index": "tags", "date": date, "sha1": sha1, "tag": tag, "comment": comment, }) if len(actions) > 1000: bulk(es, iter(actions)) actions = [] except KeyboardInterrupt: sys.stderr.write('Aborted by user\n') finally: consumer.close()
# Create consumer. # This consumer will not join the group, but the group.id is required by # committed() to know which group to get offsets for. consumer = confluent_kafka.Consumer({'bootstrap.servers': brokers, 'group.id': group}) print("%-50s %9s %9s" % ("Topic [Partition]", "Committed", "Lag")) print("=" * 72) for topic in sys.argv[3:]: # Get the topic's partitions metadata = consumer.list_topics(topic, timeout=10) if metadata.topics[topic].error is not None: raise confluent_kafka.KafkaException(metadata.topics[topic].error) # Construct TopicPartition list of partitions to query partitions = [confluent_kafka.TopicPartition(topic, p) for p in metadata.topics[topic].partitions] # Query committed offsets for this group and the given partitions committed = consumer.committed(partitions, timeout=10) for partition in committed: # Get the partitions low and high watermark offsets. (lo, hi) = consumer.get_watermark_offsets(partition, timeout=10, cached=False) if partition.offset == confluent_kafka.OFFSET_INVALID: offset = "-" else: offset = "%d" % (partition.offset)
def ack(err, msg): if err: self._loop.call_soon_threadsafe(result.set_exception, confluent_kafka.KafkaException(err)) else: self._loop.call_soon_threadsafe(result.set_result, msg)
'ssl.certificate.location': args.client_pem, 'ssl.key.location': args.client_key, } c = confluent_kafka.Consumer(**conf) c.subscribe([topic]) numOfRecords = 10 try: while numOfRecords > 0: msg = c.poll(timeout=1.0) if msg is None: continue if msg.error(): if msg.error().code( ) == confluent_kafka.KafkaError._PARTITION_EOF: sys.stderr.write( '%s [%d] reached end at offset %d\n' % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise confluent_kafka.KafkaException(msg.error()) else: sys.stderr.write('parition: %d, offset: %d, message: %s\n' % (msg.partition(), msg.offset(), msg.value())) numOfRecords = numOfRecords - 1 except KeyboardInterrupt: sys.stderr.write('Aborted by user\n') # Close down consumer to commit final offsets. c.close()