def consumer(kafka_server: str, test_config: Dict, kafka_consumer_group: str) -> Consumer: """Get a connected Kafka consumer.""" consumer = Consumer({ "bootstrap.servers": kafka_server, "auto.offset.reset": "earliest", "enable.auto.commit": True, "group.id": kafka_consumer_group, }) prefix = test_config["prefix"] kafka_topics = [ f"{prefix}.{object_type}" for object_type in test_config["object_types"] ] + [ f"{prefix}_privileged.{object_type}" for object_type in test_config["privileged_object_types"] ] consumer.subscribe(kafka_topics) yield consumer # Explicitly perform the commit operation on the consumer before closing it # to avoid possible hang since confluent-kafka v1.6.0 consumer.commit() consumer.close()
class Kafka(Consumer): def __init__(self, publisher, downloader, engine, incoming_topic, group_id, bootstrap_servers, requeuer=None, **kwargs): super().__init__(publisher, downloader, engine) config = kwargs.copy() config["group.id"] = group_id config["bootstrap.servers"] = ",".join(bootstrap_servers) config["group.instance.id"] = kwargs.get("group.instance.id", os.environ.get("HOSTNAME")) self.auto_commit = kwargs.get("enable.auto.commit", True) self.consumer = ConfluentConsumer(config) self.consumer.subscribe([incoming_topic]) log.info("subscribing to %s: %s", incoming_topic, self.consumer) self.requerer = requeuer def deserialize(self, bytes_): raise NotImplementedError() def handles(self, input_msg): return True def run(self): while True: msg = self.consumer.poll(1) if msg is None: continue err = msg.error() if err is not None: if not self.auto_commit: self.consumer.commit(msg) log.exception(err) continue val = msg.value() if val is not None: try: payload = self.deserialize(val) if self.handles(payload): self.process(payload) except Requeue as req: if not self.requerer: raise Exception( "Requeue request with no requerer configured.") self.requeuer.requeue(val, req) except Exception as ex: log.exception(ex) finally: if not self.auto_commit: self.consumer.commit(msg)
class KSubscriber(Subscriber, threading.Thread): def __init__(self, arg_parser: ArgsParser, killer: Killer): threading.Thread.__init__(self, name='kafka subscriber thread', daemon=True) self.killer = killer self.topic = arg_parser.topic config = { 'bootstrap.servers': ','.join(arg_parser.brokers), 'client.id': CLIENT_ID, 'group.id': GROUP_ID, 'auto.offset.reset': 'earliest' } if arg_parser.auth: config.update({ 'security.protocol': 'SASL_PLAINTEXT', 'sasl.mechanism': 'PLAIN', 'sasl.username': arg_parser.username, 'sasl.password': arg_parser.password }) self.kafka_subscriber = Consumer(config) def subscribe(self, topic): def on_assign(consumer, partitions): log.info('subscribed') try: self.kafka_subscriber.subscribe([topic], on_assign=on_assign) while True: message = self.kafka_subscriber.poll(timeout=1.0) if self.killer.killed: break if message is None: continue if message.error(): log.error('read message error') # commit message self.kafka_subscriber.commit(asynchronous=False) log.info('received message from topic {t}'.format(t=message.topic())) print(message.value().decode('utf-8')) except: log.error('error subscribing to brokers') self.close() def run(self): log.info('start kafka subscriber inside thread {tn}'.format(tn=self.name)) self.subscribe(self.topic) def close(self): if self.kafka_subscriber is not None: self.kafka_subscriber.close()
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.poll() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.consume() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.unassign() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.assignment() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.commit() assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert ex.match('Consumer closed') with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert ex.match('Consumer closed')
def _consume(group_id, topic, n, max_messages): config = { "bootstrap.servers": "localhost:9094", "group.id": group_id, "auto.offset.reset": "beginning", "enable.partition.eof": "true", "enable.auto.commit": "false", } consumer = Consumer(config) consumer.subscribe(topics=[topic]) messages = 0 while True: if messages == max_messages: return msg = consumer.consume(num_messages=n, timeout=5) if len(msg) == 0: continue for m in msg: if m.error(): if m.error().code() == KafkaError._PARTITION_EOF: return elif m.error(): raise KafkaException(m.error()) else: messages += 1 if messages == max_messages: break consumer.commit(asynchronous=False)
def test_any_method_after_close_throws_exception(): """ Calling any consumer method after close should thorw a RuntimeError """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.subscribe(['test']) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unsubscribe() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.poll() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.consume() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assign([TopicPartition('test', 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.unassign() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.assignment() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.commit() assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.committed([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.position([TopicPartition("test", 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: c.seek([TopicPartition("test", 0, 0)]) assert 'Consumer closed' == str(ex.value) with pytest.raises(RuntimeError) as ex: lo, hi = c.get_watermark_offsets(TopicPartition("test", 0)) assert 'Consumer closed' == str(ex.value)
def run_consumer(queue, msg_handler): consumer = Consumer({ 'bootstrap.servers': os.environ.get("KAFKA", "localhost:9092"), 'group.id': 'manager', 'auto.offset.reset': 'earliest' # earliest _committed_ offset }) _wait_for_topic_to_exist(consumer, queue) logging.info("Subscribing to topic: %s", queue) consumer.subscribe([queue]) while True: logging.debug("Waiting for messages in %r...", queue) msg = consumer.poll() if msg is None: logging.warning("Poll timed out") break logging.info("Consuming Kafka message: %r", msg.key()) if msg.error(): logging.warning("Consumer error: {}".format(msg.error())) continue msg_handler(msg) consumer.commit()
def run_communication_consumer(communication_handler): conf = {'bootstrap.servers': config['kafka']['servers'], 'group.id': "communication", 'auto.offset.reset': 'earliest', 'enable.auto.commit': 'false'} consumer = Consumer(conf) print('[+] Listening for communication messages') try: consumer_topics = [config['kafka']['communication-topic']] consumer.subscribe(consumer_topics) while True: msg = consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): raise KafkaException(msg.error()) else: print('[+] Communication message received') print(msg.value()) consumer.commit(asynchronous=False) communication_handler.handle_run_communication(msg.value()) finally: consumer.close()
class FinalCallReport: """ If CRM system send two reports per call CallReport class will handle thw first one. FinalCallReport will handle the final report. """ def __init__(self, group=None, consumer_topic=None, producer_topic=None): self.producer = Producer({ "bootstrap.servers": "", "security.protocol": "SASL_SSL", "sasl.mechanisms": "PLAIN", "sasl.username": "", "sasl.password": "" }) self._consumer = Consumer({ "bootstrap.servers": "", "security.protocol": "SASL_SSL", "sasl.mechanisms": "PLAIN", "sasl.username": "", "sasl.password": "", 'group.id': group, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest' }) self._consumer.subscribe([consumer_topic]) self._producer_topic = producer_topic @abstractmethod def get_call_report(self): """ Implementation for stream case """ while True: msg = self._consumer.poll(0.1) if msg is None: continue elif not msg.error(): # Received message self._consumer.commit() self.send_call_report(msg.value()) elif msg.error().code() == KafkaError._PARTITION_EOF: logging.info('End of partition reached {}/{}'.format( msg.topic(), msg.partition())) else: logging.error('Error occurred: {}'.format(msg.error().str())) def send_call_report(self, report): """ Depends on dialer API options :param report: """ while True: self.producer.produce(self._producer_topic, report, callback=KafkaUtils.self.delivery_report) self.producer.poll(0) def terminate(self): self.producer.flush()
class PushRecommendations: def __init__(self, group, recommedations_topic): self._producer = Producer({ "bootstrap.servers": "", "security.protocol": "SASL_SSL", "sasl.mechanisms": "PLAIN", "sasl.username": "", "sasl.password": "" }) self._consumer = Consumer({ "bootstrap.servers": "", "security.protocol": "SASL_SSL", "sasl.mechanisms": "PLAIN", "sasl.username": "", "sasl.password": "", 'group.id': group, 'enable.auto.commit': True, 'auto.offset.reset': 'earliest' }) self._consumer.subscribe([recommedations_topic]) def start(self): """ Get messages from push_recommendations_topic. If we get recommendations then call self.push_recommendations() :return: """ while True: msg = self._consumer.poll(0.1) if msg is None: continue elif not msg.error(): # Received message recommendations = msg.value() recommendations = json.loads(recommendations) if len(recommendations['response']['leads']) == 0: logging.error('Got 0 optimal leads from OptimalQ') continue self.push_recommendations(recommendations) self._consumer.commit() elif msg.error().code() == KafkaError._PARTITION_EOF: logging.info('End of partition reached {}/{}'.format( msg.topic(), msg.partition())) else: logging.error('Error occurred: {}'.format(msg.error().str())) @abstractmethod def push_recommendations(self, recommendations): """ Get OptimalQ recommendations and push them to the dialer, depends on dialer api options. :param recommendations: """ pass def terminate(self): self._producer.flush()
def worker(): global consumers consumer = Consumer({'bootstrap.servers': bootstrap_servers, 'group.id': consumer_group, 'client.id': client_id, 'default.topic.config': {'auto.offset.reset': 'earliest'}, 'enable.auto.offset.store': False, 'session.timeout.ms': session_timeout_ms}) consumers.append(consumer) consumer.subscribe([topic]) while True: msg = consumer.poll(0) thread_name = threading.current_thread().name if msg == None or not msg: continue if not msg.error(): msg_timestamp = datetime.fromtimestamp(msg.timestamp()[1] / 1000.0) keep_alive_counter = 0 now = datetime.now() # loop/sleep to delay the message while now < msg_timestamp + delay_timedelta: keep_alive_counter = keep_alive_counter + 1 msg_timestamp_with_delta = msg_timestamp + delay_timedelta diff1 = msg_timestamp_with_delta - now diff_seconds = diff1.total_seconds() if keep_alive_counter <= 1: logging.info("[%s] %s | received message on partition=%d, delaying for %fs" % ( thread_name, now.isoformat(), msg.partition(), diff_seconds)) # sleep for {min_sleep_seconds}s...{kafka_keep_alive_seconds}s sleep_seconds = min(kafka_keep_alive_seconds, max(min_sleep_seconds, diff_seconds)) # use as 'keep alive' feedback for low (no) traffic periods... to avoid connections getting dropped by brokers - resulting in a group rebalance logging.debug( "[%s] %s | kafka keep alive commit partition=%d" % (thread_name, now.isoformat(), msg.partition())) consumer.commit( offsets=[TopicPartition(topic=msg.topic(), partition=msg.partition(), offset=OFFSET_STORED)]) # go to sleep logging.debug("[%s] %s | going to sleep for %fs / lag: %fs" % ( thread_name, now.isoformat(), sleep_seconds, diff_seconds)) sleep(sleep_seconds) now = datetime.now() process(thread_name, msg) consumer.store_offsets(msg) elif msg.error().code() == KafkaError._PARTITION_EOF: continue else: logging.error("kafka consumer error: %s" % msg.error())
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb (err, partitions): pass kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke (consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3))) kc.assign(partitions) kc.unassign() kc.commit(async=True) try: kc.commit(async=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == -1001]) == len(partitions) try: offsets = kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT kc.close()
def set_consumer_offsets(self, offsets: List[Offset]): grouped_offsets = ConfluentAdminClient.group_offsets_by_consumer_group( offsets) for consumer_group, _offsets in grouped_offsets.items(): consumer = Consumer({**self.config, 'group.id': consumer_group}) tps = [TopicPartition(o.topic, o.partition, o.value) for o in _offsets] logger.info(f'Set {len(tps)} offsets for consumer ' f'group: {consumer_group}') consumer.commit(offsets=tps, asynchronous=False)
class Kafka(): def __init__(self, topic_name, group_id, auto_offset_reset): with open(config_file_path) as kafka_conf: self.conf = yaml.load(kafka_conf, Loader=yaml.FullLoader) self.group_id = group_id self.topic_name = topic_name self.auto_offset_reset = auto_offset_reset self.running_consumer = True self.c = Consumer({ 'bootstrap.servers': self.conf['bootstrap_servers'], 'group.id': self.group_id, 'auto.offset.reset': self.auto_offset_reset }) self.c.subscribe([self.topic_name]) print(self.c.list_topics()) def consume(self): # self.batch_size = batch_size while self.running_consumer: a = 0 msg = self.c.poll(1.0) if msg is None: # empty = Log("Empty") # empty.write("Empty message!","kafka") print("empty message!") msg = "empty".encode('utf-8') #if a%10 == 0: #break # if msg.error(): # err = Log("Error") # err.write(msg.error(),"kafka") # print(msg.value().decode('utf-8')) else: a += 1 msg = msg.value().decode('utf-8') print("message is : {}".format(msg)) #.decode('utf-8'))) self.c.commit() if a % 10 == 0: self.running_consumer = False #return msg # self.c.close() return msg self.consume() def stop_consume(self): self.running_consumer = False time.sleep(10) self.consume()
def consume_loop(): # Consumer configuration # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md conf = { 'bootstrap.servers': KAFKA, 'group.id': GROUP, 'session.timeout.ms': SESSION_TIMEOUT, 'auto.offset.reset': AUTO_OFFSET_RESET, 'on_commit': commit_completed } # Create Consumer instance. Logs will be emitted when poll() is called c = Consumer(conf, logger=logger) def print_assignment(_, partitions): logger.info('Assignment: {}'.format(partitions)) # Subscribe to topics c.subscribe(TOPICS, on_assign=print_assignment) # Read messages from Kafka, print to stdout try: msg_count = 0 while True: msg = c.poll(timeout=TIMEOUT) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event sys.stderr.write( '%% %s [%d] reached end at offset %d\n' % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise KafkaException(msg.error()) else: # Proper message sys.stderr.write('%% %s [%d] at offset %d with key %s:\n' % (msg.topic(), msg.partition(), msg.offset(), str(msg.key()))) msg_process(msg) msg_count += 1 # Manually commit every MIN_COMMIT_COUNT messages if msg_count % MIN_COMMIT_COUNT == 0: c.commit() except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') finally: # Close down consumer to commit final offsets. c.close()
def repl(): c = Consumer(settings) c.subscribe(topics) try: while True: if not red.ping(): time.sleep(1) continue msg = c.poll(0.1) # No message present if msg is None: continue # Found message elif not msg.error(): # Try to handle if msg.topic() == u'add_build': result = add_build(msg.value()) elif msg.topic() == u'delete_build': result = delete_build(msg.value()) elif msg.topic() == u'add_user': result = add_user(msg.value()) elif msg.topic() == u'delete_user': result = delete_user(msg.value()) elif msg.topic() == u'add_build_component': result = add_build_component(msg.value()) elif msg.topic() == u'remove_build_component': result = remove_build_component(msg.value()) elif msg.topic() == u'add_decoration': result = add_decoration(msg.value()) elif msg.topic() == u'remove_decoration': result = remove_decoration(msg.value()) elif msg.topic() == u'remove_all_decorations': result = remove_all_decorations(msg.value()) if result: pprint('Success ' + msg.value()) c.commit() else: c.unsubscribe() c.subscribe(topics) print('Error Occurred Adding to Redis') elif msg.error().code() == KafkaError._PARTITION_EOF: print('End of partition reached {0}/{1}'.format( msg.topic(), msg.partition())) else: print('Error occurred: {0}'.format(msg.error().str())) time.sleep(1) except KeyboardInterrupt: pass finally: c.close()
class Consume: def __init__(self, topics, min_commit_count, persist_object): conf = { 'bootstrap.servers': "localhost:9092", 'group.id': "bike_theft", 'default.topic.config': { 'auto.offset.reset': 'smallest' }, 'on_commit': self.commit_completed } self.consumer = Consumer(conf) self.topics = topics self.min_commit_count = min_commit_count self.db = persist_object @staticmethod def commit_completed(err, partitions): if err: print(str(err)) else: print("Committed partition offsets: " + str(partitions)) def db_persist(self, msg): return self.db.insert(msg) def consume_loop(self): try: self.consumer.subscribe(self.topics) msg_count = 0 while True: msg = self.consumer.poll(timeout=1.0) print(msg) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: sys.stderr.write( '%% %s [%d] reached end at offset %d\n' % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise KafkaException(msg.error()) else: decoded_msg = msg.value().decode("utf-8") inserted_id = self.db_persist(decoded_msg) msg_count += 1 if msg_count % self.min_commit_count == 0: self.consumer.commit(asynchronous=False) yield decoded_msg finally: self.consumer.close()
def main(args): def _on_assign(consumer, partitions): """ If force-beginning is True, force Kafka to read all stored messages :param consumer: :param partitions: :return: """ print(partitions) if args.force_beginning: for p in partitions: p.offset = OFFSET_BEGINNING consumer.assign(partitions) conf = read_config() kafka_config = gen_kafka_config(conf) kafka_config['auto.offset.reset'] = args.offset kafka_config['group.id'] = args.consumer_id if args.debug: print("Kafka configuration:") print(json.dumps(kafka_config, indent=4)) consumer = Consumer(kafka_config) consumer.subscribe([args.topic], on_assign=_on_assign) schema = None if args.schema in ['marshmallow', 'marshmallow-extended']: schema = UserSchema() while True: message = consumer.poll(1) if message is not None: print("Raw message: ", message.value().decode('UTF-8')) if args.schema == 'marshmallow': user = schema.loads(message.value().decode('UTF-8')).data print(user) elif args.schema == 'marshmallow-extended': buffer = json.loads(message.value().decode('UTF-8')) schema_name = buffer['schema']['name'] schema_version = buffer['schema']['version'] print("Schema name: ", schema_name, " version: ", schema_version) if schema_name == 'UserSchema': user = schema.load(buffer['data']).data print(user) consumer.commit()
def main(): consumer = Consumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'HuMan-1', 'enable-autocommit': False, 'default.topic.config': { 'auto.offset.reset': 'earliest' } }) consumer.subscribe(['test-topic']) while (True): message = consumer.poll(1.0) if message: print(f"Message {message.value()}") consumer.commit()
class Kafka(Consumer): def __init__(self, publisher, downloader, engine, incoming_topic, group_id, bootstrap_servers, **kwargs): super().__init__(publisher, downloader, engine) config = kwargs.copy() config["group.id"] = group_id config["bootstrap.servers"] = ",".join(bootstrap_servers) log.info("config", extra={"config": config}) self.auto_commit = kwargs.get("enable.auto.commit", True) self.consumer = ConfluentConsumer(config) self.consumer.subscribe([incoming_topic]) log.info("subscribing to %s: %s", incoming_topic, self.consumer) def deserialize(self, bytes_): raise NotImplementedError() def handles(self, input_msg): return True def run(self): while True: msg = self.consumer.poll(1) if msg is None: continue err = msg.error() if err is not None: # TODO: Should msg be committed? log.exception(err) continue val = msg.value() if val is not None: try: payload = self.deserialize(val) if self.handles(payload): self.process(payload) except Exception as ex: log.exception(ex) finally: if not self.auto_commit: self.consumer.commit(msg)
def consume_record(lines: list): consumer = Consumer(consumer_config) consumer.subscribe([ "observations.weather.multivariate", "observations.weather.municipality" ]) while True: try: message = consumer.poll(1) except Exception as e: print(f"Exception while trying to poll messages - {e}") exit(-1) else: if message: to_buffer(lines, message) if (len(lines) > 1000 and flush_buffer(lines) == True): consumer.commit()
class KafkaStream: def __init__(self, topic): # 连接 kafka consumer try: topics = [topic] kafka_config = KAFKA_CONSUMER_DEFAULT_CONFIG self.consumer = Consumer(kafka_config) self.consumer.subscribe(topics) except Exception as e: logger.error(f"fail to init kafka consumer.[{topic}][{e}]") sys.exit(f"fail to init kafka consumer.[{topic}]") async def handler(self, data): pass async def read_stream(self): # 监听kafka try: while True: loop = asyncio.get_event_loop() message = await loop.run_in_executor(None, self.consumer.poll) if message is None: continue if message.error(): logger.exception(message.error()) raise KafkaException(message.error()) else: data = json.loads(message.value().decode("utf-8")) try: finish = await self.handler(data) except Exception as e: logger.info(f"handler解析出现异常[{e}]") time.sleep(1) continue if finish: self.consumer.commit(asynchronus=True) # 不需要等待到触发回调函数之后.(?) except Exception as e: logger.error(f"kafka error.[{e}]") return
def consume_msg(self): c = Consumer({'bootstrap.servers': "bootstrap_server1,server2......", 'group.id': "foo", "session.timeout.ms": 6000, 'auto.offset.reset': 'latest'}) # Consumer starts consuming either ealiest offset or latest offset. c.subscribe(['merto_mart']) try: while True: df = pd.DataFrame(columns = ["name", "surname", "age"]) empty_list = list() msg = c.poll(1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue else: print(msg.error()) break print('Received message: {}, message offset: {}, topicname: {}'.format(msg.value().decode('utf-8'), msg.offset(), msg.topic())) for i in str(msg.value().decode('utf-8')).split(","): print(i) empty_list.append(i) df = df.append(pd.Series(empty_list, index=df.columns), ignore_index=True) df["date"] = self.today if(df.shape[1]!= 0): self.biqquery_manager.push_to_bq(df, schema, 'kafka_test') print("pushed succesfully") c.commit() except KeyboardInterrupt: sys.stderr.write('%% Aborted by user\n') finally: c.close()
def test_on_commit(): """ Verify that on_commit is only called once per commit() (issue #71) """ class CommitState(object): def __init__(self, topic, partition): self.topic = topic self.partition = partition self.once = True def commit_cb(cs, err, ps): print('on_commit: err %s, partitions %s' % (err, ps)) assert cs.once is True assert err == KafkaError._NO_OFFSET assert len(ps) == 1 p = ps[0] assert p.topic == cs.topic assert p.partition == cs.partition cs.once = False cs = CommitState('test', 2) c = Consumer({ 'group.id': 'x', 'enable.auto.commit': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100, 'on_commit': lambda err, ps: commit_cb(cs, err, ps) }) c.assign([TopicPartition(cs.topic, cs.partition)]) for i in range(1, 3): c.poll(0.1) if cs.once: # Try commit once try: c.commit(asynchronous=False) except KafkaException as e: print('commit failed with %s (expected)' % e) assert e.args[0].code() == KafkaError._NO_OFFSET c.close()
async def consume_events(topic, group, brokers, callback, schema=None,registry=None,delay=0.01,**kwargs): """ Connect to the Kafka endpoint and start consuming messages from the given `topic`. The given callback is applied on each message. """ global consumer if topic in consumers: raise RuntimeError("A consumer already exists for topic: %s" % topic) if (not registry_serializer or not registry_client) and registry: r_client,serializer = create_registry_client(registry) consumer = Consumer({'bootstrap.servers': brokers, 'group.id': group, 'default.topic.config': {'auto.offset.reset': 'largest'}}) consumer.subscribe([topic]) consumers[topic] = consumer try: while True: message = consumer.poll(1) if message: if not message.error(): if registry: message = serializer.decode_message(message.value()) else: message = message.value() await callback(message) consumer.commit() else: await asyncio.sleep(delay) except KafkaException as ex: pass else: consumer.close() finally: consumers.pop(topic, None)
def test_on_commit(): """ Verify that on_commit is only called once per commit() (issue #71) """ class CommitState(object): def __init__(self, topic, partition): self.topic = topic self.partition = partition self.once = True def commit_cb(cs, err, ps): print('on_commit: err %s, partitions %s' % (err, ps)) assert cs.once is True assert err == KafkaError._NO_OFFSET assert len(ps) == 1 p = ps[0] assert p.topic == cs.topic assert p.partition == cs.partition cs.once = False cs = CommitState('test', 2) c = Consumer({'group.id': 'x', 'enable.auto.commit': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100, 'on_commit': lambda err, ps: commit_cb(cs, err, ps)}) c.assign([TopicPartition(cs.topic, cs.partition)]) for i in range(1, 3): c.poll(0.1) if cs.once: # Try commit once try: c.commit(asynchronous=False) except KafkaException as e: print('commit failed with %s (expected)' % e) assert e.args[0].code() == KafkaError._NO_OFFSET c.close()
def start_consumer(): logger.info('Starting consumer', extra={'tags': { 'group_id': GROUP_ID }}) consumer = Consumer({ 'bootstrap.servers': f'{PRODUCER_HOST}:{PRODUCER_PORT}', 'group.id': GROUP_ID, 'auto.offset.reset': 'earliest', 'queued.max.messages.kbytes': 100000, 'enable.auto.commit': 'false', 'on_commit': on_commit }) current_date = update_topic_list(consumer) while True: if int(time.time()) % 300 == 0: current_date = update_topic_list(consumer, current_topic_date=current_date) msg = consumer.poll(1) if msg is None: continue if msg.error(): logger.error('Consumer error: {}'.format(msg.error())) continue process_start_time = datetime.now() alert = base64.b64encode(msg.value()).decode('utf-8') logger.info('Received alert from stream') success, candid = do_ingest(alert) logger.info('Finished processing message from {topic} with offset {offset}'.format( topic=msg.topic() + '-' + str(msg.partition()), offset=msg.offset()), extra={'tags': { 'candid': candid, 'success': success, 'record_processing_time': (datetime.now() - process_start_time).total_seconds(), 'processing_latency': datetime.now().timestamp() - msg.timestamp()[1]/1000 }} ) consumer.commit(msg) consumer.close()
class Listener: def __init__(self, kafka_server, topic): self.topic = topic self.kafka_server = kafka_server self.consumer = Consumer({ 'bootstrap.servers': kafka_server, 'group.id': "poly_encoders_server", 'enable.auto.commit': False, 'auto.offset.reset': 'earliest', 'metadata.max.age.ms': 10000 }) self.consumer.subscribe([topic]) def listen(self): logger.info(f"Listening on topic: {self.topic}") while True: msgs = self.consumer.consume(500, timeout=1) if msgs is None or len(msgs) == 0: continue msg = msgs[-1] if msg.error(): logger.error("Consumer error: {}".format(msg.error())) continue else: try: value = msg.value().decode('utf-8') event = PolyEncodersTrainingCompletedEvent( **json.loads(value)) application_services['latest'] = ApplicationService( model_dir=event.model_dir, poly_m=event.poly_m, max_query_len=event.max_query_len, max_candidate_len=event.max_candidate_len, random_seed=event.random_seed) logger.debug('Received message: {}'.format(value)) self.consumer.commit(message=msgs[-1]) except Exception as e: logger.exception(e) logger.error(msg.value()) raise e
def repl(): c = Consumer(settings) c.subscribe([topic]) db.connect() try: while True: if not db.ping(): db.connect() continue msg = c.poll(0.1) # No message present if msg is None: continue # Found a message elif not msg.error(): # Try to insert result = insertArmor(msg.value()) if result: pprint('Added Successfully ' + msg.value()) c.commit() else: c.unsubscribe() c.subscribe([topic]) print('Error Occurred Adding to Cassandra') elif msg.error().code() == KafkaError._PARTITION_EOF: print('End of partition reached {0}/{1}'.format( msg.topic(), msg.partition())) else: print('Error occurred: {0}'.format(msg.error().str())) time.sleep(1) except KeyboardInterrupt: pass finally: c.close()
class QuerySubscriptionConsumer(object): """ A Kafka consumer that processes query subscription update messages. Each message has a related subscription id and the latest values related to the subscribed query. These values are passed along to a callback associated with the subscription. """ topic_to_dataset = { settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS: QueryDatasets.EVENTS } def __init__(self, group_id, topic=None, commit_batch_size=100, initial_offset_reset="earliest"): self.group_id = group_id if not topic: topic = settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS self.topic = topic cluster_name = settings.KAFKA_TOPICS[topic]["cluster"] self.bootstrap_servers = settings.KAFKA_CLUSTERS[cluster_name][ "bootstrap.servers"] self.commit_batch_size = commit_batch_size self.initial_offset_reset = initial_offset_reset self.offsets = {} self.consumer = None def run(self): logger.debug("Starting snuba query subscriber") self.offsets.clear() conf = { "bootstrap.servers": self.bootstrap_servers, "group.id": self.group_id, "session.timeout.ms": 6000, "auto.offset.reset": self.initial_offset_reset, "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "false", "default.topic.config": { "auto.offset.reset": self.initial_offset_reset }, } def on_revoke(consumer, partitions): self.commit_offsets() self.consumer = Consumer(conf) self.consumer.subscribe([self.topic], on_revoke=on_revoke) try: i = 0 while True: message = self.consumer.poll(0.1) if message is None: continue error = message.error() if error is not None: raise KafkaException(error) i = i + 1 with sentry_sdk.start_span( Span( op="handle_message", transaction= "query_subscription_consumer_process_message", sampled=True, )): self.handle_message(message) # Track latest completed message here, for use in `shutdown` handler. self.offsets[message.partition()] = message.offset() + 1 if i % self.commit_batch_size == 0: logger.debug("Committing offsets") self.commit_offsets() except KeyboardInterrupt: pass self.shutdown() def commit_offsets(self): if self.offsets and self.consumer: to_commit = [ TopicPartition(self.topic, partition, offset) for partition, offset in self.offsets.items() ] self.consumer.commit(offsets=to_commit) self.offsets.clear() def shutdown(self): logger.debug("Committing offsets and closing consumer") self.commit_offsets() self.consumer.close() def handle_message(self, message): """ Parses the value from Kafka, and if valid passes the payload to the callback defined by the subscription. If the subscription has been removed, or no longer has a valid callback then just log metrics/errors and continue. :param message: :return: """ with sentry_sdk.push_scope() as scope: try: contents = self.parse_message_value(message.value()) except InvalidMessageError: # If the message is in an invalid format, just log the error # and continue logger.exception( "Subscription update could not be parsed", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) return scope.set_tag("query_subscription_id", contents["subscription_id"]) try: subscription = QuerySubscription.objects.get_from_cache( subscription_id=contents["subscription_id"]) except QuerySubscription.DoesNotExist: metrics.incr( "snuba_query_subscriber.subscription_doesnt_exist") logger.error( "Received subscription update, but subscription does not exist", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) try: _delete_from_snuba(self.topic_to_dataset[message.topic()], contents["subscription_id"]) except Exception: logger.exception( "Failed to delete unused subscription from snuba.") return if subscription.type not in subscriber_registry: metrics.incr( "snuba_query_subscriber.subscription_type_not_registered") logger.error( "Received subscription update, but no subscription handler registered", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) return logger.info( "query-subscription-consumer.handle_message", extra={ "timestamp": contents["timestamp"], "query_subscription_id": contents["subscription_id"], "contents": contents, "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) callback = subscriber_registry[subscription.type] with sentry_sdk.start_span( op="process_message") as span, metrics.timer( "snuba_query_subscriber.callback.duration", instance=subscription.type): span.set_data("payload", contents) callback(contents, subscription) def parse_message_value(self, value): """ Parses the value received via the Kafka consumer and verifies that it matches the expected schema. :param value: A json formatted string :return: A dict with the parsed message """ wrapper = loads(value) try: jsonschema.validate(wrapper, SUBSCRIPTION_WRAPPER_SCHEMA) except jsonschema.ValidationError: metrics.incr("snuba_query_subscriber.message_wrapper_invalid") raise InvalidSchemaError("Message wrapper does not match schema") schema_version = wrapper["version"] if schema_version not in SUBSCRIPTION_PAYLOAD_VERSIONS: metrics.incr( "snuba_query_subscriber.message_wrapper_invalid_version") raise InvalidMessageError( "Version specified in wrapper has no schema") payload = wrapper["payload"] try: jsonschema.validate(payload, SUBSCRIPTION_PAYLOAD_VERSIONS[schema_version]) except jsonschema.ValidationError: metrics.incr("snuba_query_subscriber.message_payload_invalid") raise InvalidSchemaError("Message payload does not match schema") payload["timestamp"] = parse_date( payload["timestamp"]).replace(tzinfo=pytz.utc) return payload
class QuerySubscriptionConsumer: """ A Kafka consumer that processes query subscription update messages. Each message has a related subscription id and the latest values related to the subscribed query. These values are passed along to a callback associated with the subscription. """ topic_to_dataset = { settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS: QueryDatasets.EVENTS, settings.KAFKA_TRANSACTIONS_SUBSCRIPTIONS_RESULTS: QueryDatasets.TRANSACTIONS, } def __init__( self, group_id, topic=None, commit_batch_size=100, initial_offset_reset="earliest", force_offset_reset=None, ): self.group_id = group_id if not topic: topic = settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS self.topic = topic cluster_name = settings.KAFKA_TOPICS[topic]["cluster"] self.commit_batch_size = commit_batch_size self.initial_offset_reset = initial_offset_reset self.offsets = {} self.consumer = None self.cluster_options = kafka_config.get_kafka_consumer_cluster_options( cluster_name, { "group.id": self.group_id, "session.timeout.ms": 6000, "auto.offset.reset": self.initial_offset_reset, "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "false", "default.topic.config": { "auto.offset.reset": self.initial_offset_reset }, }, ) self.admin_cluster_options = kafka_config.get_kafka_admin_cluster_options( cluster_name, {"allow.auto.create.topics": "true"}) self.resolve_partition_force_offset = self.offset_reset_name_to_func( force_offset_reset) def offset_reset_name_to_func(self, offset_reset): if offset_reset in {"smallest", "earliest", "beginning"}: return self.resolve_partition_offset_earliest elif offset_reset in {"largest", "latest", "end"}: return self.resolve_partition_offset_latest def resolve_partition_offset_earliest(self, partition): low, high = self.consumer.get_watermark_offsets(partition) return TopicPartition(partition.topic, partition.partition, low) def resolve_partition_offset_latest(self, partition): low, high = self.consumer.get_watermark_offsets(partition) return TopicPartition(partition.topic, partition.partition, high) def run(self): logger.debug("Starting snuba query subscriber") self.offsets.clear() def on_assign(consumer, partitions): updated_partitions = [] for partition in partitions: if self.resolve_partition_force_offset: partition = self.resolve_partition_force_offset(partition) updated_partitions.append(partition) if partition.offset == OFFSET_INVALID: updated_offset = None else: updated_offset = partition.offset self.offsets[partition.partition] = updated_offset if updated_partitions: self.consumer.assign(updated_partitions) logger.info( "query-subscription-consumer.on_assign", extra={ "offsets": six.text_type(self.offsets), "partitions": six.text_type(partitions), }, ) def on_revoke(consumer, partitions): partition_numbers = [ partition.partition for partition in partitions ] self.commit_offsets(partition_numbers) for partition_number in partition_numbers: self.offsets.pop(partition_number, None) logger.info( "query-subscription-consumer.on_revoke", extra={ "offsets": six.text_type(self.offsets), "partitions": six.text_type(partitions), }, ) self.consumer = Consumer(self.cluster_options) if settings.KAFKA_CONSUMER_AUTO_CREATE_TOPICS: # This is required for confluent-kafka>=1.5.0, otherwise the topics will # not be automatically created. admin_client = AdminClient(self.admin_cluster_options) wait_for_topics(admin_client, [self.topic]) self.consumer.subscribe([self.topic], on_assign=on_assign, on_revoke=on_revoke) try: i = 0 while True: message = self.consumer.poll(0.1) if message is None: continue error = message.error() if error is not None: raise KafkaException(error) i = i + 1 with sentry_sdk.start_transaction( op="handle_message", name="query_subscription_consumer_process_message", sampled=True, ), metrics.timer("snuba_query_subscriber.handle_message"): self.handle_message(message) # Track latest completed message here, for use in `shutdown` handler. self.offsets[message.partition()] = message.offset() + 1 if i % self.commit_batch_size == 0: logger.debug("Committing offsets") self.commit_offsets() except KeyboardInterrupt: pass self.shutdown() def commit_offsets(self, partitions=None): logger.info( "query-subscription-consumer.commit_offsets", extra={ "offsets": six.text_type(self.offsets), "partitions": six.text_type(partitions) }, ) if self.offsets and self.consumer: if partitions is None: partitions = self.offsets.keys() to_commit = [] for partition in partitions: offset = self.offsets.get(partition) if offset is None: # Skip partitions that have no offset continue to_commit.append(TopicPartition(self.topic, partition, offset)) self.consumer.commit(offsets=to_commit) def shutdown(self): logger.debug("Committing offsets and closing consumer") self.commit_offsets() self.consumer.close() def handle_message(self, message): """ Parses the value from Kafka, and if valid passes the payload to the callback defined by the subscription. If the subscription has been removed, or no longer has a valid callback then just log metrics/errors and continue. :param message: :return: """ with sentry_sdk.push_scope() as scope: try: with metrics.timer( "snuba_query_subscriber.parse_message_value"): contents = self.parse_message_value(message.value()) except InvalidMessageError: # If the message is in an invalid format, just log the error # and continue logger.exception( "Subscription update could not be parsed", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) return scope.set_tag("query_subscription_id", contents["subscription_id"]) try: with metrics.timer( "snuba_query_subscriber.fetch_subscription"): subscription = QuerySubscription.objects.get_from_cache( subscription_id=contents["subscription_id"]) if subscription.status != QuerySubscription.Status.ACTIVE.value: metrics.incr( "snuba_query_subscriber.subscription_inactive") return except QuerySubscription.DoesNotExist: metrics.incr( "snuba_query_subscriber.subscription_doesnt_exist") logger.error( "Received subscription update, but subscription does not exist", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) try: _delete_from_snuba(self.topic_to_dataset[message.topic()], contents["subscription_id"]) except Exception: logger.exception( "Failed to delete unused subscription from snuba.") return if subscription.type not in subscriber_registry: metrics.incr( "snuba_query_subscriber.subscription_type_not_registered") logger.error( "Received subscription update, but no subscription handler registered", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) return logger.debug( "query-subscription-consumer.handle_message", extra={ "timestamp": contents["timestamp"], "query_subscription_id": contents["subscription_id"], "project_id": subscription.project_id, "subscription_dataset": subscription.snuba_query.dataset, "subscription_query": subscription.snuba_query.query, "subscription_aggregation": subscription.snuba_query.aggregate, "subscription_time_window": subscription.snuba_query.time_window, "subscription_resolution": subscription.snuba_query.resolution, "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) callback = subscriber_registry[subscription.type] with sentry_sdk.start_span( op="process_message") as span, metrics.timer( "snuba_query_subscriber.callback.duration", instance=subscription.type): span.set_data("payload", contents) callback(contents, subscription) def parse_message_value(self, value): """ Parses the value received via the Kafka consumer and verifies that it matches the expected schema. :param value: A json formatted string :return: A dict with the parsed message """ with metrics.timer( "snuba_query_subscriber.parse_message_value.json_parse"): wrapper = json.loads(value) with metrics.timer( "snuba_query_subscriber.parse_message_value.json_validate_wrapper" ): try: jsonschema.validate(wrapper, SUBSCRIPTION_WRAPPER_SCHEMA) except jsonschema.ValidationError: metrics.incr("snuba_query_subscriber.message_wrapper_invalid") raise InvalidSchemaError( "Message wrapper does not match schema") schema_version = wrapper["version"] if schema_version not in SUBSCRIPTION_PAYLOAD_VERSIONS: metrics.incr( "snuba_query_subscriber.message_wrapper_invalid_version") raise InvalidMessageError( "Version specified in wrapper has no schema") payload = wrapper["payload"] with metrics.timer( "snuba_query_subscriber.parse_message_value.json_validate_payload" ): try: jsonschema.validate( payload, SUBSCRIPTION_PAYLOAD_VERSIONS[schema_version]) except jsonschema.ValidationError: metrics.incr("snuba_query_subscriber.message_payload_invalid") raise InvalidSchemaError( "Message payload does not match schema") # XXX: Since we just return the raw dict here, when the payload changes it'll # break things. This should convert the payload into a class rather than passing # the dict around, but until we get time to refactor we can keep things working # here. payload.setdefault("values", payload.get("result")) payload["timestamp"] = parse_date( payload["timestamp"]).replace(tzinfo=pytz.utc) return payload
class KafkaStreamingClient(AbstractStreamingClient): """Kafka streaming client.""" def __init__(self, config): # pragma: no cover """ Streaming client implementation based on Kafka. Configuration keys: KAFKA_ADDRESS KAFKA_CONSUMER_GROUP KAFKA_TOPIC TIMEOUT EVENT_HUB_KAFKA_CONNECTION_STRING """ self.logger = Logger() self.topic = config.get("KAFKA_TOPIC") if not self.topic: raise ValueError("KAFKA_TOPIC is not set in the config object.") if not config.get("KAFKA_ADDRESS"): raise ValueError("KAFKA_ADDRESS is not set in the config object.") if config.get("TIMEOUT"): try: self.timeout = int(config.get("TIMEOUT")) except ValueError: self.timeout = None else: self.timeout = None kafka_config = self.create_kafka_config(config) self.admin = admin.AdminClient(kafka_config) if config.get("KAFKA_CONSUMER_GROUP") is None: self.logger.info('Creating Producer') self.producer = Producer(kafka_config) self.run = False else: self.logger.info('Creating Consumer') self.consumer = Consumer(kafka_config) self.run = True signal.signal(signal.SIGTERM, self.exit_gracefully) @staticmethod def create_kafka_config(user_config: dict) -> dict: # pragma: no cover """Create the kafka configuration.""" config = { "bootstrap.servers": user_config.get("KAFKA_ADDRESS"), "enable.auto.commit": False, "auto.offset.reset": "latest", "default.topic.config": {'auto.offset.reset': 'latest'}, } if user_config.get('KAFKA_CONSUMER_GROUP') is not None: config['group.id'] = user_config['KAFKA_CONSUMER_GROUP'] if user_config.get('KAFKA_DEBUG') is not None: config['debug'] = user_config['KAFKA_DEBUG'] if user_config.get('EVENT_HUB_KAFKA_CONNECTION_STRING'): ssl_location = user_config.get('SSL_CERT_LOCATION') or '/etc/ssl/certs/ca-certificates.crt' kakfa_config = { 'security.protocol': "SASL_SSL", 'sasl.mechanism': "PLAIN", 'ssl.ca.location': ssl_location, 'sasl.username': '******', 'sasl.password': user_config.get('EVENT_HUB_KAFKA_CONNECTION_STRING'), 'client.id': 'agogosml', } config = {**config, **kakfa_config} return config def delivery_report(self, err, msg): # pragma: no cover """ Indicate delivery result. Called once for each message produced. Triggered by poll() or flush(). :param err: An error message. :param msg: A string input to be uploaded to kafka. """ if err is not None: self.logger.error('Message delivery failed: %s', err) else: self.logger.info('Message delivered to %s [%s]', msg.topic(), msg.partition()) def send(self, message: str): # pragma: no cover if not isinstance(message, str): raise TypeError('str type expected for message') try: mutated_message = message.encode('utf-8') self.logger.info('Sending message to kafka topic: %s', self.topic) self.producer.poll(0) self.producer.produce( self.topic, mutated_message, callback=self.delivery_report) self.producer.flush() return True except Exception as ex: self.logger.error('Error sending message to kafka: %s', ex) return False def stop(self): """Stop streaming client.""" self.run = False def check_timeout(self, start: datetime): # pragma: no cover """Interrupts if too much time has elapsed since the kafka client started running.""" if self.timeout is not None: elapsed = datetime.now() - start if elapsed.seconds >= self.timeout: raise KeyboardInterrupt def handle_kafka_error(self, msg): # pragma: no cover """Handle an error in kafka.""" if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event self.logger.info('%% %s [%d] reached end at offset %d\n', msg.topic(), msg.partition(), msg.offset()) else: # Error raise KafkaException(msg.error()) def start_receiving(self, on_message_received_callback): # pragma: no cover try: self.subscribe_to_topic() start = datetime.now() while self.run: # Stop loop after timeout if exists self.check_timeout(start) # Poll messages from topic msg = self.read_single_message() if msg is not None: on_message_received_callback(msg) except KeyboardInterrupt: self.logger.info('Aborting listener...') raise finally: # Close down consumer to commit final offsets. self.consumer.close() def exit_gracefully(self, signum, frame): # pylint: disable=unused-argument """Handle interrupt signal or calls to stop and exit gracefully.""" self.logger.info("Handling interrupt signal %s gracefully." % signum) self.stop() def subscribe_to_topic(self): # pragma: no cover """Subscribe to topic.""" self.consumer.subscribe([self.topic]) def read_single_message(self): # pragma: no cover """Poll messages from topic.""" msg = self.consumer.poll(0.000001) if msg is None: return None if msg.error(): # Error or event self.handle_kafka_error(msg) return None # Proper message # self.logger.info('kafka read message: %s, from topic: %s', msg.value(), msg.topic()) self.consumer.commit(msg) return msg.value()
raise KafkaException(record.error()) else: recrods_pulled = True # ** 在這裡進行商業邏輯與訊息處理 ** # 取出相關的metadata topic = record.topic() partition = record.partition() offset = record.offset() timestamp = record.timestamp() # 取出msgKey與msgValue msgKey = try_decode_utf8(record.key()) msgValue = try_decode_utf8(record.value()) # 秀出metadata與msgKey & msgValue訊息 print('%s-%d-%d : (%s , %s)' % (topic, partition, offset, msgKey, msgValue)) # 同步地執行commit (Sync commit) if (recrods_pulled): offsets = consumer.commit(asynchronous=False) print_sync_commit_result(offsets) except KeyboardInterrupt as e: sys.stderr.write('Aborted by user\n') except Exception as e: sys.stderr.write(str(e)) finally: # 步驟6.關掉Consumer實例的連線 consumer.close()
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb(err, partitions): pass kc = Consumer({'group.id': 'test', 'socket.timeout.ms': '100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke(consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') if msg is not None: assert msg.timestamp() == (TIMESTAMP_NOT_AVAILABLE, -1) msglist = kc.consume(num_messages=10, timeout=0.001) assert len(msglist) == 0, "expected 0 messages, not %d" % len(msglist) with pytest.raises(ValueError) as ex: kc.consume(-100) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) with pytest.raises(ValueError) as ex: kc.consume(1000001) assert 'num_messages must be between 0 and 1000000 (1M)' == str(ex.value) partitions = list(map(lambda part: TopicPartition("test", part), range(0, 100, 3))) kc.assign(partitions) with pytest.raises(KafkaException) as ex: kc.seek(TopicPartition("test", 0, 123)) assert 'Erroneous state' in str(ex.value) # Verify assignment assignment = kc.assignment() assert partitions == assignment # Pause partitions kc.pause(partitions) # Resume partitions kc.resume(partitions) # Get cached watermarks, should all be invalid. lo, hi = kc.get_watermark_offsets(partitions[0], cached=True) assert lo == -1001 and hi == -1001 assert lo == OFFSET_INVALID and hi == OFFSET_INVALID # Query broker for watermarks, should raise an exception. try: lo, hi = kc.get_watermark_offsets(partitions[0], timeout=0.5, cached=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) kc.unassign() kc.commit(asynchronous=True) try: kc.commit(asynchronous=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == OFFSET_INVALID]) == len(partitions) try: kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT try: kc.list_topics(timeout=0.2) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) try: kc.list_topics(topic="hi", timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._TRANSPORT) kc.close()
class SynchronizedConsumer(object): """ This class implements the framework for a consumer that is intended to only consume messages that have already been consumed and committed by members of another consumer group. This works similarly to the Kafka built-in ``__consumer_offsets`` topic. The consumer group that is being "followed" (the one that must make progress for our consumer here to make progress, identified by the ``synchronize_commit_group`` constructor parameter/instance attribute) must report its offsets to a topic (identified by the ``commit_log_topic`` constructor parameter/instance attribute). This consumer subscribes to both commit log topic, as well as the topic(s) that we are actually interested in consuming messages from. The messages received from the commit log topic control whether or not consumption from partitions belonging to the main topic is paused, resumed, or allowed to continue in its current state without changes. The furthest point in any partition that this consumer should ever consume to is the maximum offset that has been recorded to the commit log topic for that partition. If the offsets recorded to that topic move non-monotonically (due to an intentional offset rollback, for instance) this consumer *may* consume up to the highest watermark point. (The implementation here tries to pause consuming from the partition as soon as possible, but this makes no explicit guarantees about that behavior.) """ initial_offset_reset_strategies = { 'earliest': get_earliest_offset, 'latest': get_latest_offset, } def __init__(self, bootstrap_servers, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset='latest', on_commit=None): self.bootstrap_servers = bootstrap_servers self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change) self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = { 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, 'on_commit': commit_callback, } self.__consumer = Consumer(consumer_configuration) def __start_commit_log_consumer(self, timeout=None): """ Starts running the commit log consumer. """ stop_request_event = threading.Event() start_event = threading.Event() result = execute( functools.partial( run_commit_log_consumer, bootstrap_servers=self.bootstrap_servers, consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex), commit_log_topic=self.commit_log_topic, synchronize_commit_group=self.synchronize_commit_group, partition_state_manager=self.__partition_state_manager, start_event=start_event, stop_request_event=stop_request_event, ), ) start_event.wait(timeout) return result, stop_request_event def __check_commit_log_consumer_running(self): if not self.__commit_log_consumer.running(): try: result = self.__commit_log_consumer.result(timeout=0) # noqa except TimeoutError: pass # not helpful raise Exception('Commit log consumer unexpectedly exit!') def __on_partition_state_change( self, topic, partition, previous_state_and_offsets, current_state_and_offsets): """ Callback that is invoked when a partition state changes. """ logger.debug('State change for %r: %r to %r', (topic, partition), previous_state_and_offsets, current_state_and_offsets) current_state, current_offsets = current_state_and_offsets if current_offsets.local is None: # It only makes sense to manipulate the consumer if we've got an # assignment. (This block should only be entered at startup if the # remote offsets are retrieved from the commit log before the local # consumer has received its assignment.) return # TODO: This will be called from the commit log consumer thread, so need # to verify that calling the ``consumer.{pause,resume}`` methods is # thread safe! if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED, SynchronizedPartitionState.REMOTE_BEHIND): self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)]) elif current_state is SynchronizedPartitionState.LOCAL_BEHIND: self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)]) else: raise NotImplementedError('Unexpected partition state: %s' % (current_state,)) def subscribe(self, topics, on_assign=None, on_revoke=None): """ Subscribe to a topic. """ self.__check_commit_log_consumer_running() def assignment_callback(consumer, assignment): # Since ``auto.offset.reset`` is set to ``error`` to force human # interaction on an offset reset, we have to explicitly specify the # starting offset if no offset has been committed for this topic during # the ``__consumer_offsets`` topic retention period. assignment = { (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment } for i in self.__consumer.committed([TopicPartition(topic, partition) for ( topic, partition), offset in assignment.items() if offset is None]): k = (i.topic, i.partition) if i.offset > -1: assignment[k] = i.offset else: assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition) self.__consumer.assign([TopicPartition(topic, partition, offset) for (topic, partition), offset in assignment.items()]) for (topic, partition), offset in assignment.items(): # Setting the local offsets will either cause the partition to be # paused (if the remote offset is unknown or the local offset is # not trailing the remote offset) or resumed. self.__partition_state_manager.set_local_offset(topic, partition, offset) self.__positions[(topic, partition)] = offset if on_assign is not None: on_assign(self, [TopicPartition(topic, partition) for topic, partition in assignment.keys()]) def revocation_callback(consumer, assignment): for item in assignment: # TODO: This should probably also be removed from the state manager. self.__positions.pop((item.topic, item.partition)) if on_revoke is not None: on_revoke(self, assignment) self.__consumer.subscribe( topics, on_assign=assignment_callback, on_revoke=revocation_callback) def poll(self, timeout): self.__check_commit_log_consumer_running() message = self.__consumer.poll(timeout) if message is None: return if message.error() is not None: return message self.__partition_state_manager.validate_local_message( message.topic(), message.partition(), message.offset()) self.__partition_state_manager.set_local_offset( message.topic(), message.partition(), message.offset() + 1) self.__positions[(message.topic(), message.partition())] = message.offset() + 1 return message def commit(self, *args, **kwargs): self.__check_commit_log_consumer_running() return self.__consumer.commit(*args, **kwargs) def close(self): self.__check_commit_log_consumer_running() self.__commit_log_consumer_stop_request.set() try: self.__consumer.close() finally: self.__commit_log_consumer.result()
class KafkaConsumerWorker(BaseWorker): topic_name = None consumer_name = None consumer_settings = {} commit_on_complete = True async_commit = True poll_timeout = 0.01 sleep_time = 0.05 timestamp_fields = ['timestamp'] decimal_fields = [] boolean_fields = [] def setup(self): self.consumer = Consumer(**self.get_consumer_settings()) self.serializer = self.get_message_serializer() self.set_topic() def teardown(self): self.consumer.close() def get_topic_name(self): if self.topic_name is None: raise NotImplementedError return self.topic_name def get_consumer_name(self): if self.consumer_name is None: raise NotImplementedError return self.consumer_name def get_broker_url(self): broker_url = settings.BROKER_URL if broker_url is None: raise NotImplementedError return broker_url def get_zookeeper_url(self): zookeeper_url = settings.ZOOKEEPER_URL if zookeeper_url is None: raise NotImplementedError return zookeeper_url def get_consumer_settings(self): broker_url = self.get_broker_url() logger.debug('connecting to kafka: ' + broker_url) consumer_name = self.get_consumer_name() logger.debug('using group id: ' + consumer_name) initial_settings = { 'api.version.request': True, 'broker.version.fallback': '0.9.0', 'client.id': 'JanglConsumer', 'bootstrap.servers': broker_url, 'group.id': consumer_name, 'default.topic.config': {'auto.offset.reset': 'earliest'}, 'enable.auto.commit': False, 'on_commit': self.on_commit, 'session.timeout.ms': 10000, 'heartbeat.interval.ms': 1000, } return generate_client_settings(initial_settings, self.consumer_settings) def get_message_serializer(self): schema_registry_url = self.get_schema_registry_url() logger.debug('loading schema registry: ' + schema_registry_url) schema_client = CachedSchemaRegistryClient(url=schema_registry_url) return MessageSerializer(schema_client) def get_schema_registry_url(self): schema_microservice = settings.SCHEMA_MICROSERVICE if schema_microservice: schema_registry_url = get_service_url(schema_microservice) else: schema_registry_url = settings.SCHEMA_REGISTRY_URL if schema_registry_url is None: raise NotImplementedError return schema_registry_url def set_topic(self): topic_name = self.get_topic_name() logger.debug('set kafka topic: ' + topic_name) self.consumer.subscribe([topic_name], on_assign=self.on_assign, on_revoke=self.on_revoke) def on_assign(self, consumer, partitions): logger.debug('partitions assigned: {}'.format(partitions)) consumer.assign(partitions) def on_revoke(self, consumer, partitions): logger.debug('partitions revoked: {}'.format(partitions)) try: consumer.commit(async=False) except KafkaException: pass consumer.unassign() def on_commit(self, err, partitions): if err is None: logger.debug('commit done: {}'.format(partitions)) else: logger.error('commit error: {} - {}'.format(err, partitions)) def handle(self): message = self.consumer.poll(timeout=self.poll_timeout) if message is not None: if message.error(): if message.error().code() == KafkaError._PARTITION_EOF: # End of partition event logger.info('%% %s [%d] reached end at offset %d\n' % (message.topic(), message.partition(), message.offset())) elif message.error(): raise KafkaException(message.error()) else: message = DecodedMessage(self.serializer, message) message = self.parse_message(message) self.consume_message(message) if self.commit_on_complete: self.commit() self.done() else: self.wait() def parse_message(self, message): for field in self.timestamp_fields: if field in message: try: message[field] = datetime.fromtimestamp(message[field], utc) except ValueError: try: message[field] = datetime.fromtimestamp(message[field]/1000, utc) except TypeError: pass except TypeError: pass for field in self.decimal_fields: if field in message: try: message[field] = decimal.Decimal(message[field]) except (TypeError, decimal.InvalidOperation): pass for field in self.boolean_fields: if field in message: try: message[field] = bool(message[field]) except TypeError: pass return message def commit(self): if not self.consumer_settings.get('enable.auto.commit'): self.consumer.commit(async=self.async_commit) def consume_message(self, message): pass
class VerifiableConsumer(VerifiableClient): """ confluent-kafka-python backed VerifiableConsumer class for use with Kafka's kafkatests client tests. """ def __init__(self, conf): """ conf is a config dict passed to confluent_kafka.Consumer() """ super(VerifiableConsumer, self).__init__(conf) self.conf['on_commit'] = self.on_commit self.consumer = Consumer(**conf) self.consumed_msgs = 0 self.consumed_msgs_last_reported = 0 self.consumed_msgs_at_last_commit = 0 self.use_auto_commit = False self.use_async_commit = False self.max_msgs = -1 self.assignment = [] self.assignment_dict = dict() def find_assignment(self, topic, partition): """ Find and return existing assignment based on topic and partition, or None on miss. """ skey = '%s %d' % (topic, partition) return self.assignment_dict.get(skey) def send_records_consumed(self, immediate=False): """ Send records_consumed, every 100 messages, on timeout, or if immediate is set. """ if self.consumed_msgs <= self.consumed_msgs_last_reported + (0 if immediate else 100): return if len(self.assignment) == 0: return d = {'name': 'records_consumed', 'count': self.consumed_msgs - self.consumed_msgs_last_reported, 'partitions': []} for a in self.assignment: if a.min_offset == -1: # Skip partitions that havent had any messages since last time. # This is to circumvent some minOffset checks in kafkatest. continue d['partitions'].append(a.to_dict()) a.min_offset = -1 self.send(d) self.consumed_msgs_last_reported = self.consumed_msgs def send_assignment(self, evtype, partitions): """ Send assignment update, evtype is either 'assigned' or 'revoked' """ d = {'name': 'partitions_' + evtype, 'partitions': [{'topic': x.topic, 'partition': x.partition} for x in partitions]} self.send(d) def on_assign(self, consumer, partitions): """ Rebalance on_assign callback """ old_assignment = self.assignment self.assignment = [AssignedPartition(p.topic, p.partition) for p in partitions] # Move over our last seen offsets so that we can report a proper # minOffset even after a rebalance loop. for a in old_assignment: b = self.find_assignment(a.topic, a.partition) b.min_offset = a.min_offset self.assignment_dict = {a.skey: a for a in self.assignment} self.send_assignment('assigned', partitions) def on_revoke(self, consumer, partitions): """ Rebalance on_revoke callback """ # Send final consumed records prior to rebalancing to make sure # latest consumed is in par with what is going to be committed. self.send_records_consumed(immediate=True) self.do_commit(immediate=True, asynchronous=False) self.assignment = list() self.assignment_dict = dict() self.send_assignment('revoked', partitions) def on_commit(self, err, partitions): """ Offsets Committed callback """ if err is not None and err.code() == KafkaError._NO_OFFSET: self.dbg('on_commit(): no offsets to commit') return # Report consumed messages to make sure consumed position >= committed position self.send_records_consumed(immediate=True) d = {'name': 'offsets_committed', 'offsets': []} if err is not None: d['success'] = False d['error'] = str(err) else: d['success'] = True d['error'] = '' for p in partitions: pd = {'topic': p.topic, 'partition': p.partition, 'offset': p.offset} if p.error is not None: pd['error'] = str(p.error) d['offsets'].append(pd) if len(self.assignment) == 0: self.dbg('Not sending offsets_committed: No current assignment: would be: %s' % d) return self.send(d) def do_commit(self, immediate=False, asynchronous=None): """ Commit every 1000 messages or whenever there is a consume timeout or immediate. """ if (self.use_auto_commit or self.consumed_msgs_at_last_commit + (0 if immediate else 1000) > self.consumed_msgs): return # Make sure we report consumption before commit, # otherwise tests may fail because of commit > consumed if self.consumed_msgs_at_last_commit < self.consumed_msgs: self.send_records_consumed(immediate=True) if asynchronous is None: async_mode = self.use_async_commit else: async_mode = asynchronous self.dbg('Committing %d messages (Async=%s)' % (self.consumed_msgs - self.consumed_msgs_at_last_commit, async_mode)) retries = 3 while True: try: self.dbg('Commit') offsets = self.consumer.commit(asynchronous=async_mode) self.dbg('Commit done: offsets %s' % offsets) if not async_mode: self.on_commit(None, offsets) break except KafkaException as e: if e.args[0].code() == KafkaError._NO_OFFSET: self.dbg('No offsets to commit') break elif e.args[0].code() in (KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR_FOR_GROUP, KafkaError._WAIT_COORD): self.dbg('Commit failed: %s (%d retries)' % (str(e), retries)) if retries <= 0: raise retries -= 1 time.sleep(1) continue else: raise self.consumed_msgs_at_last_commit = self.consumed_msgs def msg_consume(self, msg): """ Handle consumed message (or error event) """ if msg.error(): self.err('Consume failed: %s' % msg.error(), term=False) return if False: self.dbg('Read msg from %s [%d] @ %d' % (msg.topic(), msg.partition(), msg.offset())) if self.max_msgs >= 0 and self.consumed_msgs >= self.max_msgs: return # ignore extra messages # Find assignment. a = self.find_assignment(msg.topic(), msg.partition()) if a is None: self.err('Received message on unassigned partition %s [%d] @ %d' % (msg.topic(), msg.partition(), msg.offset()), term=True) a.consumed_msgs += 1 if a.min_offset == -1: a.min_offset = msg.offset() if a.max_offset < msg.offset(): a.max_offset = msg.offset() self.consumed_msgs += 1 self.consumer.store_offsets(message=msg) self.send_records_consumed(immediate=False) self.do_commit(immediate=False)