def open_consumer(stream_host_and_port_list, topic_name, group_name): consumer = Consumer({'bootstrap.servers': stream_host_and_port_list, # kafka broker 'group.id': group_name, # consumer group 'api.version.request':'true' }) consumer.subscribe([topic_name]) return consumer
class KafkaWorkflowResultsReceiver(object): _requires = ['confluent-kafka'] def __init__(self, message_converter=ProtobufWorkflowResultsConverter, current_app=None): import walkoff.server.workflowresults # Need this import self.thread_exit = False kafka_config = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_CONFIG self.receiver = Consumer(kafka_config) self.topic = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_TOPIC self.message_converter = message_converter self.workflows_executed = 0 if current_app is None: self.current_app = Flask(__name__) self.current_app.config.from_object(walkoff.config.Config) self.current_app.running_context = context.Context(init_all=False) else: self.current_app = current_app def receive_results(self): """Constantly receives data from the Kafka Consumer and handles it accordingly""" logger.info('Starting Kafka workflow results receiver') self.receiver.subscribe(['{}.*'.format(self.topic)]) while not self.thread_exit: raw_message = self.receiver.poll(1.0) if raw_message is None: gevent.sleep(0.1) continue if raw_message.error(): if raw_message.error().code() == KafkaError._PARTITION_EOF: gevent.sleep(0.1) continue else: logger.error('Received an error in Kafka receiver: {}'.format(raw_message.error())) gevent.sleep(0.1) continue with self.current_app.app_context(): self._send_callback(raw_message.value()) self.receiver.close() return def _send_callback(self, message_bytes): event, sender, data = self.message_converter.to_event_callback(message_bytes) if sender is not None and event is not None: with self.current_app.app_context(): event.send(sender, data=data) if event in [WalkoffEvent.WorkflowShutdown, WalkoffEvent.WorkflowAborted]: self._increment_execution_count() def _increment_execution_count(self): self.workflows_executed += 1
def test_offsets_for_times(): c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) # Query broker for timestamps for partition try: test_topic_partition = TopicPartition("test", 0, 100) c.offsets_for_times([test_topic_partition], timeout=0.1) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._WAIT_COORD, KafkaError.LEADER_NOT_AVAILABLE),\ str(e.args([0])) c.close()
class KafkaWorkflowCommunicationReceiver(object): """Receives communication via Kafka and sends it to the executing workflow""" _requires = ['confluent-kafka'] def __init__(self, message_converter=ProtobufWorkflowCommunicationConverter): self._ready = False kafka_config = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_CONFIG self.receiver = Consumer(kafka_config) self.topic = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_TOPIC self.message_converter = message_converter self.exit = False if self.check_status(): self._ready = True def shutdown(self): self.exit = True self.receiver.close() def receive_communications(self): """Constantly receives data from the Kafka and handles it accordingly""" logger.info('Starting workflow communication receiver') while not self.exit: raw_message = self.receiver.poll(1.0) if raw_message is None: continue if raw_message.error(): if raw_message.error().code() == KafkaError._PARTITION_EOF: continue else: logger.error('Received an error in Kafka receiver: {}'.format(raw_message.error())) continue message = self.message_converter.to_received_message(raw_message.value()) if message is not None: yield message else: break raise StopIteration def is_ready(self): return self._ready def check_status(self): if self.receiver is not None: return True return False
def __init__(self, bootstrap_servers, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset='latest', on_commit=None): self.bootstrap_servers = bootstrap_servers self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change) self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = { 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, 'on_commit': commit_callback, } self.__consumer = Consumer(consumer_configuration)
def __init__(self, message_converter=ProtobufWorkflowCommunicationConverter): self._ready = False kafka_config = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_CONFIG self.receiver = Consumer(kafka_config) self.topic = walkoff.config.Config.WORKFLOW_COMMUNICATION_KAFKA_TOPIC self.message_converter = message_converter self.exit = False if self.check_status(): self._ready = True
def test_multiple_close_throw_exception(): """ Calling Consumer.close() multiple times should throw Runtime Exception """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) c.unsubscribe() c.close() with pytest.raises(RuntimeError) as ex: c.close() assert 'Consumer already closed' == str(ex.value)
def analytics_internet3_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet3_logs_%s' %dt, 'default.topic.config': {'auto.offset.reset': 'latest', 'auto.commit.enable': 'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tm = time.strftime('%Y%m%d%H%M', time.localtime()) if Msg: Msg = Msg.split() if len(Msg) >= 17: internet_access_minute = 'internet_access_minute_%s' % tm RC.incr(internet_access_minute) RC.expire(internet_access_minute,3600) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def test_store_offsets(): """ Basic store_offsets() tests """ c = Consumer({'group.id': 'test', 'enable.auto.commit': True, 'enable.auto.offset.store': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100}) c.subscribe(["test"]) try: c.store_offsets(offsets=[TopicPartition("test", 0, 42)]) except KafkaException as e: assert e.args[0].code() == KafkaError._UNKNOWN_PARTITION c.unsubscribe() c.close()
def __init__(self, conf): """ conf is a config dict passed to confluent_kafka.Consumer() """ super(VerifiableConsumer, self).__init__(conf) self.conf['on_commit'] = self.on_commit self.consumer = Consumer(**conf) self.consumed_msgs = 0 self.consumed_msgs_last_reported = 0 self.consumed_msgs_at_last_commit = 0 self.use_auto_commit = False self.use_async_commit = False self.max_msgs = -1 self.assignment = [] self.assignment_dict = dict()
def subscribe(): c = Consumer({'bootstrap.servers': '0', 'group.id': 'test-consumer-group', 'default.topic.config': {'auto.offset.reset': 'smallest'}}) c.subscribe(['neuronraindata']) while True: msg = c.poll() if not msg.error() and msg.value(): print('Received message: ' , msg.value().encode("utf-8")) else: print(msg.error()) c.close()
def consume(): c = Consumer({'bootstrap.servers': KAFKA_SERVER, 'group.id': 'mygroup', 'default.topic.config': {'auto.offset.reset': 'smallest'}}) c.subscribe([KAFKA_TOPIC]) while True: msg = c.poll() if not msg.error(): print('Received message: %s' % msg.value().decode('utf-8')) c.close()
def test_on_commit(): """ Verify that on_commit is only called once per commit() (issue #71) """ class CommitState(object): def __init__(self, topic, partition): self.topic = topic self.partition = partition self.once = True def commit_cb(cs, err, ps): print('on_commit: err %s, partitions %s' % (err, ps)) assert cs.once is True assert err == KafkaError._NO_OFFSET assert len(ps) == 1 p = ps[0] assert p.topic == cs.topic assert p.partition == cs.partition cs.once = False cs = CommitState('test', 2) c = Consumer({'group.id': 'x', 'enable.auto.commit': False, 'socket.timeout.ms': 50, 'session.timeout.ms': 100, 'on_commit': lambda err, ps: commit_cb(cs, err, ps)}) c.assign([TopicPartition(cs.topic, cs.partition)]) for i in range(1, 3): c.poll(0.1) if cs.once: # Try commit once try: c.commit(asynchronous=False) except KafkaException as e: print('commit failed with %s (expected)' % e) assert e.args[0].code() == KafkaError._NO_OFFSET c.close()
def __init__(self, message_converter=ProtobufWorkflowResultsConverter, current_app=None): import walkoff.server.workflowresults # Need this import self.thread_exit = False kafka_config = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_CONFIG self.receiver = Consumer(kafka_config) self.topic = walkoff.config.Config.WORKFLOW_RESULTS_KAFKA_TOPIC self.message_converter = message_converter self.workflows_executed = 0 if current_app is None: self.current_app = Flask(__name__) self.current_app.config.from_object(walkoff.config.Config) self.current_app.running_context = context.Context(init_all=False) else: self.current_app = current_app
def connect(self): self.consumer = Consumer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.group_id, 'default.topic.config': { 'auto.offset.reset': 'smallest' } }) print("subscribing to %s" % self.consumer_topic) self.consumer.subscribe([ self.consumer_topic ]) print("Subscribed to topic %s " % self.consumer_topic) self.producer = Producer({ 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.group_id })
async def consume_events(topic, group, brokers, callback, schema=None,registry=None,delay=0.01,**kwargs): """ Connect to the Kafka endpoint and start consuming messages from the given `topic`. The given callback is applied on each message. """ global consumer if topic in consumers: raise RuntimeError("A consumer already exists for topic: %s" % topic) if (not registry_serializer or not registry_client) and registry: r_client,serializer = create_registry_client(registry) consumer = Consumer({'bootstrap.servers': brokers, 'group.id': group, 'default.topic.config': {'auto.offset.reset': 'largest'}}) consumer.subscribe([topic]) consumers[topic] = consumer try: while True: message = consumer.poll(1) if message: if not message.error(): if registry: message = serializer.decode_message(message.value()) else: message = message.value() await callback(message) consumer.commit() else: await asyncio.sleep(delay) except KafkaException as ex: pass else: consumer.close() finally: consumers.pop(topic, None)
def analytics_intranet_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Intranet_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy2_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tt = time.strftime('%Y%m%d', time.localtime()) th = time.strftime('%Y%m%d%H', time.localtime()) tm = time.strftime('%Y%m%d%H%M', time.localtime()) H_key = 'haproxy2_topic_%s' % tt top2_url_hour = 'top2_url_hour_%s' % th top2_url_minute = 'top2_url_minute_%s' % tm if len(Msg.split()) >= 17: val = Msg.split('{') if len(val) >= 2: Topic = val[1].split('}')[0] Rtime = val[0].split()[8] Rtime = int(Rtime.split('/')[4]) if ':' in Topic: Topic = str(Topic.split(':')[0]) if '|' in Topic: Topic = str(Topic.split('|')[0]) if '.baihe.com' in Topic: Key = 'haproxy2_logs_%s_%s' % (tt, Topic) Rt_Key = 'Rtime2_%s_%s' % (tt, Topic) # 接口 PATH = str(Msg.split()[17]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) RC.zincrby(top2_url_hour, URL, 1) RC.zincrby(top2_url_minute, URL, 1) for KEY in (H_key, Key, Rt_Key,top2_url_hour,top2_url_minute): RC.expire(KEY,3600) RC.sadd(H_key, Topic) RC.incr(Key) if Rtime: RC.lpush(Rt_Key, Rtime) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def analytics_internet_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tt = time.strftime('%Y%m%d', time.localtime()) th = time.strftime('%Y%m%d%H', time.localtime()) pv_key = 'baihe_pv_%s' % tt if Msg: Msg = Msg.split() RC.incr(pv_key) if len(Msg) >= 17: Topic = str(Msg[14]).split('|')[0].replace('{', '').strip() IP = str(Msg[5]) H_key = 'haproxy_topic_%s' % tt top_ip = 'top_ip_%s' % tt top_ip_hour = 'top_ip_%s' % th top_url_hour = 'top_url_%s' % th PATH = str(Msg[16]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) Ha_Key = 'haproxy_logs_%s_%s' % (tt, Topic) top_ip_domain = 'top_%s_domain_%s' % (IP, tt) top_ip_domain_hour = 'top_%s_domain_%s' % (IP, th) for KEY in (H_key, pv_key, top_ip, top_url_hour, top_ip_hour,Ha_Key, top_ip_domain, top_ip_domain_hour): RC.expire(KEY,3600) RC.sadd(H_key, Topic) RC.incr(Ha_Key) # ip RC.zincrby(top_ip, IP, 1) RC.zincrby(top_ip_hour, IP, 1) # IP_接口 RC.zincrby(top_ip_domain, URL, 1) RC.zincrby(top_ip_domain_hour, URL, 1) # 接口 RC.zincrby(top_url_hour, URL, 1) except: continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def WAF_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Waf_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tm = time.strftime('%Y%m%d%H%M',time.localtime()) if Msg: Msg = Msg.split() if len(Msg) >= 17: url_code = Msg[9] Topic =str(Msg[14]).split('|')[0].replace('{','').strip() IP = str(Msg[5]) if url_code in ('200', '206', '301', '302', '304', '404'): top_ip_minute = 'top_ip_%s' % tm top_url_minute = 'top_url_%s' % tm PATH = str(Msg[16]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) top_ip_domain_minute = 'top_%s_domain_%s' % (IP, tm) top_url_ip_minute = 'top_%s_ip_%s' % (URL, tm) # ip RC.zincrby(top_ip_minute, IP, 1) RC.expire(top_ip_minute, 300) # IP_接口 RC.zincrby(top_ip_domain_minute, URL, 1) RC.expire(top_ip_domain_minute, 300) # 接口 RC.zincrby(top_url_minute, URL, 1) RC.expire(top_url_minute, 300) # 接口_ip RC.zincrby(top_url_ip_minute, IP, 1) RC.expire(top_url_ip_minute, 300) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
# corresponding to register / deregister event insertRegisterDeregisterMsg(msg) settings = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'mygroup', 'client.id': 'client-1', 'enable.auto.commit': True, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' } } c = Consumer(settings) c.subscribe([vehicleConstants.KAFKA_TOPIC_VEHICLE_REGISTER_DEREGISTER]) try: while True: msg = c.poll(0.1) if msg is None: continue elif not msg.error(): print('Received message: {0}'.format(msg.value())) elif msg.error().code() == KafkaError._PARTITION_EOF: print('End of partition reached {0}/{1}'.format( msg.topic(), msg.partition())) else: print('Error occured: {0}'.format(msg.error().str())) print(msg)
def ConsumeMessages(self): c = Consumer({ 'bootstrap.servers': 'localhost:9092', 'group.id': 'mygroup2', 'auto.offset.reset': 'largest' }) c.subscribe(['test']) self.counter = 0 if self.counter == 0: self.now = math.floor(time.time() - (time.time() % 10) + 1) self.end = self.now + 5 self.counter += 1 while True: msg = c.poll(0.1) if msg is None: continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue self.data = literal_eval(msg.value().decode('utf-8')) if math.floor(self.data['timestamp']) < self.now: pass if (math.floor(self.data['timestamp']) >= self.now) and ( math.floor(self.data['timestamp']) < self.end): self.dict_ttl.append_values(self.data['sensor'], self.data["data"]) if math.floor(self.data['timestamp']) >= self.end: print('\nLength of dict is {}'.format(len(list( self.dict_ttl)))) if len(list(self.dict_ttl)) == 0: self.now = self.now + 5 self.end = self.end + 5 continue for k, v in self.dict_ttl.ttl_items(): self.df = self.df.append( { 'timestamp': self.now, 'data': v[0][1], 'sensor': k }, ignore_index=True) if self.df.empty: print('empty df') continue print('sum of items is {}'.format( str(self.df['data'].apply(lambda x: sum(x)).tolist()[0]))) self.total = self.df['data'].apply( lambda x: sum(x)).tolist()[0] if int(self.total) < 7000: print('This row would be discarded') self.df['data'] = self.df['data'].apply( lambda x: sum(x) / len(x)) self.df = self.df.groupby(['timestamp', 'sensor' ]).agg('mean').reset_index() print( "Mean of all items for this sensor within window {},{} is " .format(self.now, self.end)) print(self.df) print('\n---------------------------') self.df = pd.DataFrame() self.dict_ttl = DefaultDictTTL(5) self.now = self.now + 5 self.end = self.end + 5 c.close()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # # TODO: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! # # self.broker_properties = { "bootstrap.servers": ",".join(["PLAINTEXT://localhost:9092"]), "group.id": f"{topic_name_pattern}", "default.topic.config": { "auto.offset.reset": "earliest" }, } # TODO: Create the Consumer, using the appropriate type. if is_avro is True: # TODO: Make sure to set schema registry self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) # # # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. # # # self.consumer.subscribe( TODO ) self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # TODO: If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest for partition in partitions: if self.offset_earliest is True: partition.offset = confluent_kafka.OFFSET_BEGINNING logger.info("partitions assigned for {self.topic_name_pattern}") consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # # # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions. # Additionally, make sure you return 1 when a message is processed, and 0 when no message # is retrieved. # # logger.debug(f"consuming from topic pattern {self.topic_name_pattern}") try: message = self.consumer.poll(timeout=self.consume_timeout) except SerializerError as e: logger.error( f"failed to deserialize message {self.topic_name_pattern}: {e}" ) return 0 if message is None: logger.debug("no messages to be consumed") return 0 elif message.error() is not None: logger.error( f"failed to consume message {self.topic_name_pattern}: {message.error()}" ) return 0 logger.debug(f"message received: ({message.key()}) {message.value()}") self.message_handler(message) return 1 def close(self): """Cleans up any open kafka consumers""" # # # TODO: Cleanup the kafka consumer # # logger.debug("closing consumer...") self.consumer.close()
class ConsumerServer: def __init__( self, topic_name_pattern=TOPIC, broker_urls=BROADCAST_URL, message_handler=handle_json_message, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { 'bootstrap.servers': broker_urls, 'group.id': '0' } self.consumer = Consumer(self.broker_properties) self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" for partition in partitions: if self.offset_earliest: partition.offset = OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: time.sleep(1) num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" message = self.consumer.poll(timeout=self.consume_timeout) if message is None: logger.info("no message received for pattern %s", self.topic_name_pattern) return 0 elif message.error(): logger.error("error - failed to consume data") return 0 else: self.message_handler(message) return 1 def close(self): """Cleans up any open kafka consumers""" logger.info("Shutdown consumer") self.consumer.close()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # # TODO: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! # # #self.broker_properties = { # # # # TODO # # # "kafka" : "PLAINTEXT://localhost:9092", # "schema_registry" : "http://localhost:8081" #} self.broker_properties = { "bootstrap.servers": "localhost:9092", #"bootstrap.servers": "PLAINTEXT://localhost:9092", "group.id": "udacity", "auto.offset.reset": "earliest" if offset_earliest else "latest" } # TODO: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) #self.consumer = AvroConsumer( # { # "bootstrap.servers": self.broker_properties["kafka"], # "schema.registry.url": self.broker_properties["schema_registry"], # "group.id": "0", # "auto.offset.reset": "earliest" # self.broker_properties["schema.registry.url"] = "http://localhost:8081" # # } #) logger.info("__init__ - AvroConsumer was created") else: self.consumer = Consumer(self.broker_properties) #self.consumer = Consumer( # { # "bootstrap.servers": self.broker_properties["kafka"], # "group.id": "0", # "auto.offset.reset": "earliest" # } #) #pass logger.info("__init__ - Consumer was created") # # # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. # # logger.info("Consumer will subscribe - %s", self.topic_name_pattern) self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # TODO: If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest #logger.info("on_assign is incomplete - skipping") logger.info("on_assign - self.topic_name_pattern: %s", self.topic_name_pattern) logger.info("on_assign - partitions: %s", partitions) logger.info("on_assign - self.consumer: %s", self.consumer) #for partition in partitions: # pass # # # # # # TODO # # # # for partition in partitions: logger.info("on_assign - partition: %s", partition) partition.offset = OFFSET_BEGINNING logger.info("BEFORE partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) logger.info("AFTER partitions assigned for %s", self.topic_name_pattern) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # # # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions. # Additionally, make sure you return 1 when a message is processed, and 0 when no message # is retrieved. # # #logger.info("_consume is incomplete - skipping") #return 0 message = self.consumer.poll(1.0) if message is None: logger.info("no message received by consumer: %s", self.topic_name_pattern) #logger.info("no message received by consumer") return 0 elif message.error() is not None: logger.info(f"error from consumer {message.error()}") return 0 else: logger.info(f"consumed message {message.key()}: {message.value()}") self.message_handler(message) return 1 def close(self): """Cleans up any open kafka consumers""" # # # TODO: Cleanup the kafka consumer # # self.consumer.close()
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest self.broker_properties = { "bootstrap.servers": ",".join(["PLAINTEXT://localhost:9092"]), "group.id": f"{topic_name_pattern}", "default.topic.config": { "auto.offset.reset": "earliest" }, } # TODO: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" for partition in partitions: if self.offset_earliest is True: partition.offset = confluent_kafka.OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" try: message = self.consumer.poll(timeout=self.consume_timeout) except SerializerError as e: return 0 if message is None: return 0 elif message.error() is not None: return 0 self.message_handler(message) return 1 def close(self): """Cleans up any open kafka consumers""" logger.debug("Cleaning up the kafka consumer") self.consumer.close()
def _connect(self): connection = {'bootstrap.servers': self.host+":"+self.port, 'group.id': self.group, 'session.timeout.ms': 6000, 'default.topic.config': {'auto.offset.reset': 'largest'}} logging.info("Connecting to Kafka at %s...", connection) self.consumer = Consumer(**connection) self.consumer.subscribe(self.topic, on_assign=self.on_assign)
class QuerySubscriptionConsumer: """ A Kafka consumer that processes query subscription update messages. Each message has a related subscription id and the latest values related to the subscribed query. These values are passed along to a callback associated with the subscription. """ topic_to_dataset: Dict[str, QueryDatasets] = { settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS: QueryDatasets.EVENTS, settings.KAFKA_TRANSACTIONS_SUBSCRIPTIONS_RESULTS: QueryDatasets.TRANSACTIONS, } def __init__( self, group_id: str, topic: Optional[str] = None, commit_batch_size: int = 100, initial_offset_reset: str = "earliest", force_offset_reset: Optional[str] = None, ): self.group_id = group_id if not topic: # TODO(typing): Need a way to get the actual value of settings to avoid this topic = cast(str, settings.KAFKA_EVENTS_SUBSCRIPTIONS_RESULTS) self.topic = topic cluster_name: str = settings.KAFKA_TOPICS[topic]["cluster"] self.commit_batch_size = commit_batch_size self.initial_offset_reset = initial_offset_reset self.offsets: Dict[int, Optional[int]] = {} self.consumer: Consumer = None self.cluster_options = kafka_config.get_kafka_consumer_cluster_options( cluster_name, { "group.id": self.group_id, "session.timeout.ms": 6000, "auto.offset.reset": self.initial_offset_reset, "enable.auto.commit": "false", "enable.auto.offset.store": "false", "enable.partition.eof": "false", "default.topic.config": { "auto.offset.reset": self.initial_offset_reset }, }, ) self.admin_cluster_options = kafka_config.get_kafka_admin_cluster_options( cluster_name, {"allow.auto.create.topics": "true"}) self.resolve_partition_force_offset = self.offset_reset_name_to_func( force_offset_reset) self.__shutdown_requested = False def offset_reset_name_to_func( self, offset_reset: Optional[str] ) -> Optional[Callable[[TopicPartition], TopicPartition]]: if offset_reset in {"smallest", "earliest", "beginning"}: return self.resolve_partition_offset_earliest elif offset_reset in {"largest", "latest", "end"}: return self.resolve_partition_offset_latest return None def resolve_partition_offset_earliest( self, partition: TopicPartition) -> TopicPartition: low, high = self.consumer.get_watermark_offsets(partition) return TopicPartition(partition.topic, partition.partition, low) def resolve_partition_offset_latest( self, partition: TopicPartition) -> TopicPartition: low, high = self.consumer.get_watermark_offsets(partition) return TopicPartition(partition.topic, partition.partition, high) def run(self) -> None: logger.debug("Starting snuba query subscriber") self.offsets.clear() def on_assign(consumer: Consumer, partitions: List[TopicPartition]) -> None: updated_partitions: List[TopicPartition] = [] for partition in partitions: if self.resolve_partition_force_offset: partition = self.resolve_partition_force_offset(partition) updated_partitions.append(partition) if partition.offset == OFFSET_INVALID: updated_offset = None else: updated_offset = partition.offset self.offsets[partition.partition] = updated_offset if updated_partitions: self.consumer.assign(updated_partitions) logger.info( "query-subscription-consumer.on_assign", extra={ "offsets": str(self.offsets), "partitions": str(partitions), }, ) def on_revoke(consumer: Consumer, partitions: List[TopicPartition]) -> None: partition_numbers = [ partition.partition for partition in partitions ] self.commit_offsets(partition_numbers) for partition_number in partition_numbers: self.offsets.pop(partition_number, None) logger.info( "query-subscription-consumer.on_revoke", extra={ "offsets": str(self.offsets), "partitions": str(partitions), }, ) self.consumer = Consumer(self.cluster_options) self.__shutdown_requested = False if settings.KAFKA_CONSUMER_AUTO_CREATE_TOPICS: # This is required for confluent-kafka>=1.5.0, otherwise the topics will # not be automatically created. admin_client = AdminClient(self.admin_cluster_options) wait_for_topics(admin_client, [self.topic]) self.consumer.subscribe([self.topic], on_assign=on_assign, on_revoke=on_revoke) i = 0 while not self.__shutdown_requested: message = self.consumer.poll(0.1) if message is None: continue error = message.error() if error is not None: raise KafkaException(error) i = i + 1 with sentry_sdk.start_transaction( op="handle_message", name="query_subscription_consumer_process_message", sampled=random() <= options.get("subscriptions-query.sample-rate"), ), metrics.timer("snuba_query_subscriber.handle_message"): self.handle_message(message) # Track latest completed message here, for use in `shutdown` handler. self.offsets[message.partition()] = message.offset() + 1 if i % self.commit_batch_size == 0: logger.debug("Committing offsets") self.commit_offsets() logger.debug("Committing offsets and closing consumer") self.commit_offsets() self.consumer.close() def commit_offsets(self, partitions: Optional[Iterable[int]] = None) -> None: logger.info( "query-subscription-consumer.commit_offsets", extra={ "offsets": str(self.offsets), "partitions": str(partitions) }, ) if self.offsets and self.consumer: if partitions is None: partitions = self.offsets.keys() to_commit = [] for partition in partitions: offset = self.offsets.get(partition) if offset is None: # Skip partitions that have no offset continue to_commit.append(TopicPartition(self.topic, partition, offset)) self.consumer.commit(offsets=to_commit) def shutdown(self) -> None: self.__shutdown_requested = True def handle_message(self, message: Message) -> None: """ Parses the value from Kafka, and if valid passes the payload to the callback defined by the subscription. If the subscription has been removed, or no longer has a valid callback then just log metrics/errors and continue. :param message: :return: """ with sentry_sdk.push_scope() as scope: try: with metrics.timer( "snuba_query_subscriber.parse_message_value"): contents = self.parse_message_value(message.value()) except InvalidMessageError: # If the message is in an invalid format, just log the error # and continue logger.exception( "Subscription update could not be parsed", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) return scope.set_tag("query_subscription_id", contents["subscription_id"]) try: with metrics.timer( "snuba_query_subscriber.fetch_subscription"): subscription: QuerySubscription = QuerySubscription.objects.get_from_cache( subscription_id=contents["subscription_id"]) if subscription.status != QuerySubscription.Status.ACTIVE.value: metrics.incr( "snuba_query_subscriber.subscription_inactive") return except QuerySubscription.DoesNotExist: metrics.incr( "snuba_query_subscriber.subscription_doesnt_exist") logger.error( "Received subscription update, but subscription does not exist", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) try: _delete_from_snuba(self.topic_to_dataset[message.topic()], contents["subscription_id"]) except Exception: logger.exception( "Failed to delete unused subscription from snuba.") return if subscription.type not in subscriber_registry: metrics.incr( "snuba_query_subscriber.subscription_type_not_registered") logger.error( "Received subscription update, but no subscription handler registered", extra={ "offset": message.offset(), "partition": message.partition(), "value": message.value(), }, ) return sentry_sdk.set_tag("project_id", subscription.project_id) sentry_sdk.set_tag("query_subscription_id", contents["subscription_id"]) callback = subscriber_registry[subscription.type] with sentry_sdk.start_span( op="process_message") as span, metrics.timer( "snuba_query_subscriber.callback.duration", instance=subscription.type): span.set_data("payload", contents) span.set_data("subscription_dataset", subscription.snuba_query.dataset) span.set_data("subscription_query", subscription.snuba_query.query) span.set_data("subscription_aggregation", subscription.snuba_query.aggregate) span.set_data("subscription_time_window", subscription.snuba_query.time_window) span.set_data("subscription_resolution", subscription.snuba_query.resolution) span.set_data("message_offset", message.offset()) span.set_data("message_partition", message.partition()) span.set_data("message_value", message.value()) callback(contents, subscription) def parse_message_value(self, value: str) -> Dict[str, Any]: """ Parses the value received via the Kafka consumer and verifies that it matches the expected schema. :param value: A json formatted string :return: A dict with the parsed message """ with metrics.timer( "snuba_query_subscriber.parse_message_value.json_parse"): wrapper: Dict[str, Any] = json.loads(value) with metrics.timer( "snuba_query_subscriber.parse_message_value.json_validate_wrapper" ): try: jsonschema.validate(wrapper, SUBSCRIPTION_WRAPPER_SCHEMA) except jsonschema.ValidationError: metrics.incr("snuba_query_subscriber.message_wrapper_invalid") raise InvalidSchemaError( "Message wrapper does not match schema") schema_version: int = wrapper["version"] if schema_version not in SUBSCRIPTION_PAYLOAD_VERSIONS: metrics.incr( "snuba_query_subscriber.message_wrapper_invalid_version") raise InvalidMessageError( "Version specified in wrapper has no schema") payload: Dict[str, Any] = wrapper["payload"] with metrics.timer( "snuba_query_subscriber.parse_message_value.json_validate_payload" ): try: jsonschema.validate( payload, SUBSCRIPTION_PAYLOAD_VERSIONS[schema_version]) except jsonschema.ValidationError: metrics.incr("snuba_query_subscriber.message_payload_invalid") raise InvalidSchemaError( "Message payload does not match schema") # XXX: Since we just return the raw dict here, when the payload changes it'll # break things. This should convert the payload into a class rather than passing # the dict around, but until we get time to refactor we can keep things working # here. payload.setdefault("values", payload.get("result")) payload["timestamp"] = parse_date( payload["timestamp"]).replace(tzinfo=pytz.utc) return payload
def __init__(self, host: str, port: int, topic_name: str, max_polling_timeout: float = 0.001, **kwargs): """ Init Kafka RPCClient. Not like the most of the RPC protocols, Only one KRPCClient can run on a single Kafka topic. If you insist using multiple KRPCClient instances, redis must be used, pass argument use_redis=True. Args: host: kafka broker host port: kafka broker port topic_name: kafka topic_name, if topic exists, the existing topic will be used, create a new topic otherwise. max_polling_timeout: maximum time(seconds) to block waiting for message, event or callback. encrypt: default None, if not None, will encrypt the message with the given password. It will slow down performance. verify: default False, if True, will verify the message with the given sha3 checksum from the headers. use_redis: default False, if True, use redis as cache, built-in QueueDict instead. """ bootstrap_servers = '{}:{}'.format(host, port) self.topic_name = topic_name self.server_topic = 'krpc_{}_server'.format(topic_name) self.client_topic = 'krpc_{}_client'.format(topic_name) # set max_polling_timeout assert max_polling_timeout > 0, 'max_polling_timeout must be greater than 0' self.max_polling_timeout = max_polling_timeout self.consumer = Consumer({ 'bootstrap.servers': bootstrap_servers, 'group.id': 'krpc', 'auto.offset.reset': 'earliest', 'auto.commit.interval.ms': 1000 }) self.producer = Producer({ 'bootstrap.servers': bootstrap_servers, 'on_delivery': self.delivery_report, }) # add redis cache, for temporarily storage of returned data self.use_redis = kwargs.get('use_redis', False) self.expire_time = kwargs.get('expire_time', 600) if self.use_redis: import redis redis_port = kwargs.get('redis_port', 6379) redis_db = kwargs.get('redis_db', 0) redis_password = kwargs.get('redis_password', None) self.cache = redis.Redis(host, redis_port, redis_db, redis_password) self.cache_channel = self.cache.pubsub() else: self.cache = QueueDict(maxlen=2048, expire=self.expire_time) self.consumer.subscribe([self.client_topic]) # set msgpack packer & unpacker self.packer = msgpack.Packer(use_bin_type=True) self.unpacker = msgpack.Unpacker(use_list=False, raw=False) self.verify = kwargs.get('verify', False) self.verification_method = kwargs.get('verification', 'crc32') if self.verification_method == 'crc32': self.verification_method = lambda x: hex(zlib.crc32(x)).encode() elif isinstance(self.verification_method, Callable): self.verification_method = self.verification_method else: raise AssertionError('not supported verification function.') self.encrypt = kwargs.get('encrypt', None) if self.encrypt is not None: self.encrypt = AESEncryption(self.encrypt, encrypt_length=16) self.is_closed = False # coroutine pool self.pool = ThreadAsyncPoolExecutor(pool_size=1) self.pool.submit(self.wait_forever) # handshake, if's ok not to handshake, but the first rpc would be slow. if kwargs.get('handshake', True): self.handshaked = False self.producer.produce(self.server_topic, b'handshake', b'handshake', headers={'checksum': None}) self.producer.poll(0.0) logger.info('sending handshake') while True: if self.handshaked: break time.sleep(1) # acknowledge, disable ack will double the speed, but not exactly safe. self.ack = kwargs.get('ack', False)
if __name__ == '__main__': # 步驟1.設定要連線到Kafka集群的相關設定 # Consumer configuration # See https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md props = { 'bootstrap.servers': 'localhost:9092', # Kafka集群在那裡? (置換成要連接的Kafka集群) 'group.id': 'tdea', # ConsumerGroup的名稱 (置換成你/妳的學員ID) 'auto.offset.reset': 'earliest', # 是否從這個ConsumerGroup尚未讀取的partition/offset開始讀 'enable.auto.commit': True, # 是否啟動自動commit 'auto.commit.interval.ms': 5000, # 自動commit的interval 'on_commit': print_commit_result, # 設定接收commit訊息的callback函數 'error_cb': error_cb # 設定接收error訊息的callback函數 } # 步驟2. 產生一個Kafka的Consumer的實例 consumer = Consumer(props) # 步驟3. 指定想要訂閱訊息的topic名稱 topicName = 'ak03.test' # 步驟4. 讓Consumer向Kafka集群訂閱指定的topic consumer.subscribe([topicName]) # 步驟5. 持續的拉取Kafka有進來的訊息 try: while True: # 請求Kafka把新的訊息吐出來 records = consumer.consume(num_messages=500, timeout=1.0) # 批次讀取 if records is None: continue for record in records: # 檢查是否有錯誤
class KRPCClient: def __init__(self, host: str, port: int, topic_name: str, max_polling_timeout: float = 0.001, **kwargs): """ Init Kafka RPCClient. Not like the most of the RPC protocols, Only one KRPCClient can run on a single Kafka topic. If you insist using multiple KRPCClient instances, redis must be used, pass argument use_redis=True. Args: host: kafka broker host port: kafka broker port topic_name: kafka topic_name, if topic exists, the existing topic will be used, create a new topic otherwise. max_polling_timeout: maximum time(seconds) to block waiting for message, event or callback. encrypt: default None, if not None, will encrypt the message with the given password. It will slow down performance. verify: default False, if True, will verify the message with the given sha3 checksum from the headers. use_redis: default False, if True, use redis as cache, built-in QueueDict instead. """ bootstrap_servers = '{}:{}'.format(host, port) self.topic_name = topic_name self.server_topic = 'krpc_{}_server'.format(topic_name) self.client_topic = 'krpc_{}_client'.format(topic_name) # set max_polling_timeout assert max_polling_timeout > 0, 'max_polling_timeout must be greater than 0' self.max_polling_timeout = max_polling_timeout self.consumer = Consumer({ 'bootstrap.servers': bootstrap_servers, 'group.id': 'krpc', 'auto.offset.reset': 'earliest', 'auto.commit.interval.ms': 1000 }) self.producer = Producer({ 'bootstrap.servers': bootstrap_servers, 'on_delivery': self.delivery_report, }) # add redis cache, for temporarily storage of returned data self.use_redis = kwargs.get('use_redis', False) self.expire_time = kwargs.get('expire_time', 600) if self.use_redis: import redis redis_port = kwargs.get('redis_port', 6379) redis_db = kwargs.get('redis_db', 0) redis_password = kwargs.get('redis_password', None) self.cache = redis.Redis(host, redis_port, redis_db, redis_password) self.cache_channel = self.cache.pubsub() else: self.cache = QueueDict(maxlen=2048, expire=self.expire_time) self.consumer.subscribe([self.client_topic]) # set msgpack packer & unpacker self.packer = msgpack.Packer(use_bin_type=True) self.unpacker = msgpack.Unpacker(use_list=False, raw=False) self.verify = kwargs.get('verify', False) self.verification_method = kwargs.get('verification', 'crc32') if self.verification_method == 'crc32': self.verification_method = lambda x: hex(zlib.crc32(x)).encode() elif isinstance(self.verification_method, Callable): self.verification_method = self.verification_method else: raise AssertionError('not supported verification function.') self.encrypt = kwargs.get('encrypt', None) if self.encrypt is not None: self.encrypt = AESEncryption(self.encrypt, encrypt_length=16) self.is_closed = False # coroutine pool self.pool = ThreadAsyncPoolExecutor(pool_size=1) self.pool.submit(self.wait_forever) # handshake, if's ok not to handshake, but the first rpc would be slow. if kwargs.get('handshake', True): self.handshaked = False self.producer.produce(self.server_topic, b'handshake', b'handshake', headers={'checksum': None}) self.producer.poll(0.0) logger.info('sending handshake') while True: if self.handshaked: break time.sleep(1) # acknowledge, disable ack will double the speed, but not exactly safe. self.ack = kwargs.get('ack', False) @staticmethod def delivery_report(err, msg): if err is not None: logger.error('request failed: {}'.format(err)) else: logger.info('request sent to {} [{}]'.format( msg.topic(), msg.partition())) def parse_response(self, msg_value): try: self.unpacker.feed(msg_value) res = next(self.unpacker) except Exception as e: logger.exception(e) res = None return res def call(self, method_name, *args, **kwargs): # rpc call timeout # WARNING: if the rpc method has an argument named timeout, it will be not be passed. timeout = kwargs.pop('timeout', 10) start_time = time.time() # send request back to server req = {'method_name': method_name, 'args': args, 'kwargs': kwargs} req = self.packer.pack(req) if self.encrypt: req = self.encrypt.encrypt(req) if self.verify: checksum = self.verification_method(req) else: checksum = None task_id = uuid.uuid4().hex self.producer.produce(self.server_topic, req, task_id, headers={'checksum': checksum}) # waiting for response from server sync/async res = self.poll_result_from_redis_cache(task_id, timeout) if self.ack: self.producer.poll(0.0) # do something to the response ret = res['ret'] tact_time_server = res['tact_time'] server_id = res['server_id'] end_time = time.time() return { 'ret': ret, 'tact_time': end_time - start_time, 'tact_time_server': tact_time_server, 'server_id': server_id } def wait_forever(self): while True: if self.is_closed: logger.info('user exit') break try: msg = self.consumer.poll(self.max_polling_timeout) if msg is None: continue if msg.error(): logger.error("consumer error: {}".format(msg.error())) continue task_id = msg.key( ) # an uuid, the only id that pairs the request and the response if task_id == b'handshake': logger.info('handshake succeeded.') self.handshaked = True continue res = msg.value() headers = msg.headers() checksum = headers[0][1] if self.verify: signature = self.verification_method(res) if checksum != signature: logger.error( 'checksum mismatch of task {}'.format(task_id)) continue if self.use_redis: self.cache.publish(task_id, res) self.cache.set(task_id, res) self.cache.expire(task_id, self.expire_time) else: self.cache[task_id] = res # send signal for polling to search for result ... except Exception as e: logger.exception(e) def poll_result_from_redis_cache(self, task_id, timeout=10): """ poll_result_from_cache after receiving a signal from waiting Args: task_id: timeout: Returns: """ loop_times = int(timeout / self.max_polling_timeout) task_id = task_id.encode() if self.use_redis: self.cache_channel.subscribe(task_id) for _ in range(loop_times): # if no completion, get message from subscribed channel message = self.cache_channel.get_message( timeout=self.max_polling_timeout) # else get response from redis db cache if message is None: res = self.cache.get(task_id) # if still no response yet, continue polling if res is None: continue break if isinstance(message, dict): if isinstance(message['data'], int): continue res = message['data'] break else: for _ in range(loop_times): try: res = self.cache[task_id] break except: time.sleep(self.max_polling_timeout) try: if self.encrypt: res = self.encrypt.decrypt(res) res = self.parse_response(res) except NameError: raise TimeoutError return res def __getattr__(self, method_name): return lambda *args, **kwargs: self.call(method_name, *args, **kwargs) def close(self): self.is_closed = True if self.use_redis: self.cache_channel.close() self.cache.close() self.consumer.close() self.producer.flush() self.pool.shutdown()
class KafkaConsumer: def __init__(self, broker_manager, msg_monitor, consumer_id, test_number): self.consumer = None self.broker_manager = broker_manager self.msg_monitor = msg_monitor self.consumer_id = consumer_id self.actor = f"CONSUMER(Test:{test_number} Id:C{consumer_id})" self.terminate = False self.topic = None self.on_assignment_ctr = 0 def get_partitions(self, partitions): ps = list() for p in partitions: ps.append(str(p.partition)) if len(ps) == 0: return "none" else: return ",".join(ps) def on_assignment(self, con, partitions): console_out(f"Assigned partitions: {self.get_partitions(partitions)}", self.actor) if self.on_assignment_ctr == 0: self.on_assignment_ctr += 1 for part in partitions: part.offset = 0 self.consumer.assign(partitions) def on_revoke(self, con, partitions): console_out( f"Unassigned partitions: {self.get_partitions(partitions)}", self.actor) self.consumer.unassign() def create_consumer(self, group_id, topic): self.terminate = False console_out( f"Creating a consumer with bootstrap.servers: {self.broker_manager.get_bootstrap_servers()}", self.actor) self.consumer = Consumer({ 'bootstrap.servers': self.broker_manager.get_bootstrap_servers(), 'api.version.request': True, 'enable.auto.commit': True, 'group.id': group_id, 'auto.offset.reset': 'earliest', 'default.topic.config': { 'auto.offset.reset': 'smallest' } }) self.topic = topic def subscribe(self): subscribed = False while not subscribed: try: console_out(f"Starting subscription to {self.topic}", self.actor) self.consumer.subscribe([self.topic], on_assign=self.on_assignment, on_revoke=self.on_revoke) console_out(f"Subscribed to {self.topic}", self.actor) subscribed = True except KafkaError as e: console_out(f"Failed to subscribe: {e}", self.actor) time.sleep(5) def start_consuming(self): self.subscribe() try: msg_ctr = 0 while not self.terminate: msg = self.consumer.poll(2.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue else: console_out(msg.error(), self.actor) break self.msg_monitor.append(msg.value(), self.consumer_id, self.actor) console_out("Consumption terminated", self.actor) self.consumer.close() except Exception as e: console_out("Consumption terminated due to error", self.actor) template = "An exception of type {0} occurred. Arguments:{1!r}" message = template.format(type(e).__name__, e.args) console_out(message, self.actor) def stop_consuming(self): self.terminate = True
from confluent_kafka import Consumer c = Consumer({ 'bootstrap.servers': 'mybroker', 'group.id': 'mygroup', 'auto.offset.reset': 'earliest' }) c.subscribe(['mytopic']) while True: msg = c.poll(1.0) if msg is None: continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue print('Received message: {}'.format(msg.value().decode('utf-8'))) c.close()
def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # # TODO: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! # # #self.broker_properties = { # # # # TODO # # # "kafka" : "PLAINTEXT://localhost:9092", # "schema_registry" : "http://localhost:8081" #} self.broker_properties = { "bootstrap.servers": "localhost:9092", #"bootstrap.servers": "PLAINTEXT://localhost:9092", "group.id": "udacity", "auto.offset.reset": "earliest" if offset_earliest else "latest" } # TODO: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) #self.consumer = AvroConsumer( # { # "bootstrap.servers": self.broker_properties["kafka"], # "schema.registry.url": self.broker_properties["schema_registry"], # "group.id": "0", # "auto.offset.reset": "earliest" # self.broker_properties["schema.registry.url"] = "http://localhost:8081" # # } #) logger.info("__init__ - AvroConsumer was created") else: self.consumer = Consumer(self.broker_properties) #self.consumer = Consumer( # { # "bootstrap.servers": self.broker_properties["kafka"], # "group.id": "0", # "auto.offset.reset": "earliest" # } #) #pass logger.info("__init__ - Consumer was created") # # # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. # # logger.info("Consumer will subscribe - %s", self.topic_name_pattern) self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign)
def test_basic_api(): """ Basic API tests, these wont really do anything since there is no broker configured. """ try: kc = Consumer() except TypeError as e: assert str(e) == "expected configuration dict" def dummy_commit_cb (err, partitions): pass kc = Consumer({'group.id':'test', 'socket.timeout.ms':'100', 'session.timeout.ms': 1000, # Avoid close() blocking too long 'on_commit': dummy_commit_cb}) kc.subscribe(["test"]) kc.unsubscribe() def dummy_assign_revoke (consumer, partitions): pass kc.subscribe(["test"], on_assign=dummy_assign_revoke, on_revoke=dummy_assign_revoke) kc.unsubscribe() msg = kc.poll(timeout=0.001) if msg is None: print('OK: poll() timeout') elif msg.error(): print('OK: consumer error: %s' % msg.error().str()) else: print('OK: consumed message') partitions = list(map(lambda p: TopicPartition("test", p), range(0,100,3))) kc.assign(partitions) kc.unassign() kc.commit(async=True) try: kc.commit(async=False) except KafkaException as e: assert e.args[0].code() in (KafkaError._TIMED_OUT, KafkaError._NO_OFFSET) # Get current position, should all be invalid. kc.position(partitions) assert len([p for p in partitions if p.offset == -1001]) == len(partitions) try: offsets = kc.committed(partitions, timeout=0.001) except KafkaException as e: assert e.args[0].code() == KafkaError._TIMED_OUT kc.close()
def run(self) -> None: def fail_fast(err: Any, _msg: Any) -> None: if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) def on_commit(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors print(p) if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 180000, "default.topic.config": { "auto.offset.reset": "latest", }, }) consumer = Consumer(consumer_conf) producer_conf = self.kafka_config.copy() producer_conf.update({ "delivery.report.only.error": True, "default.topic.config": { "request.required.acks": -1, # all brokers must confirm }, }) producer = Producer(producer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) print("Kafka consuming {}".format(self.consume_topic)) while True: msg = consumer.poll(self.poll_interval) if not msg: print("nothing new from kafka (poll_interval: {} sec)".format( self.poll_interval)) continue if msg.error(): raise KafkaException(msg.error()) cle = json.loads(msg.value().decode("utf-8")) # print(cle) print("processing changelog index {}".format(cle["index"])) release_ids = [] new_release_ids = [] file_ids = [] fileset_ids = [] webcapture_ids = [] container_ids = [] work_ids = [] release_edits = cle["editgroup"]["edits"]["releases"] for re in release_edits: release_ids.append(re["ident"]) # filter to direct release edits which are not updates if not re.get("prev_revision") and not re.get( "redirect_ident"): new_release_ids.append(re["ident"]) file_edits = cle["editgroup"]["edits"]["files"] for e in file_edits: file_ids.append(e["ident"]) fileset_edits = cle["editgroup"]["edits"]["filesets"] for e in fileset_edits: fileset_ids.append(e["ident"]) webcapture_edits = cle["editgroup"]["edits"]["webcaptures"] for e in webcapture_edits: webcapture_ids.append(e["ident"]) container_edits = cle["editgroup"]["edits"]["containers"] for e in container_edits: container_ids.append(e["ident"]) work_edits = cle["editgroup"]["edits"]["works"] for e in work_edits: work_ids.append(e["ident"]) # TODO: do these fetches in parallel using a thread pool? for ident in set(file_ids): file_entity = self.api.get_file(ident, expand=None) # update release when a file changes # TODO: also fetch old version of file and update any *removed* # release idents (and same for filesets, webcapture updates) release_ids.extend(file_entity.release_ids or []) file_dict = self.api.api_client.sanitize_for_serialization( file_entity) producer.produce( self.file_topic, json.dumps(file_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) # TODO: topic for fileset updates for ident in set(fileset_ids): fileset_entity = self.api.get_fileset(ident, expand=None) # update release when a fileset changes release_ids.extend(fileset_entity.release_ids or []) # TODO: topic for webcapture updates for ident in set(webcapture_ids): webcapture_entity = self.api.get_webcapture(ident, expand=None) # update release when a webcapture changes release_ids.extend(webcapture_entity.release_ids or []) for ident in set(container_ids): container = self.api.get_container(ident) container_dict = self.api.api_client.sanitize_for_serialization( container) producer.produce( self.container_topic, json.dumps(container_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) for ident in set(release_ids): release = self.api.get_release( ident, expand="files,filesets,webcaptures,container,creators") if release.work_id: work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization( release) producer.produce( self.release_topic, json.dumps(release_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) # for ingest requests, filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request( release, ingest_request_source="fatcat-changelog") if ir and not release.files and self.want_live_ingest( release, ir): producer.produce( self.ingest_file_request_topic, json.dumps(ir).encode("utf-8"), # key=None, on_delivery=fail_fast, ) # send work updates (just ident and changelog metadata) to scholar for re-indexing for ident in set(work_ids): assert ident key = f"work_{ident}" work_ident_dict = dict( key=key, type="fatcat_work", work_ident=ident, updated=cle["timestamp"], fatcat_changelog_index=cle["index"], ) producer.produce( self.work_ident_topic, json.dumps(work_ident_dict).encode("utf-8"), key=key.encode("utf-8"), on_delivery=fail_fast, ) producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg)
from confluent_kafka import Consumer, KafkaError, TopicPartition c = Consumer({ 'bootstrap.servers': '10.211.55.3:29092', 'group.id': 'mygroup2', 'default.topic.config': { 'auto.offset.reset': 'smallest' # largest } }) # tp = TopicPartition("mytopic", 2, 0) # c.assign([tp]) # c.seek(tp) c.subscribe(['mytopic']) while True: msg = c.poll(1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue else: print(msg.error()) break print('Received message: {} {} {}'.format(msg.value().decode('utf-8'), msg.topic(), msg.partition()))
def run(self) -> None: logger.debug("Starting snuba query subscriber") self.offsets.clear() def on_assign(consumer: Consumer, partitions: List[TopicPartition]) -> None: updated_partitions: List[TopicPartition] = [] for partition in partitions: if self.resolve_partition_force_offset: partition = self.resolve_partition_force_offset(partition) updated_partitions.append(partition) if partition.offset == OFFSET_INVALID: updated_offset = None else: updated_offset = partition.offset self.offsets[partition.partition] = updated_offset if updated_partitions: self.consumer.assign(updated_partitions) logger.info( "query-subscription-consumer.on_assign", extra={ "offsets": str(self.offsets), "partitions": str(partitions), }, ) def on_revoke(consumer: Consumer, partitions: List[TopicPartition]) -> None: partition_numbers = [ partition.partition for partition in partitions ] self.commit_offsets(partition_numbers) for partition_number in partition_numbers: self.offsets.pop(partition_number, None) logger.info( "query-subscription-consumer.on_revoke", extra={ "offsets": str(self.offsets), "partitions": str(partitions), }, ) self.consumer = Consumer(self.cluster_options) self.__shutdown_requested = False if settings.KAFKA_CONSUMER_AUTO_CREATE_TOPICS: # This is required for confluent-kafka>=1.5.0, otherwise the topics will # not be automatically created. admin_client = AdminClient(self.admin_cluster_options) wait_for_topics(admin_client, [self.topic]) self.consumer.subscribe([self.topic], on_assign=on_assign, on_revoke=on_revoke) i = 0 while not self.__shutdown_requested: message = self.consumer.poll(0.1) if message is None: continue error = message.error() if error is not None: raise KafkaException(error) i = i + 1 with sentry_sdk.start_transaction( op="handle_message", name="query_subscription_consumer_process_message", sampled=random() <= options.get("subscriptions-query.sample-rate"), ), metrics.timer("snuba_query_subscriber.handle_message"): self.handle_message(message) # Track latest completed message here, for use in `shutdown` handler. self.offsets[message.partition()] = message.offset() + 1 if i % self.commit_batch_size == 0: logger.debug("Committing offsets") self.commit_offsets() logger.debug("Committing offsets and closing consumer") self.commit_offsets() self.consumer.close()
def setup(self): self.consumer = Consumer(**self.get_consumer_settings()) self.serializer = self.get_message_serializer() self.set_topic()
def _create_consumer(self, config) -> Consumer: return Consumer(config)
class VerifiableConsumer(VerifiableClient): """ confluent-kafka-python backed VerifiableConsumer class for use with Kafka's kafkatests client tests. """ def __init__(self, conf): """ conf is a config dict passed to confluent_kafka.Consumer() """ super(VerifiableConsumer, self).__init__(conf) self.conf['on_commit'] = self.on_commit self.consumer = Consumer(**conf) self.consumed_msgs = 0 self.consumed_msgs_last_reported = 0 self.consumed_msgs_at_last_commit = 0 self.use_auto_commit = False self.use_async_commit = False self.max_msgs = -1 self.assignment = [] self.assignment_dict = dict() def find_assignment(self, topic, partition): """ Find and return existing assignment based on topic and partition, or None on miss. """ skey = '%s %d' % (topic, partition) return self.assignment_dict.get(skey) def send_records_consumed(self, immediate=False): """ Send records_consumed, every 100 messages, on timeout, or if immediate is set. """ if self.consumed_msgs <= self.consumed_msgs_last_reported + (0 if immediate else 100): return if len(self.assignment) == 0: return d = {'name': 'records_consumed', 'count': self.consumed_msgs - self.consumed_msgs_last_reported, 'partitions': []} for a in self.assignment: if a.min_offset == -1: # Skip partitions that havent had any messages since last time. # This is to circumvent some minOffset checks in kafkatest. continue d['partitions'].append(a.to_dict()) a.min_offset = -1 self.send(d) self.consumed_msgs_last_reported = self.consumed_msgs def send_assignment(self, evtype, partitions): """ Send assignment update, evtype is either 'assigned' or 'revoked' """ d = {'name': 'partitions_' + evtype, 'partitions': [{'topic': x.topic, 'partition': x.partition} for x in partitions]} self.send(d) def on_assign(self, consumer, partitions): """ Rebalance on_assign callback """ old_assignment = self.assignment self.assignment = [AssignedPartition(p.topic, p.partition) for p in partitions] # Move over our last seen offsets so that we can report a proper # minOffset even after a rebalance loop. for a in old_assignment: b = self.find_assignment(a.topic, a.partition) b.min_offset = a.min_offset self.assignment_dict = {a.skey: a for a in self.assignment} self.send_assignment('assigned', partitions) def on_revoke(self, consumer, partitions): """ Rebalance on_revoke callback """ # Send final consumed records prior to rebalancing to make sure # latest consumed is in par with what is going to be committed. self.send_records_consumed(immediate=True) self.do_commit(immediate=True, asynchronous=False) self.assignment = list() self.assignment_dict = dict() self.send_assignment('revoked', partitions) def on_commit(self, err, partitions): """ Offsets Committed callback """ if err is not None and err.code() == KafkaError._NO_OFFSET: self.dbg('on_commit(): no offsets to commit') return # Report consumed messages to make sure consumed position >= committed position self.send_records_consumed(immediate=True) d = {'name': 'offsets_committed', 'offsets': []} if err is not None: d['success'] = False d['error'] = str(err) else: d['success'] = True d['error'] = '' for p in partitions: pd = {'topic': p.topic, 'partition': p.partition, 'offset': p.offset} if p.error is not None: pd['error'] = str(p.error) d['offsets'].append(pd) if len(self.assignment) == 0: self.dbg('Not sending offsets_committed: No current assignment: would be: %s' % d) return self.send(d) def do_commit(self, immediate=False, asynchronous=None): """ Commit every 1000 messages or whenever there is a consume timeout or immediate. """ if (self.use_auto_commit or self.consumed_msgs_at_last_commit + (0 if immediate else 1000) > self.consumed_msgs): return # Make sure we report consumption before commit, # otherwise tests may fail because of commit > consumed if self.consumed_msgs_at_last_commit < self.consumed_msgs: self.send_records_consumed(immediate=True) if asynchronous is None: async_mode = self.use_async_commit else: async_mode = asynchronous self.dbg('Committing %d messages (Async=%s)' % (self.consumed_msgs - self.consumed_msgs_at_last_commit, async_mode)) retries = 3 while True: try: self.dbg('Commit') offsets = self.consumer.commit(asynchronous=async_mode) self.dbg('Commit done: offsets %s' % offsets) if not async_mode: self.on_commit(None, offsets) break except KafkaException as e: if e.args[0].code() == KafkaError._NO_OFFSET: self.dbg('No offsets to commit') break elif e.args[0].code() in (KafkaError.REQUEST_TIMED_OUT, KafkaError.NOT_COORDINATOR_FOR_GROUP, KafkaError._WAIT_COORD): self.dbg('Commit failed: %s (%d retries)' % (str(e), retries)) if retries <= 0: raise retries -= 1 time.sleep(1) continue else: raise self.consumed_msgs_at_last_commit = self.consumed_msgs def msg_consume(self, msg): """ Handle consumed message (or error event) """ if msg.error(): self.err('Consume failed: %s' % msg.error(), term=False) return if False: self.dbg('Read msg from %s [%d] @ %d' % (msg.topic(), msg.partition(), msg.offset())) if self.max_msgs >= 0 and self.consumed_msgs >= self.max_msgs: return # ignore extra messages # Find assignment. a = self.find_assignment(msg.topic(), msg.partition()) if a is None: self.err('Received message on unassigned partition %s [%d] @ %d' % (msg.topic(), msg.partition(), msg.offset()), term=True) a.consumed_msgs += 1 if a.min_offset == -1: a.min_offset = msg.offset() if a.max_offset < msg.offset(): a.max_offset = msg.offset() self.consumed_msgs += 1 self.consumer.store_offsets(message=msg) self.send_records_consumed(immediate=False) self.do_commit(immediate=False)
from confluent_kafka import Consumer, KafkaError, KafkaException import sys conf = { "bootstrap.servers": "0.0.0.0:9092,0.0.0.0:9092", "group.id": "foo", "auto.offset.reset": "smallest", "enable.auto.commit": True, } consumer = Consumer(conf) try: consumer.subscribe(["test2"]) while True: msg = consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event sys.stderr.write("%% %s [%d] reached end at offset %d\n" % (msg.topic(), msg.partition(), msg.offset())) elif msg.error(): raise KafkaException(msg.error()) else: print(msg.value()) message = msg.value()
def __init__(self, conf, topic_name): self.consumer = Consumer(conf) self.topic_name = topic_name self.running = True self._observers = []
if __name__ == '__main__': # Initialization args = ccloud_lib.parse_args() config_file = args.config_file topic = args.topic conf = ccloud_lib.read_ccloud_config(config_file) # Create Consumer instance # 'auto.offset.reset=earliest' to start reading from the beginning of the # topic if no committed offsets exist c = Consumer({ 'bootstrap.servers': conf['bootstrap.servers'], 'sasl.mechanisms': 'PLAIN', 'security.protocol': 'SASL_SSL', 'sasl.username': conf['sasl.username'], 'sasl.password': conf['sasl.password'], 'group.id': 'python_example_group_1', 'auto.offset.reset': 'earliest' }) # Subscribe to topic c.subscribe([topic]) # Process messages total_count = 0 try: while True: print("Waiting for message or event/error in poll()") msg = c.poll(1.0) if msg is None:
class EventProcessor(object): __metaclass__ = abc.ABC _DEFAULT_KAFKA_CONSUMER_CONFIG = { 'bootstrap.servers': 'kafka:9092', 'enable.auto.commit': True, 'auto.commit.interval.ms': 10000, 'session.timeout.ms': 30000, 'fetch.max.bytes': 5000012, 'auto.offset.reset': 'latest', } _DEFAULT_KAFKA_PRODUCER_CONFIG = { 'bootstrap.servers': 'kafka:9092', 'acks': 'all', 'retries': 0, 'linger.ms': 20, } def __init__(self, **kwargs): """initialize EventProcessor with Kafka Prodcuer and Consumer""" self.logger = logging.getLogger(__name__) self._input_topics = kwargs.get('input_topics') self._output_topics = kwargs.get('output_topics') self._invocation_id = kwargs.get('invocation_id') self._bootstrap_servers = kwargs.get('bootstrap_servers') self.static_properties = kwargs.get('static_properties') self._running = False self._threads = {} if self._bootstrap_servers is not None: self._DEFAULT_KAFKA_CONSUMER_CONFIG[ 'bootstrap.servers'] = self._bootstrap_servers self._DEFAULT_KAFKA_PRODUCER_CONFIG[ 'bootstrap.servers'] = self._bootstrap_servers self._DEFAULT_KAFKA_CONSUMER_CONFIG[ 'group.id'] = 'streampipes_python_' + self._invocation_id self._producer = Producer(self._DEFAULT_KAFKA_PRODUCER_CONFIG) self._consumer = Consumer(self._DEFAULT_KAFKA_CONSUMER_CONFIG) #self._create_topic(topic=self._output_topics, conf=self._DEFAULT_KAFKA_PRODUCER_CONFIG) self.on_invocation() def init(self): self.logger.info('start processor {}'.format(self.invocation_id)) thread = threading.Thread(target=self._consume, name=self.invocation_id) thread.start() self._threads['kafka'] = thread def active_threads(self): return self._threads @property def invocation_id(self): return self._invocation_id def __del__(self): pass @abc.abstractmethod def on_invocation(self): """ on_invocation is called when processor is started """ @abc.abstractmethod def on_event(self, event): """ on_event receives kafka consumer messages """ pass @abc.abstractmethod def on_detach(self): """ on_detach is called when processor is stopped """ pass def _on_event(self, event): result = self.on_event(event) if result is not None: self._produce(result) def _consume(self): """ retrieve events from kafka """ self._consumer.subscribe(topics=[self._input_topics]) self._running = True while self._running: # fetch records from kafka and send to msg = self._consumer.poll(timeout=1.0) if msg is None: continue elif msg.error(): if msg.error().str() != "Broker: No more messages": self.logger.error("Consumer error: {}".format(msg.error())) continue else: try: # json -> dict event = json.loads(msg.value().decode('utf-8')) if isinstance(event, int): self.logger.info( "Integer not allowed {}".format(event)) continue except ValueError as e: self.logger.info("Not a valid json {}".format(e)) continue self._on_event(event) def _produce(self, result): """ send events to kafka """ event = json.dumps(result).encode('utf-8') try: # dict -> json self._producer.produce(self._output_topics, value=event) except BufferError: self._producer.poll(1) # def _create_topic(self, topic=None, conf=None): # """ Create the topic if it doesn't exist """ # admin = AdminClient(conf) # fs = admin.create_topics([NewTopic(topic, num_partitions=1, replication_factor=1)]) # f = fs[topic] # try: # f.result() # except KafkaException as ex: # if ex.args[0].code() == KafkaError.TOPIC_ALREADY_EXISTS: # self.logger.warning("Topic {} already exists: continue".format(topic)) # else: # raise def stop(self): self.logger.info('stop processor {}'.format(self.invocation_id)) self._running = False self._consumer.close() self._producer.flush() self.on_detach()
from confluent_kafka import Consumer, KafkaError import sys import uuid topic = sys.argv[1] c = Consumer({ #'bootstrap.servers': '172.17.0.3:9092,172.17.0.4:9093,172.17.0.5:9094', 'bootstrap.servers': '172.17.0.4:9093,172.17.0.5:9094', 'api.version.request': True, 'enable.auto.commit': True, 'group.id': str(uuid.uuid1()), 'auto.offset.reset': 'earliest' #'default.topic.config': { # 'auto.offset.reset': 'smallest' #} }) def print_assignment(consumer, partitions): for p in partitions: p.offset = 0 print('assign', partitions) consumer.assign(partitions) # Subscribe to topics c.subscribe([topic], on_assign=print_assignment) while True:
class SyncReport: def __init__(self, group, token, optimalq_connector, pool_uid, call_reports_topic): self._consumer = Consumer({ "bootstrap.servers": "", "security.protocol": "SASL_SSL", "sasl.mechanisms": "PLAIN", "sasl.username": "", "sasl.password": "", 'group.id': group, 'enable.auto.commit': False, 'auto.offset.reset': 'earliest' }) self._consumer.subscribe([call_reports_topic]) self._headers = { "X-Auth-Token": "{}".format(token), "Content-Type": "application/json" } self._optimalq_connector = optimalq_connector self._pool_uid = pool_uid self._optimalq_url = '' def start(self): """ Get messages from call_report_topic. Send the call reports to post_call_report """ while True: msg = self._consumer.poll(0.1) if msg is None: continue elif not msg.error(): #Received message self.post_call_report(msg.value()) elif msg.error().code() == KafkaError._PARTITION_EOF: logging.info('End of partition reached {}/{}'.format( msg.topic(), msg.partition())) else: logging.error('Error occurred: {}'.format(msg.error().str())) def post_call_report(self, call_report): """ Post call report json to OptimalQ API by pool uid :param call_report: :return: """ url = '{}/v1/pools/{}/call_reports'.format(self._optimalq_url, self._pool_uid) success_post_call_report = requests.post(url=url, data=call_report, headers=self._headers) code = success_post_call_report.status_code counter = 5 while (counter > 0) and ((code < 200) or (code > 299)): counter -= 1 token = self._optimalq_connector.get_token() if token is not None: self._headers = { "X-Auth-Token": "{}".format(token), "Content-Type": "application/json" } success_post_call_report = requests.post(url=url, data=call_report, headers=self._headers) code = success_post_call_report.status_code if (code > 199) and (code < 300): self._consumer.commit() logging.info('Sent call report for pool: {}'.format( self._pool_uid)) return logging.error( 'Connection to OptimalQ failed while trying to send call report {}. code: {}, error: {}' .format(self._pool_uid, code, success_post_call_report.content)) def terminate(self): self._consumer.close()
dest="topic", help="Topic to listen", ) parser.add_option( "-s", "--servers", #default=str(DEFAULT_SERVER), dest="servers", help="Kafka servers", ) (options, _) = parser.parse_args() c = Consumer({ 'bootstrap.servers': options.servers, 'group.id': 'mygroup', 'auto.offset.reset': 'earliest' }) c.subscribe([options.topic]) #c.subscribe(['Cisco-IOS-XR-qos-ma-oper.qos.nodes.node.policy-map.interface-table.interface.member-interfaces.member-interface.output.service-policy-names.service-policy-instance.statistics']) print("waiting for packets") while True: msg = c.poll(1.0) if msg is None: continue if msg.error(): print("Consumer error: {}".format(msg.error())) continue
print("Program Started") from confluent_kafka import Consumer from configFile import * from FinalProducer import KafkaProducer from URLReader import UrlReader from Mongodb import MongoDb producer = KafkaProducer() url_reader = UrlReader() mongodb = MongoDb() c = Consumer( { 'bootstrap.servers': BOOTSTRAP_SERVERS, 'group.id': GROUP_CONTENT_CONSUMER } ) #, 'max.partition.fetch.bytes': 200000000, 'receive.message.max.bytes': 1000000000}); c.subscribe([UNPROCESSED_URL_TOPIC]) running = True while running: print("Waiting for unprocessed url to be fetched") data = c.poll() if not data.error(): url = data.value() print("Fetched url from Kafka - " + str(url)) print("Checking if this url is already processed...") if mongodb.is_url_processed(url): print("Url already processed... Skipping it") else: print("Url not processed.... adding to the topic")
def test_consumer_rebalance_from_committed_offset(requires_kafka): consumer_group = "consumer-{}".format(uuid.uuid1().hex) synchronize_commit_group = "consumer-{}".format(uuid.uuid1().hex) messages_delivered = defaultdict(list) def record_message_delivered(error, message): assert error is None messages_delivered[message.topic()].append(message) producer = Producer({ "bootstrap.servers": os.environ["SENTRY_KAFKA_HOSTS"], "on_delivery": record_message_delivered, }) with create_topic( partitions=2) as topic, create_topic() as commit_log_topic: # Produce some messages into the topic. for i in range(4): producer.produce(topic, "{}".format(i).encode("utf8"), partition=i % 2) assert producer.flush( 5) == 0, "producer did not successfully flush queue" Consumer({ "bootstrap.servers": os.environ["SENTRY_KAFKA_HOSTS"], "group.id": consumer_group }).commit( offsets=[ TopicPartition(message.topic(), message.partition(), message.offset() + 1) for message in messages_delivered[topic][:2] ], asynchronous=False, ) consumer_a = SynchronizedConsumer( bootstrap_servers=os.environ["SENTRY_KAFKA_HOSTS"], consumer_group=consumer_group, commit_log_topic=commit_log_topic, synchronize_commit_group=synchronize_commit_group, initial_offset_reset="earliest", ) assignments_received = defaultdict(list) def on_assign(consumer, assignment): assignments_received[consumer].append(assignment) consumer_a.subscribe([topic], on_assign=on_assign) # Wait until the first consumer has received its assignments. for i in xrange(10): # this takes a while assert consumer_a.poll(1) is None if assignments_received[consumer_a]: break assert (len(assignments_received[consumer_a]) == 1 ), "expected to receive partition assignment" assert set( (i.topic, i.partition) for i in assignments_received[consumer_a][0]) == set([(topic, 0), (topic, 1)]) assignments_received[consumer_a].pop() consumer_b = SynchronizedConsumer( bootstrap_servers=os.environ["SENTRY_KAFKA_HOSTS"], consumer_group=consumer_group, commit_log_topic=commit_log_topic, synchronize_commit_group=synchronize_commit_group, initial_offset_reset="earliest", ) consumer_b.subscribe([topic], on_assign=on_assign) assignments = {} # Wait until *both* consumers have received updated assignments. for consumer in [consumer_a, consumer_b]: for i in xrange(10): # this takes a while assert consumer.poll(1) is None if assignments_received[consumer]: break assert (len(assignments_received[consumer]) == 1 ), "expected to receive partition assignment" assert (len(assignments_received[consumer][0]) == 1 ), "expected to have a single partition assignment" i = assignments_received[consumer][0][0] assignments[(i.topic, i.partition)] = consumer assert set(assignments.keys()) == set([(topic, 0), (topic, 1)]) for expected_message in messages_delivered[topic][2:]: consumer = assignments[(expected_message.topic(), expected_message.partition())] # Make sure that there are no messages ready to consume. assert consumer.poll(1) is None # Move the committed offset forward for our synchronizing group. producer.produce( commit_log_topic, key="{}:{}:{}".format(expected_message.topic(), expected_message.partition(), synchronize_commit_group).encode("utf8"), value="{}".format(expected_message.offset() + 1).encode("utf8"), ) assert producer.flush( 5) == 0, "producer did not successfully flush queue" # We should have received a single message. # TODO: Can we also assert that the position is unpaused?) for i in xrange(5): received_message = consumer.poll(1) if received_message is not None: break assert received_message is not None, "no message received" assert received_message.topic() == expected_message.topic() assert received_message.partition() == expected_message.partition() assert received_message.offset() == expected_message.offset() # We should not be able to continue reading into the topic. # TODO: Can we assert that the position is paused? assert consumer.poll(1) is None
def test_consumer_start_from_committed_offset(requires_kafka): consumer_group = "consumer-{}".format(uuid.uuid1().hex) synchronize_commit_group = "consumer-{}".format(uuid.uuid1().hex) messages_delivered = defaultdict(list) def record_message_delivered(error, message): assert error is None messages_delivered[message.topic()].append(message) producer = Producer({ "bootstrap.servers": os.environ["SENTRY_KAFKA_HOSTS"], "on_delivery": record_message_delivered, }) with create_topic() as topic, create_topic() as commit_log_topic: # Produce some messages into the topic. for i in range(3): producer.produce(topic, "{}".format(i).encode("utf8")) assert producer.flush( 5) == 0, "producer did not successfully flush queue" Consumer({ "bootstrap.servers": os.environ["SENTRY_KAFKA_HOSTS"], "group.id": consumer_group }).commit(message=messages_delivered[topic][0], asynchronous=False) # Create the synchronized consumer. consumer = SynchronizedConsumer( bootstrap_servers=os.environ["SENTRY_KAFKA_HOSTS"], consumer_group=consumer_group, commit_log_topic=commit_log_topic, synchronize_commit_group=synchronize_commit_group, initial_offset_reset="earliest", ) assignments_received = [] def on_assign(c, assignment): assert c is consumer assignments_received.append(assignment) consumer.subscribe([topic], on_assign=on_assign) # Wait until we have received our assignments. for i in xrange(10): # this takes a while assert consumer.poll(1) is None if assignments_received: break assert len(assignments_received ) == 1, "expected to receive partition assignment" assert set((i.topic, i.partition) for i in assignments_received[0]) == set([(topic, 0)]) # TODO: Make sure that all partitions are paused on assignment. # Move the committed offset forward for our synchronizing group. message = messages_delivered[topic][0] producer.produce( commit_log_topic, key="{}:{}:{}".format(message.topic(), message.partition(), synchronize_commit_group).encode("utf8"), value="{}".format(message.offset() + 1).encode("utf8"), ) # Make sure that there are no messages ready to consume. assert consumer.poll(1) is None # Move the committed offset forward for our synchronizing group. message = messages_delivered[topic][0 + 1] # second message producer.produce( commit_log_topic, key="{}:{}:{}".format(message.topic(), message.partition(), synchronize_commit_group).encode("utf8"), value="{}".format(message.offset() + 1).encode("utf8"), ) assert producer.flush( 5) == 0, "producer did not successfully flush queue" # We should have received a single message. # TODO: Can we also assert that the position is unpaused?) for i in xrange(5): message = consumer.poll(1) if message is not None: break assert message is not None, "no message received" expected_message = messages_delivered[topic][0 + 1] # second message assert message.topic() == expected_message.topic() assert message.partition() == expected_message.partition() assert message.offset() == expected_message.offset() # We should not be able to continue reading into the topic. # TODO: Can we assert that the position is paused? assert consumer.poll(1) is None
from confluent_kafka import Consumer, KafkaError import downloader settings = { 'bootstrap.servers': 'localhost:9092', 'group.id': 'mygroup', 'client.id': 'client-1', 'enable.auto.commit': True, 'session.timeout.ms': 6000, 'default.topic.config': { 'auto.offset.reset': 'smallest' } } c = Consumer(settings) c.subscribe(['downloadvideo']) try: while True: msg = c.poll(0.1) if msg is None: continue elif not msg.error(): url = str(msg.value()) url_formated = url.replace('b\'http', 'http').replace('"', '').replace("'", '').strip() print('Received message: {0}'.format(url)) downloader.run(url_formated)
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # # TODO: Configure the broker properties below. Make sure to reference the project README # and use the Host URL for Kafka and Schema Registry! # # self.broker_properties = { 'bootstrap.servers': 'PLAINTEXT://localhost:9092', "default.topic.config": { "auto.offset.reset": "earliest" }, "group.id": "0", } # TODO: Create the Consumer, using the appropriate type. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) # # # TODO: Configure the AvroConsumer and subscribe to the topics. Make sure to think about # how the `on_assign` callback should be invoked. # # self.consumer.subscribe([topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # TODO: If the topic is configured to use `offset_earliest` set the partition offset to # the beginning or earliest for partition in partitions: if self.offset_earliest: partition.offset = confluent_kafka.OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # # # TODO: Poll Kafka for messages. Make sure to handle any errors or exceptions. # Additionally, make sure you return 1 when a message is processed, and 0 when no message # is retrieved. # # try: msg = self.consumer.poll(self.consume_timeout) if msg: err = msg.error() if not err: self.message_handler(msg) rc = 1 else: logger.error(err) rc = 0 else: logger.warn("no messages to consume") rc = 0 except Exception as e: logger.error(e) rc = 0 return rc def close(self): """Cleans up any open kafka consumers""" if self.consumer: self.consumer.close()
class ConfluentKafkaReader(object): def __init__(self, host, port, group, topic, buffer_size, reconnect_wait_time=2): """ Initialize Kafka reader """ logging.info("Initializing Confluent Kafka Consumer") self.host = host self.port = str(port) self.group = group self.topic = [topic] self.buffer_size = buffer_size self.reconnect_wait_time = reconnect_wait_time self.reconnect_retries = 0 self.max_reconnect_retries = 10 # TODO: implement config parameter self.buffer = [] # Initialized on read self.consumer = None def on_assign(self, consumer, partitions): # for p in partitions: # p.offset=-2 # consumer.assign(partitions) logging.debug('on_assignment callback...') logging.info('Assignment:', partitions) def _connect(self): connection = {'bootstrap.servers': self.host+":"+self.port, 'group.id': self.group, 'session.timeout.ms': 6000, 'default.topic.config': {'auto.offset.reset': 'largest'}} logging.info("Connecting to Kafka at %s...", connection) self.consumer = Consumer(**connection) self.consumer.subscribe(self.topic, on_assign=self.on_assign) def read(self): """ Read from Kafka. Reconnect on error. """ try: self._connect() msgcn = 0 while True: msg = self.consumer.poll(timeout=1.0) if msg is None: continue if msg.error(): # Error or event if msg.error().code() == KafkaError._PARTITION_EOF: # End of partition event logging.debug('Catching KafkaError._PARTITION_EOF') logging.error('%s [%d] reached end at offset %d\n', msg.topic(), msg.partition(), msg.offset()) logging.error('%s [%d] at offset %d with key %s:\n', msg.topic(), msg.partition(), msg.offset(), str(msg.key())) break elif msg.error(): # Error # TODO : extend exception handling scope as we will end here # for a lot of reasons ! logging.debug('Catching other errors...') logging.error("Kafka error: %s.", msg.error()) logging.error("Trying to reconnect to %s:%s", self.host, self.port) self.reconnect_retries += 1 time.sleep(self.reconnect_wait_time) if self.reconnect_retries >= self.max_reconnect_retries: logging.error("Max reconnection attempt limit reached (%d). Aborting", self.max_reconnect_retries) break else: self.consumer.close() self._connect() pass #raise KafkaException(msg.error()) else: # Proper message logging.error('%s [%d] at offset %d with key %s:\n', msg.topic(), msg.partition(), msg.offset(), str(msg.key())) (self.buffer).append(msg.value().rstrip('\n')) # otherwise the #writter will add extra \n msgcn += 1 #self.consumer.commit(async=False) if msgcn >= self.buffer_size: logging.debug("Read buffer [%d] reached.",self.buffer_size) break except KeyboardInterrupt: logging.info('Aborted by user\n') # Close down consumer to commit final offsets. self.consumer.close() return(self.buffer)
# or to pick up all messages that the consumer has missed ('earliest'). # Using 'latest' means the consumer must be started before the producer. read_topic_from = 'latest' # How often to indicate data rate in seconds throughput_debug_interval_in_sec = 1 ### ### Consumer code ### kbs_in_mb = 1000 c = Consumer({ 'bootstrap.servers': kafka_servers, 'group.id': 'mygroup', 'auto.offset.reset': read_topic_from }) c.subscribe([topic_name]) kbs_so_far = 0 window_start_time = int(time.time()) while True: # Waits 1 second to receive a message, if it doesn't find one goes round the loop again msg = c.poll(1.0) if msg is None:
sys.stderr.write("-T option value needs to be larger than zero: %s\n" % opt[1]) sys.exit(1) conf['stats_cb'] = stats_cb conf['statistics.interval.ms'] = int(opt[1]) # Create logger for consumer (logs will be emitted when poll() is called) logger = logging.getLogger('consumer') logger.setLevel(logging.DEBUG) handler = logging.StreamHandler() handler.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s')) logger.addHandler(handler) # Create Consumer instance # Hint: try debug='fetch' to generate some log messages c = Consumer(conf, logger=logger) def print_assignment(consumer, partitions): print('Assignment:', partitions) # Subscribe to topics c.subscribe(topics, on_assign=print_assignment) # Read messages from Kafka, print to stdout try: while True: msg = c.poll(timeout=1.0) if msg is None: continue if msg.error(): raise KafkaException(msg.error())
class KafkaConsumer: """Defines the base kafka consumer class""" def __init__( self, topic_name_pattern, message_handler, is_avro=True, offset_earliest=False, sleep_secs=1.0, consume_timeout=0.1, ): """Creates a consumer object for asynchronous use""" self.topic_name_pattern = topic_name_pattern self.message_handler = message_handler self.sleep_secs = sleep_secs self.consume_timeout = consume_timeout self.offset_earliest = offset_earliest # # Configure the broker properties below. # Use the Host URL for Kafka and Schema Registry! # # self.broker_properties = { "bootstrap.servers": "PLAINTEXT://localhost:9092", "group.id": "0", } # Create the proper Consumer, AvroConsumer or Consumer. if is_avro is True: self.broker_properties[ "schema.registry.url"] = "http://localhost:8081" self.consumer = AvroConsumer(self.broker_properties) else: self.consumer = Consumer(self.broker_properties) # Configure the AvroConsumer and subscribe to the topics. on_assign is Callback function self.consumer.subscribe([self.topic_name_pattern], on_assign=self.on_assign) def on_assign(self, consumer, partitions): """Callback for when topic assignment takes place""" # Set the topic partition offset to the beginning or earliest for each partition for partition in partitions: partition.offset = OFFSET_BEGINNING logger.info("partitions assigned for %s", self.topic_name_pattern) consumer.assign(partitions) async def consume(self): """Asynchronously consumes data from kafka topic""" while True: num_results = 1 while num_results > 0: num_results = self._consume() await gen.sleep(self.sleep_secs) def _consume(self): """Polls for a message. Returns 1 if a message was received, 0 otherwise""" # # Poll Kafka for messages with handle of any errors or exceptions. # Make sure to return 1 when a message is processed, and 0 when no message # is retrieved. while True: message = self.consumer.poll(1.0) if message is None: logger.info("_consume is incomplete - skipping") return 0 elif message.error() is not None: print(f"error from consumer {message.error()}") else: return 1 def close(self): """Cleans up any open kafka consumers""" # Cleanup the kafka consumer self.consumer.close()
def run_commit_log_consumer(bootstrap_servers, consumer_group, commit_log_topic, partition_state_manager, synchronize_commit_group, start_event, stop_request_event): start_event.set() logging.debug('Starting commit log consumer...') positions = {} # NOTE: The commit log consumer group should not be persisted into the # ``__consumer_offsets`` topic since no offsets are committed by this # consumer. The group membership metadata messages will be published # initially but as long as this group remains a single consumer it will # be deleted after the consumer is closed. # It is very important to note that the ``group.id`` **MUST** be unique to # this consumer process!!! This ensures that it is able to consume from all # partitions of the commit log topic and get a comprehensive view of the # state of the consumer groups it is tracking. consumer = Consumer({ 'bootstrap.servers': bootstrap_servers, 'group.id': consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, }) def rewind_partitions_on_assignment(consumer, assignment): # The commit log consumer must start consuming from the beginning of # the commit log topic to ensure that it has a comprehensive view of # all active partitions. consumer.assign([ TopicPartition( i.topic, i.partition, positions.get((i.topic, i.partition), OFFSET_BEGINNING), ) for i in assignment ]) consumer.subscribe( [commit_log_topic], on_assign=rewind_partitions_on_assignment, ) while not stop_request_event.is_set(): message = consumer.poll(1) if message is None: continue error = message.error() if error is not None: raise Exception(error) positions[(message.topic(), message.partition())] = message.offset() + 1 group, topic, partition, offset = get_commit_data(message) if group != synchronize_commit_group: logger.debug('Received consumer offsets update from %r, ignoring...', group) continue if offset in LOGICAL_OFFSETS: logger.debug( 'Skipping invalid logical offset (%r) from %s/%s...', offset, topic, partition) continue elif offset < 0: logger.warning( 'Received unexpected negative offset (%r) from %s/%s!', offset, topic, partition) partition_state_manager.set_remote_offset(topic, partition, offset)
class SynchronizedConsumer(object): """ This class implements the framework for a consumer that is intended to only consume messages that have already been consumed and committed by members of another consumer group. This works similarly to the Kafka built-in ``__consumer_offsets`` topic. The consumer group that is being "followed" (the one that must make progress for our consumer here to make progress, identified by the ``synchronize_commit_group`` constructor parameter/instance attribute) must report its offsets to a topic (identified by the ``commit_log_topic`` constructor parameter/instance attribute). This consumer subscribes to both commit log topic, as well as the topic(s) that we are actually interested in consuming messages from. The messages received from the commit log topic control whether or not consumption from partitions belonging to the main topic is paused, resumed, or allowed to continue in its current state without changes. The furthest point in any partition that this consumer should ever consume to is the maximum offset that has been recorded to the commit log topic for that partition. If the offsets recorded to that topic move non-monotonically (due to an intentional offset rollback, for instance) this consumer *may* consume up to the highest watermark point. (The implementation here tries to pause consuming from the partition as soon as possible, but this makes no explicit guarantees about that behavior.) """ initial_offset_reset_strategies = { 'earliest': get_earliest_offset, 'latest': get_latest_offset, } def __init__(self, bootstrap_servers, consumer_group, commit_log_topic, synchronize_commit_group, initial_offset_reset='latest', on_commit=None): self.bootstrap_servers = bootstrap_servers self.consumer_group = consumer_group self.commit_log_topic = commit_log_topic self.synchronize_commit_group = synchronize_commit_group self.initial_offset_reset = self.initial_offset_reset_strategies[initial_offset_reset] self.__partition_state_manager = SynchronizedPartitionStateManager( self.__on_partition_state_change) self.__commit_log_consumer, self.__commit_log_consumer_stop_request = self.__start_commit_log_consumer() self.__positions = {} def commit_callback(error, partitions): if on_commit is not None: return on_commit(error, partitions) consumer_configuration = { 'bootstrap.servers': self.bootstrap_servers, 'group.id': self.consumer_group, 'enable.auto.commit': 'false', 'enable.auto.offset.store': 'true', 'enable.partition.eof': 'false', 'default.topic.config': { 'auto.offset.reset': 'error', }, 'on_commit': commit_callback, } self.__consumer = Consumer(consumer_configuration) def __start_commit_log_consumer(self, timeout=None): """ Starts running the commit log consumer. """ stop_request_event = threading.Event() start_event = threading.Event() result = execute( functools.partial( run_commit_log_consumer, bootstrap_servers=self.bootstrap_servers, consumer_group='{}:sync:{}'.format(self.consumer_group, uuid.uuid1().hex), commit_log_topic=self.commit_log_topic, synchronize_commit_group=self.synchronize_commit_group, partition_state_manager=self.__partition_state_manager, start_event=start_event, stop_request_event=stop_request_event, ), ) start_event.wait(timeout) return result, stop_request_event def __check_commit_log_consumer_running(self): if not self.__commit_log_consumer.running(): try: result = self.__commit_log_consumer.result(timeout=0) # noqa except TimeoutError: pass # not helpful raise Exception('Commit log consumer unexpectedly exit!') def __on_partition_state_change( self, topic, partition, previous_state_and_offsets, current_state_and_offsets): """ Callback that is invoked when a partition state changes. """ logger.debug('State change for %r: %r to %r', (topic, partition), previous_state_and_offsets, current_state_and_offsets) current_state, current_offsets = current_state_and_offsets if current_offsets.local is None: # It only makes sense to manipulate the consumer if we've got an # assignment. (This block should only be entered at startup if the # remote offsets are retrieved from the commit log before the local # consumer has received its assignment.) return # TODO: This will be called from the commit log consumer thread, so need # to verify that calling the ``consumer.{pause,resume}`` methods is # thread safe! if current_state in (SynchronizedPartitionState.UNKNOWN, SynchronizedPartitionState.SYNCHRONIZED, SynchronizedPartitionState.REMOTE_BEHIND): self.__consumer.pause([TopicPartition(topic, partition, current_offsets.local)]) elif current_state is SynchronizedPartitionState.LOCAL_BEHIND: self.__consumer.resume([TopicPartition(topic, partition, current_offsets.local)]) else: raise NotImplementedError('Unexpected partition state: %s' % (current_state,)) def subscribe(self, topics, on_assign=None, on_revoke=None): """ Subscribe to a topic. """ self.__check_commit_log_consumer_running() def assignment_callback(consumer, assignment): # Since ``auto.offset.reset`` is set to ``error`` to force human # interaction on an offset reset, we have to explicitly specify the # starting offset if no offset has been committed for this topic during # the ``__consumer_offsets`` topic retention period. assignment = { (i.topic, i.partition): self.__positions.get((i.topic, i.partition)) for i in assignment } for i in self.__consumer.committed([TopicPartition(topic, partition) for ( topic, partition), offset in assignment.items() if offset is None]): k = (i.topic, i.partition) if i.offset > -1: assignment[k] = i.offset else: assignment[k] = self.initial_offset_reset(consumer, i.topic, i.partition) self.__consumer.assign([TopicPartition(topic, partition, offset) for (topic, partition), offset in assignment.items()]) for (topic, partition), offset in assignment.items(): # Setting the local offsets will either cause the partition to be # paused (if the remote offset is unknown or the local offset is # not trailing the remote offset) or resumed. self.__partition_state_manager.set_local_offset(topic, partition, offset) self.__positions[(topic, partition)] = offset if on_assign is not None: on_assign(self, [TopicPartition(topic, partition) for topic, partition in assignment.keys()]) def revocation_callback(consumer, assignment): for item in assignment: # TODO: This should probably also be removed from the state manager. self.__positions.pop((item.topic, item.partition)) if on_revoke is not None: on_revoke(self, assignment) self.__consumer.subscribe( topics, on_assign=assignment_callback, on_revoke=revocation_callback) def poll(self, timeout): self.__check_commit_log_consumer_running() message = self.__consumer.poll(timeout) if message is None: return if message.error() is not None: return message self.__partition_state_manager.validate_local_message( message.topic(), message.partition(), message.offset()) self.__partition_state_manager.set_local_offset( message.topic(), message.partition(), message.offset() + 1) self.__positions[(message.topic(), message.partition())] = message.offset() + 1 return message def commit(self, *args, **kwargs): self.__check_commit_log_consumer_running() return self.__consumer.commit(*args, **kwargs) def close(self): self.__check_commit_log_consumer_running() self.__commit_log_consumer_stop_request.set() try: self.__consumer.close() finally: self.__commit_log_consumer.result()
def httpry_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Httpry_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['httpry_logs']) try: while True: msg = consumer.poll() if msg: if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tm = time.strftime('%Y%m%d%H%M', time.localtime()) httpry_Key = 'httpry_domain.%s' % tm if Msg: msg = Msg.split() if len(msg) == 11: if msg[6] != '-': RC.zincrby(httpry_Key,msg[6], 1) RC.expire(httpry_Key,600) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()
def analytics_internet2_logs(): consumer = Consumer({'bootstrap.servers': kafka_hosts, 'group.id': 'Internet2_logs_%s' %dt,'default.topic.config': {'auto.offset.reset': 'latest','auto.commit.enable':'true'}}) consumer.subscribe(['haproxy_logs']) try: while True: msg = consumer.poll() if not msg.error(): Msg = msg.value().decode('utf-8').strip() try: tt = time.strftime('%Y%m%d', time.localtime()) tm = time.strftime('%Y%m%d%H%M', time.localtime()) Tm = time.strftime('%H:%M', time.localtime()) Tra_ser_minute_Key = 'traffic.ser.%s' % tm Tra_cli_minute_Key = 'traffic.cli.%s' % tm if Msg: Msg = Msg.split() if len(Msg) >= 17: traffic_cli = Msg[10] traffic_ser = Msg[11] Topic = str(Msg[14]).split('|')[0].replace('{', '').strip() IP = str(Msg[5]) Rtime = Msg[8].split('/')[-1] if Rtime.isdigit(): Rtime = int(Rtime) else: Rtime = 0 uv_key = 'baihe_uv_%s' % tt Rt_Key = 'Rtime_%s_%s' % (tt, Topic) PATH = str(Msg[16]).split('?')[0] URL = 'http://%s%s' % (Topic,PATH) Tra_ser_url_minute_Key = 'traffic.ser.url_%s' % Tm Tra_cli_url_minute_Key = 'traffic.cli.url_%s' % Tm for KEY in (uv_key,Rt_Key,Tra_ser_url_minute_Key,Tra_cli_url_minute_Key): RC.expire(KEY,3600) # 流量 if traffic_ser.isdigit() and traffic_cli.isdigit(): RC.zincrby(Tra_cli_url_minute_Key, URL, int(traffic_cli)) RC.zincrby(Tra_ser_url_minute_Key,URL, int(traffic_ser)) # 实时流量 RC.zincrby(Tra_cli_minute_Key, Topic, int(traffic_cli)) RC.expire(Tra_cli_minute_Key, 300) RC.zincrby(Tra_ser_minute_Key, Topic, int(traffic_ser)) RC.expire(Tra_ser_minute_Key, 300) # if Rtime: RC.lpush(Rt_Key, Rtime) RC.sadd(uv_key, IP) except Exception as e: logging.error(e) continue elif msg.error().code() != KafkaError._PARTITION_EOF: logging.error(msg.error()) continue except Exception as e: logging.error(e) finally: consumer.close()